From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/drivers/staging/lustre/Kconfig | 3 + kernel/drivers/staging/lustre/Makefile | 2 + kernel/drivers/staging/lustre/README.txt | 87 + kernel/drivers/staging/lustre/TODO | 12 + .../staging/lustre/include/linux/libcfs/curproc.h | 97 + .../staging/lustre/include/linux/libcfs/libcfs.h | 187 + .../lustre/include/linux/libcfs/libcfs_cpu.h | 219 + .../lustre/include/linux/libcfs/libcfs_crypto.h | 199 + .../lustre/include/linux/libcfs/libcfs_debug.h | 262 ++ .../lustre/include/linux/libcfs/libcfs_fail.h | 171 + .../lustre/include/linux/libcfs/libcfs_hash.h | 843 ++++ .../lustre/include/linux/libcfs/libcfs_ioctl.h | 214 + .../include/linux/libcfs/libcfs_kernelcomm.h | 118 + .../lustre/include/linux/libcfs/libcfs_prim.h | 87 + .../lustre/include/linux/libcfs/libcfs_private.h | 556 +++ .../lustre/include/linux/libcfs/libcfs_string.h | 107 + .../lustre/include/linux/libcfs/libcfs_time.h | 131 + .../lustre/include/linux/libcfs/libcfs_workitem.h | 110 + .../lustre/include/linux/libcfs/linux/libcfs.h | 147 + .../lustre/include/linux/libcfs/linux/linux-cpu.h | 82 + .../lustre/include/linux/libcfs/linux/linux-mem.h | 80 + .../lustre/include/linux/libcfs/linux/linux-time.h | 144 + .../lustre/include/linux/lnet/api-support.h | 44 + .../staging/lustre/include/linux/lnet/api.h | 217 + .../staging/lustre/include/linux/lnet/lib-lnet.h | 883 ++++ .../staging/lustre/include/linux/lnet/lib-types.h | 760 ++++ .../lustre/include/linux/lnet/linux/api-support.h | 42 + .../lustre/include/linux/lnet/linux/lib-lnet.h | 71 + .../lustre/include/linux/lnet/linux/lib-types.h | 45 + .../staging/lustre/include/linux/lnet/linux/lnet.h | 56 + .../lustre/include/linux/lnet/lnet-sysctl.h | 49 + .../staging/lustre/include/linux/lnet/lnet.h | 51 + .../staging/lustre/include/linux/lnet/lnetctl.h | 80 + .../staging/lustre/include/linux/lnet/lnetst.h | 491 +++ .../staging/lustre/include/linux/lnet/ptllnd.h | 93 + .../lustre/include/linux/lnet/ptllnd_wire.h | 119 + .../staging/lustre/include/linux/lnet/socklnd.h | 103 + .../staging/lustre/include/linux/lnet/types.h | 492 +++ kernel/drivers/staging/lustre/lnet/Kconfig | 40 + kernel/drivers/staging/lustre/lnet/Makefile | 1 + kernel/drivers/staging/lustre/lnet/klnds/Makefile | 1 + .../staging/lustre/lnet/klnds/o2iblnd/Makefile | 2 + .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c | 3118 ++++++++++++++ .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h | 1030 +++++ .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c | 3519 +++++++++++++++ .../lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c | 230 + .../staging/lustre/lnet/klnds/socklnd/Makefile | 3 + .../staging/lustre/lnet/klnds/socklnd/socklnd.c | 2886 +++++++++++++ .../staging/lustre/lnet/klnds/socklnd/socklnd.h | 588 +++ .../staging/lustre/lnet/klnds/socklnd/socklnd_cb.c | 2634 ++++++++++++ .../lustre/lnet/klnds/socklnd/socklnd_lib-linux.c | 714 ++++ .../lustre/lnet/klnds/socklnd/socklnd_lib-linux.h | 86 + .../lustre/lnet/klnds/socklnd/socklnd_modparams.c | 188 + .../lustre/lnet/klnds/socklnd/socklnd_proto.c | 797 ++++ kernel/drivers/staging/lustre/lnet/lnet/Makefile | 5 + kernel/drivers/staging/lustre/lnet/lnet/acceptor.c | 500 +++ kernel/drivers/staging/lustre/lnet/lnet/api-ni.c | 1940 +++++++++ kernel/drivers/staging/lustre/lnet/lnet/config.c | 1292 ++++++ kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c | 441 ++ kernel/drivers/staging/lustre/lnet/lnet/lib-md.c | 454 ++ kernel/drivers/staging/lustre/lnet/lnet/lib-me.c | 298 ++ kernel/drivers/staging/lustre/lnet/lnet/lib-move.c | 2460 +++++++++++ kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c | 647 +++ kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c | 935 ++++ kernel/drivers/staging/lustre/lnet/lnet/lo.c | 120 + kernel/drivers/staging/lustre/lnet/lnet/module.c | 155 + kernel/drivers/staging/lustre/lnet/lnet/peer.c | 338 ++ kernel/drivers/staging/lustre/lnet/lnet/router.c | 1706 ++++++++ .../drivers/staging/lustre/lnet/lnet/router_proc.c | 968 +++++ .../drivers/staging/lustre/lnet/selftest/Makefile | 4 + .../staging/lustre/lnet/selftest/brw_test.c | 508 +++ .../drivers/staging/lustre/lnet/selftest/conctl.c | 929 ++++ .../drivers/staging/lustre/lnet/selftest/conrpc.c | 1396 ++++++ .../drivers/staging/lustre/lnet/selftest/conrpc.h | 146 + .../drivers/staging/lustre/lnet/selftest/console.c | 2096 +++++++++ .../drivers/staging/lustre/lnet/selftest/console.h | 235 + .../staging/lustre/lnet/selftest/framework.c | 1804 ++++++++ .../drivers/staging/lustre/lnet/selftest/module.c | 159 + .../staging/lustre/lnet/selftest/ping_test.c | 230 + kernel/drivers/staging/lustre/lnet/selftest/rpc.c | 1673 ++++++++ kernel/drivers/staging/lustre/lnet/selftest/rpc.h | 302 ++ .../staging/lustre/lnet/selftest/selftest.h | 624 +++ .../drivers/staging/lustre/lnet/selftest/timer.c | 248 ++ .../drivers/staging/lustre/lnet/selftest/timer.h | 53 + kernel/drivers/staging/lustre/lustre/Kconfig | 62 + kernel/drivers/staging/lustre/lustre/Makefile | 2 + kernel/drivers/staging/lustre/lustre/fid/Makefile | 3 + .../staging/lustre/lustre/fid/fid_internal.h | 56 + kernel/drivers/staging/lustre/lustre/fid/fid_lib.c | 95 + .../staging/lustre/lustre/fid/fid_request.c | 572 +++ .../drivers/staging/lustre/lustre/fid/lproc_fid.c | 225 + kernel/drivers/staging/lustre/lustre/fld/Makefile | 3 + .../drivers/staging/lustre/lustre/fld/fld_cache.c | 546 +++ .../staging/lustre/lustre/fld/fld_internal.h | 193 + .../staging/lustre/lustre/fld/fld_request.c | 526 +++ .../drivers/staging/lustre/lustre/fld/lproc_fld.c | 172 + .../staging/lustre/lustre/include/cl_object.h | 3287 ++++++++++++++ .../staging/lustre/lustre/include/dt_object.h | 1499 +++++++ .../staging/lustre/lustre/include/interval_tree.h | 124 + .../staging/lustre/lustre/include/lclient.h | 433 ++ .../lustre/lustre/include/linux/lustre_compat25.h | 216 + .../lustre/lustre/include/linux/lustre_lite.h | 98 + .../lustre/include/linux/lustre_patchless_compat.h | 85 + .../lustre/lustre/include/linux/lustre_user.h | 70 + .../staging/lustre/lustre/include/linux/obd.h | 125 + .../staging/lustre/lustre/include/lprocfs_status.h | 1015 +++++ .../staging/lustre/lustre/include/lu_object.h | 1340 ++++++ .../drivers/staging/lustre/lustre/include/lu_ref.h | 182 + .../staging/lustre/lustre/include/lustre/libiam.h | 145 + .../lustre/lustre/include/lustre/ll_fiemap.h | 121 + .../lustre/include/lustre/lustre_build_version.h | 2 + .../lustre/lustre/include/lustre/lustre_errno.h | 215 + .../lustre/lustre/include/lustre/lustre_idl.h | 3734 ++++++++++++++++ .../lustre/include/lustre/lustre_lfsck_user.h | 95 + .../lustre/lustre/include/lustre/lustre_user.h | 1179 +++++ .../staging/lustre/lustre/include/lustre_acl.h | 49 + .../staging/lustre/lustre/include/lustre_capa.h | 305 ++ .../staging/lustre/lustre/include/lustre_cfg.h | 293 ++ .../staging/lustre/lustre/include/lustre_debug.h | 56 + .../staging/lustre/lustre/include/lustre_disk.h | 547 +++ .../staging/lustre/lustre/include/lustre_dlm.h | 1480 +++++++ .../lustre/lustre/include/lustre_dlm_flags.h | 476 +++ .../staging/lustre/lustre/include/lustre_eacl.h | 95 + .../staging/lustre/lustre/include/lustre_export.h | 406 ++ .../staging/lustre/lustre/include/lustre_fid.h | 767 ++++ .../staging/lustre/lustre/include/lustre_fld.h | 160 + .../staging/lustre/lustre/include/lustre_ha.h | 64 + .../staging/lustre/lustre/include/lustre_handles.h | 97 + .../staging/lustre/lustre/include/lustre_import.h | 385 ++ .../staging/lustre/lustre/include/lustre_intent.h | 62 + .../staging/lustre/lustre/include/lustre_lib.h | 666 +++ .../staging/lustre/lustre/include/lustre_lite.h | 150 + .../staging/lustre/lustre/include/lustre_log.h | 545 +++ .../staging/lustre/lustre/include/lustre_mdc.h | 191 + .../staging/lustre/lustre/include/lustre_mds.h | 81 + .../staging/lustre/lustre/include/lustre_net.h | 2967 +++++++++++++ .../staging/lustre/lustre/include/lustre_param.h | 121 + .../staging/lustre/lustre/include/lustre_quota.h | 241 ++ .../lustre/lustre/include/lustre_req_layout.h | 341 ++ .../staging/lustre/lustre/include/lustre_sec.h | 1147 +++++ .../staging/lustre/lustre/include/lustre_ver.h | 26 + kernel/drivers/staging/lustre/lustre/include/obd.h | 1496 +++++++ .../staging/lustre/lustre/include/obd_cache.h | 39 + .../staging/lustre/lustre/include/obd_cksum.h | 176 + .../staging/lustre/lustre/include/obd_class.h | 1929 +++++++++ .../staging/lustre/lustre/include/obd_support.h | 862 ++++ .../staging/lustre/lustre/lclient/glimpse.c | 269 ++ .../staging/lustre/lustre/lclient/lcommon_cl.c | 1287 ++++++ .../staging/lustre/lustre/lclient/lcommon_misc.c | 199 + .../staging/lustre/lustre/ldlm/interval_tree.c | 751 ++++ kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c | 76 + .../staging/lustre/lustre/ldlm/ldlm_extent.c | 241 ++ .../staging/lustre/lustre/ldlm/ldlm_flock.c | 859 ++++ .../staging/lustre/lustre/ldlm/ldlm_inodebits.c | 74 + .../staging/lustre/lustre/ldlm/ldlm_internal.h | 316 ++ .../drivers/staging/lustre/lustre/ldlm/ldlm_lib.c | 870 ++++ .../drivers/staging/lustre/lustre/ldlm/ldlm_lock.c | 2322 ++++++++++ .../staging/lustre/lustre/ldlm/ldlm_lockd.c | 1191 ++++++ .../staging/lustre/lustre/ldlm/ldlm_plain.c | 72 + .../drivers/staging/lustre/lustre/ldlm/ldlm_pool.c | 1455 +++++++ .../staging/lustre/lustre/ldlm/ldlm_request.c | 2294 ++++++++++ .../staging/lustre/lustre/ldlm/ldlm_resource.c | 1425 +++++++ .../drivers/staging/lustre/lustre/libcfs/Makefile | 18 + .../drivers/staging/lustre/lustre/libcfs/debug.c | 460 ++ kernel/drivers/staging/lustre/lustre/libcfs/fail.c | 138 + kernel/drivers/staging/lustre/lustre/libcfs/hash.c | 2098 +++++++++ .../lustre/lustre/libcfs/kernel_user_comm.c | 240 ++ .../staging/lustre/lustre/libcfs/libcfs_cpu.c | 224 + .../staging/lustre/lustre/libcfs/libcfs_lock.c | 189 + .../staging/lustre/lustre/libcfs/libcfs_mem.c | 202 + .../staging/lustre/lustre/libcfs/libcfs_string.c | 562 +++ .../staging/lustre/lustre/libcfs/linux/linux-cpu.c | 1056 +++++ .../lustre/libcfs/linux/linux-crypto-adler.c | 141 + .../lustre/lustre/libcfs/linux/linux-crypto.c | 291 ++ .../lustre/lustre/libcfs/linux/linux-crypto.h | 29 + .../lustre/lustre/libcfs/linux/linux-curproc.c | 111 + .../lustre/lustre/libcfs/linux/linux-debug.c | 200 + .../lustre/lustre/libcfs/linux/linux-module.c | 183 + .../lustre/lustre/libcfs/linux/linux-prim.c | 217 + .../lustre/lustre/libcfs/linux/linux-tcpip.c | 623 +++ .../lustre/lustre/libcfs/linux/linux-tracefile.c | 275 ++ .../lustre/lustre/libcfs/linux/linux-tracefile.h | 48 + .../drivers/staging/lustre/lustre/libcfs/module.c | 976 +++++ .../staging/lustre/lustre/libcfs/nidstrings.c | 842 ++++ kernel/drivers/staging/lustre/lustre/libcfs/prng.c | 139 + .../staging/lustre/lustre/libcfs/tracefile.c | 1196 ++++++ .../staging/lustre/lustre/libcfs/tracefile.h | 340 ++ .../staging/lustre/lustre/libcfs/workitem.c | 479 +++ .../drivers/staging/lustre/lustre/llite/Makefile | 11 + .../drivers/staging/lustre/lustre/llite/dcache.c | 363 ++ kernel/drivers/staging/lustre/lustre/llite/dir.c | 1971 +++++++++ kernel/drivers/staging/lustre/lustre/llite/file.c | 3624 ++++++++++++++++ .../staging/lustre/lustre/llite/llite_capa.c | 654 +++ .../staging/lustre/lustre/llite/llite_close.c | 393 ++ .../staging/lustre/lustre/llite/llite_internal.h | 1521 +++++++ .../staging/lustre/lustre/llite/llite_lib.c | 2354 ++++++++++ .../staging/lustre/lustre/llite/llite_mmap.c | 492 +++ .../staging/lustre/lustre/llite/llite_nfs.c | 335 ++ .../staging/lustre/lustre/llite/llite_rmtacl.c | 300 ++ kernel/drivers/staging/lustre/lustre/llite/lloop.c | 877 ++++ .../staging/lustre/lustre/llite/lproc_llite.c | 1536 +++++++ kernel/drivers/staging/lustre/lustre/llite/namei.c | 1178 +++++ .../staging/lustre/lustre/llite/remote_perm.c | 331 ++ kernel/drivers/staging/lustre/lustre/llite/rw.c | 1289 ++++++ kernel/drivers/staging/lustre/lustre/llite/rw26.c | 553 +++ .../staging/lustre/lustre/llite/statahead.c | 1729 ++++++++ .../drivers/staging/lustre/lustre/llite/super25.c | 226 + .../drivers/staging/lustre/lustre/llite/symlink.c | 170 + .../drivers/staging/lustre/lustre/llite/vvp_dev.c | 547 +++ .../staging/lustre/lustre/llite/vvp_internal.h | 62 + .../drivers/staging/lustre/lustre/llite/vvp_io.c | 1209 ++++++ .../drivers/staging/lustre/lustre/llite/vvp_lock.c | 85 + .../staging/lustre/lustre/llite/vvp_object.c | 201 + .../drivers/staging/lustre/lustre/llite/vvp_page.c | 551 +++ kernel/drivers/staging/lustre/lustre/llite/xattr.c | 621 +++ .../staging/lustre/lustre/llite/xattr_cache.c | 538 +++ kernel/drivers/staging/lustre/lustre/lmv/Makefile | 3 + kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c | 83 + .../drivers/staging/lustre/lustre/lmv/lmv_intent.c | 323 ++ .../staging/lustre/lustre/lmv/lmv_internal.h | 157 + kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c | 2892 +++++++++++++ .../drivers/staging/lustre/lustre/lmv/lproc_lmv.c | 237 ++ kernel/drivers/staging/lustre/lustre/lov/Makefile | 6 + .../staging/lustre/lustre/lov/lov_cl_internal.h | 839 ++++ kernel/drivers/staging/lustre/lustre/lov/lov_dev.c | 528 +++ kernel/drivers/staging/lustre/lustre/lov/lov_ea.c | 363 ++ .../staging/lustre/lustre/lov/lov_internal.h | 319 ++ kernel/drivers/staging/lustre/lustre/lov/lov_io.c | 990 +++++ .../drivers/staging/lustre/lustre/lov/lov_lock.c | 1198 ++++++ .../drivers/staging/lustre/lustre/lov/lov_merge.c | 186 + kernel/drivers/staging/lustre/lustre/lov/lov_obd.c | 2395 +++++++++++ .../drivers/staging/lustre/lustre/lov/lov_object.c | 1001 +++++ .../drivers/staging/lustre/lustre/lov/lov_offset.c | 264 ++ .../drivers/staging/lustre/lustre/lov/lov_pack.c | 511 +++ .../drivers/staging/lustre/lustre/lov/lov_page.c | 232 + .../drivers/staging/lustre/lustre/lov/lov_pool.c | 673 +++ .../staging/lustre/lustre/lov/lov_request.c | 773 ++++ .../drivers/staging/lustre/lustre/lov/lovsub_dev.c | 209 + .../drivers/staging/lustre/lustre/lov/lovsub_io.c | 55 + .../staging/lustre/lustre/lov/lovsub_lock.c | 466 ++ .../staging/lustre/lustre/lov/lovsub_object.c | 164 + .../staging/lustre/lustre/lov/lovsub_page.c | 71 + .../drivers/staging/lustre/lustre/lov/lproc_lov.c | 311 ++ kernel/drivers/staging/lustre/lustre/mdc/Makefile | 3 + .../drivers/staging/lustre/lustre/mdc/lproc_mdc.c | 220 + .../staging/lustre/lustre/mdc/mdc_internal.h | 181 + kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c | 593 +++ .../drivers/staging/lustre/lustre/mdc/mdc_locks.c | 1313 ++++++ .../drivers/staging/lustre/lustre/mdc/mdc_reint.c | 483 +++ .../staging/lustre/lustre/mdc/mdc_request.c | 2731 ++++++++++++ kernel/drivers/staging/lustre/lustre/mgc/Makefile | 3 + .../drivers/staging/lustre/lustre/mgc/lproc_mgc.c | 80 + .../staging/lustre/lustre/mgc/mgc_internal.h | 73 + .../staging/lustre/lustre/mgc/mgc_request.c | 1762 ++++++++ .../staging/lustre/lustre/obdclass/Makefile | 11 + .../drivers/staging/lustre/lustre/obdclass/acl.c | 548 +++ .../drivers/staging/lustre/lustre/obdclass/capa.c | 421 ++ .../staging/lustre/lustre/obdclass/cl_internal.h | 121 + .../drivers/staging/lustre/lustre/obdclass/cl_io.c | 1669 ++++++++ .../staging/lustre/lustre/obdclass/cl_lock.c | 2239 ++++++++++ .../staging/lustre/lustre/obdclass/cl_object.c | 1139 +++++ .../staging/lustre/lustre/obdclass/cl_page.c | 1553 +++++++ .../staging/lustre/lustre/obdclass/class_obd.c | 704 +++ .../drivers/staging/lustre/lustre/obdclass/debug.c | 109 + .../staging/lustre/lustre/obdclass/dt_object.c | 1059 +++++ .../staging/lustre/lustre/obdclass/genops.c | 1833 ++++++++ .../lustre/lustre/obdclass/linux/linux-module.c | 449 ++ .../lustre/lustre/obdclass/linux/linux-obdo.c | 222 + .../lustre/lustre/obdclass/linux/linux-sysctl.c | 405 ++ .../drivers/staging/lustre/lustre/obdclass/llog.c | 1007 +++++ .../staging/lustre/lustre/obdclass/llog_cat.c | 815 ++++ .../staging/lustre/lustre/obdclass/llog_internal.h | 98 + .../staging/lustre/lustre/obdclass/llog_obd.c | 262 ++ .../staging/lustre/lustre/obdclass/llog_swab.c | 415 ++ .../lustre/lustre/obdclass/lprocfs_counters.c | 139 + .../lustre/lustre/obdclass/lprocfs_status.c | 2059 +++++++++ .../staging/lustre/lustre/obdclass/lu_object.c | 2192 ++++++++++ .../staging/lustre/lustre/obdclass/lu_ref.c | 50 + .../lustre/lustre/obdclass/lustre_handles.c | 257 ++ .../staging/lustre/lustre/obdclass/lustre_peer.c | 217 + .../staging/lustre/lustre/obdclass/obd_config.c | 1953 +++++++++ .../staging/lustre/lustre/obdclass/obd_mount.c | 1319 ++++++ .../drivers/staging/lustre/lustre/obdclass/obdo.c | 362 ++ .../staging/lustre/lustre/obdclass/statfs_pack.c | 75 + .../drivers/staging/lustre/lustre/obdclass/uuid.c | 82 + .../drivers/staging/lustre/lustre/obdecho/Makefile | 2 + .../staging/lustre/lustre/obdecho/echo_client.c | 2197 ++++++++++ .../staging/lustre/lustre/obdecho/echo_internal.h | 47 + .../staging/lustre/lustre/obdecho/lproc_echo.c | 57 + kernel/drivers/staging/lustre/lustre/osc/Makefile | 4 + .../drivers/staging/lustre/lustre/osc/lproc_osc.c | 751 ++++ .../drivers/staging/lustre/lustre/osc/osc_cache.c | 2944 +++++++++++++ .../staging/lustre/lustre/osc/osc_cl_internal.h | 685 +++ kernel/drivers/staging/lustre/lustre/osc/osc_dev.c | 262 ++ .../staging/lustre/lustre/osc/osc_internal.h | 203 + kernel/drivers/staging/lustre/lustre/osc/osc_io.c | 819 ++++ .../drivers/staging/lustre/lustre/osc/osc_lock.c | 1613 +++++++ .../drivers/staging/lustre/lustre/osc/osc_object.c | 271 ++ .../drivers/staging/lustre/lustre/osc/osc_page.c | 916 ++++ .../drivers/staging/lustre/lustre/osc/osc_quota.c | 327 ++ .../staging/lustre/lustre/osc/osc_request.c | 3379 +++++++++++++++ .../drivers/staging/lustre/lustre/ptlrpc/Makefile | 20 + .../drivers/staging/lustre/lustre/ptlrpc/client.c | 3149 ++++++++++++++ .../staging/lustre/lustre/ptlrpc/connection.c | 241 ++ .../drivers/staging/lustre/lustre/ptlrpc/errno.c | 380 ++ .../drivers/staging/lustre/lustre/ptlrpc/events.c | 585 +++ .../drivers/staging/lustre/lustre/ptlrpc/import.c | 1642 +++++++ .../drivers/staging/lustre/lustre/ptlrpc/layout.c | 2442 +++++++++++ .../staging/lustre/lustre/ptlrpc/llog_client.c | 366 ++ .../staging/lustre/lustre/ptlrpc/llog_net.c | 72 + .../staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c | 1366 ++++++ .../drivers/staging/lustre/lustre/ptlrpc/niobuf.c | 731 ++++ kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c | 1754 ++++++++ .../staging/lustre/lustre/ptlrpc/nrs_fifo.c | 270 ++ .../staging/lustre/lustre/ptlrpc/pack_generic.c | 2536 +++++++++++ kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c | 75 + .../drivers/staging/lustre/lustre/ptlrpc/pinger.c | 678 +++ .../staging/lustre/lustre/ptlrpc/ptlrpc_internal.h | 312 ++ .../staging/lustre/lustre/ptlrpc/ptlrpc_module.c | 171 + .../drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c | 811 ++++ .../drivers/staging/lustre/lustre/ptlrpc/recover.c | 379 ++ kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c | 2459 +++++++++++ .../staging/lustre/lustre/ptlrpc/sec_bulk.c | 884 ++++ .../staging/lustre/lustre/ptlrpc/sec_config.c | 901 ++++ .../drivers/staging/lustre/lustre/ptlrpc/sec_gc.c | 252 ++ .../staging/lustre/lustre/ptlrpc/sec_lproc.c | 199 + .../staging/lustre/lustre/ptlrpc/sec_null.c | 458 ++ .../staging/lustre/lustre/ptlrpc/sec_plain.c | 1013 +++++ .../drivers/staging/lustre/lustre/ptlrpc/service.c | 3105 ++++++++++++++ .../staging/lustre/lustre/ptlrpc/wiretest.c | 4492 ++++++++++++++++++++ 330 files changed, 219991 insertions(+) create mode 100644 kernel/drivers/staging/lustre/Kconfig create mode 100644 kernel/drivers/staging/lustre/Makefile create mode 100644 kernel/drivers/staging/lustre/README.txt create mode 100644 kernel/drivers/staging/lustre/TODO create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/curproc.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h create mode 100644 kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/api-support.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/api.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lib-lnet.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lib-types.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/linux/api-support.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/linux/lnet.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lnet.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lnetctl.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/lnetst.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/ptllnd.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/socklnd.h create mode 100644 kernel/drivers/staging/lustre/include/linux/lnet/types.h create mode 100644 kernel/drivers/staging/lustre/lnet/Kconfig create mode 100644 kernel/drivers/staging/lustre/lnet/Makefile create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/Makefile create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c create mode 100644 kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/Makefile create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/acceptor.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/api-ni.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/config.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-md.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-me.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-move.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/lo.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/module.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/peer.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/router.c create mode 100644 kernel/drivers/staging/lustre/lnet/lnet/router_proc.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/Makefile create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/brw_test.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/conctl.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/conrpc.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/conrpc.h create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/console.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/console.h create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/framework.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/module.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/ping_test.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/rpc.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/rpc.h create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/selftest.h create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/timer.c create mode 100644 kernel/drivers/staging/lustre/lnet/selftest/timer.h create mode 100644 kernel/drivers/staging/lustre/lustre/Kconfig create mode 100644 kernel/drivers/staging/lustre/lustre/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/fid/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/fid/fid_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/fid/fid_lib.c create mode 100644 kernel/drivers/staging/lustre/lustre/fid/fid_request.c create mode 100644 kernel/drivers/staging/lustre/lustre/fid/lproc_fid.c create mode 100644 kernel/drivers/staging/lustre/lustre/fld/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/fld/fld_cache.c create mode 100644 kernel/drivers/staging/lustre/lustre/fld/fld_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/fld/fld_request.c create mode 100644 kernel/drivers/staging/lustre/lustre/fld/lproc_fld.c create mode 100644 kernel/drivers/staging/lustre/lustre/include/cl_object.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/dt_object.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/interval_tree.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lclient.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/lustre_lite.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/lustre_user.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/linux/obd.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lprocfs_status.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lu_object.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lu_ref.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/libiam.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre/lustre_user.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_acl.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_capa.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_cfg.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_debug.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_disk.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_dlm.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_eacl.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_export.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_fid.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_fld.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_ha.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_handles.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_import.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_intent.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_lib.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_lite.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_log.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_mdc.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_mds.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_net.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_param.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_quota.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_req_layout.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_sec.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/lustre_ver.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd_cache.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd_cksum.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd_class.h create mode 100644 kernel/drivers/staging/lustre/lustre/include/obd_support.h create mode 100644 kernel/drivers/staging/lustre/lustre/lclient/glimpse.c create mode 100644 kernel/drivers/staging/lustre/lustre/lclient/lcommon_cl.c create mode 100644 kernel/drivers/staging/lustre/lustre/lclient/lcommon_misc.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/interval_tree.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_request.c create mode 100644 kernel/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/debug.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/fail.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/hash.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/module.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/prng.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h create mode 100644 kernel/drivers/staging/lustre/lustre/libcfs/workitem.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/llite/dcache.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/dir.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/file.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_capa.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_close.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_lib.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_mmap.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_nfs.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/llite_rmtacl.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/lloop.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/lproc_llite.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/namei.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/remote_perm.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/rw.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/rw26.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/statahead.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/super25.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/symlink.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_dev.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_io.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_lock.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_object.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/vvp_page.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/xattr.c create mode 100644 kernel/drivers/staging/lustre/lustre/llite/xattr_cache.c create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lmv_intent.c create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lmv_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c create mode 100644 kernel/drivers/staging/lustre/lustre/lmv/lproc_lmv.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_cl_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_dev.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_ea.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_io.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_lock.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_merge.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_obd.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_object.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_offset.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_pack.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_page.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_pool.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lov_request.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_dev.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_io.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_lock.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_object.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lovsub_page.c create mode 100644 kernel/drivers/staging/lustre/lustre/lov/lproc_lov.c create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/lproc_mdc.c create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_locks.c create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_reint.c create mode 100644 kernel/drivers/staging/lustre/lustre/mdc/mdc_request.c create mode 100644 kernel/drivers/staging/lustre/lustre/mgc/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/mgc/lproc_mgc.c create mode 100644 kernel/drivers/staging/lustre/lustre/mgc/mgc_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/mgc/mgc_request.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/acl.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/capa.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_io.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_lock.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_object.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/cl_page.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/class_obd.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/debug.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/dt_object.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/genops.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog_cat.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog_obd.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/llog_swab.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lu_object.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lu_ref.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lustre_handles.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/lustre_peer.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/obd_config.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/obd_mount.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/obdo.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/statfs_pack.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdclass/uuid.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdecho/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/obdecho/echo_client.c create mode 100644 kernel/drivers/staging/lustre/lustre/obdecho/echo_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/obdecho/lproc_echo.c create mode 100644 kernel/drivers/staging/lustre/lustre/osc/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/osc/lproc_osc.c create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_cache.c create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_cl_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_dev.c create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_io.c create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_lock.c create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_object.c create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_page.c create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_quota.c create mode 100644 kernel/drivers/staging/lustre/lustre/osc/osc_request.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/Makefile create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/client.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/connection.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/errno.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/events.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/import.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/layout.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/llog_client.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/llog_net.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/niobuf.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/pinger.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/recover.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_config.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_null.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/service.c create mode 100644 kernel/drivers/staging/lustre/lustre/ptlrpc/wiretest.c (limited to 'kernel/drivers/staging/lustre') diff --git a/kernel/drivers/staging/lustre/Kconfig b/kernel/drivers/staging/lustre/Kconfig new file mode 100644 index 000000000..a224d88bf --- /dev/null +++ b/kernel/drivers/staging/lustre/Kconfig @@ -0,0 +1,3 @@ +source "drivers/staging/lustre/lustre/Kconfig" + +source "drivers/staging/lustre/lnet/Kconfig" diff --git a/kernel/drivers/staging/lustre/Makefile b/kernel/drivers/staging/lustre/Makefile new file mode 100644 index 000000000..95ffe337a --- /dev/null +++ b/kernel/drivers/staging/lustre/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_LNET) += lnet/ +obj-$(CONFIG_LUSTRE_FS) += lustre/ diff --git a/kernel/drivers/staging/lustre/README.txt b/kernel/drivers/staging/lustre/README.txt new file mode 100644 index 000000000..cf0ca50ff --- /dev/null +++ b/kernel/drivers/staging/lustre/README.txt @@ -0,0 +1,87 @@ +Lustre Parallel Filesystem Client +================================= + +The Lustre file system is an open-source, parallel file system +that supports many requirements of leadership class HPC simulation +environments. +Born from from a research project at Carnegie Mellon University, +the Lustre file system is a widely-used option in HPC. +The Lustre file system provides a POSIX compliant file system interface, +can scale to thousands of clients, petabytes of storage and +hundreds of gigabytes per second of I/O bandwidth. + +Unlike shared disk storage cluster filesystems (e.g. OCFS2, GFS, GPFS), +Lustre has independent Metadata and Data servers that clients can access +in parallel to maximize performance. + +In order to use Lustre client you will need to download lustre client +tools from +https://downloads.hpdd.intel.com/public/lustre/latest-feature-release/ +the package name is lustre-client. + +You will need to install and configure your Lustre servers separately. + +Mount Syntax +============ +After you installed the lustre-client tools including mount.lustre binary +you can mount your Lustre filesystem with: + +mount -t lustre mgs:/fsname mnt + +where mgs is the host name or ip address of your Lustre MGS(management service) +fsname is the name of the filesystem you would like to mount. + + +Mount Options +============= + + noflock + Disable posix file locking (Applications trying to use + the functionality will get ENOSYS) + + localflock + Enable local flock support, using only client-local flock + (faster, for applications that require flock but do not run + on multiple nodes). + + flock + Enable cluster-global posix file locking coherent across all + client nodes. + + user_xattr, nouser_xattr + Support "user." extended attributes (or not) + + user_fid2path, nouser_fid2path + Enable FID to path translation by regular users (or not) + + checksum, nochecksum + Verify data consistency on the wire and in memory as it passes + between the layers (or not). + + lruresize, nolruresize + Allow lock LRU to be controlled by memory pressure on the server + (or only 100 (default, controlled by lru_size proc parameter) locks + per CPU per server on this client). + + lazystatfs, nolazystatfs + Do not block in statfs() if some of the servers are down. + + 32bitapi + Shrink inode numbers to fit into 32 bits. This is necessary + if you plan to reexport Lustre filesystem from this client via + NFSv4. + + verbose, noverbose + Enable mount/umount console messages (or not) + +More Information +================ +You can get more information at +OpenSFS website: http://lustre.opensfs.org/about/ +Intel HPDD wiki: https://wiki.hpdd.intel.com + +Out of tree Lustre client and server code is available at: +http://git.whamcloud.com/fs/lustre-release.git + +Latest binary packages: +http://lustre.opensfs.org/download-lustre/ diff --git a/kernel/drivers/staging/lustre/TODO b/kernel/drivers/staging/lustre/TODO new file mode 100644 index 000000000..0512594b5 --- /dev/null +++ b/kernel/drivers/staging/lustre/TODO @@ -0,0 +1,12 @@ +* Possible remaining coding style fix. +* Remove deadcode. +* Seperate client/server functionality. Functions only used by server can be + removed from client. +* Clean up libcfs layer. Ideally we can remove include/linux/libcfs entirely. +* Clean up CLIO layer. Lustre client readahead/writeback control needs to better + suit kernel providings. +* Add documents in Documentation. +* Other minor misc cleanups... + +Please send any patches to Greg Kroah-Hartman , Andreas Dilger +, and Oleg Drokin . diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/curproc.h b/kernel/drivers/staging/lustre/include/linux/libcfs/curproc.h new file mode 100644 index 000000000..1edfca58c --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/curproc.h @@ -0,0 +1,97 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/curproc.h + * + * Lustre curproc API declaration + * + * Author: Nikita Danilov + */ + +#ifndef __LIBCFS_CURPROC_H__ +#define __LIBCFS_CURPROC_H__ + +/* + * Plus, platform-specific constant + * + * CFS_CURPROC_COMM_MAX, + * + * and opaque scalar type + * + * kernel_cap_t + */ + +/* check if task is running in compat mode.*/ +#define current_pid() (current->pid) +#define current_comm() (current->comm) + +typedef __u32 cfs_cap_t; + +#define CFS_CAP_CHOWN 0 +#define CFS_CAP_DAC_OVERRIDE 1 +#define CFS_CAP_DAC_READ_SEARCH 2 +#define CFS_CAP_FOWNER 3 +#define CFS_CAP_FSETID 4 +#define CFS_CAP_LINUX_IMMUTABLE 9 +#define CFS_CAP_SYS_ADMIN 21 +#define CFS_CAP_SYS_BOOT 23 +#define CFS_CAP_SYS_RESOURCE 24 + +#define CFS_CAP_FS_MASK ((1 << CFS_CAP_CHOWN) | \ + (1 << CFS_CAP_DAC_OVERRIDE) | \ + (1 << CFS_CAP_DAC_READ_SEARCH) | \ + (1 << CFS_CAP_FOWNER) | \ + (1 << CFS_CAP_FSETID) | \ + (1 << CFS_CAP_LINUX_IMMUTABLE) | \ + (1 << CFS_CAP_SYS_ADMIN) | \ + (1 << CFS_CAP_SYS_BOOT) | \ + (1 << CFS_CAP_SYS_RESOURCE)) + +void cfs_cap_raise(cfs_cap_t cap); +void cfs_cap_lower(cfs_cap_t cap); +int cfs_cap_raised(cfs_cap_t cap); +cfs_cap_t cfs_curproc_cap_pack(void); + +/* __LIBCFS_CURPROC_H__ */ +#endif +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs.h new file mode 100644 index 000000000..4410d7fdc --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs.h @@ -0,0 +1,187 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LIBCFS_H__ +#define __LIBCFS_LIBCFS_H__ + +#if !__GNUC__ +#define __attribute__(x) +#endif + +#include "linux/libcfs.h" +#include + +#include "curproc.h" + +#ifndef offsetof +# define offsetof(typ, memb) ((long)(long_ptr_t)((char *)&(((typ *)0)->memb))) +#endif + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0]))) +#endif + +#if !defined(swap) +#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +#endif + +#if !defined(container_of) +/* given a pointer @ptr to the field @member embedded into type (usually + * struct) @type, return pointer to the embedding instance of @type. */ +#define container_of(ptr, type, member) \ + ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) +#endif + +static inline int __is_po2(unsigned long long val) +{ + return !(val & (val - 1)); +} + +#define IS_PO2(val) __is_po2((unsigned long long)(val)) + +#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) + +/* + * Lustre Error Checksum: calculates checksum + * of Hex number by XORing each bit. + */ +#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \ + ((hexnum) >> 8 & 0xf)) + +#define LUSTRE_SRV_LNET_PID LUSTRE_LNET_PID + +#include + +int libcfs_arch_init(void); +void libcfs_arch_cleanup(void); + +/* libcfs tcpip */ +int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask); +int libcfs_ipif_enumerate(char ***names); +void libcfs_ipif_free_enumeration(char **names, int n); +int libcfs_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog); +int libcfs_sock_accept(struct socket **newsockp, struct socket *sock); +void libcfs_sock_abort_accept(struct socket *sock); +int libcfs_sock_connect(struct socket **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port); +int libcfs_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize); +int libcfs_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize); +int libcfs_sock_getaddr(struct socket *socket, int remote, __u32 *ip, int *port); +int libcfs_sock_write(struct socket *sock, void *buffer, int nob, int timeout); +int libcfs_sock_read(struct socket *sock, void *buffer, int nob, int timeout); +void libcfs_sock_release(struct socket *sock); + +/* need both kernel and user-land acceptor */ +#define LNET_ACCEPTOR_MIN_RESERVED_PORT 512 +#define LNET_ACCEPTOR_MAX_RESERVED_PORT 1023 + +/* + * libcfs pseudo device operations + * + * It's just draft now. + */ + +struct cfs_psdev_file { + unsigned long off; + void *private_data; + unsigned long reserved1; + unsigned long reserved2; +}; + +struct cfs_psdev_ops { + int (*p_open)(unsigned long, void *); + int (*p_close)(unsigned long, void *); + int (*p_read)(struct cfs_psdev_file *, char *, unsigned long); + int (*p_write)(struct cfs_psdev_file *, char *, unsigned long); + int (*p_ioctl)(struct cfs_psdev_file *, unsigned long, void *); +}; + +/* + * Drop into debugger, if possible. Implementation is provided by platform. + */ + +void cfs_enter_debugger(void); + +/* + * Defined by platform + */ +int unshare_fs_struct(void); +sigset_t cfs_get_blocked_sigs(void); +sigset_t cfs_block_allsigs(void); +sigset_t cfs_block_sigs(unsigned long sigs); +sigset_t cfs_block_sigsinv(unsigned long sigs); +void cfs_restore_sigs(sigset_t); +int cfs_signal_pending(void); +void cfs_clear_sigpending(void); + +/* + * Random number handling + */ + +/* returns a random 32-bit integer */ +unsigned int cfs_rand(void); +/* seed the generator */ +void cfs_srand(unsigned int, unsigned int); +void cfs_get_random_bytes(void *buf, int size); + +#include "libcfs_debug.h" +#include "libcfs_cpu.h" +#include "libcfs_private.h" +#include "libcfs_ioctl.h" +#include "libcfs_prim.h" +#include "libcfs_time.h" +#include "libcfs_string.h" +#include "libcfs_kernelcomm.h" +#include "libcfs_workitem.h" +#include "libcfs_hash.h" +#include "libcfs_fail.h" +#include "libcfs_crypto.h" + +/* container_of depends on "likely" which is defined in libcfs_private.h */ +static inline void *__container_of(void *ptr, unsigned long shift) +{ + if (unlikely(IS_ERR(ptr) || ptr == NULL)) + return ptr; + return (char *)ptr - shift; +} + +#define container_of0(ptr, type, member) \ + ((type *)__container_of((void *)(ptr), offsetof(type, member))) + +#define _LIBCFS_H + +#endif /* _LIBCFS_H */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h new file mode 100644 index 000000000..787867847 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h @@ -0,0 +1,219 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_cpu.h + * + * CPU partition + * . CPU partition is virtual processing unit + * + * . CPU partition can present 1-N cores, or 1-N NUMA nodes, + * in other words, CPU partition is a processors pool. + * + * CPU Partition Table (CPT) + * . a set of CPU partitions + * + * . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP + * + * . User can specify total number of CPU partitions while creating a + * CPT, ID of CPU partition is always start from 0. + * + * Example: if there are 8 cores on the system, while creating a CPT + * with cpu_npartitions=4: + * core[0, 1] = partition[0], core[2, 3] = partition[1] + * core[4, 5] = partition[2], core[6, 7] = partition[3] + * + * cpu_npartitions=1: + * core[0, 1, ... 7] = partition[0] + * + * . User can also specify CPU partitions by string pattern + * + * Examples: cpu_partitions="0[0,1], 1[2,3]" + * cpu_partitions="N 0[0-3], 1[4-8]" + * + * The first character "N" means following numbers are numa ID + * + * . NUMA allocators, CPU affinity threads are built over CPU partitions, + * instead of HW CPUs or HW nodes. + * + * . By default, Lustre modules should refer to the global cfs_cpt_table, + * instead of accessing HW CPUs directly, so concurrency of Lustre can be + * configured by cpu_npartitions of the global cfs_cpt_table + * + * . If cpu_npartitions=1(all CPUs in one pool), lustre should work the + * same way as 2.2 or earlier versions + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_CPU_H__ +#define __LIBCFS_CPU_H__ + +/* any CPU partition */ +#define CFS_CPT_ANY (-1) + +#ifdef CONFIG_SMP +/** + * return cpumask of CPU partition \a cpt + */ +cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt); +/** + * print string information of cpt-table + */ +int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len); +#else /* !CONFIG_SMP */ +struct cfs_cpt_table { + /* # of CPU partitions */ + int ctb_nparts; + /* cpu mask */ + cpumask_t ctb_mask; + /* node mask */ + nodemask_t ctb_nodemask; + /* version */ + __u64 ctb_version; +}; + +static inline cpumask_t * +cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt) +{ + return NULL; +} + +static inline int +cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + return 0; +} +#endif /* CONFIG_SMP */ + +extern struct cfs_cpt_table *cfs_cpt_table; + +/** + * destroy a CPU partition table + */ +void cfs_cpt_table_free(struct cfs_cpt_table *cptab); +/** + * create a cfs_cpt_table with \a ncpt number of partitions + */ +struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt); +/** + * return total number of CPU partitions in \a cptab + */ +int +cfs_cpt_number(struct cfs_cpt_table *cptab); +/** + * return number of HW cores or hyper-threadings in a CPU partition \a cpt + */ +int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt); +/** + * is there any online CPU in CPU partition \a cpt + */ +int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt); +/** + * return nodemask of CPU partition \a cpt + */ +nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt); +/** + * shadow current HW processor ID to CPU-partition ID of \a cptab + */ +int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap); +/** + * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab + */ +int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu); +/** + * bind current thread on a CPU-partition \a cpt of \a cptab + */ +int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt); +/** + * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success, + * otherwise 0 is returned + */ +int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * remove \a cpu from CPU partition \a cpt of \a cptab + */ +void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * add all cpus in \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, + int cpt, cpumask_t *mask); +/** + * remove all cpus in \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, + int cpt, cpumask_t *mask); +/** + * add all cpus in NUMA node \a node to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node); +/** + * remove all cpus in NUMA node \a node from CPU partition \a cpt + */ +void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node); + +/** + * add all cpus in node mask \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, + int cpt, nodemask_t *mask); +/** + * remove all cpus in node mask \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, + int cpt, nodemask_t *mask); +/** + * unset all cpus for CPU partition \a cpt + */ +void cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt); +/** + * convert partition id \a cpt to numa node id, if there are more than one + * nodes in this partition, it might return a different node id each time. + */ +int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt); + +/** + * return number of HTs in the same core of \a cpu + */ +int cfs_cpu_ht_nsiblings(int cpu); + +/** + * iterate over all CPU partitions in \a cptab + */ +#define cfs_cpt_for_each(i, cptab) \ + for (i = 0; i < cfs_cpt_number(cptab); i++) + +int cfs_cpu_init(void); +void cfs_cpu_fini(void); + +#endif /* __LIBCFS_CPU_H__ */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h new file mode 100644 index 000000000..e8663697e --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h @@ -0,0 +1,199 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +#ifndef _LIBCFS_CRYPTO_H +#define _LIBCFS_CRYPTO_H + +struct cfs_crypto_hash_type { + char *cht_name; /**< hash algorithm name, equal to + * format name for crypto api */ + unsigned int cht_key; /**< init key by default (valid for + * 4 bytes context like crc32, adler */ + unsigned int cht_size; /**< hash digest size */ +}; + +enum cfs_crypto_hash_alg { + CFS_HASH_ALG_NULL = 0, + CFS_HASH_ALG_ADLER32, + CFS_HASH_ALG_CRC32, + CFS_HASH_ALG_MD5, + CFS_HASH_ALG_SHA1, + CFS_HASH_ALG_SHA256, + CFS_HASH_ALG_SHA384, + CFS_HASH_ALG_SHA512, + CFS_HASH_ALG_CRC32C, + CFS_HASH_ALG_MAX +}; + +static struct cfs_crypto_hash_type hash_types[] = { + [CFS_HASH_ALG_NULL] = { "null", 0, 0 }, + [CFS_HASH_ALG_ADLER32] = { "adler32", 1, 4 }, + [CFS_HASH_ALG_CRC32] = { "crc32", ~0, 4 }, + [CFS_HASH_ALG_CRC32C] = { "crc32c", ~0, 4 }, + [CFS_HASH_ALG_MD5] = { "md5", 0, 16 }, + [CFS_HASH_ALG_SHA1] = { "sha1", 0, 20 }, + [CFS_HASH_ALG_SHA256] = { "sha256", 0, 32 }, + [CFS_HASH_ALG_SHA384] = { "sha384", 0, 48 }, + [CFS_HASH_ALG_SHA512] = { "sha512", 0, 64 }, +}; + +/** Return pointer to type of hash for valid hash algorithm identifier */ +static inline const struct cfs_crypto_hash_type * + cfs_crypto_hash_type(unsigned char hash_alg) +{ + struct cfs_crypto_hash_type *ht; + + if (hash_alg < CFS_HASH_ALG_MAX) { + ht = &hash_types[hash_alg]; + if (ht->cht_name) + return ht; + } + return NULL; +} + +/** Return hash name for valid hash algorithm identifier or "unknown" */ +static inline const char *cfs_crypto_hash_name(unsigned char hash_alg) +{ + const struct cfs_crypto_hash_type *ht; + + ht = cfs_crypto_hash_type(hash_alg); + if (ht) + return ht->cht_name; + return "unknown"; +} + +/** Return digest size for valid algorithm identifier or 0 */ +static inline int cfs_crypto_hash_digestsize(unsigned char hash_alg) +{ + const struct cfs_crypto_hash_type *ht; + + ht = cfs_crypto_hash_type(hash_alg); + if (ht) + return ht->cht_size; + return 0; +} + +/** Return hash identifier for valid hash algorithm name or 0xFF */ +static inline unsigned char cfs_crypto_hash_alg(const char *algname) +{ + unsigned char i; + + for (i = 0; i < CFS_HASH_ALG_MAX; i++) + if (!strcmp(hash_types[i].cht_name, algname)) + break; + return (i == CFS_HASH_ALG_MAX ? 0xFF : i); +} + +/** Calculate hash digest for buffer. + * @param alg id of hash algorithm + * @param buf buffer of data + * @param buf_len buffer len + * @param key initial value for algorithm, if it is NULL, + * default initial value should be used. + * @param key_len len of initial value + * @param hash [out] pointer to hash, if it is NULL, hash_len is + * set to valid digest size in bytes, retval -ENOSPC. + * @param hash_len [in,out] size of hash buffer + * @returns status of operation + * @retval -EINVAL if buf, buf_len, hash_len or alg_id is invalid + * @retval -ENODEV if this algorithm is unsupported + * @retval -ENOSPC if pointer to hash is NULL, or hash_len less than + * digest size + * @retval 0 for success + * @retval < 0 other errors from lower layers. + */ +int cfs_crypto_hash_digest(unsigned char alg, + const void *buf, unsigned int buf_len, + unsigned char *key, unsigned int key_len, + unsigned char *hash, unsigned int *hash_len); + +/* cfs crypto hash descriptor */ +struct cfs_crypto_hash_desc; + +/** Allocate and initialize descriptor for hash algorithm. + * @param alg algorithm id + * @param key initial value for algorithm, if it is NULL, + * default initial value should be used. + * @param key_len len of initial value + * @returns pointer to descriptor of hash instance + * @retval ERR_PTR(error) when errors occurred. + */ +struct cfs_crypto_hash_desc* + cfs_crypto_hash_init(unsigned char alg, + unsigned char *key, unsigned int key_len); + +/** Update digest by part of data. + * @param desc hash descriptor + * @param page data page + * @param offset data offset + * @param len data len + * @returns status of operation + * @retval 0 for success. + */ +int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc, + struct page *page, unsigned int offset, + unsigned int len); + +/** Update digest by part of data. + * @param desc hash descriptor + * @param buf pointer to data buffer + * @param buf_len size of data at buffer + * @returns status of operation + * @retval 0 for success. + */ +int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf, + unsigned int buf_len); + +/** Finalize hash calculation, copy hash digest to buffer, destroy hash + * descriptor. + * @param desc hash descriptor + * @param hash buffer pointer to store hash digest + * @param hash_len pointer to hash buffer size, if NULL + * destroy hash descriptor + * @returns status of operation + * @retval -ENOSPC if hash is NULL, or *hash_len less than + * digest size + * @retval 0 for success + * @retval < 0 other errors from lower layers. + */ +int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc, + unsigned char *hash, unsigned int *hash_len); +/** + * Register crypto hash algorithms + */ +int cfs_crypto_register(void); + +/** + * Unregister + */ +void cfs_crypto_unregister(void); + +/** Return hash speed in Mbytes per second for valid hash algorithm + * identifier. If test was unsuccessful -1 would be returned. + */ +int cfs_crypto_hash_speed(unsigned char hash_alg); +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h new file mode 100644 index 000000000..8251ac932 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h @@ -0,0 +1,262 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_debug.h + * + * Debug messages and assertions + * + */ + +#ifndef __LIBCFS_DEBUG_H__ +#define __LIBCFS_DEBUG_H__ + +/* + * Debugging + */ +extern unsigned int libcfs_subsystem_debug; +extern unsigned int libcfs_stack; +extern unsigned int libcfs_debug; +extern unsigned int libcfs_printk; +extern unsigned int libcfs_console_ratelimit; +extern unsigned int libcfs_watchdog_ratelimit; +extern unsigned int libcfs_console_max_delay; +extern unsigned int libcfs_console_min_delay; +extern unsigned int libcfs_console_backoff; +extern unsigned int libcfs_debug_binary; +extern char libcfs_debug_file_path_arr[PATH_MAX]; + +int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys); +int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys); + +/* Has there been an LBUG? */ +extern unsigned int libcfs_catastrophe; +extern unsigned int libcfs_panic_on_lbug; + +/** + * Format for debug message headers + */ +struct ptldebug_header { + __u32 ph_len; + __u32 ph_flags; + __u32 ph_subsys; + __u32 ph_mask; + __u16 ph_cpu_id; + __u16 ph_type; + __u32 ph_sec; + __u64 ph_usec; + __u32 ph_stack; + __u32 ph_pid; + __u32 ph_extern_pid; + __u32 ph_line_num; +} __packed; + +#define PH_FLAG_FIRST_RECORD 1 + +/* Debugging subsystems (32 bits, non-overlapping) */ +/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ +#define S_UNDEFINED 0x00000001 +#define S_MDC 0x00000002 +#define S_MDS 0x00000004 +#define S_OSC 0x00000008 +#define S_OST 0x00000010 +#define S_CLASS 0x00000020 +#define S_LOG 0x00000040 +#define S_LLITE 0x00000080 +#define S_RPC 0x00000100 +#define S_MGMT 0x00000200 +#define S_LNET 0x00000400 +#define S_LND 0x00000800 /* ALL LNDs */ +#define S_PINGER 0x00001000 +#define S_FILTER 0x00002000 +/* unused */ +#define S_ECHO 0x00008000 +#define S_LDLM 0x00010000 +#define S_LOV 0x00020000 +#define S_LQUOTA 0x00040000 +#define S_OSD 0x00080000 +/* unused */ +/* unused */ +/* unused */ +#define S_LMV 0x00800000 /* b_new_cmd */ +/* unused */ +#define S_SEC 0x02000000 /* upcall cache */ +#define S_GSS 0x04000000 /* b_new_cmd */ +/* unused */ +#define S_MGC 0x10000000 +#define S_MGS 0x20000000 +#define S_FID 0x40000000 /* b_new_cmd */ +#define S_FLD 0x80000000 /* b_new_cmd */ +/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ + +/* Debugging masks (32 bits, non-overlapping) */ +/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ +#define D_TRACE 0x00000001 /* ENTRY/EXIT markers */ +#define D_INODE 0x00000002 +#define D_SUPER 0x00000004 +#define D_EXT2 0x00000008 /* anything from ext2_debug */ +#define D_MALLOC 0x00000010 /* print malloc, free information */ +#define D_CACHE 0x00000020 /* cache-related items */ +#define D_INFO 0x00000040 /* general information */ +#define D_IOCTL 0x00000080 /* ioctl related information */ +#define D_NETERROR 0x00000100 /* network errors */ +#define D_NET 0x00000200 /* network communications */ +#define D_WARNING 0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */ +#define D_BUFFS 0x00000800 +#define D_OTHER 0x00001000 +#define D_DENTRY 0x00002000 +#define D_NETTRACE 0x00004000 +#define D_PAGE 0x00008000 /* bulk page handling */ +#define D_DLMTRACE 0x00010000 +#define D_ERROR 0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */ +#define D_EMERG 0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */ +#define D_HA 0x00080000 /* recovery and failover */ +#define D_RPCTRACE 0x00100000 /* for distributed debugging */ +#define D_VFSTRACE 0x00200000 +#define D_READA 0x00400000 /* read-ahead */ +#define D_MMAP 0x00800000 +#define D_CONFIG 0x01000000 +#define D_CONSOLE 0x02000000 +#define D_QUOTA 0x04000000 +#define D_SEC 0x08000000 +#define D_LFSCK 0x10000000 /* For both OI scrub and LFSCK */ +/* keep these in sync with lnet/{utils,libcfs}/debug.c */ + +#define D_HSM D_TRACE + +#define D_CANTMASK (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE) + +#ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +#endif + +#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600)) /* jiffies */ +#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */ +#define CDEBUG_DEFAULT_BACKOFF 2 +struct cfs_debug_limit_state { + unsigned long cdls_next; + unsigned int cdls_delay; + int cdls_count; +}; + +struct libcfs_debug_msg_data { + const char *msg_file; + const char *msg_fn; + int msg_subsys; + int msg_line; + int msg_mask; + struct cfs_debug_limit_state *msg_cdls; +}; + +#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls) \ +do { \ + (data)->msg_subsys = DEBUG_SUBSYSTEM; \ + (data)->msg_file = __FILE__; \ + (data)->msg_fn = __func__; \ + (data)->msg_line = __LINE__; \ + (data)->msg_cdls = (cdls); \ + (data)->msg_mask = (mask); \ +} while (0) + +#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls) \ + static struct libcfs_debug_msg_data dataname = { \ + .msg_subsys = DEBUG_SUBSYSTEM, \ + .msg_file = __FILE__, \ + .msg_fn = __func__, \ + .msg_line = __LINE__, \ + .msg_cdls = (cdls) }; \ + dataname.msg_mask = (mask) + +/** + * Filters out logging messages based on mask and subsystem. + */ +static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem) +{ + return mask & D_CANTMASK || + ((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem)); +} + +#define __CDEBUG(cdls, mask, format, ...) \ +do { \ + static struct libcfs_debug_msg_data msgdata; \ + \ + CFS_CHECK_STACK(&msgdata, mask, cdls); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls); \ + libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__); \ + } \ +} while (0) + +#define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__) + +#define CDEBUG_LIMIT(mask, format, ...) \ +do { \ + static struct cfs_debug_limit_state cdls; \ + \ + __CDEBUG(&cdls, mask, format, ## __VA_ARGS__); \ +} while (0) + +#define CWARN(format, ...) CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__) +#define CERROR(format, ...) CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__) +#define CNETERR(format, a...) CDEBUG_LIMIT(D_NETERROR, format, ## a) +#define CEMERG(format, ...) CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__) + +#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__) +#define LCONSOLE_INFO(format, ...) CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__) +#define LCONSOLE_WARN(format, ...) CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__) +#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \ + "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__) +#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__) + +#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__) + +int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, + const char *format1, ...) + __printf(2, 3); + +int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata, + const char *format1, + va_list args, const char *format2, ...) + __printf(4, 5); + +/* other external symbols that tracefile provides: */ +int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, + const char __user *usr_buffer, int usr_buffer_nob); +int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, + const char *knl_buffer, char *append); + +#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log" + +#endif /* __LIBCFS_DEBUG_H__ */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h new file mode 100644 index 000000000..eea55d94e --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h @@ -0,0 +1,171 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores, + * CA 94065 USA or visit www.oracle.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Oracle Corporation, Inc. + */ + +#ifndef _LIBCFS_FAIL_H +#define _LIBCFS_FAIL_H + +extern unsigned long cfs_fail_loc; +extern unsigned int cfs_fail_val; + +extern wait_queue_head_t cfs_race_waitq; +extern int cfs_race_state; + +int __cfs_fail_check_set(__u32 id, __u32 value, int set); +int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set); + +enum { + CFS_FAIL_LOC_NOSET = 0, + CFS_FAIL_LOC_ORSET = 1, + CFS_FAIL_LOC_RESET = 2, + CFS_FAIL_LOC_VALUE = 3 +}; + +/* Failure injection control */ +#define CFS_FAIL_MASK_SYS 0x0000FF00 +#define CFS_FAIL_MASK_LOC (0x000000FF | CFS_FAIL_MASK_SYS) + +#define CFS_FAILED_BIT 30 +/* CFS_FAILED is 0x40000000 */ +#define CFS_FAILED (1 << CFS_FAILED_BIT) + +#define CFS_FAIL_ONCE_BIT 31 +/* CFS_FAIL_ONCE is 0x80000000 */ +#define CFS_FAIL_ONCE (1 << CFS_FAIL_ONCE_BIT) + +/* The following flags aren't made to be combined */ +#define CFS_FAIL_SKIP 0x20000000 /* skip N times then fail */ +#define CFS_FAIL_SOME 0x10000000 /* only fail N times */ +#define CFS_FAIL_RAND 0x08000000 /* fail 1/N of the times */ +#define CFS_FAIL_USR1 0x04000000 /* user flag */ + +#define CFS_FAIL_PRECHECK(id) (cfs_fail_loc && \ + (cfs_fail_loc & CFS_FAIL_MASK_LOC) == \ + ((id) & CFS_FAIL_MASK_LOC)) + +static inline int cfs_fail_check_set(__u32 id, __u32 value, + int set, int quiet) +{ + int ret = 0; + + if (unlikely(CFS_FAIL_PRECHECK(id) && + (ret = __cfs_fail_check_set(id, value, set)))) { + if (quiet) { + CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n", + id, value); + } else { + LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n", + id, value); + } + } + + return ret; +} + +/* If id hit cfs_fail_loc, return 1, otherwise return 0 */ +#define CFS_FAIL_CHECK(id) \ + cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0) +#define CFS_FAIL_CHECK_QUIET(id) \ + cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1) + +/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_VALUE(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0) +#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1) + +/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_ORSET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0) +#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1) + +/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_RESET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0) +#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1) + +static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) +{ + if (unlikely(CFS_FAIL_PRECHECK(id))) + return __cfs_fail_timeout_set(id, value, ms, set); + return 0; +} + +/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */ +#define CFS_FAIL_TIMEOUT(id, secs) \ + cfs_fail_timeout_set(id, 0, secs * 1000, CFS_FAIL_LOC_NOSET) + +#define CFS_FAIL_TIMEOUT_MS(id, ms) \ + cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET) + +/* If id hit cfs_fail_loc, cfs_fail_loc |= value and + * sleep seconds or milliseconds */ +#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \ + cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_ORSET) + +#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \ + cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET) + +/* The idea here is to synchronise two threads to force a race. The + * first thread that calls this with a matching fail_loc is put to + * sleep. The next thread that calls with the same fail_loc wakes up + * the first and continues. */ +static inline void cfs_race(__u32 id) +{ + + if (CFS_FAIL_PRECHECK(id)) { + if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) { + int rc; + + cfs_race_state = 0; + CERROR("cfs_race id %x sleeping\n", id); + rc = wait_event_interruptible(cfs_race_waitq, + cfs_race_state != 0); + CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc); + } else { + CERROR("cfs_fail_race id %x waking\n", id); + cfs_race_state = 1; + wake_up(&cfs_race_waitq); + } + } +} + +#define CFS_RACE(id) cfs_race(id) + +#endif /* _LIBCFS_FAIL_H */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h new file mode 100644 index 000000000..c40814591 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h @@ -0,0 +1,843 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_hash.h + * + * Hashing routines + * + */ + +#ifndef __LIBCFS_HASH_H__ +#define __LIBCFS_HASH_H__ +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL + +/* + * Ideally we would use HAVE_HASH_LONG for this, but on linux we configure + * the linux kernel and user space at the same time, so we need to differentiate + * between them explicitly. If this is not needed on other architectures, then + * we'll need to move the functions to architecture specific headers. + */ + +#include + +/** disable debug */ +#define CFS_HASH_DEBUG_NONE 0 +/** record hash depth and output to console when it's too deep, + * computing overhead is low but consume more memory */ +#define CFS_HASH_DEBUG_1 1 +/** expensive, check key validation */ +#define CFS_HASH_DEBUG_2 2 + +#define CFS_HASH_DEBUG_LEVEL CFS_HASH_DEBUG_NONE + +struct cfs_hash_ops; +struct cfs_hash_lock_ops; +struct cfs_hash_hlist_ops; + +union cfs_hash_lock { + rwlock_t rw; /**< rwlock */ + spinlock_t spin; /**< spinlock */ +}; + +/** + * cfs_hash_bucket is a container of: + * - lock, counter ... + * - array of hash-head starting from hsb_head[0], hash-head can be one of + * . cfs_hash_head_t + * . cfs_hash_head_dep_t + * . cfs_hash_dhead_t + * . cfs_hash_dhead_dep_t + * which depends on requirement of user + * - some extra bytes (caller can require it while creating hash) + */ +struct cfs_hash_bucket { + union cfs_hash_lock hsb_lock; /**< bucket lock */ + __u32 hsb_count; /**< current entries */ + __u32 hsb_version; /**< change version */ + unsigned int hsb_index; /**< index of bucket */ + int hsb_depmax; /**< max depth on bucket */ + long hsb_head[0]; /**< hash-head array */ +}; + +/** + * cfs_hash bucket descriptor, it's normally in stack of caller + */ +struct cfs_hash_bd { + struct cfs_hash_bucket *bd_bucket; /**< address of bucket */ + unsigned int bd_offset; /**< offset in bucket */ +}; + +#define CFS_HASH_NAME_LEN 16 /**< default name length */ +#define CFS_HASH_BIGNAME_LEN 64 /**< bigname for param tree */ + +#define CFS_HASH_BKT_BITS 3 /**< default bits of bucket */ +#define CFS_HASH_BITS_MAX 30 /**< max bits of bucket */ +#define CFS_HASH_BITS_MIN CFS_HASH_BKT_BITS + +/** + * common hash attributes. + */ +enum cfs_hash_tag { + /** + * don't need any lock, caller will protect operations with it's + * own lock. With this flag: + * . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK + * will be ignored. + * . Some functions will be disabled with this flag, i.e: + * cfs_hash_for_each_empty, cfs_hash_rehash + */ + CFS_HASH_NO_LOCK = 1 << 0, + /** no bucket lock, use one spinlock to protect the whole hash */ + CFS_HASH_NO_BKTLOCK = 1 << 1, + /** rwlock to protect bucket */ + CFS_HASH_RW_BKTLOCK = 1 << 2, + /** spinlock to protect bucket */ + CFS_HASH_SPIN_BKTLOCK = 1 << 3, + /** always add new item to tail */ + CFS_HASH_ADD_TAIL = 1 << 4, + /** hash-table doesn't have refcount on item */ + CFS_HASH_NO_ITEMREF = 1 << 5, + /** big name for param-tree */ + CFS_HASH_BIGNAME = 1 << 6, + /** track global count */ + CFS_HASH_COUNTER = 1 << 7, + /** rehash item by new key */ + CFS_HASH_REHASH_KEY = 1 << 8, + /** Enable dynamic hash resizing */ + CFS_HASH_REHASH = 1 << 9, + /** can shrink hash-size */ + CFS_HASH_SHRINK = 1 << 10, + /** assert hash is empty on exit */ + CFS_HASH_ASSERT_EMPTY = 1 << 11, + /** record hlist depth */ + CFS_HASH_DEPTH = 1 << 12, + /** + * rehash is always scheduled in a different thread, so current + * change on hash table is non-blocking + */ + CFS_HASH_NBLK_CHANGE = 1 << 13, + /** NB, we typed hs_flags as __u16, please change it + * if you need to extend >=16 flags */ +}; + +/** most used attributes */ +#define CFS_HASH_DEFAULT (CFS_HASH_RW_BKTLOCK | \ + CFS_HASH_COUNTER | CFS_HASH_REHASH) + +/** + * cfs_hash is a hash-table implementation for general purpose, it can support: + * . two refcount modes + * hash-table with & without refcount + * . four lock modes + * nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock + * . general operations + * lookup, add(add_tail or add_head), delete + * . rehash + * grows or shrink + * . iteration + * locked iteration and unlocked iteration + * . bigname + * support long name hash + * . debug + * trace max searching depth + * + * Rehash: + * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker) + * is spawned to handle the rehash in the background, it's possible that other + * processes can concurrently perform additions, deletions, and lookups + * without being blocked on rehash completion, because rehash will release + * the global wrlock for each bucket. + * + * rehash and iteration can't run at the same time because it's too tricky + * to keep both of them safe and correct. + * As they are relatively rare operations, so: + * . if iteration is in progress while we try to launch rehash, then + * it just giveup, iterator will launch rehash at the end. + * . if rehash is in progress while we try to iterate the hash table, + * then we just wait (shouldn't be very long time), anyway, nobody + * should expect iteration of whole hash-table to be non-blocking. + * + * During rehashing, a (key,object) pair may be in one of two buckets, + * depending on whether the worker task has yet to transfer the object + * to its new location in the table. Lookups and deletions need to search both + * locations; additions must take care to only insert into the new bucket. + */ + +struct cfs_hash { + /** serialize with rehash, or serialize all operations if + * the hash-table has CFS_HASH_NO_BKTLOCK */ + union cfs_hash_lock hs_lock; + /** hash operations */ + struct cfs_hash_ops *hs_ops; + /** hash lock operations */ + struct cfs_hash_lock_ops *hs_lops; + /** hash list operations */ + struct cfs_hash_hlist_ops *hs_hops; + /** hash buckets-table */ + struct cfs_hash_bucket **hs_buckets; + /** total number of items on this hash-table */ + atomic_t hs_count; + /** hash flags, see cfs_hash_tag for detail */ + __u16 hs_flags; + /** # of extra-bytes for bucket, for user saving extended attributes */ + __u16 hs_extra_bytes; + /** wants to iterate */ + __u8 hs_iterating; + /** hash-table is dying */ + __u8 hs_exiting; + /** current hash bits */ + __u8 hs_cur_bits; + /** min hash bits */ + __u8 hs_min_bits; + /** max hash bits */ + __u8 hs_max_bits; + /** bits for rehash */ + __u8 hs_rehash_bits; + /** bits for each bucket */ + __u8 hs_bkt_bits; + /** resize min threshold */ + __u16 hs_min_theta; + /** resize max threshold */ + __u16 hs_max_theta; + /** resize count */ + __u32 hs_rehash_count; + /** # of iterators (caller of cfs_hash_for_each_*) */ + __u32 hs_iterators; + /** rehash workitem */ + cfs_workitem_t hs_rehash_wi; + /** refcount on this hash table */ + atomic_t hs_refcount; + /** rehash buckets-table */ + struct cfs_hash_bucket **hs_rehash_buckets; +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 + /** serialize debug members */ + spinlock_t hs_dep_lock; + /** max depth */ + unsigned int hs_dep_max; + /** id of the deepest bucket */ + unsigned int hs_dep_bkt; + /** offset in the deepest bucket */ + unsigned int hs_dep_off; + /** bits when we found the max depth */ + unsigned int hs_dep_bits; + /** workitem to output max depth */ + cfs_workitem_t hs_dep_wi; +#endif + /** name of htable */ + char hs_name[0]; +}; + +typedef struct cfs_hash_lock_ops { + /** lock the hash table */ + void (*hs_lock)(union cfs_hash_lock *lock, int exclusive); + /** unlock the hash table */ + void (*hs_unlock)(union cfs_hash_lock *lock, int exclusive); + /** lock the hash bucket */ + void (*hs_bkt_lock)(union cfs_hash_lock *lock, int exclusive); + /** unlock the hash bucket */ + void (*hs_bkt_unlock)(union cfs_hash_lock *lock, int exclusive); +} cfs_hash_lock_ops_t; + +typedef struct cfs_hash_hlist_ops { + /** return hlist_head of hash-head of @bd */ + struct hlist_head *(*hop_hhead)(struct cfs_hash *hs, struct cfs_hash_bd *bd); + /** return hash-head size */ + int (*hop_hhead_size)(struct cfs_hash *hs); + /** add @hnode to hash-head of @bd */ + int (*hop_hnode_add)(struct cfs_hash *hs, + struct cfs_hash_bd *bd, struct hlist_node *hnode); + /** remove @hnode from hash-head of @bd */ + int (*hop_hnode_del)(struct cfs_hash *hs, + struct cfs_hash_bd *bd, struct hlist_node *hnode); +} cfs_hash_hlist_ops_t; + +typedef struct cfs_hash_ops { + /** return hashed value from @key */ + unsigned (*hs_hash)(struct cfs_hash *hs, const void *key, unsigned mask); + /** return key address of @hnode */ + void * (*hs_key)(struct hlist_node *hnode); + /** copy key from @hnode to @key */ + void (*hs_keycpy)(struct hlist_node *hnode, void *key); + /** + * compare @key with key of @hnode + * returns 1 on a match + */ + int (*hs_keycmp)(const void *key, struct hlist_node *hnode); + /** return object address of @hnode, i.e: container_of(...hnode) */ + void * (*hs_object)(struct hlist_node *hnode); + /** get refcount of item, always called with holding bucket-lock */ + void (*hs_get)(struct cfs_hash *hs, struct hlist_node *hnode); + /** release refcount of item */ + void (*hs_put)(struct cfs_hash *hs, struct hlist_node *hnode); + /** release refcount of item, always called with holding bucket-lock */ + void (*hs_put_locked)(struct cfs_hash *hs, struct hlist_node *hnode); + /** it's called before removing of @hnode */ + void (*hs_exit)(struct cfs_hash *hs, struct hlist_node *hnode); +} cfs_hash_ops_t; + +/** total number of buckets in @hs */ +#define CFS_HASH_NBKT(hs) \ + (1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits)) + +/** total number of buckets in @hs while rehashing */ +#define CFS_HASH_RH_NBKT(hs) \ + (1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits)) + +/** number of hlist for in bucket */ +#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits) + +/** total number of hlist in @hs */ +#define CFS_HASH_NHLIST(hs) (1U << (hs)->hs_cur_bits) + +/** total number of hlist in @hs while rehashing */ +#define CFS_HASH_RH_NHLIST(hs) (1U << (hs)->hs_rehash_bits) + +static inline int +cfs_hash_with_no_lock(struct cfs_hash *hs) +{ + /* caller will serialize all operations for this hash-table */ + return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0; +} + +static inline int +cfs_hash_with_no_bktlock(struct cfs_hash *hs) +{ + /* no bucket lock, one single lock to protect the hash-table */ + return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_rw_bktlock(struct cfs_hash *hs) +{ + /* rwlock to protect hash bucket */ + return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_spin_bktlock(struct cfs_hash *hs) +{ + /* spinlock to protect hash bucket */ + return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_add_tail(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0; +} + +static inline int +cfs_hash_with_no_itemref(struct cfs_hash *hs) +{ + /* hash-table doesn't keep refcount on item, + * item can't be removed from hash unless it's + * ZERO refcount */ + return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0; +} + +static inline int +cfs_hash_with_bigname(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_BIGNAME) != 0; +} + +static inline int +cfs_hash_with_counter(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_COUNTER) != 0; +} + +static inline int +cfs_hash_with_rehash(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_REHASH) != 0; +} + +static inline int +cfs_hash_with_rehash_key(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0; +} + +static inline int +cfs_hash_with_shrink(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_SHRINK) != 0; +} + +static inline int +cfs_hash_with_assert_empty(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0; +} + +static inline int +cfs_hash_with_depth(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_DEPTH) != 0; +} + +static inline int +cfs_hash_with_nblk_change(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0; +} + +static inline int +cfs_hash_is_exiting(struct cfs_hash *hs) +{ /* cfs_hash_destroy is called */ + return hs->hs_exiting; +} + +static inline int +cfs_hash_is_rehashing(struct cfs_hash *hs) +{ /* rehash is launched */ + return hs->hs_rehash_bits != 0; +} + +static inline int +cfs_hash_is_iterating(struct cfs_hash *hs) +{ /* someone is calling cfs_hash_for_each_* */ + return hs->hs_iterating || hs->hs_iterators != 0; +} + +static inline int +cfs_hash_bkt_size(struct cfs_hash *hs) +{ + return offsetof(struct cfs_hash_bucket, hsb_head[0]) + + hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) + + hs->hs_extra_bytes; +} + +static inline unsigned +cfs_hash_id(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return hs->hs_ops->hs_hash(hs, key, mask); +} + +static inline void * +cfs_hash_key(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_key(hnode); +} + +static inline void +cfs_hash_keycpy(struct cfs_hash *hs, struct hlist_node *hnode, void *key) +{ + if (hs->hs_ops->hs_keycpy) + hs->hs_ops->hs_keycpy(hnode, key); +} + +/** + * Returns 1 on a match, + */ +static inline int +cfs_hash_keycmp(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_keycmp(key, hnode); +} + +static inline void * +cfs_hash_object(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_object(hnode); +} + +static inline void +cfs_hash_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_get(hs, hnode); +} + +static inline void +cfs_hash_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_put_locked(hs, hnode); +} + +static inline void +cfs_hash_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_put(hs, hnode); +} + +static inline void +cfs_hash_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + if (hs->hs_ops->hs_exit) + hs->hs_ops->hs_exit(hs, hnode); +} + +static inline void cfs_hash_lock(struct cfs_hash *hs, int excl) +{ + hs->hs_lops->hs_lock(&hs->hs_lock, excl); +} + +static inline void cfs_hash_unlock(struct cfs_hash *hs, int excl) +{ + hs->hs_lops->hs_unlock(&hs->hs_lock, excl); +} + +static inline int cfs_hash_dec_and_lock(struct cfs_hash *hs, + atomic_t *condition) +{ + LASSERT(cfs_hash_with_no_bktlock(hs)); + return atomic_dec_and_lock(condition, &hs->hs_lock.spin); +} + +static inline void cfs_hash_bd_lock(struct cfs_hash *hs, + struct cfs_hash_bd *bd, int excl) +{ + hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl); +} + +static inline void cfs_hash_bd_unlock(struct cfs_hash *hs, + struct cfs_hash_bd *bd, int excl) +{ + hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl); +} + +/** + * operations on cfs_hash bucket (bd: bucket descriptor), + * they are normally for hash-table without rehash + */ +void cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd); + +static inline void cfs_hash_bd_get_and_lock(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bd, int excl) +{ + cfs_hash_bd_get(hs, key, bd); + cfs_hash_bd_lock(hs, bd, excl); +} + +static inline unsigned cfs_hash_bd_index_get(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits); +} + +static inline void cfs_hash_bd_index_set(struct cfs_hash *hs, + unsigned index, struct cfs_hash_bd *bd) +{ + bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits]; + bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U); +} + +static inline void * +cfs_hash_bd_extra_get(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + return (void *)bd->bd_bucket + + cfs_hash_bkt_size(hs) - hs->hs_extra_bytes; +} + +static inline __u32 +cfs_hash_bd_version_get(struct cfs_hash_bd *bd) +{ + /* need hold cfs_hash_bd_lock */ + return bd->bd_bucket->hsb_version; +} + +static inline __u32 +cfs_hash_bd_count_get(struct cfs_hash_bd *bd) +{ + /* need hold cfs_hash_bd_lock */ + return bd->bd_bucket->hsb_count; +} + +static inline int +cfs_hash_bd_depmax_get(struct cfs_hash_bd *bd) +{ + return bd->bd_bucket->hsb_depmax; +} + +static inline int +cfs_hash_bd_compare(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2) +{ + if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index) + return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index; + + if (bd1->bd_offset != bd2->bd_offset) + return bd1->bd_offset - bd2->bd_offset; + + return 0; +} + +void cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); +void cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); +void cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old, + struct cfs_hash_bd *bd_new, struct hlist_node *hnode); + +static inline int cfs_hash_bd_dec_and_lock(struct cfs_hash *hs, struct cfs_hash_bd *bd, + atomic_t *condition) +{ + LASSERT(cfs_hash_with_spin_bktlock(hs)); + return atomic_dec_and_lock(condition, + &bd->bd_bucket->hsb_lock.spin); +} + +static inline struct hlist_head *cfs_hash_bd_hhead(struct cfs_hash *hs, + struct cfs_hash_bd *bd) +{ + return hs->hs_hops->hop_hhead(hs, bd); +} + +struct hlist_node *cfs_hash_bd_lookup_locked(struct cfs_hash *hs, + struct cfs_hash_bd *bd, const void *key); +struct hlist_node *cfs_hash_bd_peek_locked(struct cfs_hash *hs, + struct cfs_hash_bd *bd, const void *key); +struct hlist_node *cfs_hash_bd_findadd_locked(struct cfs_hash *hs, + struct cfs_hash_bd *bd, const void *key, + struct hlist_node *hnode, + int insist_add); +struct hlist_node *cfs_hash_bd_finddel_locked(struct cfs_hash *hs, + struct cfs_hash_bd *bd, const void *key, + struct hlist_node *hnode); + +/** + * operations on cfs_hash bucket (bd: bucket descriptor), + * they are safe for hash-table with rehash + */ +void cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bds); +void cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl); +void cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl); + +static inline void cfs_hash_dual_bd_get_and_lock(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bds, int excl) +{ + cfs_hash_dual_bd_get(hs, key, bds); + cfs_hash_dual_bd_lock(hs, bds, excl); +} + +struct hlist_node *cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, + struct cfs_hash_bd *bds, + const void *key); +struct hlist_node *cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, + struct cfs_hash_bd *bds, + const void *key, + struct hlist_node *hnode, + int insist_add); +struct hlist_node *cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, + struct cfs_hash_bd *bds, + const void *key, + struct hlist_node *hnode); + +/* Hash init/cleanup functions */ +struct cfs_hash *cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits, + unsigned bkt_bits, unsigned extra_bytes, + unsigned min_theta, unsigned max_theta, + cfs_hash_ops_t *ops, unsigned flags); + +struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs); +void cfs_hash_putref(struct cfs_hash *hs); + +/* Hash addition functions */ +void cfs_hash_add(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); +int cfs_hash_add_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); +void *cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); + +/* Hash deletion functions */ +void *cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode); +void *cfs_hash_del_key(struct cfs_hash *hs, const void *key); + +/* Hash lookup/for_each functions */ +#define CFS_HASH_LOOP_HOG 1024 + +typedef int (*cfs_hash_for_each_cb_t)(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *node, void *data); +void *cfs_hash_lookup(struct cfs_hash *hs, const void *key); +void cfs_hash_for_each(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data); +void cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data); +int cfs_hash_for_each_nolock(struct cfs_hash *hs, + cfs_hash_for_each_cb_t, void *data); +int cfs_hash_for_each_empty(struct cfs_hash *hs, + cfs_hash_for_each_cb_t, void *data); +void cfs_hash_for_each_key(struct cfs_hash *hs, const void *key, + cfs_hash_for_each_cb_t, void *data); +typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data); +void cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t, void *data); + +void cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex, + cfs_hash_for_each_cb_t, void *data); +int cfs_hash_is_empty(struct cfs_hash *hs); +__u64 cfs_hash_size_get(struct cfs_hash *hs); + +/* + * Rehash - Theta is calculated to be the average chained + * hash depth assuming a perfectly uniform hash function. + */ +void cfs_hash_rehash_cancel_locked(struct cfs_hash *hs); +void cfs_hash_rehash_cancel(struct cfs_hash *hs); +int cfs_hash_rehash(struct cfs_hash *hs, int do_rehash); +void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key, + void *new_key, struct hlist_node *hnode); + +#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 +/* Validate hnode references the correct key */ +static inline void +cfs_hash_key_validate(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) +{ + LASSERT(cfs_hash_keycmp(hs, key, hnode)); +} + +/* Validate hnode is in the correct bucket */ +static inline void +cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_bd bds[2]; + + cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds); + LASSERT(bds[0].bd_bucket == bd->bd_bucket || + bds[1].bd_bucket == bd->bd_bucket); +} + +#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */ + +static inline void +cfs_hash_key_validate(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) {} + +static inline void +cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) {} + +#endif /* CFS_HASH_DEBUG_LEVEL */ + +#define CFS_HASH_THETA_BITS 10 +#define CFS_HASH_MIN_THETA (1U << (CFS_HASH_THETA_BITS - 1)) +#define CFS_HASH_MAX_THETA (1U << (CFS_HASH_THETA_BITS + 1)) + +/* Return integer component of theta */ +static inline int __cfs_hash_theta_int(int theta) +{ + return (theta >> CFS_HASH_THETA_BITS); +} + +/* Return a fractional value between 0 and 999 */ +static inline int __cfs_hash_theta_frac(int theta) +{ + return ((theta * 1000) >> CFS_HASH_THETA_BITS) - + (__cfs_hash_theta_int(theta) * 1000); +} + +static inline int __cfs_hash_theta(struct cfs_hash *hs) +{ + return (atomic_read(&hs->hs_count) << + CFS_HASH_THETA_BITS) >> hs->hs_cur_bits; +} + +static inline void __cfs_hash_set_theta(struct cfs_hash *hs, int min, int max) +{ + LASSERT(min < max); + hs->hs_min_theta = (__u16)min; + hs->hs_max_theta = (__u16)max; +} + +/* Generic debug formatting routines mainly for proc handler */ +struct seq_file; +void cfs_hash_debug_header(struct seq_file *m); +void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m); + +/* + * Generic djb2 hash algorithm for character arrays. + */ +static inline unsigned +cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask) +{ + unsigned i, hash = 5381; + + LASSERT(key != NULL); + + for (i = 0; i < size; i++) + hash = hash * 33 + ((char *)key)[i]; + + return (hash & mask); +} + +/* + * Generic u32 hash algorithm. + */ +static inline unsigned +cfs_hash_u32_hash(const __u32 key, unsigned mask) +{ + return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask); +} + +/* + * Generic u64 hash algorithm. + */ +static inline unsigned +cfs_hash_u64_hash(const __u64 key, unsigned mask) +{ + return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask); +} + +/** iterate over all buckets in @bds (array of struct cfs_hash_bd) */ +#define cfs_hash_for_each_bd(bds, n, i) \ + for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++) + +/** iterate over all buckets of @hs */ +#define cfs_hash_for_each_bucket(hs, bd, pos) \ + for (pos = 0; \ + pos < CFS_HASH_NBKT(hs) && \ + ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++) + +/** iterate over all hlist of bucket @bd */ +#define cfs_hash_bd_for_each_hlist(hs, bd, hlist) \ + for ((bd)->bd_offset = 0; \ + (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) && \ + (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL; \ + (bd)->bd_offset++) + +/* !__LIBCFS__HASH_H__ */ +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h new file mode 100644 index 000000000..3ee38782a --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h @@ -0,0 +1,214 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_ioctl.h + * + * Low-level ioctl data structures. Kernel ioctl functions declared here, + * and user space functions are in libcfsutil_ioctl.h. + * + */ + +#ifndef __LIBCFS_IOCTL_H__ +#define __LIBCFS_IOCTL_H__ + +#define LIBCFS_IOCTL_VERSION 0x0001000a + +struct libcfs_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + + __u64 ioc_nid; + __u64 ioc_u64[1]; + + __u32 ioc_flags; + __u32 ioc_count; + __u32 ioc_net; + __u32 ioc_u32[7]; + + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + + __u32 ioc_plen1; /* buffers in userspace */ + char *ioc_pbuf1; + __u32 ioc_plen2; /* buffers in userspace */ + char *ioc_pbuf2; + + char ioc_bulk[0]; +}; + +#define ioc_priority ioc_u32[0] + +struct libcfs_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +struct libcfs_debug_ioctl_data { + struct libcfs_ioctl_hdr hdr; + unsigned int subs; + unsigned int debug; +}; + +#define LIBCFS_IOC_INIT(data) \ +do { \ + memset(&data, 0, sizeof(data)); \ + data.ioc_version = LIBCFS_IOCTL_VERSION; \ + data.ioc_len = sizeof(data); \ +} while (0) + +struct libcfs_ioctl_handler { + struct list_head item; + int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_data *data); +}; + +#define DECLARE_IOCTL_HANDLER(ident, func) \ + struct libcfs_ioctl_handler ident = { \ + /* .item = */ LIST_HEAD_INIT(ident.item), \ + /* .handle_ioctl = */ func \ + } + +/* FIXME check conflict with lustre_lib.h */ +#define LIBCFS_IOC_DEBUG_MASK _IOWR('f', 250, long) + +/* ioctls for manipulating snapshots 30- */ +#define IOC_LIBCFS_TYPE 'e' +#define IOC_LIBCFS_MIN_NR 30 +/* libcfs ioctls */ +#define IOC_LIBCFS_PANIC _IOWR('e', 30, long) +#define IOC_LIBCFS_CLEAR_DEBUG _IOWR('e', 31, long) +#define IOC_LIBCFS_MARK_DEBUG _IOWR('e', 32, long) +#define IOC_LIBCFS_MEMHOG _IOWR('e', 36, long) +#define IOC_LIBCFS_PING_TEST _IOWR('e', 37, long) +/* lnet ioctls */ +#define IOC_LIBCFS_GET_NI _IOWR('e', 50, long) +#define IOC_LIBCFS_FAIL_NID _IOWR('e', 51, long) +#define IOC_LIBCFS_ADD_ROUTE _IOWR('e', 52, long) +#define IOC_LIBCFS_DEL_ROUTE _IOWR('e', 53, long) +#define IOC_LIBCFS_GET_ROUTE _IOWR('e', 54, long) +#define IOC_LIBCFS_NOTIFY_ROUTER _IOWR('e', 55, long) +#define IOC_LIBCFS_UNCONFIGURE _IOWR('e', 56, long) +#define IOC_LIBCFS_PORTALS_COMPATIBILITY _IOWR('e', 57, long) +#define IOC_LIBCFS_LNET_DIST _IOWR('e', 58, long) +#define IOC_LIBCFS_CONFIGURE _IOWR('e', 59, long) +#define IOC_LIBCFS_TESTPROTOCOMPAT _IOWR('e', 60, long) +#define IOC_LIBCFS_PING _IOWR('e', 61, long) +#define IOC_LIBCFS_DEBUG_PEER _IOWR('e', 62, long) +#define IOC_LIBCFS_LNETST _IOWR('e', 63, long) +/* lnd ioctls */ +#define IOC_LIBCFS_REGISTER_MYNID _IOWR('e', 70, long) +#define IOC_LIBCFS_CLOSE_CONNECTION _IOWR('e', 71, long) +#define IOC_LIBCFS_PUSH_CONNECTION _IOWR('e', 72, long) +#define IOC_LIBCFS_GET_CONN _IOWR('e', 73, long) +#define IOC_LIBCFS_DEL_PEER _IOWR('e', 74, long) +#define IOC_LIBCFS_ADD_PEER _IOWR('e', 75, long) +#define IOC_LIBCFS_GET_PEER _IOWR('e', 76, long) +#define IOC_LIBCFS_GET_TXDESC _IOWR('e', 77, long) +#define IOC_LIBCFS_ADD_INTERFACE _IOWR('e', 78, long) +#define IOC_LIBCFS_DEL_INTERFACE _IOWR('e', 79, long) +#define IOC_LIBCFS_GET_INTERFACE _IOWR('e', 80, long) + +#define IOC_LIBCFS_MAX_NR 80 + +static inline int libcfs_ioctl_packlen(struct libcfs_ioctl_data *data) +{ + int len = sizeof(*data); + + len += cfs_size_round(data->ioc_inllen1); + len += cfs_size_round(data->ioc_inllen2); + return len; +} + +static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data) +{ + if (data->ioc_len > (1<<30)) { + CERROR("LIBCFS ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen1 > (1<<30)) { + CERROR("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen2 > (1<<30)) { + CERROR("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR("LIBCFS ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR("LIBCFS ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR("LIBCFS ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR("LIBCFS ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_plen1 && !data->ioc_pbuf1) { + CERROR("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n"); + return 1; + } + if (data->ioc_plen2 && !data->ioc_pbuf2) { + CERROR("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n"); + return 1; + } + if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len) { + CERROR("LIBCFS ioctl: packlen != ioc_len\n"); + return 1; + } + if (data->ioc_inllen1 && + data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') { + CERROR("LIBCFS ioctl: inlbuf1 not 0 terminated\n"); + return 1; + } + if (data->ioc_inllen2 && + data->ioc_bulk[cfs_size_round(data->ioc_inllen1) + + data->ioc_inllen2 - 1] != '\0') { + CERROR("LIBCFS ioctl: inlbuf2 not 0 terminated\n"); + return 1; + } + return 0; +} + +int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand); +int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand); +int libcfs_ioctl_getdata(char *buf, char *end, void *arg); +int libcfs_ioctl_popdata(void *arg, void *buf, int size); + +#endif /* __LIBCFS_IOCTL_H__ */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h new file mode 100644 index 000000000..a989d2666 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h @@ -0,0 +1,118 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: Nathan Rutman + * + * libcfs/include/libcfs/libcfs_kernelcomm.h + * + * Kernel <-> userspace communication routines. + * The definitions below are used in the kernel and userspace. + * + */ + +#ifndef __LIBCFS_KERNELCOMM_H__ +#define __LIBCFS_KERNELCOMM_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +/* KUC message header. + * All current and future KUC messages should use this header. + * To avoid having to include Lustre headers from libcfs, define this here. + */ +struct kuc_hdr { + __u16 kuc_magic; + __u8 kuc_transport; /* Each new Lustre feature should use a different + transport */ + __u8 kuc_flags; + __u16 kuc_msgtype; /* Message type or opcode, transport-specific */ + __u16 kuc_msglen; /* Including header */ +} __aligned(sizeof(__u64)); + +#define KUC_CHANGELOG_MSG_MAXSIZE (sizeof(struct kuc_hdr)+CR_MAXSIZE) + +#define KUC_MAGIC 0x191C /*Lustre9etLinC */ +#define KUC_FL_BLOCK 0x01 /* Wait for send */ + +/* kuc_msgtype values are defined in each transport */ +enum kuc_transport_type { + KUC_TRANSPORT_GENERIC = 1, + KUC_TRANSPORT_HSM = 2, + KUC_TRANSPORT_CHANGELOG = 3, +}; + +enum kuc_generic_message_type { + KUC_MSG_SHUTDOWN = 1, +}; + +/* prototype for callback function on kuc groups */ +typedef int (*libcfs_kkuc_cb_t)(__u32 data, void *cb_arg); + +/* KUC Broadcast Groups. This determines which userspace process hears which + * messages. Mutliple transports may be used within a group, or multiple + * groups may use the same transport. Broadcast + * groups need not be used if e.g. a UID is specified instead; + * use group 0 to signify unicast. + */ +#define KUC_GRP_HSM 0x02 +#define KUC_GRP_MAX KUC_GRP_HSM + +/* Kernel methods */ +int libcfs_kkuc_msg_put(struct file *fp, void *payload); +int libcfs_kkuc_group_put(int group, void *payload); +int libcfs_kkuc_group_add(struct file *fp, int uid, int group, + __u32 data); +int libcfs_kkuc_group_rem(int uid, int group); +int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func, + void *cb_arg); + +#define LK_FLG_STOP 0x01 + +/* kernelcomm control structure, passed from userspace to kernel */ +typedef struct lustre_kernelcomm { + __u32 lk_wfd; + __u32 lk_rfd; + __u32 lk_uid; + __u32 lk_group; + __u32 lk_data; + __u32 lk_flags; +} __packed lustre_kernelcomm; + +/* Userspace methods */ +int libcfs_ukuc_start(lustre_kernelcomm *l, int groups); +int libcfs_ukuc_stop(lustre_kernelcomm *l); +int libcfs_ukuc_msg_get(lustre_kernelcomm *l, char *buf, int maxsize, + int transport); + +#endif /* __LIBCFS_KERNELCOMM_H__ */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h new file mode 100644 index 000000000..978d3e2f1 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_prim.h + * + * General primitives. + * + */ + +#ifndef __LIBCFS_PRIM_H__ +#define __LIBCFS_PRIM_H__ + +/* + * Timer + */ +typedef void (cfs_timer_func_t)(ulong_ptr_t); + +void add_wait_queue_exclusive_head(wait_queue_head_t *, wait_queue_t *); + +void cfs_init_timer(struct timer_list *t); +void cfs_timer_init(struct timer_list *t, cfs_timer_func_t *func, void *arg); +void cfs_timer_done(struct timer_list *t); +void cfs_timer_arm(struct timer_list *t, unsigned long deadline); +void cfs_timer_disarm(struct timer_list *t); +int cfs_timer_is_armed(struct timer_list *t); +unsigned long cfs_timer_deadline(struct timer_list *t); + +/* + * Memory + */ +#ifndef memory_pressure_get +#define memory_pressure_get() (0) +#endif +#ifndef memory_pressure_set +#define memory_pressure_set() do {} while (0) +#endif +#ifndef memory_pressure_clr +#define memory_pressure_clr() do {} while (0) +#endif + +static inline int cfs_memory_pressure_get_and_set(void) +{ + int old = memory_pressure_get(); + + if (!old) + memory_pressure_set(); + return old; +} + +static inline void cfs_memory_pressure_restore(int old) +{ + if (old) + memory_pressure_set(); + else + memory_pressure_clr(); +} +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h new file mode 100644 index 000000000..fef882530 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h @@ -0,0 +1,556 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_private.h + * + * Various defines for libcfs. + * + */ + +#ifndef __LIBCFS_PRIVATE_H__ +#define __LIBCFS_PRIVATE_H__ + +/* XXX this layering violation is for nidstrings */ +#include "../lnet/types.h" + +#ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +#endif + + +/* + * When this is on, LASSERT macro includes check for assignment used instead + * of equality check, but doesn't have unlikely(). Turn this on from time to + * time to make test-builds. This shouldn't be on for production release. + */ +#define LASSERT_CHECKED (0) + +#define LASSERTF(cond, fmt, ...) \ +do { \ + if (unlikely(!(cond))) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL); \ + libcfs_debug_msg(&__msg_data, \ + "ASSERTION( %s ) failed: " fmt, #cond, \ + ## __VA_ARGS__); \ + lbug_with_loc(&__msg_data); \ + } \ +} while (0) + +#define LASSERT(cond) LASSERTF(cond, "\n") + +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK +/** + * This is for more expensive checks that one doesn't want to be enabled all + * the time. LINVRNT() has to be explicitly enabled by + * CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK option. + */ +# define LINVRNT(exp) LASSERT(exp) +#else +# define LINVRNT(exp) ((void)sizeof !!(exp)) +#endif + +#define KLASSERT(e) LASSERT(e) + +void lbug_with_loc(struct libcfs_debug_msg_data *)__attribute__((noreturn)); + +#define LBUG() \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ + lbug_with_loc(&msgdata); \ +} while (0) + +extern atomic_t libcfs_kmemory; +/* + * Memory + */ + +# define libcfs_kmem_inc(ptr, size) \ +do { \ + atomic_add(size, &libcfs_kmemory); \ +} while (0) + +# define libcfs_kmem_dec(ptr, size) \ +do { \ + atomic_sub(size, &libcfs_kmemory); \ +} while (0) + +# define libcfs_kmem_read() \ + atomic_read(&libcfs_kmemory) + +#ifndef LIBCFS_VMALLOC_SIZE +#define LIBCFS_VMALLOC_SIZE (2 << PAGE_CACHE_SHIFT) /* 2 pages */ +#endif + +#define LIBCFS_ALLOC_PRE(size, mask) \ +do { \ + LASSERT(!in_interrupt() || \ + ((size) <= LIBCFS_VMALLOC_SIZE && \ + ((mask) & __GFP_WAIT) == 0)); \ +} while (0) + +#define LIBCFS_ALLOC_POST(ptr, size) \ +do { \ + if (unlikely((ptr) == NULL)) { \ + CERROR("LNET: out of memory at %s:%d (tried to alloc '" \ + #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size)); \ + CERROR("LNET: %d total bytes allocated by lnet\n", \ + libcfs_kmem_read()); \ + } else { \ + memset((ptr), 0, (size)); \ + libcfs_kmem_inc((ptr), (size)); \ + CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %d).\n", \ + (int)(size), (ptr), libcfs_kmem_read()); \ + } \ +} while (0) + +/** + * allocate memory with GFP flags @mask + */ +#define LIBCFS_ALLOC_GFP(ptr, size, mask) \ +do { \ + LIBCFS_ALLOC_PRE((size), (mask)); \ + (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ? \ + kmalloc((size), (mask)) : vmalloc(size); \ + LIBCFS_ALLOC_POST((ptr), (size)); \ +} while (0) + +/** + * default allocator + */ +#define LIBCFS_ALLOC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, GFP_NOFS) + +/** + * non-sleeping allocator + */ +#define LIBCFS_ALLOC_ATOMIC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC) + +/** + * allocate memory for specified CPU partition + * \a cptab != NULL, \a cpt is CPU partition id of \a cptab + * \a cptab == NULL, \a cpt is HW NUMA node id + */ +#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask) \ +do { \ + LIBCFS_ALLOC_PRE((size), (mask)); \ + (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ? \ + kmalloc_node((size), (mask), cfs_cpt_spread_node(cptab, cpt)) :\ + vmalloc_node(size, cfs_cpt_spread_node(cptab, cpt)); \ + LIBCFS_ALLOC_POST((ptr), (size)); \ +} while (0) + +/** default numa allocator */ +#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size) \ + LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS) + +#define LIBCFS_FREE(ptr, size) \ +do { \ + int s = (size); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + libcfs_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n", \ + s, (ptr), libcfs_kmem_read()); \ + if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ + vfree(ptr); \ + else \ + kfree(ptr); \ +} while (0) + +/******************************************************************************/ + +/* htonl hack - either this, or compile with -O2. Stupid byteorder/generic.h */ +#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__) +#define ___htonl(x) __cpu_to_be32(x) +#define ___htons(x) __cpu_to_be16(x) +#define ___ntohl(x) __be32_to_cpu(x) +#define ___ntohs(x) __be16_to_cpu(x) +#define htonl(x) ___htonl(x) +#define ntohl(x) ___ntohl(x) +#define htons(x) ___htons(x) +#define ntohs(x) ___ntohs(x) +#endif + +void libcfs_run_upcall(char **argv); +void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *); +void libcfs_debug_dumplog(void); +int libcfs_debug_init(unsigned long bufsize); +int libcfs_debug_cleanup(void); +int libcfs_debug_clear_buffer(void); +int libcfs_debug_mark_buffer(const char *text); + +void libcfs_debug_set_level(unsigned int debug_level); + +/* + * allocate per-cpu-partition data, returned value is an array of pointers, + * variable can be indexed by CPU ID. + * cptable != NULL: size of array is number of CPU partitions + * cptable == NULL: size of array is number of HW cores + */ +void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size); +/* + * destroy per-cpu-partition variable + */ +void cfs_percpt_free(void *vars); +int cfs_percpt_number(void *vars); +void *cfs_percpt_current(void *vars); +void *cfs_percpt_index(void *vars, int idx); + +#define cfs_percpt_for_each(var, i, vars) \ + for (i = 0; i < cfs_percpt_number(vars) && \ + ((var) = (vars)[i]) != NULL; i++) + +/* + * allocate a variable array, returned value is an array of pointers. + * Caller can specify length of array by count. + */ +void *cfs_array_alloc(int count, unsigned int size); +void cfs_array_free(void *vars); + +#define LASSERT_ATOMIC_ENABLED (1) + +#if LASSERT_ATOMIC_ENABLED + +/** assert value of @a is equal to @v */ +#define LASSERT_ATOMIC_EQ(a, v) \ +do { \ + LASSERTF(atomic_read(a) == v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is unequal to @v */ +#define LASSERT_ATOMIC_NE(a, v) \ +do { \ + LASSERTF(atomic_read(a) != v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is little than @v */ +#define LASSERT_ATOMIC_LT(a, v) \ +do { \ + LASSERTF(atomic_read(a) < v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is little/equal to @v */ +#define LASSERT_ATOMIC_LE(a, v) \ +do { \ + LASSERTF(atomic_read(a) <= v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is great than @v */ +#define LASSERT_ATOMIC_GT(a, v) \ +do { \ + LASSERTF(atomic_read(a) > v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is great/equal to @v */ +#define LASSERT_ATOMIC_GE(a, v) \ +do { \ + LASSERTF(atomic_read(a) >= v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is great than @v1 and little than @v2 */ +#define LASSERT_ATOMIC_GT_LT(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v); \ +} while (0) + +/** assert value of @a is great than @v1 and little/equal to @v2 */ +#define LASSERT_ATOMIC_GT_LE(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v); \ +} while (0) + +/** assert value of @a is great/equal to @v1 and little than @v2 */ +#define LASSERT_ATOMIC_GE_LT(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v); \ +} while (0) + +/** assert value of @a is great/equal to @v1 and little/equal to @v2 */ +#define LASSERT_ATOMIC_GE_LE(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v); \ +} while (0) + +#else /* !LASSERT_ATOMIC_ENABLED */ + +#define LASSERT_ATOMIC_EQ(a, v) do {} while (0) +#define LASSERT_ATOMIC_NE(a, v) do {} while (0) +#define LASSERT_ATOMIC_LT(a, v) do {} while (0) +#define LASSERT_ATOMIC_LE(a, v) do {} while (0) +#define LASSERT_ATOMIC_GT(a, v) do {} while (0) +#define LASSERT_ATOMIC_GE(a, v) do {} while (0) +#define LASSERT_ATOMIC_GT_LT(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GT_LE(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GE_LT(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GE_LE(a, v1, v2) do {} while (0) + +#endif /* LASSERT_ATOMIC_ENABLED */ + +#define LASSERT_ATOMIC_ZERO(a) LASSERT_ATOMIC_EQ(a, 0) +#define LASSERT_ATOMIC_POS(a) LASSERT_ATOMIC_GT(a, 0) + +#define CFS_ALLOC_PTR(ptr) LIBCFS_ALLOC(ptr, sizeof(*(ptr))) +#define CFS_FREE_PTR(ptr) LIBCFS_FREE(ptr, sizeof(*(ptr))) + +/* + * percpu partition lock + * + * There are some use-cases like this in Lustre: + * . each CPU partition has it's own private data which is frequently changed, + * and mostly by the local CPU partition. + * . all CPU partitions share some global data, these data are rarely changed. + * + * LNet is typical example. + * CPU partition lock is designed for this kind of use-cases: + * . each CPU partition has it's own private lock + * . change on private data just needs to take the private lock + * . read on shared data just needs to take _any_ of private locks + * . change on shared data needs to take _all_ private locks, + * which is slow and should be really rare. + */ + +enum { + CFS_PERCPT_LOCK_EX = -1, /* negative */ +}; + +struct cfs_percpt_lock { + /* cpu-partition-table for this lock */ + struct cfs_cpt_table *pcl_cptab; + /* exclusively locked */ + unsigned int pcl_locked; + /* private lock table */ + spinlock_t **pcl_locks; +}; + +/* return number of private locks */ +static inline int +cfs_percpt_lock_num(struct cfs_percpt_lock *pcl) +{ + return cfs_cpt_number(pcl->pcl_cptab); +} + +/* + * create a cpu-partition lock based on CPU partition table \a cptab, + * each private lock has extra \a psize bytes padding data + */ +struct cfs_percpt_lock *cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab); +/* destroy a cpu-partition lock */ +void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl); + +/* lock private lock \a index of \a pcl */ +void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index); +/* unlock private lock \a index of \a pcl */ +void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index); +/* create percpt (atomic) refcount based on @cptab */ +atomic_t **cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int val); +/* destroy percpt refcount */ +void cfs_percpt_atomic_free(atomic_t **refs); +/* return sum of all percpu refs */ +int cfs_percpt_atomic_summary(atomic_t **refs); + +/** Compile-time assertion. + + * Check an invariant described by a constant expression at compile time by + * forcing a compiler error if it does not hold. \a cond must be a constant + * expression as defined by the ISO C Standard: + * + * 6.8.4.2 The switch statement + * .... + * [#3] The expression of each case label shall be an integer + * constant expression and no two of the case constant + * expressions in the same switch statement shall have the same + * value after conversion... + * + */ +#define CLASSERT(cond) do {switch (42) {case (cond): case 0: break; } } while (0) + +/* support decl needed both by kernel and liblustre */ +int libcfs_isknown_lnd(int type); +char *libcfs_lnd2modname(int type); +char *libcfs_lnd2str(int type); +int libcfs_str2lnd(const char *str); +char *libcfs_net2str(__u32 net); +char *libcfs_nid2str(lnet_nid_t nid); +__u32 libcfs_str2net(const char *str); +lnet_nid_t libcfs_str2nid(const char *str); +int libcfs_str2anynid(lnet_nid_t *nid, const char *str); +char *libcfs_id2str(lnet_process_id_t id); +void cfs_free_nidlist(struct list_head *list); +int cfs_parse_nidlist(char *str, int len, struct list_head *list); +int cfs_match_nid(lnet_nid_t nid, struct list_head *list); + +/** \addtogroup lnet_addr + * @{ */ +/* how an LNET NID encodes net:address */ +/** extract the address part of an lnet_nid_t */ +#define LNET_NIDADDR(nid) ((__u32)((nid) & 0xffffffff)) +/** extract the network part of an lnet_nid_t */ +#define LNET_NIDNET(nid) ((__u32)(((nid) >> 32)) & 0xffffffff) +/** make an lnet_nid_t from a network part and an address part */ +#define LNET_MKNID(net, addr) ((((__u64)(net))<<32)|((__u64)(addr))) +/* how net encodes type:number */ +#define LNET_NETNUM(net) ((net) & 0xffff) +#define LNET_NETTYP(net) (((net) >> 16) & 0xffff) +#define LNET_MKNET(typ, num) ((((__u32)(typ))<<16)|((__u32)(num))) +/** @} lnet_addr */ + +/* max value for numeric network address */ +#define MAX_NUMERIC_VALUE 0xffffffff + +/* implication */ +#define ergo(a, b) (!(a) || (b)) +/* logical equivalence */ +#define equi(a, b) (!!(a) == !!(b)) + +/* -------------------------------------------------------------------- + * Light-weight trace + * Support for temporary event tracing with minimal Heisenberg effect. + * -------------------------------------------------------------------- */ + +struct libcfs_device_userstate { + int ldu_memhog_pages; + struct page *ldu_memhog_root_page; +}; + +#define MKSTR(ptr) ((ptr)) ? (ptr) : "" + +static inline int cfs_size_round4(int val) +{ + return (val + 3) & (~0x3); +} + +#ifndef HAVE_CFS_SIZE_ROUND +static inline int cfs_size_round(int val) +{ + return (val + 7) & (~0x7); +} + +#define HAVE_CFS_SIZE_ROUND +#endif + +static inline int cfs_size_round16(int val) +{ + return (val + 0xf) & (~0xf); +} + +static inline int cfs_size_round32(int val) +{ + return (val + 0x1f) & (~0x1f); +} + +static inline int cfs_size_round0(int val) +{ + if (!val) + return 0; + return (val + 1 + 7) & (~0x7); +} + +static inline size_t cfs_round_strlen(char *fset) +{ + return (size_t)cfs_size_round((int)strlen(fset) + 1); +} + +/* roundup \a val to power2 */ +static inline unsigned int cfs_power2_roundup(unsigned int val) +{ + if (val != LOWEST_BIT_SET(val)) { /* not a power of 2 already */ + do { + val &= ~LOWEST_BIT_SET(val); + } while (val != LOWEST_BIT_SET(val)); + /* ...and round up */ + val <<= 1; + } + return val; +} + +#define LOGL(var, len, ptr) \ +do { \ + if (var) \ + memcpy((char *)ptr, (const char *)var, len); \ + ptr += cfs_size_round(len); \ +} while (0) + +#define LOGU(var, len, ptr) \ +do { \ + if (var) \ + memcpy((char *)var, (const char *)ptr, len); \ + ptr += cfs_size_round(len); \ +} while (0) + +#define LOGL0(var, len, ptr) \ +do { \ + if (!len) \ + break; \ + memcpy((char *)ptr, (const char *)var, len); \ + *((char *)(ptr) + len) = 0; \ + ptr += cfs_size_round(len + 1); \ +} while (0) + +/** + * Lustre Network Driver types. + */ +enum { + /* Only add to these values (i.e. don't ever change or redefine them): + * network addresses depend on them... */ + QSWLND = 1, + SOCKLND = 2, + GMLND = 3, /* obsolete, keep it so that libcfs_nid2str works */ + PTLLND = 4, + O2IBLND = 5, + CIBLND = 6, + OPENIBLND = 7, + IIBLND = 8, + LOLND = 9, + RALND = 10, + VIBLND = 11, + MXLND = 12, + GNILND = 13, +}; + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h new file mode 100644 index 000000000..509dc1e5c --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h @@ -0,0 +1,107 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_string.h + * + * Generic string manipulation functions. + * + * Author: Nathan Rutman + */ + +#ifndef __LIBCFS_STRING_H__ +#define __LIBCFS_STRING_H__ + +/* libcfs_string.c */ +/* string comparison ignoring case */ +int cfs_strncasecmp(const char *s1, const char *s2, size_t n); +/* Convert a text string to a bitmask */ +int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), + int *oldmask, int minmask, int allmask); +/* trim leading and trailing space characters */ +char *cfs_firststr(char *str, size_t size); + +/** + * Structure to represent NULL-less strings. + */ +struct cfs_lstr { + char *ls_str; + int ls_len; +}; + +/* + * Structure to represent \ token of the syntax. + */ +struct cfs_range_expr { + /* + * Link to cfs_expr_list::el_exprs. + */ + struct list_head re_link; + __u32 re_lo; + __u32 re_hi; + __u32 re_stride; +}; + +struct cfs_expr_list { + struct list_head el_link; + struct list_head el_exprs; +}; + +char *cfs_trimwhite(char *str); +int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res); +int cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max); +int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list); +int cfs_expr_list_values(struct cfs_expr_list *expr_list, + int max, __u32 **values); +static inline void +cfs_expr_list_values_free(__u32 *values, int num) +{ + /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed + * by OBD_FREE() if it's called by module other than libcfs & LNet, + * otherwise we will see fake memory leak */ + LIBCFS_FREE(values, num * sizeof(values[0])); +} + +void cfs_expr_list_free(struct cfs_expr_list *expr_list); +int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp); +void cfs_expr_list_free_list(struct list_head *list); +int cfs_ip_addr_parse(char *str, int len, struct list_head *list); +int cfs_ip_addr_match(__u32 addr, struct list_head *list); +void cfs_ip_addr_free(struct list_head *list); + +#define strtoul(str, endp, base) simple_strtoul(str, endp, base) + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h new file mode 100644 index 000000000..5de6da085 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h @@ -0,0 +1,131 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_time.h + * + * Time functions. + * + */ + +#ifndef __LIBCFS_TIME_H__ +#define __LIBCFS_TIME_H__ +/* + * generic time manipulation functions. + */ + +static inline unsigned long cfs_time_add(unsigned long t, long d) +{ + return (unsigned long)(t + d); +} + +static inline unsigned long cfs_time_sub(unsigned long t1, unsigned long t2) +{ + return (unsigned long)(t1 - t2); +} + +static inline int cfs_time_after(unsigned long t1, unsigned long t2) +{ + return time_before(t2, t1); +} + +static inline int cfs_time_aftereq(unsigned long t1, unsigned long t2) +{ + return time_before_eq(t2, t1); +} + +static inline unsigned long cfs_time_shift(int seconds) +{ + return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds)); +} + +static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small, + struct timeval *result) +{ + long r = (long)( + (large->tv_sec - small->tv_sec) * ONE_MILLION + + (large->tv_usec - small->tv_usec)); + if (result != NULL) { + result->tv_usec = r % ONE_MILLION; + result->tv_sec = r / ONE_MILLION; + } + return r; +} + +static inline void cfs_slow_warning(unsigned long now, int seconds, char *msg) +{ + if (cfs_time_after(cfs_time_current(), + cfs_time_add(now, cfs_time_seconds(15)))) + CERROR("slow %s "CFS_TIME_T" sec\n", msg, + cfs_duration_sec(cfs_time_sub(cfs_time_current(), now))); +} + +#define CFS_RATELIMIT(seconds) \ +({ \ + /* \ + * XXX nikita: non-portable initializer \ + */ \ + static time_t __next_message; \ + int result; \ + \ + if (cfs_time_after(cfs_time_current(), __next_message)) \ + result = 1; \ + else { \ + __next_message = cfs_time_shift(seconds); \ + result = 0; \ + } \ + result; \ +}) + +/* + * helper function similar to do_gettimeofday() of Linux kernel + */ +static inline void cfs_fs_timeval(struct timeval *tv) +{ + struct timespec time; + + cfs_fs_time_current(&time); + cfs_fs_time_usec(&time, tv); +} + +/* + * return valid time-out based on user supplied one. Currently we only check + * that time-out is not shorted than allowed. + */ +static inline long cfs_timeout_cap(long timeout) +{ + if (timeout < CFS_TICK) + timeout = CFS_TICK; + return timeout; +} + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h new file mode 100644 index 000000000..5cc64f327 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h @@ -0,0 +1,110 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_workitem.h + * + * Author: Isaac Huang + * Liang Zhen + * + * A workitems is deferred work with these semantics: + * - a workitem always runs in thread context. + * - a workitem can be concurrent with other workitems but is strictly + * serialized with respect to itself. + * - no CPU affinity, a workitem does not necessarily run on the same CPU + * that schedules it. However, this might change in the future. + * - if a workitem is scheduled again before it has a chance to run, it + * runs only once. + * - if a workitem is scheduled while it runs, it runs again after it + * completes; this ensures that events occurring while other events are + * being processed receive due attention. This behavior also allows a + * workitem to reschedule itself. + * + * Usage notes: + * - a workitem can sleep but it should be aware of how that sleep might + * affect others. + * - a workitem runs inside a kernel thread so there's no user space to access. + * - do not use a workitem if the scheduling latency can't be tolerated. + * + * When wi_action returns non-zero, it means the workitem has either been + * freed or reused and workitem scheduler won't touch it any more. + */ + +#ifndef __LIBCFS_WORKITEM_H__ +#define __LIBCFS_WORKITEM_H__ + +struct cfs_wi_sched; + +void cfs_wi_sched_destroy(struct cfs_wi_sched *); +int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt, + int nthrs, struct cfs_wi_sched **); + +struct cfs_workitem; + +typedef int (*cfs_wi_action_t) (struct cfs_workitem *); +typedef struct cfs_workitem { + /** chain on runq or rerunq */ + struct list_head wi_list; + /** working function */ + cfs_wi_action_t wi_action; + /** arg for working function */ + void *wi_data; + /** in running */ + unsigned short wi_running:1; + /** scheduled */ + unsigned short wi_scheduled:1; +} cfs_workitem_t; + +static inline void +cfs_wi_init(cfs_workitem_t *wi, void *data, cfs_wi_action_t action) +{ + INIT_LIST_HEAD(&wi->wi_list); + + wi->wi_running = 0; + wi->wi_scheduled = 0; + wi->wi_data = data; + wi->wi_action = action; +} + +void cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi); +int cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi); +void cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi); + +int cfs_wi_startup(void); +void cfs_wi_shutdown(void); + +/** # workitem scheduler loops before reschedule */ +#define CFS_WI_RESCHED 128 + +#endif /* __LIBCFS_WORKITEM_H__ */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h new file mode 100644 index 000000000..4fe50841e --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h @@ -0,0 +1,147 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LINUX_LIBCFS_H__ +#define __LIBCFS_LINUX_LIBCFS_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "linux-cpu.h" +#include "linux-time.h" +#include "linux-mem.h" + + +#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5) + +#if !defined(__x86_64__) +# ifdef __ia64__ +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_dwarf_cfa() & \ + (THREAD_SIZE - 1))) +# else +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0) & \ + (THREAD_SIZE - 1))) +# endif /* __ia64__ */ + +#define __CHECK_STACK(msgdata, mask, cdls) \ +do { \ + if (unlikely(CDEBUG_STACK() > libcfs_stack)) { \ + LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL); \ + libcfs_stack = CDEBUG_STACK(); \ + libcfs_debug_msg(msgdata, \ + "maximum lustre stack %lu\n", \ + CDEBUG_STACK()); \ + (msgdata)->msg_mask = mask; \ + (msgdata)->msg_cdls = cdls; \ + dump_stack(); \ + /*panic("LBUG");*/ \ + } \ +} while (0) +#define CFS_CHECK_STACK(msgdata, mask, cdls) __CHECK_STACK(msgdata, mask, cdls) +#else /* __x86_64__ */ +#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while (0) +#define CDEBUG_STACK() (0L) +#endif /* __x86_64__ */ + +/* initial pid */ +#define LUSTRE_LNET_PID 12345 + +#define __current_nesting_level() (0) + +/** + * Platform specific declarations for cfs_curproc API (libcfs/curproc.h) + * + * Implementation is in linux-curproc.c + */ +#define CFS_CURPROC_COMM_MAX (sizeof((struct task_struct *)0)->comm) + +#include + +/* long integer with size equal to pointer */ +typedef unsigned long ulong_ptr_t; +typedef long long_ptr_t; + +#ifndef WITH_WATCHDOG +#define WITH_WATCHDOG +#endif + + +#endif /* _LINUX_LIBCFS_H */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h new file mode 100644 index 000000000..520209f17 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h @@ -0,0 +1,82 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-mem.h + * + * Basic library routines. + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_LINUX_CPU_H__ +#define __LIBCFS_LINUX_CPU_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +#include +#include +#include + +#ifdef CONFIG_SMP + +#define HAVE_LIBCFS_CPT + +/** virtual processing unit */ +struct cfs_cpu_partition { + /* CPUs mask for this partition */ + cpumask_t *cpt_cpumask; + /* nodes mask for this partition */ + nodemask_t *cpt_nodemask; + /* spread rotor for NUMA allocator */ + unsigned cpt_spread_rotor; +}; + +/** descriptor for CPU partitions */ +struct cfs_cpt_table { + /* version, reserved for hotplug */ + unsigned ctb_version; + /* spread rotor for NUMA allocator */ + unsigned ctb_spread_rotor; + /* # of CPU partitions */ + unsigned ctb_nparts; + /* partitions tables */ + struct cfs_cpu_partition *ctb_parts; + /* shadow HW CPU to CPU partition ID */ + int *ctb_cpu2cpt; + /* all cpus in this partition table */ + cpumask_t *ctb_cpumask; + /* all nodes in this partition table */ + nodemask_t *ctb_nodemask; +}; + +#endif /* CONFIG_SMP */ +#endif /* __LIBCFS_LINUX_CPU_H__ */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h new file mode 100644 index 000000000..0f2fd79e5 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h @@ -0,0 +1,80 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-mem.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_MEM_H__ +#define __LIBCFS_LINUX_CFS_MEM_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +#include +#include +#include +#include +#include +#include + +#ifndef HAVE_LIBCFS_CPT +/* Need this for cfs_cpt_table */ +#include "../libcfs_cpu.h" +#endif + +#define CFS_PAGE_MASK (~((__u64)PAGE_CACHE_SIZE-1)) +#define page_index(p) ((p)->index) + +#define memory_pressure_get() (current->flags & PF_MEMALLOC) +#define memory_pressure_set() do { current->flags |= PF_MEMALLOC; } while (0) +#define memory_pressure_clr() do { current->flags &= ~PF_MEMALLOC; } while (0) + +#if BITS_PER_LONG == 32 +/* limit to lowmem on 32-bit systems */ +#define NUM_CACHEPAGES \ + min(totalram_pages, 1UL << (30 - PAGE_CACHE_SHIFT) * 3 / 4) +#else +#define NUM_CACHEPAGES totalram_pages +#endif + +#define DECL_MMSPACE mm_segment_t __oldfs +#define MMSPACE_OPEN \ + do { __oldfs = get_fs(); set_fs(get_ds()); } while (0) +#define MMSPACE_CLOSE set_fs(__oldfs) + +#endif /* __LINUX_CFS_MEM_H__ */ diff --git a/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h new file mode 100644 index 000000000..0fc490bac --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h @@ -0,0 +1,144 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-time.h + * + * Implementation of portable time API for Linux (kernel and user-level). + * + * Author: Nikita Danilov + */ + +#ifndef __LIBCFS_LINUX_LINUX_TIME_H__ +#define __LIBCFS_LINUX_LINUX_TIME_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +#define ONE_BILLION ((u_int64_t)1000000000) +#define ONE_MILLION 1000000 + +#include +#include +#include +#include + +/* + * post 2.5 kernels. + */ + +#include + + +static inline void cfs_fs_time_usec(struct timespec *t, struct timeval *v) +{ + v->tv_sec = t->tv_sec; + v->tv_usec = t->tv_nsec / 1000; +} + +/* + * Generic kernel stuff + */ + +static inline unsigned long cfs_time_current(void) +{ + return jiffies; +} + +static inline void cfs_fs_time_current(struct timespec *t) +{ + *t = CURRENT_TIME; +} + +static inline time_t cfs_fs_time_sec(struct timespec *t) +{ + return t->tv_sec; +} + +static inline long cfs_time_seconds(int seconds) +{ + return ((long)seconds) * HZ; +} + +static inline time_t cfs_duration_sec(long d) +{ + return d / HZ; +} + +static inline void cfs_duration_usec(long d, struct timeval *s) +{ +#if (BITS_PER_LONG == 32) && (HZ > 4096) + __u64 t; + + s->tv_sec = d / HZ; + t = (d - (long)s->tv_sec * HZ) * ONE_MILLION; + do_div(t, HZ); + s->tv_usec = t; +#else + s->tv_sec = d / HZ; + s->tv_usec = ((d - (long)s->tv_sec * HZ) * ONE_MILLION) / HZ; +#endif +} + +#define cfs_time_current_64 get_jiffies_64 + +static inline __u64 cfs_time_add_64(__u64 t, __u64 d) +{ + return t + d; +} + +static inline __u64 cfs_time_shift_64(int seconds) +{ + return cfs_time_add_64(cfs_time_current_64(), + cfs_time_seconds(seconds)); +} + +static inline int cfs_time_before_64(__u64 t1, __u64 t2) +{ + return (__s64)t2 - (__s64)t1 > 0; +} + +static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2) +{ + return (__s64)t2 - (__s64)t1 >= 0; +} + +/* + * One jiffy + */ +#define CFS_TICK (1) + +#define CFS_TIME_T "%lu" +#define CFS_DURATION_T "%ld" + +#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */ diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/api-support.h b/kernel/drivers/staging/lustre/include/linux/lnet/api-support.h new file mode 100644 index 000000000..8f7fa28b5 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/api-support.h @@ -0,0 +1,44 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_API_SUPPORT_H__ +#define __LNET_API_SUPPORT_H__ + +#include "linux/api-support.h" + +#include "../libcfs/libcfs.h" +#include "types.h" +#include "lnet.h" + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/api.h b/kernel/drivers/staging/lustre/include/linux/lnet/api.h new file mode 100644 index 000000000..cd8651757 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/api.h @@ -0,0 +1,217 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_API_H__ +#define __LNET_API_H__ + +/** \defgroup lnet LNet + * + * The Lustre Networking subsystem. + * + * LNet is an asynchronous message-passing API, which provides an unreliable + * connectionless service that can't guarantee any order. It supports OFA IB, + * TCP/IP, and Cray Portals, and routes between heterogeneous networks. + * + * LNet can run both in OS kernel space and in userspace as a library. + * @{ + */ + +#include "../lnet/types.h" + +/** \defgroup lnet_init_fini Initialization and cleanup + * The LNet must be properly initialized before any LNet calls can be made. + * @{ */ +int LNetInit(void); +void LNetFini(void); + +int LNetNIInit(lnet_pid_t requested_pid); +int LNetNIFini(void); +/** @} lnet_init_fini */ + +/** \defgroup lnet_addr LNet addressing and basic types + * + * Addressing scheme and basic data types of LNet. + * + * The LNet API is memory-oriented, so LNet must be able to address not only + * end-points but also memory region within a process address space. + * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process + * in a node. A portal represents an opening in the address space of a + * process. Match bits is criteria to identify a region of memory inside a + * portal, and offset specifies an offset within the memory region. + * + * LNet creates a table of portals for each process during initialization. + * This table has MAX_PORTALS entries and its size can't be dynamically + * changed. A portal stays empty until the owning process starts to add + * memory regions to it. A portal is sometimes called an index because + * it's an entry in the portals table of a process. + * + * \see LNetMEAttach + * @{ */ +int LNetGetId(unsigned int index, lnet_process_id_t *id); +int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order); +void LNetSnprintHandle(char *str, int str_len, lnet_handle_any_t handle); + +/** @} lnet_addr */ + +/** \defgroup lnet_me Match entries + * + * A match entry (abbreviated as ME) describes a set of criteria to accept + * incoming requests. + * + * A portal is essentially a match list plus a set of attributes. A match + * list is a chain of MEs. Each ME includes a pointer to a memory descriptor + * and a set of match criteria. The match criteria can be used to reject + * incoming requests based on process ID or the match bits provided in the + * request. MEs can be dynamically inserted into a match list by LNetMEAttach() + * and LNetMEInsert(), and removed from its list by LNetMEUnlink(). + * @{ */ +int LNetMEAttach(unsigned int portal, + lnet_process_id_t match_id_in, + __u64 match_bits_in, + __u64 ignore_bits_in, + lnet_unlink_t unlink_in, + lnet_ins_pos_t pos_in, + lnet_handle_me_t *handle_out); + +int LNetMEInsert(lnet_handle_me_t current_in, + lnet_process_id_t match_id_in, + __u64 match_bits_in, + __u64 ignore_bits_in, + lnet_unlink_t unlink_in, + lnet_ins_pos_t position_in, + lnet_handle_me_t *handle_out); + +int LNetMEUnlink(lnet_handle_me_t current_in); +/** @} lnet_me */ + +/** \defgroup lnet_md Memory descriptors + * + * A memory descriptor contains information about a region of a user's + * memory (either in kernel or user space) and optionally points to an + * event queue where information about the operations performed on the + * memory descriptor are recorded. Memory descriptor is abbreviated as + * MD and can be used interchangeably with the memory region it describes. + * + * The LNet API provides two operations to create MDs: LNetMDAttach() + * and LNetMDBind(); one operation to unlink and release the resources + * associated with a MD: LNetMDUnlink(). + * @{ */ +int LNetMDAttach(lnet_handle_me_t current_in, + lnet_md_t md_in, + lnet_unlink_t unlink_in, + lnet_handle_md_t *handle_out); + +int LNetMDBind(lnet_md_t md_in, + lnet_unlink_t unlink_in, + lnet_handle_md_t *handle_out); + +int LNetMDUnlink(lnet_handle_md_t md_in); +/** @} lnet_md */ + +/** \defgroup lnet_eq Events and event queues + * + * Event queues (abbreviated as EQ) are used to log operations performed on + * local MDs. In particular, they signal the completion of a data transmission + * into or out of a MD. They can also be used to hold acknowledgments for + * completed PUT operations and indicate when a MD has been unlinked. Multiple + * MDs can share a single EQ. An EQ may have an optional event handler + * associated with it. If an event handler exists, it will be run for each + * event that is deposited into the EQ. + * + * In addition to the lnet_handle_eq_t, the LNet API defines two types + * associated with events: The ::lnet_event_kind_t defines the kinds of events + * that can be stored in an EQ. The lnet_event_t defines a structure that + * holds the information about with an event. + * + * There are five functions for dealing with EQs: LNetEQAlloc() is used to + * create an EQ and allocate the resources needed, while LNetEQFree() + * releases these resources and free the EQ. LNetEQGet() retrieves the next + * event from an EQ, and LNetEQWait() can be used to block a process until + * an EQ has at least one event. LNetEQPoll() can be used to test or wait + * on multiple EQs. + * @{ */ +int LNetEQAlloc(unsigned int count_in, + lnet_eq_handler_t handler, + lnet_handle_eq_t *handle_out); + +int LNetEQFree(lnet_handle_eq_t eventq_in); + +int LNetEQGet(lnet_handle_eq_t eventq_in, + lnet_event_t *event_out); + +int LNetEQWait(lnet_handle_eq_t eventq_in, + lnet_event_t *event_out); + +int LNetEQPoll(lnet_handle_eq_t *eventqs_in, + int neq_in, + int timeout_ms, + lnet_event_t *event_out, + int *which_eq_out); +/** @} lnet_eq */ + +/** \defgroup lnet_data Data movement operations + * + * The LNet API provides two data movement operations: LNetPut() + * and LNetGet(). + * @{ */ +int LNetPut(lnet_nid_t self, + lnet_handle_md_t md_in, + lnet_ack_req_t ack_req_in, + lnet_process_id_t target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in, + __u64 hdr_data_in); + +int LNetGet(lnet_nid_t self, + lnet_handle_md_t md_in, + lnet_process_id_t target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in); +/** @} lnet_data */ + +/** \defgroup lnet_misc Miscellaneous operations. + * Miscellaneous operations. + * @{ */ + +int LNetSetLazyPortal(int portal); +int LNetClearLazyPortal(int portal); +int LNetCtl(unsigned int cmd, void *arg); +int LNetSetAsync(lnet_process_id_t id, int nasync); + +/** @} lnet_misc */ + +/** @} lnet */ +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/kernel/drivers/staging/lustre/include/linux/lnet/lib-lnet.h new file mode 100644 index 000000000..0038d29a3 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/lib-lnet.h @@ -0,0 +1,883 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/lib-lnet.h + * + * Top level include for library side routines + */ + +#ifndef __LNET_LIB_LNET_H__ +#define __LNET_LIB_LNET_H__ + +#include "linux/lib-lnet.h" +#include "../libcfs/libcfs.h" +#include "types.h" +#include "lnet.h" +#include "lib-types.h" + +extern lnet_t the_lnet; /* THE network */ + +#if defined(LNET_USE_LIB_FREELIST) +/* 1 CPT, simplify implementation... */ +# define LNET_CPT_MAX_BITS 0 + +#else /* KERNEL and no freelist */ + +# if (BITS_PER_LONG == 32) +/* 2 CPTs, allowing more CPTs might make us under memory pressure */ +# define LNET_CPT_MAX_BITS 1 + +# else /* 64-bit system */ +/* + * 256 CPTs for thousands of CPUs, allowing more CPTs might make us + * under risk of consuming all lh_cookie. + */ +# define LNET_CPT_MAX_BITS 8 +# endif /* BITS_PER_LONG == 32 */ +#endif + +/* max allowed CPT number */ +#define LNET_CPT_MAX (1 << LNET_CPT_MAX_BITS) + +#define LNET_CPT_NUMBER (the_lnet.ln_cpt_number) +#define LNET_CPT_BITS (the_lnet.ln_cpt_bits) +#define LNET_CPT_MASK ((1ULL << LNET_CPT_BITS) - 1) + +/** exclusive lock */ +#define LNET_LOCK_EX CFS_PERCPT_LOCK_EX + +static inline int lnet_is_wire_handle_none(lnet_handle_wire_t *wh) +{ + return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE && + wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE); +} + +static inline int lnet_md_exhausted(lnet_libmd_t *md) +{ + return (md->md_threshold == 0 || + ((md->md_options & LNET_MD_MAX_SIZE) != 0 && + md->md_offset + md->md_max_size > md->md_length)); +} + +static inline int lnet_md_unlinkable(lnet_libmd_t *md) +{ + /* Should unlink md when its refcount is 0 and either: + * - md has been flagged for deletion (by auto unlink or + * LNetM[DE]Unlink, in the latter case md may not be exhausted). + * - auto unlink is on and md is exhausted. + */ + if (md->md_refcount != 0) + return 0; + + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0) + return 1; + + return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 && + lnet_md_exhausted(md)); +} + +#define lnet_cpt_table() (the_lnet.ln_cpt_table) +#define lnet_cpt_current() cfs_cpt_current(the_lnet.ln_cpt_table, 1) + +static inline int +lnet_cpt_of_cookie(__u64 cookie) +{ + unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK; + + /* LNET_CPT_NUMBER doesn't have to be power2, which means we can + * get illegal cpt from it's invalid cookie */ + return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER; +} + +static inline void +lnet_res_lock(int cpt) +{ + cfs_percpt_lock(the_lnet.ln_res_lock, cpt); +} + +static inline void +lnet_res_unlock(int cpt) +{ + cfs_percpt_unlock(the_lnet.ln_res_lock, cpt); +} + +static inline int +lnet_res_lock_current(void) +{ + int cpt = lnet_cpt_current(); + + lnet_res_lock(cpt); + return cpt; +} + +static inline void +lnet_net_lock(int cpt) +{ + cfs_percpt_lock(the_lnet.ln_net_lock, cpt); +} + +static inline void +lnet_net_unlock(int cpt) +{ + cfs_percpt_unlock(the_lnet.ln_net_lock, cpt); +} + +static inline int +lnet_net_lock_current(void) +{ + int cpt = lnet_cpt_current(); + + lnet_net_lock(cpt); + return cpt; +} + +#define LNET_LOCK() lnet_net_lock(LNET_LOCK_EX) +#define LNET_UNLOCK() lnet_net_unlock(LNET_LOCK_EX) + +#define lnet_ptl_lock(ptl) spin_lock(&(ptl)->ptl_lock) +#define lnet_ptl_unlock(ptl) spin_unlock(&(ptl)->ptl_lock) +#define lnet_eq_wait_lock() spin_lock(&the_lnet.ln_eq_wait_lock) +#define lnet_eq_wait_unlock() spin_unlock(&the_lnet.ln_eq_wait_lock) +#define lnet_ni_lock(ni) spin_lock(&(ni)->ni_lock) +#define lnet_ni_unlock(ni) spin_unlock(&(ni)->ni_lock) +#define LNET_MUTEX_LOCK(m) mutex_lock(m) +#define LNET_MUTEX_UNLOCK(m) mutex_unlock(m) + +#define MAX_PORTALS 64 + +/* these are only used by code with LNET_USE_LIB_FREELIST, but we still + * exported them to !LNET_USE_LIB_FREELIST for easy implementation */ +#define LNET_FL_MAX_MES 2048 +#define LNET_FL_MAX_MDS 2048 +#define LNET_FL_MAX_EQS 512 +#define LNET_FL_MAX_MSGS 2048 /* Outstanding messages */ + +#ifdef LNET_USE_LIB_FREELIST + +int lnet_freelist_init(lnet_freelist_t *fl, int n, int size); +void lnet_freelist_fini(lnet_freelist_t *fl); + +static inline void * +lnet_freelist_alloc(lnet_freelist_t *fl) +{ + /* ALWAYS called with liblock held */ + lnet_freeobj_t *o; + + if (list_empty(&fl->fl_list)) + return NULL; + + o = list_entry(fl->fl_list.next, lnet_freeobj_t, fo_list); + list_del(&o->fo_list); + return (void *)&o->fo_contents; +} + +static inline void +lnet_freelist_free(lnet_freelist_t *fl, void *obj) +{ + /* ALWAYS called with liblock held */ + lnet_freeobj_t *o = list_entry(obj, lnet_freeobj_t, fo_contents); + + list_add(&o->fo_list, &fl->fl_list); +} + +static inline lnet_eq_t * +lnet_eq_alloc(void) +{ + /* NEVER called with resource lock held */ + struct lnet_res_container *rec = &the_lnet.ln_eq_container; + lnet_eq_t *eq; + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_res_lock(0); + eq = (lnet_eq_t *)lnet_freelist_alloc(&rec->rec_freelist); + lnet_res_unlock(0); + + return eq; +} + +static inline void +lnet_eq_free_locked(lnet_eq_t *eq) +{ + /* ALWAYS called with resource lock held */ + struct lnet_res_container *rec = &the_lnet.ln_eq_container; + + LASSERT(LNET_CPT_NUMBER == 1); + lnet_freelist_free(&rec->rec_freelist, eq); +} + +static inline void +lnet_eq_free(lnet_eq_t *eq) +{ + lnet_res_lock(0); + lnet_eq_free_locked(eq); + lnet_res_unlock(0); +} + +static inline lnet_libmd_t * +lnet_md_alloc(lnet_md_t *umd) +{ + /* NEVER called with resource lock held */ + struct lnet_res_container *rec = the_lnet.ln_md_containers[0]; + lnet_libmd_t *md; + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_res_lock(0); + md = (lnet_libmd_t *)lnet_freelist_alloc(&rec->rec_freelist); + lnet_res_unlock(0); + + if (md != NULL) + INIT_LIST_HEAD(&md->md_list); + + return md; +} + +static inline void +lnet_md_free_locked(lnet_libmd_t *md) +{ + /* ALWAYS called with resource lock held */ + struct lnet_res_container *rec = the_lnet.ln_md_containers[0]; + + LASSERT(LNET_CPT_NUMBER == 1); + lnet_freelist_free(&rec->rec_freelist, md); +} + +static inline void +lnet_md_free(lnet_libmd_t *md) +{ + lnet_res_lock(0); + lnet_md_free_locked(md); + lnet_res_unlock(0); +} + +static inline lnet_me_t * +lnet_me_alloc(void) +{ + /* NEVER called with resource lock held */ + struct lnet_res_container *rec = the_lnet.ln_me_containers[0]; + lnet_me_t *me; + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_res_lock(0); + me = (lnet_me_t *)lnet_freelist_alloc(&rec->rec_freelist); + lnet_res_unlock(0); + + return me; +} + +static inline void +lnet_me_free_locked(lnet_me_t *me) +{ + /* ALWAYS called with resource lock held */ + struct lnet_res_container *rec = the_lnet.ln_me_containers[0]; + + LASSERT(LNET_CPT_NUMBER == 1); + lnet_freelist_free(&rec->rec_freelist, me); +} + +static inline void +lnet_me_free(lnet_me_t *me) +{ + lnet_res_lock(0); + lnet_me_free_locked(me); + lnet_res_unlock(0); +} + +static inline lnet_msg_t * +lnet_msg_alloc(void) +{ + /* NEVER called with network lock held */ + struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0]; + lnet_msg_t *msg; + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_net_lock(0); + msg = (lnet_msg_t *)lnet_freelist_alloc(&msc->msc_freelist); + lnet_net_unlock(0); + + if (msg != NULL) { + /* NULL pointers, clear flags etc */ + memset(msg, 0, sizeof(*msg)); + } + return msg; +} + +static inline void +lnet_msg_free_locked(lnet_msg_t *msg) +{ + /* ALWAYS called with network lock held */ + struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0]; + + LASSERT(LNET_CPT_NUMBER == 1); + LASSERT(!msg->msg_onactivelist); + lnet_freelist_free(&msc->msc_freelist, msg); +} + +static inline void +lnet_msg_free(lnet_msg_t *msg) +{ + lnet_net_lock(0); + lnet_msg_free_locked(msg); + lnet_net_unlock(0); +} + +#else /* !LNET_USE_LIB_FREELIST */ + +static inline lnet_eq_t * +lnet_eq_alloc(void) +{ + /* NEVER called with liblock held */ + lnet_eq_t *eq; + + LIBCFS_ALLOC(eq, sizeof(*eq)); + return eq; +} + +static inline void +lnet_eq_free(lnet_eq_t *eq) +{ + /* ALWAYS called with resource lock held */ + LIBCFS_FREE(eq, sizeof(*eq)); +} + +static inline lnet_libmd_t * +lnet_md_alloc(lnet_md_t *umd) +{ + /* NEVER called with liblock held */ + lnet_libmd_t *md; + unsigned int size; + unsigned int niov; + + if ((umd->options & LNET_MD_KIOV) != 0) { + niov = umd->length; + size = offsetof(lnet_libmd_t, md_iov.kiov[niov]); + } else { + niov = ((umd->options & LNET_MD_IOVEC) != 0) ? + umd->length : 1; + size = offsetof(lnet_libmd_t, md_iov.iov[niov]); + } + + LIBCFS_ALLOC(md, size); + + if (md != NULL) { + /* Set here in case of early free */ + md->md_options = umd->options; + md->md_niov = niov; + INIT_LIST_HEAD(&md->md_list); + } + + return md; +} + +static inline void +lnet_md_free(lnet_libmd_t *md) +{ + /* ALWAYS called with resource lock held */ + unsigned int size; + + if ((md->md_options & LNET_MD_KIOV) != 0) + size = offsetof(lnet_libmd_t, md_iov.kiov[md->md_niov]); + else + size = offsetof(lnet_libmd_t, md_iov.iov[md->md_niov]); + + LIBCFS_FREE(md, size); +} + +static inline lnet_me_t * +lnet_me_alloc(void) +{ + /* NEVER called with liblock held */ + lnet_me_t *me; + + LIBCFS_ALLOC(me, sizeof(*me)); + return me; +} + +static inline void +lnet_me_free(lnet_me_t *me) +{ + /* ALWAYS called with resource lock held */ + LIBCFS_FREE(me, sizeof(*me)); +} + +static inline lnet_msg_t * +lnet_msg_alloc(void) +{ + /* NEVER called with liblock held */ + lnet_msg_t *msg; + + LIBCFS_ALLOC(msg, sizeof(*msg)); + + /* no need to zero, LIBCFS_ALLOC does for us */ + return msg; +} + +static inline void +lnet_msg_free(lnet_msg_t *msg) +{ + /* ALWAYS called with network lock held */ + LASSERT(!msg->msg_onactivelist); + LIBCFS_FREE(msg, sizeof(*msg)); +} + +#define lnet_eq_free_locked(eq) lnet_eq_free(eq) +#define lnet_md_free_locked(md) lnet_md_free(md) +#define lnet_me_free_locked(me) lnet_me_free(me) +#define lnet_msg_free_locked(msg) lnet_msg_free(msg) + +#endif /* LNET_USE_LIB_FREELIST */ + +lnet_libhandle_t *lnet_res_lh_lookup(struct lnet_res_container *rec, + __u64 cookie); +void lnet_res_lh_initialize(struct lnet_res_container *rec, + lnet_libhandle_t *lh); +static inline void +lnet_res_lh_invalidate(lnet_libhandle_t *lh) +{ + /* ALWAYS called with resource lock held */ + /* NB: cookie is still useful, don't reset it */ + list_del(&lh->lh_hash_chain); +} + +static inline void +lnet_eq2handle(lnet_handle_eq_t *handle, lnet_eq_t *eq) +{ + if (eq == NULL) { + LNetInvalidateHandle(handle); + return; + } + + handle->cookie = eq->eq_lh.lh_cookie; +} + +static inline lnet_eq_t * +lnet_handle2eq(lnet_handle_eq_t *handle) +{ + /* ALWAYS called with resource lock held */ + lnet_libhandle_t *lh; + + lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, lnet_eq_t, eq_lh); +} + +static inline void +lnet_md2handle(lnet_handle_md_t *handle, lnet_libmd_t *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline lnet_libmd_t * +lnet_handle2md(lnet_handle_md_t *handle) +{ + /* ALWAYS called with resource lock held */ + lnet_libhandle_t *lh; + int cpt; + + cpt = lnet_cpt_of_cookie(handle->cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], + handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, lnet_libmd_t, md_lh); +} + +static inline lnet_libmd_t * +lnet_wire_handle2md(lnet_handle_wire_t *wh) +{ + /* ALWAYS called with resource lock held */ + lnet_libhandle_t *lh; + int cpt; + + if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie) + return NULL; + + cpt = lnet_cpt_of_cookie(wh->wh_object_cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], + wh->wh_object_cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, lnet_libmd_t, md_lh); +} + +static inline void +lnet_me2handle(lnet_handle_me_t *handle, lnet_me_t *me) +{ + handle->cookie = me->me_lh.lh_cookie; +} + +static inline lnet_me_t * +lnet_handle2me(lnet_handle_me_t *handle) +{ + /* ALWAYS called with resource lock held */ + lnet_libhandle_t *lh; + int cpt; + + cpt = lnet_cpt_of_cookie(handle->cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt], + handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, lnet_me_t, me_lh); +} + +static inline void +lnet_peer_addref_locked(lnet_peer_t *lp) +{ + LASSERT(lp->lp_refcount > 0); + lp->lp_refcount++; +} + +void lnet_destroy_peer_locked(lnet_peer_t *lp); + +static inline void +lnet_peer_decref_locked(lnet_peer_t *lp) +{ + LASSERT(lp->lp_refcount > 0); + lp->lp_refcount--; + if (lp->lp_refcount == 0) + lnet_destroy_peer_locked(lp); +} + +static inline int +lnet_isrouter(lnet_peer_t *lp) +{ + return lp->lp_rtr_refcount != 0; +} + +static inline void +lnet_ni_addref_locked(lnet_ni_t *ni, int cpt) +{ + LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); + LASSERT(*ni->ni_refs[cpt] >= 0); + + (*ni->ni_refs[cpt])++; +} + +static inline void +lnet_ni_addref(lnet_ni_t *ni) +{ + lnet_net_lock(0); + lnet_ni_addref_locked(ni, 0); + lnet_net_unlock(0); +} + +static inline void +lnet_ni_decref_locked(lnet_ni_t *ni, int cpt) +{ + LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); + LASSERT(*ni->ni_refs[cpt] > 0); + + (*ni->ni_refs[cpt])--; +} + +static inline void +lnet_ni_decref(lnet_ni_t *ni) +{ + lnet_net_lock(0); + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); +} + +void lnet_ni_free(lnet_ni_t *ni); + +static inline int +lnet_nid2peerhash(lnet_nid_t nid) +{ + return hash_long(nid, LNET_PEER_HASH_BITS); +} + +static inline struct list_head * +lnet_net2rnethash(__u32 net) +{ + return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) + + LNET_NETTYP(net)) & + ((1U << the_lnet.ln_remote_nets_hbits) - 1)]; +} + +extern lnd_t the_lolnd; +extern int avoid_asym_router_failure; + +int lnet_cpt_of_nid_locked(lnet_nid_t nid); +int lnet_cpt_of_nid(lnet_nid_t nid); +lnet_ni_t *lnet_nid2ni_locked(lnet_nid_t nid, int cpt); +lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt); +lnet_ni_t *lnet_net2ni(__u32 net); + +int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, unsigned long when); +void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, + unsigned long when); +int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid, + unsigned int priority); +int lnet_check_routes(void); +int lnet_del_route(__u32 net, lnet_nid_t gw_nid); +void lnet_destroy_routes(void); +int lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive, __u32 *priority); +void lnet_proc_init(void); +void lnet_proc_fini(void); +int lnet_rtrpools_alloc(int im_a_router); +void lnet_rtrpools_free(void); +lnet_remotenet_t *lnet_find_net_locked(__u32 net); + +int lnet_islocalnid(lnet_nid_t nid); +int lnet_islocalnet(__u32 net); + +void lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md, + unsigned int offset, unsigned int mlen); +void lnet_msg_detach_md(lnet_msg_t *msg, int status); +void lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev); +void lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type); +void lnet_msg_commit(lnet_msg_t *msg, int cpt); +void lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status); + +void lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev); +void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, + unsigned int offset, unsigned int len); +int lnet_send(lnet_nid_t nid, lnet_msg_t *msg, lnet_nid_t rtr_nid); +void lnet_return_tx_credits_locked(lnet_msg_t *msg); +void lnet_return_rx_credits_locked(lnet_msg_t *msg); + +/* portals functions */ +/* portals attributes */ +static inline int +lnet_ptl_is_lazy(lnet_portal_t *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_LAZY); +} + +static inline int +lnet_ptl_is_unique(lnet_portal_t *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE); +} + +static inline int +lnet_ptl_is_wildcard(lnet_portal_t *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD); +} + +static inline void +lnet_ptl_setopt(lnet_portal_t *ptl, int opt) +{ + ptl->ptl_options |= opt; +} + +static inline void +lnet_ptl_unsetopt(lnet_portal_t *ptl, int opt) +{ + ptl->ptl_options &= ~opt; +} + +/* match-table functions */ +struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable, + lnet_process_id_t id, __u64 mbits); +struct lnet_match_table *lnet_mt_of_attach(unsigned int index, + lnet_process_id_t id, __u64 mbits, + __u64 ignore_bits, + lnet_ins_pos_t pos); +int lnet_mt_match_md(struct lnet_match_table *mtable, + struct lnet_match_info *info, struct lnet_msg *msg); + +/* portals match/attach functions */ +void lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md, + struct list_head *matches, struct list_head *drops); +void lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md); +int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg); + +/* initialized and finalize portals */ +int lnet_portals_create(void); +void lnet_portals_destroy(void); + +/* message functions */ +int lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, + lnet_nid_t fromnid, void *private, int rdma_req); +void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlen, unsigned int rlen); +lnet_msg_t *lnet_create_reply_msg(lnet_ni_t *ni, lnet_msg_t *get_msg); +void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *msg, unsigned int len); +void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc); +void lnet_drop_delayed_msg_list(struct list_head *head, char *reason); +void lnet_recv_delayed_msg_list(struct list_head *head); + +int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt); +void lnet_msg_container_cleanup(struct lnet_msg_container *container); +void lnet_msg_containers_destroy(void); +int lnet_msg_containers_create(void); + +char *lnet_msgtyp2str(int type); +void lnet_print_hdr(lnet_hdr_t *hdr); +int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold); + +void lnet_counters_get(lnet_counters_t *counters); +void lnet_counters_reset(void); + +unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov); +int lnet_extract_iov(int dst_niov, struct kvec *dst, + int src_niov, struct kvec *src, + unsigned int offset, unsigned int len); + +unsigned int lnet_kiov_nob(unsigned int niov, lnet_kiov_t *iov); +int lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len); + +void lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, + unsigned int doffset, + unsigned int nsiov, struct kvec *siov, + unsigned int soffset, unsigned int nob); +void lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov, + unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, unsigned int nob); +void lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, + unsigned int niov, struct kvec *iov, + unsigned int iovoffset, unsigned int nob); +void lnet_copy_kiov2kiov(unsigned int ndkiov, lnet_kiov_t *dkiov, + unsigned int doffset, + unsigned int nskiov, lnet_kiov_t *skiov, + unsigned int soffset, unsigned int nob); + +static inline void +lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset, + unsigned int nsiov, struct kvec *siov, unsigned int soffset, + unsigned int nob) +{ + struct kvec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen}; + + lnet_copy_iov2iov(1, &diov, doffset, + nsiov, siov, soffset, nob); +} + +static inline void +lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset, + unsigned int nsiov, lnet_kiov_t *skiov, + unsigned int soffset, unsigned int nob) +{ + struct kvec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen}; + + lnet_copy_kiov2iov(1, &diov, doffset, + nsiov, skiov, soffset, nob); +} + +static inline void +lnet_copy_flat2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, + int slen, void *src, unsigned int soffset, unsigned int nob) +{ + struct kvec siov = {/*.iov_base = */ src, /*.iov_len = */slen}; + + lnet_copy_iov2iov(ndiov, diov, doffset, + 1, &siov, soffset, nob); +} + +static inline void +lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, + unsigned int doffset, int slen, void *src, + unsigned int soffset, unsigned int nob) +{ + struct kvec siov = {/* .iov_base = */ src, /* .iov_len = */ slen}; + + lnet_copy_iov2kiov(ndiov, dkiov, doffset, + 1, &siov, soffset, nob); +} + +void lnet_me_unlink(lnet_me_t *me); + +void lnet_md_unlink(lnet_libmd_t *md); +void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd); + +void lnet_register_lnd(lnd_t *lnd); +void lnet_unregister_lnd(lnd_t *lnd); +int lnet_set_ip_niaddr(lnet_ni_t *ni); + +int lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port); +void lnet_connect_console_error(int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int port); +int lnet_count_acceptor_nis(void); +int lnet_acceptor_timeout(void); +int lnet_acceptor_port(void); + +int lnet_count_acceptor_nis(void); +int lnet_acceptor_port(void); + +int lnet_acceptor_start(void); +void lnet_acceptor_stop(void); + +void lnet_get_tunables(void); +int lnet_peers_start_down(void); +int lnet_peer_buffer_credits(lnet_ni_t *ni); + +int lnet_router_checker_start(void); +void lnet_router_checker_stop(void); +void lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net); +void lnet_swap_pinginfo(lnet_ping_info_t *info); + +int lnet_ping_target_init(void); +void lnet_ping_target_fini(void); +int lnet_ping(lnet_process_id_t id, int timeout_ms, + lnet_process_id_t *ids, int n_ids); + +int lnet_parse_ip2nets(char **networksp, char *ip2nets); +int lnet_parse_routes(char *route_str, int *im_a_router); +int lnet_parse_networks(struct list_head *nilist, char *networks); + +int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt); +lnet_peer_t *lnet_find_peer_locked(struct lnet_peer_table *ptable, + lnet_nid_t nid); +void lnet_peer_tables_cleanup(void); +void lnet_peer_tables_destroy(void); +int lnet_peer_tables_create(void); +void lnet_debug_peer(lnet_nid_t nid); + +static inline void lnet_peer_set_alive(lnet_peer_t *lp) +{ + lp->lp_last_alive = lp->lp_last_query = get_seconds(); + if (!lp->lp_alive) + lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); +} + + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lib-types.h b/kernel/drivers/staging/lustre/include/linux/lnet/lib-types.h new file mode 100644 index 000000000..50537668f --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/lib-types.h @@ -0,0 +1,760 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/lib-types.h + * + * Types used by the library side routines that do not need to be + * exposed to the user application + */ + +#ifndef __LNET_LIB_TYPES_H__ +#define __LNET_LIB_TYPES_H__ + +#include "linux/lib-types.h" + +#include "../libcfs/libcfs.h" +#include +#include "types.h" + +#define WIRE_ATTR __attribute__((packed)) + +/* Packed version of lnet_process_id_t to transfer via network */ +typedef struct { + lnet_nid_t nid; + lnet_pid_t pid; /* node id / process id */ +} WIRE_ATTR lnet_process_id_packed_t; + +/* The wire handle's interface cookie only matches one network interface in + * one epoch (i.e. new cookie when the interface restarts or the node + * reboots). The object cookie only matches one object on that interface + * during that object's lifetime (i.e. no cookie re-use). */ +typedef struct { + __u64 wh_interface_cookie; + __u64 wh_object_cookie; +} WIRE_ATTR lnet_handle_wire_t; + +typedef enum { + LNET_MSG_ACK = 0, + LNET_MSG_PUT, + LNET_MSG_GET, + LNET_MSG_REPLY, + LNET_MSG_HELLO, +} lnet_msg_type_t; + +/* The variant fields of the portals message header are aligned on an 8 + * byte boundary in the message header. Note that all types used in these + * wire structs MUST be fixed size and the smaller types are placed at the + * end. */ +typedef struct lnet_ack { + lnet_handle_wire_t dst_wmd; + __u64 match_bits; + __u32 mlength; +} WIRE_ATTR lnet_ack_t; + +typedef struct lnet_put { + lnet_handle_wire_t ack_wmd; + __u64 match_bits; + __u64 hdr_data; + __u32 ptl_index; + __u32 offset; +} WIRE_ATTR lnet_put_t; + +typedef struct lnet_get { + lnet_handle_wire_t return_wmd; + __u64 match_bits; + __u32 ptl_index; + __u32 src_offset; + __u32 sink_length; +} WIRE_ATTR lnet_get_t; + +typedef struct lnet_reply { + lnet_handle_wire_t dst_wmd; +} WIRE_ATTR lnet_reply_t; + +typedef struct lnet_hello { + __u64 incarnation; + __u32 type; +} WIRE_ATTR lnet_hello_t; + +typedef struct { + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + lnet_pid_t dest_pid; + lnet_pid_t src_pid; + __u32 type; /* lnet_msg_type_t */ + __u32 payload_length; /* payload data to follow */ + /*<------__u64 aligned------->*/ + union { + lnet_ack_t ack; + lnet_put_t put; + lnet_get_t get; + lnet_reply_t reply; + lnet_hello_t hello; + } msg; +} WIRE_ATTR lnet_hdr_t; + +/* A HELLO message contains a magic number and protocol version + * code in the header's dest_nid, the peer's NID in the src_nid, and + * LNET_MSG_HELLO in the type field. All other common fields are zero + * (including payload_size; i.e. no payload). + * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is + * running the same protocol and to find out its NID. These LNDs should + * exchange HELLO messages when a connection is first established. Individual + * LNDs can put whatever else they fancy in lnet_hdr_t::msg. + */ +typedef struct { + __u32 magic; /* LNET_PROTO_TCP_MAGIC */ + __u16 version_major; /* increment on incompatible change */ + __u16 version_minor; /* increment on compatible change */ +} WIRE_ATTR lnet_magicversion_t; + +/* PROTO MAGIC for LNDs */ +#define LNET_PROTO_IB_MAGIC 0x0be91b91 +#define LNET_PROTO_RA_MAGIC 0x0be91b92 +#define LNET_PROTO_QSW_MAGIC 0x0be91b93 +#define LNET_PROTO_GNI_MAGIC 0xb00fbabe /* ask Kim */ +#define LNET_PROTO_TCP_MAGIC 0xeebc0ded +#define LNET_PROTO_PTL_MAGIC 0x50746C4E /* 'PtlN' unique magic */ +#define LNET_PROTO_MX_MAGIC 0x4d583130 /* 'MX10'! */ +#define LNET_PROTO_ACCEPTOR_MAGIC 0xacce7100 +#define LNET_PROTO_PING_MAGIC 0x70696E67 /* 'ping' */ + +/* Placeholder for a future "unified" protocol across all LNDs */ +/* Current LNDs that receive a request with this magic will respond with a + * "stub" reply using their current protocol */ +#define LNET_PROTO_MAGIC 0x45726963 /* ! */ + +#define LNET_PROTO_TCP_VERSION_MAJOR 1 +#define LNET_PROTO_TCP_VERSION_MINOR 0 + +/* Acceptor connection request */ +typedef struct { + __u32 acr_magic; /* PTL_ACCEPTOR_PROTO_MAGIC */ + __u32 acr_version; /* protocol version */ + __u64 acr_nid; /* target NID */ +} WIRE_ATTR lnet_acceptor_connreq_t; + +#define LNET_PROTO_ACCEPTOR_VERSION 1 + +/* forward refs */ +struct lnet_libmd; + +typedef struct lnet_msg { + struct list_head msg_activelist; + struct list_head msg_list; /* Q for credits/MD */ + + lnet_process_id_t msg_target; + /* where is it from, it's only for building event */ + lnet_nid_t msg_from; + __u32 msg_type; + + /* committed for sending */ + unsigned int msg_tx_committed:1; + /* CPT # this message committed for sending */ + unsigned int msg_tx_cpt:15; + /* committed for receiving */ + unsigned int msg_rx_committed:1; + /* CPT # this message committed for receiving */ + unsigned int msg_rx_cpt:15; + /* queued for tx credit */ + unsigned int msg_tx_delayed:1; + /* queued for RX buffer */ + unsigned int msg_rx_delayed:1; + /* ready for pending on RX delay list */ + unsigned int msg_rx_ready_delay:1; + + unsigned int msg_vmflush:1; /* VM trying to free memory */ + unsigned int msg_target_is_router:1; /* sending to a router */ + unsigned int msg_routing:1; /* being forwarded */ + unsigned int msg_ack:1; /* ack on finalize (PUT) */ + unsigned int msg_sending:1; /* outgoing message */ + unsigned int msg_receiving:1; /* being received */ + unsigned int msg_txcredit:1; /* taken an NI send credit */ + unsigned int msg_peertxcredit:1; /* taken a peer send credit */ + unsigned int msg_rtrcredit:1; /* taken a global router credit */ + unsigned int msg_peerrtrcredit:1; /* taken a peer router credit */ + unsigned int msg_onactivelist:1; /* on the activelist */ + + struct lnet_peer *msg_txpeer; /* peer I'm sending to */ + struct lnet_peer *msg_rxpeer; /* peer I received from */ + + void *msg_private; + struct lnet_libmd *msg_md; + + unsigned int msg_len; + unsigned int msg_wanted; + unsigned int msg_offset; + unsigned int msg_niov; + struct kvec *msg_iov; + lnet_kiov_t *msg_kiov; + + lnet_event_t msg_ev; + lnet_hdr_t msg_hdr; +} lnet_msg_t; + +typedef struct lnet_libhandle { + struct list_head lh_hash_chain; + __u64 lh_cookie; +} lnet_libhandle_t; + +#define lh_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) + +typedef struct lnet_eq { + struct list_head eq_list; + lnet_libhandle_t eq_lh; + lnet_seq_t eq_enq_seq; + lnet_seq_t eq_deq_seq; + unsigned int eq_size; + lnet_eq_handler_t eq_callback; + lnet_event_t *eq_events; + int **eq_refs; /* percpt refcount for EQ */ +} lnet_eq_t; + +typedef struct lnet_me { + struct list_head me_list; + lnet_libhandle_t me_lh; + lnet_process_id_t me_match_id; + unsigned int me_portal; + unsigned int me_pos; /* hash offset in mt_hash */ + __u64 me_match_bits; + __u64 me_ignore_bits; + lnet_unlink_t me_unlink; + struct lnet_libmd *me_md; +} lnet_me_t; + +typedef struct lnet_libmd { + struct list_head md_list; + lnet_libhandle_t md_lh; + lnet_me_t *md_me; + char *md_start; + unsigned int md_offset; + unsigned int md_length; + unsigned int md_max_size; + int md_threshold; + int md_refcount; + unsigned int md_options; + unsigned int md_flags; + void *md_user_ptr; + lnet_eq_t *md_eq; + unsigned int md_niov; /* # frags */ + union { + struct kvec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; + } md_iov; +} lnet_libmd_t; + +#define LNET_MD_FLAG_ZOMBIE (1 << 0) +#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1) +#define LNET_MD_FLAG_ABORTED (1 << 2) + +#ifdef LNET_USE_LIB_FREELIST +typedef struct { + void *fl_objs; /* single contiguous array of objects */ + int fl_nobjs; /* the number of them */ + int fl_objsize; /* the size (including overhead) of each of them */ + struct list_head fl_list; /* where they are enqueued */ +} lnet_freelist_t; + +typedef struct { + struct list_head fo_list; /* enqueue on fl_list */ + void *fo_contents; /* aligned contents */ +} lnet_freeobj_t; +#endif + +typedef struct { + /* info about peers we are trying to fail */ + struct list_head tp_list; /* ln_test_peers */ + lnet_nid_t tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ +} lnet_test_peer_t; + +#define LNET_COOKIE_TYPE_MD 1 +#define LNET_COOKIE_TYPE_ME 2 +#define LNET_COOKIE_TYPE_EQ 3 +#define LNET_COOKIE_TYPE_BITS 2 +#define LNET_COOKIE_MASK ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL) + +struct lnet_ni; /* forward ref */ + +typedef struct lnet_lnd { + /* fields managed by portals */ + struct list_head lnd_list; /* stash in the LND table */ + int lnd_refcount; /* # active instances */ + + /* fields initialised by the LND */ + unsigned int lnd_type; + + int (*lnd_startup)(struct lnet_ni *ni); + void (*lnd_shutdown)(struct lnet_ni *ni); + int (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg); + + /* In data movement APIs below, payload buffers are described as a set + * of 'niov' fragments which are... + * EITHER + * in virtual memory (struct iovec *iov != NULL) + * OR + * in pages (kernel only: plt_kiov_t *kiov != NULL). + * The LND may NOT overwrite these fragment descriptors. + * An 'offset' and may specify a byte offset within the set of + * fragments to start from + */ + + /* Start sending a preformatted message. 'private' is NULL for PUT and + * GET messages; otherwise this is a response to an incoming message + * and 'private' is the 'private' passed to lnet_parse(). Return + * non-zero for immediate failure, otherwise complete later with + * lnet_finalize() */ + int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg); + + /* Start receiving 'mlen' bytes of payload data, skipping the following + * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to + * lnet_parse(). Return non-zero for immediate failure, otherwise + * complete later with lnet_finalize(). This also gives back a receive + * credit if the LND does flow control. */ + int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, + int delayed, unsigned int niov, + struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + + /* lnet_parse() has had to delay processing of this message + * (e.g. waiting for a forwarding buffer or send credits). Give the + * LND a chance to free urgently needed resources. If called, return 0 + * for success and do NOT give back a receive credit; that has to wait + * until lnd_recv() gets called. On failure return < 0 and + * release resources; lnd_recv() will not be called. */ + int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, + void **new_privatep); + + /* notification of peer health */ + void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); + + /* query of peer aliveness */ + void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, unsigned long *when); + + /* accept a new connection */ + int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock); + +} lnd_t; + +#define LNET_NI_STATUS_UP 0x15aac0de +#define LNET_NI_STATUS_DOWN 0xdeadface +#define LNET_NI_STATUS_INVALID 0x00000000 +typedef struct { + lnet_nid_t ns_nid; + __u32 ns_status; + __u32 ns_unused; +} WIRE_ATTR lnet_ni_status_t; + +struct lnet_tx_queue { + int tq_credits; /* # tx credits free */ + int tq_credits_min; /* lowest it's been */ + int tq_credits_max; /* total # tx credits */ + struct list_head tq_delayed; /* delayed TXs */ +}; + +#define LNET_MAX_INTERFACES 16 + +typedef struct lnet_ni { + spinlock_t ni_lock; + struct list_head ni_list; /* chain on ln_nis */ + struct list_head ni_cptlist; /* chain on ln_nis_cpt */ + int ni_maxtxcredits; /* # tx credits */ + /* # per-peer send credits */ + int ni_peertxcredits; + /* # per-peer router buffer credits */ + int ni_peerrtrcredits; + /* seconds to consider peer dead */ + int ni_peertimeout; + int ni_ncpts; /* number of CPTs */ + __u32 *ni_cpts; /* bond NI on some CPTs */ + lnet_nid_t ni_nid; /* interface's NID */ + void *ni_data; /* instance-specific data */ + lnd_t *ni_lnd; /* procedural interface */ + struct lnet_tx_queue **ni_tx_queues; /* percpt TX queues */ + int **ni_refs; /* percpt reference count */ + long ni_last_alive; /* when I was last alive */ + lnet_ni_status_t *ni_status; /* my health status */ + /* equivalent interfaces to use */ + char *ni_interfaces[LNET_MAX_INTERFACES]; +} lnet_ni_t; + +#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL + +/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x + * of old LNet, so there shouldn't be any compatibility issue */ +#define LNET_PING_FEAT_INVAL (0) /* no feature */ +#define LNET_PING_FEAT_BASE (1 << 0) /* just a ping */ +#define LNET_PING_FEAT_NI_STATUS (1 << 1) /* return NI status */ + +#define LNET_PING_FEAT_MASK (LNET_PING_FEAT_BASE | \ + LNET_PING_FEAT_NI_STATUS) + +typedef struct { + __u32 pi_magic; + __u32 pi_features; + lnet_pid_t pi_pid; + __u32 pi_nnis; + lnet_ni_status_t pi_ni[0]; +} WIRE_ATTR lnet_ping_info_t; + +/* router checker data, per router */ +#define LNET_MAX_RTR_NIS 16 +#define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS]) +typedef struct { + /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */ + struct list_head rcd_list; + lnet_handle_md_t rcd_mdh; /* ping buffer MD */ + struct lnet_peer *rcd_gateway; /* reference to gateway */ + lnet_ping_info_t *rcd_pinginfo; /* ping buffer */ +} lnet_rc_data_t; + +typedef struct lnet_peer { + struct list_head lp_hashlist; /* chain on peer hash */ + struct list_head lp_txq; /* messages blocking for tx credits */ + struct list_head lp_rtrq; /* messages blocking for router credits */ + struct list_head lp_rtr_list; /* chain on router list */ + int lp_txcredits; /* # tx credits available */ + int lp_mintxcredits; /* low water mark */ + int lp_rtrcredits; /* # router credits */ + int lp_minrtrcredits; /* low water mark */ + unsigned int lp_alive:1; /* alive/dead? */ + unsigned int lp_notify:1; /* notification outstanding? */ + unsigned int lp_notifylnd:1; /* outstanding notification for LND? */ + unsigned int lp_notifying:1; /* some thread is handling notification */ + unsigned int lp_ping_notsent; /* SEND event outstanding from ping */ + int lp_alive_count; /* # times router went dead<->alive */ + long lp_txqnob; /* bytes queued for sending */ + unsigned long lp_timestamp; /* time of last aliveness news */ + unsigned long lp_ping_timestamp; /* time of last ping attempt */ + unsigned long lp_ping_deadline; /* != 0 if ping reply expected */ + unsigned long lp_last_alive; /* when I was last alive */ + unsigned long lp_last_query; /* when lp_ni was queried last time */ + lnet_ni_t *lp_ni; /* interface peer is on */ + lnet_nid_t lp_nid; /* peer's NID */ + int lp_refcount; /* # refs */ + int lp_cpt; /* CPT this peer attached on */ + /* # refs from lnet_route_t::lr_gateway */ + int lp_rtr_refcount; + /* returned RC ping features */ + unsigned int lp_ping_feats; + struct list_head lp_routes; /* routers on this peer */ + lnet_rc_data_t *lp_rcd; /* router checker state */ +} lnet_peer_t; + +/* peer hash size */ +#define LNET_PEER_HASH_BITS 9 +#define LNET_PEER_HASH_SIZE (1 << LNET_PEER_HASH_BITS) + +/* peer hash table */ +struct lnet_peer_table { + int pt_version; /* /proc validity stamp */ + int pt_number; /* # peers extant */ + struct list_head pt_deathrow; /* zombie peers */ + struct list_head *pt_hash; /* NID->peer hash */ +}; + +/* peer aliveness is enabled only on routers for peers in a network where the + * lnet_ni_t::ni_peertimeout has been set to a positive value */ +#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \ + (lp)->lp_ni->ni_peertimeout > 0) + +typedef struct { + struct list_head lr_list; /* chain on net */ + struct list_head lr_gwlist; /* chain on gateway */ + lnet_peer_t *lr_gateway; /* router node */ + __u32 lr_net; /* remote network number */ + int lr_seq; /* sequence for round-robin */ + unsigned int lr_downis; /* number of down NIs */ + unsigned int lr_hops; /* how far I am */ + unsigned int lr_priority; /* route priority */ +} lnet_route_t; + +#define LNET_REMOTE_NETS_HASH_DEFAULT (1U << 7) +#define LNET_REMOTE_NETS_HASH_MAX (1U << 16) +#define LNET_REMOTE_NETS_HASH_SIZE (1 << the_lnet.ln_remote_nets_hbits) + +typedef struct { + struct list_head lrn_list; /* chain on ln_remote_nets_hash */ + struct list_head lrn_routes; /* routes to me */ + __u32 lrn_net; /* my net number */ +} lnet_remotenet_t; + +typedef struct { + struct list_head rbp_bufs; /* my free buffer pool */ + struct list_head rbp_msgs; /* messages blocking for a buffer */ + int rbp_npages; /* # pages in each buffer */ + int rbp_nbuffers; /* # buffers */ + int rbp_credits; /* # free buffers / blocked messages */ + int rbp_mincredits; /* low water mark */ +} lnet_rtrbufpool_t; + +typedef struct { + struct list_head rb_list; /* chain on rbp_bufs */ + lnet_rtrbufpool_t *rb_pool; /* owning pool */ + lnet_kiov_t rb_kiov[0]; /* the buffer space */ +} lnet_rtrbuf_t; + +typedef struct { + __u32 msgs_alloc; + __u32 msgs_max; + __u32 errors; + __u32 send_count; + __u32 recv_count; + __u32 route_count; + __u32 drop_count; + __u64 send_length; + __u64 recv_length; + __u64 route_length; + __u64 drop_length; +} WIRE_ATTR lnet_counters_t; + +#define LNET_PEER_HASHSIZE 503 /* prime! */ + +#define LNET_NRBPOOLS 3 /* # different router buffer pools */ + +enum { + /* Didn't match anything */ + LNET_MATCHMD_NONE = (1 << 0), + /* Matched OK */ + LNET_MATCHMD_OK = (1 << 1), + /* Must be discarded */ + LNET_MATCHMD_DROP = (1 << 2), + /* match and buffer is exhausted */ + LNET_MATCHMD_EXHAUSTED = (1 << 3), + /* match or drop */ + LNET_MATCHMD_FINISH = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP), +}; + +/* Options for lnet_portal_t::ptl_options */ +#define LNET_PTL_LAZY (1 << 0) +#define LNET_PTL_MATCH_UNIQUE (1 << 1) /* unique match, for RDMA */ +#define LNET_PTL_MATCH_WILDCARD (1 << 2) /* wildcard match, request portal */ + +/* parameter for matching operations (GET, PUT) */ +struct lnet_match_info { + __u64 mi_mbits; + lnet_process_id_t mi_id; + unsigned int mi_opc; + unsigned int mi_portal; + unsigned int mi_rlength; + unsigned int mi_roffset; +}; + +/* ME hash of RDMA portal */ +#define LNET_MT_HASH_BITS 8 +#define LNET_MT_HASH_SIZE (1 << LNET_MT_HASH_BITS) +#define LNET_MT_HASH_MASK (LNET_MT_HASH_SIZE - 1) +/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash, + * the last entry is reserved for MEs with ignore-bits */ +#define LNET_MT_HASH_IGNORE LNET_MT_HASH_SIZE +/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which + * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the + * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */ +#define LNET_MT_BITS_U64 6 /* 2^6 bits */ +#define LNET_MT_EXHAUSTED_BITS (LNET_MT_HASH_BITS - LNET_MT_BITS_U64) +#define LNET_MT_EXHAUSTED_BMAP ((1 << LNET_MT_EXHAUSTED_BITS) + 1) + +/* portal match table */ +struct lnet_match_table { + /* reserved for upcoming patches, CPU partition ID */ + unsigned int mt_cpt; + unsigned int mt_portal; /* portal index */ + /* match table is set as "enabled" if there's non-exhausted MD + * attached on mt_mhash, it's only valid for wildcard portal */ + unsigned int mt_enabled; + /* bitmap to flag whether MEs on mt_hash are exhausted or not */ + __u64 mt_exhausted[LNET_MT_EXHAUSTED_BMAP]; + struct list_head *mt_mhash; /* matching hash */ +}; + +/* these are only useful for wildcard portal */ +/* Turn off message rotor for wildcard portals */ +#define LNET_PTL_ROTOR_OFF 0 +/* round-robin dispatch all PUT messages for wildcard portals */ +#define LNET_PTL_ROTOR_ON 1 +/* round-robin dispatch routed PUT message for wildcard portals */ +#define LNET_PTL_ROTOR_RR_RT 2 +/* dispatch routed PUT message by hashing source NID for wildcard portals */ +#define LNET_PTL_ROTOR_HASH_RT 3 + +typedef struct lnet_portal { + spinlock_t ptl_lock; + unsigned int ptl_index; /* portal ID, reserved */ + /* flags on this portal: lazy, unique... */ + unsigned int ptl_options; + /* list of messages which are stealing buffer */ + struct list_head ptl_msg_stealing; + /* messages blocking for MD */ + struct list_head ptl_msg_delayed; + /* Match table for each CPT */ + struct lnet_match_table **ptl_mtables; + /* spread rotor of incoming "PUT" */ + unsigned int ptl_rotor; + /* # active entries for this portal */ + int ptl_mt_nmaps; + /* array of active entries' cpu-partition-id */ + int ptl_mt_maps[0]; +} lnet_portal_t; + +#define LNET_LH_HASH_BITS 12 +#define LNET_LH_HASH_SIZE (1ULL << LNET_LH_HASH_BITS) +#define LNET_LH_HASH_MASK (LNET_LH_HASH_SIZE - 1) + +/* resource container (ME, MD, EQ) */ +struct lnet_res_container { + unsigned int rec_type; /* container type */ + __u64 rec_lh_cookie; /* cookie generator */ + struct list_head rec_active; /* active resource list */ + struct list_head *rec_lh_hash; /* handle hash */ +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_t rec_freelist; /* freelist for resources */ +#endif +}; + +/* message container */ +struct lnet_msg_container { + int msc_init; /* initialized or not */ + /* max # threads finalizing */ + int msc_nfinalizers; + /* msgs waiting to complete finalizing */ + struct list_head msc_finalizing; + struct list_head msc_active; /* active message list */ + /* threads doing finalization */ + void **msc_finalizers; +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_t msc_freelist; /* freelist for messages */ +#endif +}; + +/* Router Checker states */ +#define LNET_RC_STATE_SHUTDOWN 0 /* not started */ +#define LNET_RC_STATE_RUNNING 1 /* started up OK */ +#define LNET_RC_STATE_STOPPING 2 /* telling thread to stop */ + +typedef struct { + /* CPU partition table of LNet */ + struct cfs_cpt_table *ln_cpt_table; + /* number of CPTs in ln_cpt_table */ + unsigned int ln_cpt_number; + unsigned int ln_cpt_bits; + + /* protect LNet resources (ME/MD/EQ) */ + struct cfs_percpt_lock *ln_res_lock; + /* # portals */ + int ln_nportals; + /* the vector of portals */ + lnet_portal_t **ln_portals; + /* percpt ME containers */ + struct lnet_res_container **ln_me_containers; + /* percpt MD container */ + struct lnet_res_container **ln_md_containers; + + /* Event Queue container */ + struct lnet_res_container ln_eq_container; + wait_queue_head_t ln_eq_waitq; + spinlock_t ln_eq_wait_lock; + unsigned int ln_remote_nets_hbits; + + /* protect NI, peer table, credits, routers, rtrbuf... */ + struct cfs_percpt_lock *ln_net_lock; + /* percpt message containers for active/finalizing/freed message */ + struct lnet_msg_container **ln_msg_containers; + lnet_counters_t **ln_counters; + struct lnet_peer_table **ln_peer_tables; + /* failure simulation */ + struct list_head ln_test_peers; + + struct list_head ln_nis; /* LND instances */ + /* NIs bond on specific CPT(s) */ + struct list_head ln_nis_cpt; + /* dying LND instances */ + struct list_head ln_nis_zombie; + lnet_ni_t *ln_loni; /* the loopback NI */ + /* NI to wait for events in */ + lnet_ni_t *ln_eq_waitni; + + /* remote networks with routes to them */ + struct list_head *ln_remote_nets_hash; + /* validity stamp */ + __u64 ln_remote_nets_version; + /* list of all known routers */ + struct list_head ln_routers; + /* validity stamp */ + __u64 ln_routers_version; + /* percpt router buffer pools */ + lnet_rtrbufpool_t **ln_rtrpools; + + lnet_handle_md_t ln_ping_target_md; + lnet_handle_eq_t ln_ping_target_eq; + lnet_ping_info_t *ln_ping_info; + + /* router checker startup/shutdown state */ + int ln_rc_state; + /* router checker's event queue */ + lnet_handle_eq_t ln_rc_eqh; + /* rcd still pending on net */ + struct list_head ln_rcd_deathrow; + /* rcd ready for free */ + struct list_head ln_rcd_zombie; + /* serialise startup/shutdown */ + struct semaphore ln_rc_signal; + + struct mutex ln_api_mutex; + struct mutex ln_lnd_mutex; + int ln_init; /* LNetInit() called? */ + /* Have I called LNetNIInit myself? */ + int ln_niinit_self; + /* LNetNIInit/LNetNIFini counter */ + int ln_refcount; + /* shutdown in progress */ + int ln_shutdown; + + int ln_routing; /* am I a router? */ + lnet_pid_t ln_pid; /* requested pid */ + /* uniquely identifies this ni in this epoch */ + __u64 ln_interface_cookie; + /* registered LNDs */ + struct list_head ln_lnds; + + /* space for network names */ + char *ln_network_tokens; + int ln_network_tokens_nob; + /* test protocol compatibility flags */ + int ln_testprotocompat; + +} lnet_t; + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/linux/api-support.h b/kernel/drivers/staging/lustre/include/linux/lnet/linux/api-support.h new file mode 100644 index 000000000..e237ad6af --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/linux/api-support.h @@ -0,0 +1,42 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_API_SUPPORT_H__ +#define __LINUX_API_SUPPORT_H__ + +#ifndef __LNET_API_SUPPORT_H__ +#error Do not #include this file directly. #include instead +#endif + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h new file mode 100644 index 000000000..0f8f04d1e --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h @@ -0,0 +1,71 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_LINUX_LIB_LNET_H__ +#define __LNET_LINUX_LIB_LNET_H__ + +#ifndef __LNET_LIB_LNET_H__ +#error Do not #include this file directly. #include instead +#endif + +# include +# include +# include +#include "../../libcfs/libcfs.h" + +static inline __u64 +lnet_page2phys(struct page *p) +{ + /* compiler optimizer will elide unused branches */ + + switch (sizeof(typeof(page_to_phys(p)))) { + case 4: + /* page_to_phys returns a 32 bit physical address. This must + * be a 32 bit machine with <= 4G memory and we must ensure we + * don't sign extend when converting to 64 bits. */ + return (unsigned long)page_to_phys(p); + + case 8: + /* page_to_phys returns a 64 bit physical address :) */ + return page_to_phys(p); + + default: + LBUG(); + return 0; + } +} + +#define LNET_ROUTER + +#endif /* __LNET_LINUX_LIB_LNET_H__ */ diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h new file mode 100644 index 000000000..669e8c038 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h @@ -0,0 +1,45 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_LINUX_LIB_TYPES_H__ +#define __LNET_LINUX_LIB_TYPES_H__ + +#ifndef __LNET_LIB_TYPES_H__ +#error Do not #include this file directly. #include instead +#endif + +# include +# include + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/linux/lnet.h b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lnet.h new file mode 100644 index 000000000..1e888f1ef --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/linux/lnet.h @@ -0,0 +1,56 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_LINUX_LNET_H__ +#define __LNET_LINUX_LNET_H__ + +#ifndef __LNET_H__ +#error Do not #include this file directly. #include instead +#endif + +/* + * lnet.h + * + * User application interface file + */ + +#include +#include + +#define cfs_tcp_sendpage(sk, page, offset, size, flags) \ + tcp_sendpage(sk, page, offset, size, flags) + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h b/kernel/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h new file mode 100644 index 000000000..2dee1b97f --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h @@ -0,0 +1,49 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_SYSCTL_H__ +#define __LNET_SYSCTL_H__ + +#if defined(CONFIG_SYSCTL) + +#define CTL_KRANAL 201 +#define CTL_O2IBLND 205 +#define CTL_PTLLND 206 +#define CTL_QSWNAL 207 +#define CTL_SOCKLND 208 +#define CTL_GNILND 210 + +#endif + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lnet.h b/kernel/drivers/staging/lustre/include/linux/lnet/lnet.h new file mode 100644 index 000000000..75c0ab919 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/lnet.h @@ -0,0 +1,51 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_H__ +#define __LNET_H__ + +/* + * lnet.h + * + * User application interface file + */ +#include "linux/lnet.h" + +#include "types.h" +#include "api.h" + +#define LNET_NIDSTR_COUNT 1024 /* # of nidstrings */ +#define LNET_NIDSTR_SIZE 32 /* size of each one (see below for usage) */ + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lnetctl.h b/kernel/drivers/staging/lustre/include/linux/lnet/lnetctl.h new file mode 100644 index 000000000..98181d389 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/lnetctl.h @@ -0,0 +1,80 @@ +/* + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for libptlctl.a + */ +#ifndef _PTLCTL_H_ +#define _PTLCTL_H_ + +#include "../libcfs/libcfs.h" +#include "types.h" + +#define LNET_DEV_ID 0 +#define LNET_DEV_PATH "/dev/lnet" +#define LNET_DEV_MAJOR 10 +#define LNET_DEV_MINOR 240 +#define OBD_DEV_ID 1 +#define OBD_DEV_NAME "obd" +#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME +#define OBD_DEV_MAJOR 10 +#define OBD_DEV_MINOR 241 +#define SMFS_DEV_ID 2 +#define SMFS_DEV_PATH "/dev/snapdev" +#define SMFS_DEV_MAJOR 10 +#define SMFS_DEV_MINOR 242 + +int ptl_initialize(int argc, char **argv); +int jt_ptl_network(int argc, char **argv); +int jt_ptl_list_nids(int argc, char **argv); +int jt_ptl_which_nid(int argc, char **argv); +int jt_ptl_print_interfaces(int argc, char **argv); +int jt_ptl_add_interface(int argc, char **argv); +int jt_ptl_del_interface(int argc, char **argv); +int jt_ptl_print_peers(int argc, char **argv); +int jt_ptl_add_peer(int argc, char **argv); +int jt_ptl_del_peer(int argc, char **argv); +int jt_ptl_print_connections(int argc, char **argv); +int jt_ptl_disconnect(int argc, char **argv); +int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_print_active_txs(int argc, char **argv); +int jt_ptl_ping(int argc, char **argv); +int jt_ptl_mynid(int argc, char **argv); +int jt_ptl_add_uuid(int argc, char **argv); +int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ +int jt_ptl_close_uuid(int argc, char **argv); +int jt_ptl_del_uuid(int argc, char **argv); +int jt_ptl_add_route(int argc, char **argv); +int jt_ptl_del_route(int argc, char **argv); +int jt_ptl_notify_router(int argc, char **argv); +int jt_ptl_print_routes(int argc, char **argv); +int jt_ptl_fail_nid(int argc, char **argv); +int jt_ptl_lwt(int argc, char **argv); +int jt_ptl_testprotocompat(int argc, char **argv); +int jt_ptl_memhog(int argc, char **argv); + +int dbg_initialize(int argc, char **argv); +int jt_dbg_filter(int argc, char **argv); +int jt_dbg_show(int argc, char **argv); +int jt_dbg_list(int argc, char **argv); +int jt_dbg_debug_kernel(int argc, char **argv); +int jt_dbg_debug_daemon(int argc, char **argv); +int jt_dbg_debug_file(int argc, char **argv); +int jt_dbg_clear_debug_buf(int argc, char **argv); +int jt_dbg_mark_debug_buf(int argc, char **argv); +int jt_dbg_modules(int argc, char **argv); +int jt_dbg_panic(int argc, char **argv); + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/lnetst.h b/kernel/drivers/staging/lustre/include/linux/lnet/lnetst.h new file mode 100644 index 000000000..885f708d4 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/lnetst.h @@ -0,0 +1,491 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/lnetst.h + * + * Author: Liang Zhen + */ + +#ifndef __LNET_ST_H__ +#define __LNET_ST_H__ + +#include "../libcfs/libcfs.h" +#include "lnet.h" +#include "lib-types.h" + +#define LST_FEAT_NONE (0) +#define LST_FEAT_BULK_LEN (1 << 0) /* enable variable page size */ + +#define LST_FEATS_EMPTY (LST_FEAT_NONE) +#define LST_FEATS_MASK (LST_FEAT_NONE | LST_FEAT_BULK_LEN) + +#define LST_NAME_SIZE 32 /* max name buffer length */ + +#define LSTIO_DEBUG 0xC00 /* debug */ +#define LSTIO_SESSION_NEW 0xC01 /* create session */ +#define LSTIO_SESSION_END 0xC02 /* end session */ +#define LSTIO_SESSION_INFO 0xC03 /* query session */ +#define LSTIO_GROUP_ADD 0xC10 /* add group */ +#define LSTIO_GROUP_LIST 0xC11 /* list all groups in session */ +#define LSTIO_GROUP_INFO 0xC12 /* query default information of specified group */ +#define LSTIO_GROUP_DEL 0xC13 /* delete group */ +#define LSTIO_NODES_ADD 0xC14 /* add nodes to specified group */ +#define LSTIO_GROUP_UPDATE 0xC15 /* update group */ +#define LSTIO_BATCH_ADD 0xC20 /* add batch */ +#define LSTIO_BATCH_START 0xC21 /* start batch */ +#define LSTIO_BATCH_STOP 0xC22 /* stop batch */ +#define LSTIO_BATCH_DEL 0xC23 /* delete batch */ +#define LSTIO_BATCH_LIST 0xC24 /* show all batches in the session */ +#define LSTIO_BATCH_INFO 0xC25 /* show defail of specified batch */ +#define LSTIO_TEST_ADD 0xC26 /* add test (to batch) */ +#define LSTIO_BATCH_QUERY 0xC27 /* query batch status */ +#define LSTIO_STAT_QUERY 0xC30 /* get stats */ + +typedef struct { + lnet_nid_t ses_nid; /* nid of console node */ + __u64 ses_stamp; /* time stamp */ +} lst_sid_t; /*** session id */ + +extern lst_sid_t LST_INVALID_SID; + +typedef struct { + __u64 bat_id; /* unique id in session */ +} lst_bid_t; /*** batch id (group of tests) */ + +/* Status of test node */ +#define LST_NODE_ACTIVE 0x1 /* node in this session */ +#define LST_NODE_BUSY 0x2 /* node is taken by other session */ +#define LST_NODE_DOWN 0x4 /* node is down */ +#define LST_NODE_UNKNOWN 0x8 /* node not in session */ + +typedef struct { + lnet_process_id_t nde_id; /* id of node */ + int nde_state; /* state of node */ +} lstcon_node_ent_t; /*** node entry, for list_group command */ + +typedef struct { + int nle_nnode; /* # of nodes */ + int nle_nactive; /* # of active nodes */ + int nle_nbusy; /* # of busy nodes */ + int nle_ndown; /* # of down nodes */ + int nle_nunknown; /* # of unknown nodes */ +} lstcon_ndlist_ent_t; /*** node_list entry, for list_batch command */ + +typedef struct { + int tse_type; /* test type */ + int tse_loop; /* loop count */ + int tse_concur; /* concurrency of test */ +} lstcon_test_ent_t; /*** test summary entry, for list_batch command */ + +typedef struct { + int bae_state; /* batch status */ + int bae_timeout; /* batch timeout */ + int bae_ntest; /* # of tests in the batch */ +} lstcon_batch_ent_t; /*** batch summary entry, for list_batch command */ + +typedef struct { + lstcon_ndlist_ent_t tbe_cli_nle; /* client (group) node_list entry */ + lstcon_ndlist_ent_t tbe_srv_nle; /* server (group) node_list entry */ + union { + lstcon_test_ent_t tbe_test; /* test entry */ + lstcon_batch_ent_t tbe_batch; /* batch entry */ + } u; +} lstcon_test_batch_ent_t; /*** test/batch verbose information entry, + *** for list_batch command */ + +typedef struct { + struct list_head rpe_link; /* link chain */ + lnet_process_id_t rpe_peer; /* peer's id */ + struct timeval rpe_stamp; /* time stamp of RPC */ + int rpe_state; /* peer's state */ + int rpe_rpc_errno; /* RPC errno */ + + lst_sid_t rpe_sid; /* peer's session id */ + int rpe_fwk_errno; /* framework errno */ + int rpe_priv[4]; /* private data */ + char rpe_payload[0]; /* private reply payload */ +} lstcon_rpc_ent_t; + +typedef struct { + int trs_rpc_stat[4]; /* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */ + int trs_rpc_errno; /* RPC errno */ + int trs_fwk_stat[8]; /* framework stat */ + int trs_fwk_errno; /* errno of the first remote error */ + void *trs_fwk_private; /* private framework stat */ +} lstcon_trans_stat_t; + +static inline int +lstcon_rpc_stat_total(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0]; +} + +static inline int +lstcon_rpc_stat_success(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1]; +} + +static inline int +lstcon_rpc_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2]; +} + +static inline int +lstcon_sesop_stat_success(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_sesop_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_sesqry_stat_active(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_sesqry_stat_busy(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_sesqry_stat_unknown(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; +} + +static inline int +lstcon_tsbop_stat_success(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_tsbop_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_tsbqry_stat_idle(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_tsbqry_stat_run(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_tsbqry_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; +} + +static inline int +lstcon_statqry_stat_success(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_statqry_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +/* create a session */ +typedef struct { + int lstio_ses_key; /* IN: local key */ + int lstio_ses_timeout; /* IN: session timeout */ + int lstio_ses_force; /* IN: force create ? */ + /** IN: session features */ + unsigned lstio_ses_feats; + lst_sid_t *lstio_ses_idp; /* OUT: session id */ + int lstio_ses_nmlen; /* IN: name length */ + char *lstio_ses_namep; /* IN: session name */ +} lstio_session_new_args_t; + +/* query current session */ +typedef struct { + lst_sid_t *lstio_ses_idp; /* OUT: session id */ + int *lstio_ses_keyp; /* OUT: local key */ + /** OUT: session features */ + unsigned *lstio_ses_featp; + lstcon_ndlist_ent_t *lstio_ses_ndinfo; /* OUT: */ + int lstio_ses_nmlen; /* IN: name length */ + char *lstio_ses_namep; /* OUT: session name */ +} lstio_session_info_args_t; + +/* delete a session */ +typedef struct { + int lstio_ses_key; /* IN: session key */ +} lstio_session_end_args_t; + +#define LST_OPC_SESSION 1 +#define LST_OPC_GROUP 2 +#define LST_OPC_NODES 3 +#define LST_OPC_BATCHCLI 4 +#define LST_OPC_BATCHSRV 5 + +typedef struct { + int lstio_dbg_key; /* IN: session key */ + int lstio_dbg_type; /* IN: debug sessin|batch|group|nodes list */ + int lstio_dbg_flags; /* IN: reserved debug flags */ + int lstio_dbg_timeout; /* IN: timeout of debug */ + + int lstio_dbg_nmlen; /* IN: len of name */ + char *lstio_dbg_namep; /* IN: name of group|batch */ + int lstio_dbg_count; /* IN: # of test nodes to debug */ + lnet_process_id_t *lstio_dbg_idsp; /* IN: id of test nodes */ + struct list_head *lstio_dbg_resultp; /* OUT: list head of result buffer */ +} lstio_debug_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char *lstio_grp_namep; /* IN: group name */ +} lstio_group_add_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char *lstio_grp_namep; /* IN: group name */ +} lstio_group_del_args_t; + +#define LST_GROUP_CLEAN 1 /* remove inactive nodes in the group */ +#define LST_GROUP_REFRESH 2 /* refresh inactive nodes in the group */ +#define LST_GROUP_RMND 3 /* delete nodes from the group */ + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_opc; /* IN: OPC */ + int lstio_grp_args; /* IN: arguments */ + int lstio_grp_nmlen; /* IN: name length */ + char *lstio_grp_namep; /* IN: group name */ + int lstio_grp_count; /* IN: # of nodes id */ + lnet_process_id_t *lstio_grp_idsp; /* IN: array of nodes */ + struct list_head *lstio_grp_resultp; /* OUT: list head of result buffer */ +} lstio_group_update_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char *lstio_grp_namep; /* IN: group name */ + int lstio_grp_count; /* IN: # of nodes */ + /** OUT: session features */ + unsigned *lstio_grp_featp; + lnet_process_id_t *lstio_grp_idsp; /* IN: nodes */ + struct list_head *lstio_grp_resultp; /* OUT: list head of result buffer */ +} lstio_group_nodes_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_idx; /* IN: group idx */ + int lstio_grp_nmlen; /* IN: name len */ + char *lstio_grp_namep; /* OUT: name */ +} lstio_group_list_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name len */ + char *lstio_grp_namep; /* IN: name */ + lstcon_ndlist_ent_t *lstio_grp_entp; /* OUT: description of group */ + + int *lstio_grp_idxp; /* IN/OUT: node index */ + int *lstio_grp_ndentp; /* IN/OUT: # of nodent */ + lstcon_node_ent_t *lstio_grp_dentsp; /* OUT: nodent array */ +} lstio_group_info_args_t; + +#define LST_DEFAULT_BATCH "batch" /* default batch name */ + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ +} lstio_batch_add_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ +} lstio_batch_del_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_timeout; /* IN: timeout for the batch */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ + struct list_head *lstio_bat_resultp; /* OUT: list head of result buffer */ +} lstio_batch_run_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_force; /* IN: abort unfinished test RPC */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ + struct list_head *lstio_bat_resultp; /* OUT: list head of result buffer */ +} lstio_batch_stop_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_testidx; /* IN: test index */ + int lstio_bat_client; /* IN: is test client? */ + int lstio_bat_timeout; /* IN: timeout for waiting */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ + struct list_head *lstio_bat_resultp; /* OUT: list head of result buffer */ +} lstio_batch_query_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_idx; /* IN: index */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ +} lstio_batch_list_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: name */ + int lstio_bat_server; /* IN: query server or not */ + int lstio_bat_testidx; /* IN: test index */ + lstcon_test_batch_ent_t *lstio_bat_entp; /* OUT: batch ent */ + + int *lstio_bat_idxp; /* IN/OUT: index of node */ + int *lstio_bat_ndentp; /* IN/OUT: # of nodent */ + lstcon_node_ent_t *lstio_bat_dentsp; /* array of nodent */ +} lstio_batch_info_args_t; + +/* add stat in session */ +typedef struct { + int lstio_sta_key; /* IN: session key */ + int lstio_sta_timeout; /* IN: timeout for stat request */ + int lstio_sta_nmlen; /* IN: group name length */ + char *lstio_sta_namep; /* IN: group name */ + int lstio_sta_count; /* IN: # of pid */ + lnet_process_id_t *lstio_sta_idsp; /* IN: pid */ + struct list_head *lstio_sta_resultp; /* OUT: list head of result buffer */ +} lstio_stat_args_t; + +typedef enum { + LST_TEST_BULK = 1, + LST_TEST_PING = 2 +} lst_test_type_t; + +/* create a test in a batch */ +#define LST_MAX_CONCUR 1024 /* Max concurrency of test */ + +typedef struct { + int lstio_tes_key; /* IN: session key */ + int lstio_tes_bat_nmlen; /* IN: batch name len */ + char *lstio_tes_bat_name; /* IN: batch name */ + int lstio_tes_type; /* IN: test type */ + int lstio_tes_oneside; /* IN: one sided test */ + int lstio_tes_loop; /* IN: loop count */ + int lstio_tes_concur; /* IN: concurrency */ + + int lstio_tes_dist; /* IN: node distribution in destination groups */ + int lstio_tes_span; /* IN: node span in destination groups */ + int lstio_tes_sgrp_nmlen; /* IN: source group name length */ + char *lstio_tes_sgrp_name; /* IN: group name */ + int lstio_tes_dgrp_nmlen; /* IN: destination group name length */ + char *lstio_tes_dgrp_name; /* IN: group name */ + + int lstio_tes_param_len; /* IN: param buffer len */ + void *lstio_tes_param; /* IN: parameter for specified test: + lstio_bulk_param_t, + lstio_ping_param_t, + ... more */ + int *lstio_tes_retp; /* OUT: private returned value */ + struct list_head *lstio_tes_resultp; /* OUT: list head of result buffer */ +} lstio_test_args_t; + +typedef enum { + LST_BRW_READ = 1, + LST_BRW_WRITE = 2 +} lst_brw_type_t; + +typedef enum { + LST_BRW_CHECK_NONE = 1, + LST_BRW_CHECK_SIMPLE = 2, + LST_BRW_CHECK_FULL = 3 +} lst_brw_flags_t; + +typedef struct { + int blk_opc; /* bulk operation code */ + int blk_size; /* size (bytes) */ + int blk_time; /* time of running the test*/ + int blk_flags; /* reserved flags */ +} lst_test_bulk_param_t; + +typedef struct { + int png_size; /* size of ping message */ + int png_time; /* time */ + int png_loop; /* loop */ + int png_flags; /* reserved flags */ +} lst_test_ping_param_t; + +/* more tests */ +typedef struct { + __u32 errors; + __u32 rpcs_sent; + __u32 rpcs_rcvd; + __u32 rpcs_dropped; + __u32 rpcs_expired; + __u64 bulk_get; + __u64 bulk_put; +} WIRE_ATTR srpc_counters_t; + +typedef struct { + /** milliseconds since current session started */ + __u32 running_ms; + __u32 active_batches; + __u32 zombie_sessions; + __u32 brw_errors; + __u32 ping_errors; +} WIRE_ATTR sfw_counters_t; + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd.h b/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd.h new file mode 100644 index 000000000..c91d65329 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd.h @@ -0,0 +1,93 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/ptllnd.h + * + * Author: PJ Kirner + */ + +/* + * The PTLLND was designed to support Portals with + * Lustre and non-lustre UNLINK semantics. + * However for now the two targets are Cray Portals + * on the XT3 and Lustre Portals (for testing) both + * have Lustre UNLINK semantics, so this is defined + * by default. + */ +#define LUSTRE_PORTALS_UNLINK_SEMANTICS + +#ifdef _USING_LUSTRE_PORTALS_ + +/* NIDs are 64-bits on Lustre Portals */ +#define FMT_NID "%llu" +#define FMT_PID "%d" + +/* When using Lustre Portals Lustre completion semantics are imlicit*/ +#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS 0 + +#else /* _USING_CRAY_PORTALS_ */ + +/* NIDs are integers on Cray Portals */ +#define FMT_NID "%u" +#define FMT_PID "%d" + +/* When using Cray Portals this is defined in the Cray Portals Header*/ +/*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */ + +/* Can compare handles directly on Cray Portals */ +#define PtlHandleIsEqual(a, b) ((a) == (b)) + +/* Different error types on Cray Portals*/ +#define ptl_err_t ptl_ni_fail_t + +/* + * The Cray Portals has no maximum number of IOVs. The + * maximum is limited only by memory and size of the + * int parameters (2^31-1). + * Lustre only really require that the underyling + * implementation to support at least LNET_MAX_IOV, + * so for Cray portals we can safely just use that + * value here. + * + */ +#define PTL_MD_MAX_IOV LNET_MAX_IOV + +#endif + +#define FMT_PTLID "ptlid:"FMT_PID"-"FMT_NID + +/* Align incoming small request messages to an 8 byte boundary if this is + * supported to avoid alignment issues on some architectures */ +#ifndef PTL_MD_LOCAL_ALIGN8 +# define PTL_MD_LOCAL_ALIGN8 0 +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h b/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h new file mode 100644 index 000000000..808f37b64 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h @@ -0,0 +1,119 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/ptllnd_wire.h + * + * Author: PJ Kirner + */ + +/* Minimum buffer size that any peer will post to receive ptllnd messages */ +#define PTLLND_MIN_BUFFER_SIZE 256 + +/************************************************************************ + * Tunable defaults that {u,k}lnds/ptllnd should have in common. + */ + +#define PTLLND_PORTAL 9 /* The same portal PTLPRC used when talking to cray portals */ +#define PTLLND_PID 9 /* The Portals PID */ +#define PTLLND_PEERCREDITS 8 /* concurrent sends to 1 peer */ + +/* Default buffer size for kernel ptllnds (guaranteed eager) */ +#define PTLLND_MAX_KLND_MSG_SIZE 512 + +/* Default buffer size for catamount ptllnds (not guaranteed eager) - large + * enough to avoid RDMA for anything sent while control is not in liblustre */ +#define PTLLND_MAX_ULND_MSG_SIZE 512 + +/************************************************************************ + * Portals LND Wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +#define PTL_RESERVED_MATCHBITS 0x100 /* below this value is reserved + * above is for bulk data transfer */ +#define LNET_MSG_MATCHBITS 0 /* the value for the message channel */ + +typedef struct { + lnet_hdr_t kptlim_hdr; /* portals header */ + char kptlim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kptl_immediate_msg_t; + +typedef struct { + lnet_hdr_t kptlrm_hdr; /* portals header */ + __u64 kptlrm_matchbits; /* matchbits */ +} WIRE_ATTR kptl_rdma_msg_t; + +typedef struct { + __u64 kptlhm_matchbits; /* matchbits */ + __u32 kptlhm_max_msg_size; /* max message size */ +} WIRE_ATTR kptl_hello_msg_t; + +typedef struct { + /* First 2 fields fixed FOR ALL TIME */ + __u32 ptlm_magic; /* I'm a Portals LND message */ + __u16 ptlm_version; /* this is my version number */ + __u8 ptlm_type; /* the message type */ + __u8 ptlm_credits; /* returned credits */ + __u32 ptlm_nob; /* # bytes in whole message */ + __u32 ptlm_cksum; /* checksum (0 == no checksum) */ + __u64 ptlm_srcnid; /* sender's NID */ + __u64 ptlm_srcstamp; /* sender's incarnation */ + __u64 ptlm_dstnid; /* destination's NID */ + __u64 ptlm_dststamp; /* destination's incarnation */ + __u32 ptlm_srcpid; /* sender's PID */ + __u32 ptlm_dstpid; /* destination's PID */ + + union { + kptl_immediate_msg_t immediate; + kptl_rdma_msg_t rdma; + kptl_hello_msg_t hello; + } WIRE_ATTR ptlm_u; + +} kptl_msg_t; + +/* kptl_msg_t::ptlm_credits is only a __u8 */ +#define PTLLND_MSG_MAX_CREDITS ((typeof(((kptl_msg_t *)0)->ptlm_credits)) - 1) + +#define PTLLND_MSG_MAGIC LNET_PROTO_PTL_MAGIC +#define PTLLND_MSG_VERSION 0x04 + +#define PTLLND_RDMA_OK 0x00 +#define PTLLND_RDMA_FAIL 0x01 + +#define PTLLND_MSG_TYPE_INVALID 0x00 +#define PTLLND_MSG_TYPE_PUT 0x01 +#define PTLLND_MSG_TYPE_GET 0x02 +#define PTLLND_MSG_TYPE_IMMEDIATE 0x03 /* No bulk data xfer*/ +#define PTLLND_MSG_TYPE_NOOP 0x04 +#define PTLLND_MSG_TYPE_HELLO 0x05 +#define PTLLND_MSG_TYPE_NAK 0x06 diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/socklnd.h b/kernel/drivers/staging/lustre/include/linux/lnet/socklnd.h new file mode 100644 index 000000000..389038b12 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/socklnd.h @@ -0,0 +1,103 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/socklnd.h + * + * #defines shared between socknal implementation and utilities + */ +#ifndef __LNET_LNET_SOCKLND_H__ +#define __LNET_LNET_SOCKLND_H__ + +#include "types.h" +#include "lib-types.h" + +#define SOCKLND_CONN_NONE (-1) +#define SOCKLND_CONN_ANY 0 +#define SOCKLND_CONN_CONTROL 1 +#define SOCKLND_CONN_BULK_IN 2 +#define SOCKLND_CONN_BULK_OUT 3 +#define SOCKLND_CONN_NTYPES 4 + +#define SOCKLND_CONN_ACK SOCKLND_CONN_BULK_IN + +typedef struct { + __u32 kshm_magic; /* magic number of socklnd message */ + __u32 kshm_version; /* version of socklnd message */ + lnet_nid_t kshm_src_nid; /* sender's nid */ + lnet_nid_t kshm_dst_nid; /* destination nid */ + lnet_pid_t kshm_src_pid; /* sender's pid */ + lnet_pid_t kshm_dst_pid; /* destination pid */ + __u64 kshm_src_incarnation; /* sender's incarnation */ + __u64 kshm_dst_incarnation; /* destination's incarnation */ + __u32 kshm_ctype; /* connection type */ + __u32 kshm_nips; /* # IP addrs */ + __u32 kshm_ips[0]; /* IP addrs */ +} WIRE_ATTR ksock_hello_msg_t; + +typedef struct { + lnet_hdr_t ksnm_hdr; /* lnet hdr */ + + /* + * ksnm_payload is removed because of winnt compiler's limitation: + * zero-sized array can only be placed at the tail of [nested] + * structure definitions. lnet payload will be stored just after + * the body of structure ksock_lnet_msg_t + */ +} WIRE_ATTR ksock_lnet_msg_t; + +typedef struct { + __u32 ksm_type; /* type of socklnd message */ + __u32 ksm_csum; /* checksum if != 0 */ + __u64 ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */ + union { + ksock_lnet_msg_t lnetmsg; /* lnet message, it's empty if it's NOOP */ + } WIRE_ATTR ksm_u; +} WIRE_ATTR ksock_msg_t; + +static inline void +socklnd_init_msg(ksock_msg_t *msg, int type) +{ + msg->ksm_csum = 0; + msg->ksm_type = type; + msg->ksm_zc_cookies[0] = msg->ksm_zc_cookies[1] = 0; +} + +#define KSOCK_MSG_NOOP 0xc0 /* ksm_u empty */ +#define KSOCK_MSG_LNET 0xc1 /* lnet msg */ + +/* We need to know this number to parse hello msg from ksocklnd in + * other LND (usocklnd, for example) */ +#define KSOCK_PROTO_V2 2 +#define KSOCK_PROTO_V3 3 + +#endif diff --git a/kernel/drivers/staging/lustre/include/linux/lnet/types.h b/kernel/drivers/staging/lustre/include/linux/lnet/types.h new file mode 100644 index 000000000..68d8139a2 --- /dev/null +++ b/kernel/drivers/staging/lustre/include/linux/lnet/types.h @@ -0,0 +1,492 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_TYPES_H__ +#define __LNET_TYPES_H__ + +/** \addtogroup lnet + * @{ */ + +#include "../libcfs/libcfs.h" + +/** \addtogroup lnet_addr + * @{ */ + +/** Portal reserved for LNet's own use. + * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments. + */ +#define LNET_RESERVED_PORTAL 0 + +/** + * Address of an end-point in an LNet network. + * + * A node can have multiple end-points and hence multiple addresses. + * An LNet network can be a simple network (e.g. tcp0) or a network of + * LNet networks connected by LNet routers. Therefore an end-point address + * has two parts: network ID, and address within a network. + * + * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID. + */ +typedef __u64 lnet_nid_t; +/** + * ID of a process in a node. Shortened as PID to distinguish from + * lnet_process_id_t, the global process ID. + */ +typedef __u32 lnet_pid_t; + +/** wildcard NID that matches any end-point address */ +#define LNET_NID_ANY ((lnet_nid_t) -1) +/** wildcard PID that matches any lnet_pid_t */ +#define LNET_PID_ANY ((lnet_pid_t) -1) + +#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */ +#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */ + +#define LNET_TIME_FOREVER (-1) + +/** + * Objects maintained by the LNet are accessed through handles. Handle types + * have names of the form lnet_handle_xx_t, where xx is one of the two letter + * object type codes ('eq' for event queue, 'md' for memory descriptor, and + * 'me' for match entry). + * Each type of object is given a unique handle type to enhance type checking. + * The type lnet_handle_any_t can be used when a generic handle is needed. + * Every handle value can be converted into a value of type lnet_handle_any_t + * without loss of information. + */ +typedef struct { + __u64 cookie; +} lnet_handle_any_t; + +typedef lnet_handle_any_t lnet_handle_eq_t; +typedef lnet_handle_any_t lnet_handle_md_t; +typedef lnet_handle_any_t lnet_handle_me_t; + +#define LNET_WIRE_HANDLE_COOKIE_NONE (-1) + +/** + * Invalidate handle \a h. + */ +static inline void LNetInvalidateHandle(lnet_handle_any_t *h) +{ + h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE; +} + +/** + * Compare handles \a h1 and \a h2. + * + * \return 1 if handles are equal, 0 if otherwise. + */ +static inline int LNetHandleIsEqual(lnet_handle_any_t h1, lnet_handle_any_t h2) +{ + return h1.cookie == h2.cookie; +} + +/** + * Check whether handle \a h is invalid. + * + * \return 1 if handle is invalid, 0 if valid. + */ +static inline int LNetHandleIsInvalid(lnet_handle_any_t h) +{ + return LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie; +} + +/** + * Global process ID. + */ +typedef struct { + /** node id */ + lnet_nid_t nid; + /** process id */ + lnet_pid_t pid; +} lnet_process_id_t; +/** @} lnet_addr */ + +/** \addtogroup lnet_me + * @{ */ + +/** + * Specifies whether the match entry or memory descriptor should be unlinked + * automatically (LNET_UNLINK) or not (LNET_RETAIN). + */ +typedef enum { + LNET_RETAIN = 0, + LNET_UNLINK +} lnet_unlink_t; + +/** + * Values of the type lnet_ins_pos_t are used to control where a new match + * entry is inserted. The value LNET_INS_BEFORE is used to insert the new + * entry before the current entry or before the head of the list. The value + * LNET_INS_AFTER is used to insert the new entry after the current entry + * or after the last item in the list. + */ +typedef enum { + /** insert ME before current position or head of the list */ + LNET_INS_BEFORE, + /** insert ME after current position or tail of the list */ + LNET_INS_AFTER, + /** attach ME at tail of local CPU partition ME list */ + LNET_INS_LOCAL +} lnet_ins_pos_t; + +/** @} lnet_me */ + +/** \addtogroup lnet_md + * @{ */ + +/** + * Defines the visible parts of a memory descriptor. Values of this type + * are used to initialize memory descriptors. + */ +typedef struct { + /** + * Specify the memory region associated with the memory descriptor. + * If the options field has: + * - LNET_MD_KIOV bit set: The start field points to the starting + * address of an array of lnet_kiov_t and the length field specifies + * the number of entries in the array. The length can't be bigger + * than LNET_MAX_IOV. The lnet_kiov_t is used to describe page-based + * fragments that are not necessarily mapped in virtual memory. + * - LNET_MD_IOVEC bit set: The start field points to the starting + * address of an array of struct iovec and the length field specifies + * the number of entries in the array. The length can't be bigger + * than LNET_MAX_IOV. The struct iovec is used to describe fragments + * that have virtual addresses. + * - Otherwise: The memory region is contiguous. The start field + * specifies the starting address for the memory region and the + * length field specifies its length. + * + * When the memory region is fragmented, all fragments but the first + * one must start on page boundary, and all but the last must end on + * page boundary. + */ + void *start; + unsigned int length; + /** + * Specifies the maximum number of operations that can be performed + * on the memory descriptor. An operation is any action that could + * possibly generate an event. In the usual case, the threshold value + * is decremented for each operation on the MD. When the threshold + * drops to zero, the MD becomes inactive and does not respond to + * operations. A threshold value of LNET_MD_THRESH_INF indicates that + * there is no bound on the number of operations that may be applied + * to a MD. + */ + int threshold; + /** + * Specifies the largest incoming request that the memory descriptor + * should respond to. When the unused portion of a MD (length - + * local offset) falls below this value, the MD becomes inactive and + * does not respond to further operations. This value is only used + * if the LNET_MD_MAX_SIZE option is set. + */ + int max_size; + /** + * Specifies the behavior of the memory descriptor. A bitwise OR + * of the following values can be used: + * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD. + * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD. + * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory + * region is provided by the incoming request. By default, the + * offset is maintained locally. When maintained locally, the + * offset is incremented by the length of the request so that + * the next operation (PUT or GET) will access the next part of + * the memory region. Note that only one offset variable exists + * per memory descriptor. If both PUT and GET operations are + * performed on a memory descriptor, the offset is updated each time. + * - LNET_MD_TRUNCATE: The length provided in the incoming request can + * be reduced to match the memory available in the region (determined + * by subtracting the offset from the length of the memory region). + * By default, if the length in the incoming operation is greater + * than the amount of memory available, the operation is rejected. + * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for + * incoming PUT operations, even if requested. By default, + * acknowledgments are sent for PUT operations that request an + * acknowledgment. Acknowledgments are never sent for GET operations. + * The data sent in the REPLY serves as an implicit acknowledgment. + * - LNET_MD_KIOV: The start and length fields specify an array of + * lnet_kiov_t. + * - LNET_MD_IOVEC: The start and length fields specify an array of + * struct iovec. + * - LNET_MD_MAX_SIZE: The max_size field is valid. + * + * Note: + * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather + * capability for memory descriptors. They can't be both set. + * - When LNET_MD_MAX_SIZE is set, the total length of the memory + * region (i.e. sum of all fragment lengths) must not be less than + * \a max_size. + */ + unsigned int options; + /** + * A user-specified value that is associated with the memory + * descriptor. The value does not need to be a pointer, but must fit + * in the space used by a pointer. This value is recorded in events + * associated with operations on this MD. + */ + void *user_ptr; + /** + * A handle for the event queue used to log the operations performed on + * the memory region. If this argument is a NULL handle (i.e. nullified + * by LNetInvalidateHandle()), operations performed on this memory + * descriptor are not logged. + */ + lnet_handle_eq_t eq_handle; +} lnet_md_t; + +/* Max Transfer Unit (minimum supported everywhere). + * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks) + * these limits are system wide and not interface-local. */ +#define LNET_MTU_BITS 20 +#define LNET_MTU (1 << LNET_MTU_BITS) + +/** limit on the number of fragments in discontiguous MDs */ +#define LNET_MAX_IOV 256 + +/* Max payload size */ +# define LNET_MAX_PAYLOAD CONFIG_LNET_MAX_PAYLOAD +# if (LNET_MAX_PAYLOAD < LNET_MTU) +# error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb" +# else +# if (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV)) +/* PAGE_SIZE is a constant: check with cpp! */ +# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb" +# endif +# endif + +/** + * Options for the MD structure. See lnet_md_t::options. + */ +#define LNET_MD_OP_PUT (1 << 0) +/** See lnet_md_t::options. */ +#define LNET_MD_OP_GET (1 << 1) +/** See lnet_md_t::options. */ +#define LNET_MD_MANAGE_REMOTE (1 << 2) +/* unused (1 << 3) */ +/** See lnet_md_t::options. */ +#define LNET_MD_TRUNCATE (1 << 4) +/** See lnet_md_t::options. */ +#define LNET_MD_ACK_DISABLE (1 << 5) +/** See lnet_md_t::options. */ +#define LNET_MD_IOVEC (1 << 6) +/** See lnet_md_t::options. */ +#define LNET_MD_MAX_SIZE (1 << 7) +/** See lnet_md_t::options. */ +#define LNET_MD_KIOV (1 << 8) + +/* For compatibility with Cray Portals */ +#define LNET_MD_PHYS 0 + +/** Infinite threshold on MD operations. See lnet_md_t::threshold */ +#define LNET_MD_THRESH_INF (-1) + +/* NB lustre portals uses struct iovec internally! */ +typedef struct iovec lnet_md_iovec_t; + +/** + * A page-based fragment of a MD. + */ +typedef struct { + /** Pointer to the page where the fragment resides */ + struct page *kiov_page; + /** Length in bytes of the fragment */ + unsigned int kiov_len; + /** + * Starting offset of the fragment within the page. Note that the + * end of the fragment must not pass the end of the page; i.e., + * kiov_len + kiov_offset <= PAGE_CACHE_SIZE. + */ + unsigned int kiov_offset; +} lnet_kiov_t; +/** @} lnet_md */ + +/** \addtogroup lnet_eq + * @{ */ + +/** + * Six types of events can be logged in an event queue. + */ +typedef enum { + /** An incoming GET operation has completed on the MD. */ + LNET_EVENT_GET = 1, + /** + * An incoming PUT operation has completed on the MD. The + * underlying layers will not alter the memory (on behalf of this + * operation) once this event has been logged. + */ + LNET_EVENT_PUT, + /** + * A REPLY operation has completed. This event is logged after the + * data (if any) from the REPLY has been written into the MD. + */ + LNET_EVENT_REPLY, + /** An acknowledgment has been received. */ + LNET_EVENT_ACK, + /** + * An outgoing send (PUT or GET) operation has completed. This event + * is logged after the entire buffer has been sent and it is safe for + * the caller to reuse the buffer. + * + * Note: + * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can + * happen even when the message has not yet been put out on wire. + * - It's unsafe to assume that in an outgoing GET operation + * the LNET_EVENT_SEND event would happen before the + * LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and + * LNET_EVENT_ACK events in an outgoing PUT operation. + */ + LNET_EVENT_SEND, + /** + * A MD has been unlinked. Note that LNetMDUnlink() does not + * necessarily trigger an LNET_EVENT_UNLINK event. + * \see LNetMDUnlink + */ + LNET_EVENT_UNLINK, +} lnet_event_kind_t; + +#define LNET_SEQ_BASETYPE long +typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t; +#define LNET_SEQ_GT(a, b) (((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0) + +/** + * Information about an event on a MD. + */ +typedef struct { + /** The identifier (nid, pid) of the target. */ + lnet_process_id_t target; + /** The identifier (nid, pid) of the initiator. */ + lnet_process_id_t initiator; + /** + * The NID of the immediate sender. If the request has been forwarded + * by routers, this is the NID of the last hop; otherwise it's the + * same as the initiator. + */ + lnet_nid_t sender; + /** Indicates the type of the event. */ + lnet_event_kind_t type; + /** The portal table index specified in the request */ + unsigned int pt_index; + /** A copy of the match bits specified in the request. */ + __u64 match_bits; + /** The length (in bytes) specified in the request. */ + unsigned int rlength; + /** + * The length (in bytes) of the data that was manipulated by the + * operation. For truncated operations, the manipulated length will be + * the number of bytes specified by the MD (possibly with an offset, + * see lnet_md_t). For all other operations, the manipulated length + * will be the length of the requested operation, i.e. rlength. + */ + unsigned int mlength; + /** + * The handle to the MD associated with the event. The handle may be + * invalid if the MD has been unlinked. + */ + lnet_handle_md_t md_handle; + /** + * A snapshot of the state of the MD immediately after the event has + * been processed. In particular, the threshold field in md will + * reflect the value of the threshold after the operation occurred. + */ + lnet_md_t md; + /** + * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT. + * \see LNetPut + */ + __u64 hdr_data; + /** + * Indicates the completion status of the operation. It's 0 for + * successful operations, otherwise it's an error code. + */ + int status; + /** + * Indicates whether the MD has been unlinked. Note that: + * - An event with unlinked set is the last event on the MD. + * - This field is also set for an explicit LNET_EVENT_UNLINK event. + * \see LNetMDUnlink + */ + int unlinked; + /** + * The displacement (in bytes) into the memory region that the + * operation used. The offset can be determined by the operation for + * a remote managed MD or by the local MD. + * \see lnet_md_t::options + */ + unsigned int offset; + /** + * The sequence number for this event. Sequence numbers are unique + * to each event. + */ + volatile lnet_seq_t sequence; +} lnet_event_t; + +/** + * Event queue handler function type. + * + * The EQ handler runs for each event that is deposited into the EQ. The + * handler is supplied with a pointer to the event that triggered the + * handler invocation. + * + * The handler must not block, must be reentrant, and must not call any LNet + * API functions. It should return as quickly as possible. + */ +typedef void (*lnet_eq_handler_t)(lnet_event_t *event); +#define LNET_EQ_HANDLER_NONE NULL +/** @} lnet_eq */ + +/** \addtogroup lnet_data + * @{ */ + +/** + * Specify whether an acknowledgment should be sent by target when the PUT + * operation completes (i.e., when the data has been written to a MD of the + * target process). + * + * \see lnet_md_t::options for the discussion on LNET_MD_ACK_DISABLE by which + * acknowledgments can be disabled for a MD. + */ +typedef enum { + /** Request an acknowledgment */ + LNET_ACK_REQ, + /** Request that no acknowledgment should be generated. */ + LNET_NOACK_REQ +} lnet_ack_req_t; +/** @} lnet_data */ + +/** @} lnet */ +#endif diff --git a/kernel/drivers/staging/lustre/lnet/Kconfig b/kernel/drivers/staging/lustre/lnet/Kconfig new file mode 100644 index 000000000..00850eeb6 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/Kconfig @@ -0,0 +1,40 @@ +config LNET + tristate "Lustre networking subsystem" + depends on LUSTRE_FS + +config LNET_MAX_PAYLOAD + int "Lustre lnet max transfer payload (default 2MB)" + depends on LUSTRE_FS + default "1048576" + help + This option defines the maximum size of payload in bytes that lnet + can put into its transport. + + If unsure, use default. + +config LNET_SELFTEST + tristate "Lustre networking self testing" + depends on LNET + help + Choose Y here if you want to do lnet self testing. To compile this + as a module, choose M here: the module will be called lnet_selftest. + + To compile this as a kernel modules, choose M here and it will be + called lnet_selftest. + + If unsure, say N. + + See also http://wiki.lustre.org/ + +config LNET_XPRT_IB + tristate "LNET infiniband support" + depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS + default LNET && INFINIBAND + help + This option allows the LNET users to use infiniband as an + RDMA-enabled transport. + + To compile this as a kernel module, choose M here and it will be + called ko2iblnd. + + If unsure, say N. diff --git a/kernel/drivers/staging/lustre/lnet/Makefile b/kernel/drivers/staging/lustre/lnet/Makefile new file mode 100644 index 000000000..f6f03e304 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LNET) += lnet/ klnds/ selftest/ diff --git a/kernel/drivers/staging/lustre/lnet/klnds/Makefile b/kernel/drivers/staging/lustre/lnet/klnds/Makefile new file mode 100644 index 000000000..c23e4f67f --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LNET) += o2iblnd/ socklnd/ diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile new file mode 100644 index 000000000..e0a7aa72b --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o +ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c new file mode 100644 index 000000000..3bad441de --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c @@ -0,0 +1,3118 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" +#include + +static lnd_t the_o2iblnd = { + .lnd_type = O2IBLND, + .lnd_startup = kiblnd_startup, + .lnd_shutdown = kiblnd_shutdown, + .lnd_ctl = kiblnd_ctl, + .lnd_query = kiblnd_query, + .lnd_send = kiblnd_send, + .lnd_recv = kiblnd_recv, +}; + +kib_data_t kiblnd_data; + +static __u32 kiblnd_cksum(void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} + +static char *kiblnd_msgtype2str(int type) +{ + switch (type) { + case IBLND_MSG_CONNREQ: + return "CONNREQ"; + + case IBLND_MSG_CONNACK: + return "CONNACK"; + + case IBLND_MSG_NOOP: + return "NOOP"; + + case IBLND_MSG_IMMEDIATE: + return "IMMEDIATE"; + + case IBLND_MSG_PUT_REQ: + return "PUT_REQ"; + + case IBLND_MSG_PUT_NAK: + return "PUT_NAK"; + + case IBLND_MSG_PUT_ACK: + return "PUT_ACK"; + + case IBLND_MSG_PUT_DONE: + return "PUT_DONE"; + + case IBLND_MSG_GET_REQ: + return "GET_REQ"; + + case IBLND_MSG_GET_DONE: + return "GET_DONE"; + + default: + return "???"; + } +} + +static int kiblnd_msgtype2size(int type) +{ + const int hdr_size = offsetof(kib_msg_t, ibm_u); + + switch (type) { + case IBLND_MSG_CONNREQ: + case IBLND_MSG_CONNACK: + return hdr_size + sizeof(kib_connparams_t); + + case IBLND_MSG_NOOP: + return hdr_size; + + case IBLND_MSG_IMMEDIATE: + return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]); + + case IBLND_MSG_PUT_REQ: + return hdr_size + sizeof(kib_putreq_msg_t); + + case IBLND_MSG_PUT_ACK: + return hdr_size + sizeof(kib_putack_msg_t); + + case IBLND_MSG_GET_REQ: + return hdr_size + sizeof(kib_get_msg_t); + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + return hdr_size + sizeof(kib_completion_msg_t); + default: + return -1; + } +} + +static int kiblnd_unpack_rd(kib_msg_t *msg, int flip) +{ + kib_rdma_desc_t *rd; + int nob; + int n; + int i; + + LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ || + msg->ibm_type == IBLND_MSG_PUT_ACK); + + rd = msg->ibm_type == IBLND_MSG_GET_REQ ? + &msg->ibm_u.get.ibgm_rd : + &msg->ibm_u.putack.ibpam_rd; + + if (flip) { + __swab32s(&rd->rd_key); + __swab32s(&rd->rd_nfrags); + } + + n = rd->rd_nfrags; + + if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { + CERROR("Bad nfrags: %d, should be 0 < n <= %d\n", + n, IBLND_MAX_RDMA_FRAGS); + return 1; + } + + nob = offsetof(kib_msg_t, ibm_u) + + kiblnd_rd_msg_size(rd, msg->ibm_type, n); + + if (msg->ibm_nob < nob) { + CERROR("Short %s: %d(%d)\n", + kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob); + return 1; + } + + if (!flip) + return 0; + + for (i = 0; i < n; i++) { + __swab32s(&rd->rd_frags[i].rf_nob); + __swab64s(&rd->rd_frags[i].rf_addr); + } + + return 0; +} + +void kiblnd_pack_msg(lnet_ni_t *ni, kib_msg_t *msg, int version, + int credits, lnet_nid_t dstnid, __u64 dststamp) +{ + kib_net_t *net = ni->ni_data; + + /* CAVEAT EMPTOR! all message fields not set here should have been + * initialised previously. */ + msg->ibm_magic = IBLND_MSG_MAGIC; + msg->ibm_version = version; + /* ibm_type */ + msg->ibm_credits = credits; + /* ibm_nob */ + msg->ibm_cksum = 0; + msg->ibm_srcnid = ni->ni_nid; + msg->ibm_srcstamp = net->ibn_incarnation; + msg->ibm_dstnid = dstnid; + msg->ibm_dststamp = dststamp; + + if (*kiblnd_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob); + } +} + +int kiblnd_unpack_msg(kib_msg_t *msg, int nob) +{ + const int hdr_size = offsetof(kib_msg_t, ibm_u); + __u32 msg_cksum; + __u16 version; + int msg_nob; + int flip; + + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + if (msg->ibm_magic == IBLND_MSG_MAGIC) { + flip = 0; + } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) { + flip = 1; + } else { + CERROR("Bad magic: %08x\n", msg->ibm_magic); + return -EPROTO; + } + + version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; + if (version != IBLND_MSG_VERSION && + version != IBLND_MSG_VERSION_1) { + CERROR("Bad version: %x\n", version); + return -EPROTO; + } + + if (nob < hdr_size) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; + if (msg_nob > nob) { + CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; + } + + /* checksum must be computed with ibm_cksum zero and BEFORE anything + * gets flipped */ + msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; + msg->ibm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != kiblnd_cksum(msg, msg_nob)) { + CERROR("Bad checksum\n"); + return -EPROTO; + } + + msg->ibm_cksum = msg_cksum; + + if (flip) { + /* leave magic unflipped as a clue to peer endianness */ + msg->ibm_version = version; + CLASSERT(sizeof(msg->ibm_type) == 1); + CLASSERT(sizeof(msg->ibm_credits) == 1); + msg->ibm_nob = msg_nob; + __swab64s(&msg->ibm_srcnid); + __swab64s(&msg->ibm_srcstamp); + __swab64s(&msg->ibm_dstnid); + __swab64s(&msg->ibm_dststamp); + } + + if (msg->ibm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); + return -EPROTO; + } + + if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) { + CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type), + msg_nob, kiblnd_msgtype2size(msg->ibm_type)); + return -EPROTO; + } + + switch (msg->ibm_type) { + default: + CERROR("Unknown message type %x\n", msg->ibm_type); + return -EPROTO; + + case IBLND_MSG_NOOP: + case IBLND_MSG_IMMEDIATE: + case IBLND_MSG_PUT_REQ: + break; + + case IBLND_MSG_PUT_ACK: + case IBLND_MSG_GET_REQ: + if (kiblnd_unpack_rd(msg, flip)) + return -EPROTO; + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + if (flip) + __swab32s(&msg->ibm_u.completion.ibcm_status); + break; + + case IBLND_MSG_CONNREQ: + case IBLND_MSG_CONNACK: + if (flip) { + __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth); + __swab16s(&msg->ibm_u.connparams.ibcp_max_frags); + __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); + } + break; + } + return 0; +} + +int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) +{ + kib_peer_t *peer; + kib_net_t *net = ni->ni_data; + int cpt = lnet_cpt_of_nid(nid); + unsigned long flags; + + LASSERT(net != NULL); + LASSERT(nid != LNET_NID_ANY); + + LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer)); + if (peer == NULL) { + CERROR("Cannot allocate peer\n"); + return -ENOMEM; + } + + memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + + peer->ibp_ni = ni; + peer->ibp_nid = nid; + peer->ibp_error = 0; + peer->ibp_last_alive = 0; + atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ + + INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ + INIT_LIST_HEAD(&peer->ibp_conns); + INIT_LIST_HEAD(&peer->ibp_tx_queue); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT(net->ibn_shutdown == 0); + + /* npeers only grows with the global lock held */ + atomic_inc(&net->ibn_npeers); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + *peerp = peer; + return 0; +} + +void kiblnd_destroy_peer(kib_peer_t *peer) +{ + kib_net_t *net = peer->ibp_ni->ni_data; + + LASSERT(net != NULL); + LASSERT(atomic_read(&peer->ibp_refcount) == 0); + LASSERT(!kiblnd_peer_active(peer)); + LASSERT(peer->ibp_connecting == 0); + LASSERT(peer->ibp_accepting == 0); + LASSERT(list_empty(&peer->ibp_conns)); + LASSERT(list_empty(&peer->ibp_tx_queue)); + + LIBCFS_FREE(peer, sizeof(*peer)); + + /* NB a peer's connections keep a reference on their peer until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer has been cleaned up when its refcount drops to + * zero. */ + atomic_dec(&net->ibn_npeers); +} + +kib_peer_t *kiblnd_find_peer_locked(lnet_nid_t nid) +{ + /* the caller is responsible for accounting the additional reference + * that this creates */ + struct list_head *peer_list = kiblnd_nid2peerlist(nid); + struct list_head *tmp; + kib_peer_t *peer; + + list_for_each(tmp, peer_list) { + + peer = list_entry(tmp, kib_peer_t, ibp_list); + + LASSERT(peer->ibp_connecting > 0 || /* creating conns */ + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); /* active conn */ + + if (peer->ibp_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n", + peer, libcfs_nid2str(nid), + atomic_read(&peer->ibp_refcount), + peer->ibp_version); + return peer; + } + return NULL; +} + +void kiblnd_unlink_peer_locked(kib_peer_t *peer) +{ + LASSERT(list_empty(&peer->ibp_conns)); + + LASSERT(kiblnd_peer_active(peer)); + list_del_init(&peer->ibp_list); + /* lose peerlist's ref */ + kiblnd_peer_decref(peer); +} + +static int kiblnd_get_peer_info(lnet_ni_t *ni, int index, + lnet_nid_t *nidp, int *count) +{ + kib_peer_t *peer; + struct list_head *ptmp; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + + list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { + + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT(peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + if (index-- > 0) + continue; + + *nidp = peer->ibp_nid; + *count = atomic_read(&peer->ibp_refcount); + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return 0; + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return -ENOENT; +} + +static void kiblnd_del_peer_locked(kib_peer_t *peer) +{ + struct list_head *ctmp; + struct list_head *cnxt; + kib_conn_t *conn; + + if (list_empty(&peer->ibp_conns)) { + kiblnd_unlink_peer_locked(peer); + } else { + list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + kiblnd_close_conn_locked(conn, 0); + } + /* NB closing peer's last conn unlinked it. */ + } + /* NB peer now unlinked; might even be freed if the peer table had the + * last ref on it. */ +} + +static int kiblnd_del_peer(lnet_ni_t *ni, lnet_nid_t nid) +{ + LIST_HEAD(zombies); + struct list_head *ptmp; + struct list_head *pnxt; + kib_peer_t *peer; + int lo; + int hi; + int i; + unsigned long flags; + int rc = -ENOENT; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) { + lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; + } else { + lo = 0; + hi = kiblnd_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT(peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) + continue; + + if (!list_empty(&peer->ibp_tx_queue)) { + LASSERT(list_empty(&peer->ibp_conns)); + + list_splice_init(&peer->ibp_tx_queue, + &zombies); + } + + kiblnd_del_peer_locked(peer); + rc = 0; /* matched something */ + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_txlist_done(ni, &zombies, -EIO); + + return rc; +} + +static kib_conn_t *kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index) +{ + kib_peer_t *peer; + struct list_head *ptmp; + kib_conn_t *conn; + struct list_head *ctmp; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { + + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT(peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + list_for_each(ctmp, &peer->ibp_conns) { + if (index-- > 0) + continue; + + conn = list_entry(ctmp, kib_conn_t, + ibc_list); + kiblnd_conn_addref(conn); + read_unlock_irqrestore( + &kiblnd_data.kib_global_lock, + flags); + return conn; + } + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return NULL; +} + +int kiblnd_translate_mtu(int value) +{ + switch (value) { + default: + return -1; + case 0: + return 0; + case 256: + return IB_MTU_256; + case 512: + return IB_MTU_512; + case 1024: + return IB_MTU_1024; + case 2048: + return IB_MTU_2048; + case 4096: + return IB_MTU_4096; + } +} + +static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) +{ + int mtu; + + /* XXX There is no path record for iWARP, set by netdev->change_mtu? */ + if (cmid->route.path_rec == NULL) + return; + + mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu); + LASSERT(mtu >= 0); + if (mtu != 0) + cmid->route.path_rec->mtu = mtu; +} + +static int kiblnd_get_completion_vector(kib_conn_t *conn, int cpt) +{ + cpumask_t *mask; + int vectors; + int off; + int i; + lnet_nid_t nid = conn->ibc_peer->ibp_nid; + + vectors = conn->ibc_cmid->device->num_comp_vectors; + if (vectors <= 1) + return 0; + + mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt); + if (mask == NULL) + return 0; + + /* hash NID to CPU id in this partition... */ + off = do_div(nid, cpumask_weight(mask)); + for_each_cpu(i, mask) { + if (off-- == 0) + return i % vectors; + } + + LBUG(); + return 1; +} + +kib_conn_t *kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, + int state, int version) +{ + /* CAVEAT EMPTOR: + * If the new conn is created successfully it takes over the caller's + * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself + * is destroyed. On failure, the caller's ref on 'peer' remains and + * she must dispose of 'cmid'. (Actually I'd block forever if I tried + * to destroy 'cmid' here since I'm called from the CM which still has + * its ref on 'cmid'). */ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + kib_net_t *net = peer->ibp_ni->ni_data; + kib_dev_t *dev; + struct ib_qp_init_attr *init_qp_attr; + struct kib_sched_info *sched; + kib_conn_t *conn; + struct ib_cq *cq; + unsigned long flags; + int cpt; + int rc; + int i; + + LASSERT(net != NULL); + LASSERT(!in_interrupt()); + + dev = net->ibn_dev; + + cpt = lnet_cpt_of_nid(peer->ibp_nid); + sched = kiblnd_data.kib_scheds[cpt]; + + LASSERT(sched->ibs_nthreads > 0); + + LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt, + sizeof(*init_qp_attr)); + if (init_qp_attr == NULL) { + CERROR("Can't allocate qp_attr for %s\n", + libcfs_nid2str(peer->ibp_nid)); + goto failed_0; + } + + LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn)); + if (conn == NULL) { + CERROR("Can't allocate connection for %s\n", + libcfs_nid2str(peer->ibp_nid)); + goto failed_1; + } + + conn->ibc_state = IBLND_CONN_INIT; + conn->ibc_version = version; + conn->ibc_peer = peer; /* I take the caller's ref */ + cmid->context = conn; /* for future CM callbacks */ + conn->ibc_cmid = cmid; + + INIT_LIST_HEAD(&conn->ibc_early_rxs); + INIT_LIST_HEAD(&conn->ibc_tx_noops); + INIT_LIST_HEAD(&conn->ibc_tx_queue); + INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); + INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); + INIT_LIST_HEAD(&conn->ibc_active_txs); + spin_lock_init(&conn->ibc_lock); + + LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt, + sizeof(*conn->ibc_connvars)); + if (conn->ibc_connvars == NULL) { + CERROR("Can't allocate in-progress connection state\n"); + goto failed_2; + } + + write_lock_irqsave(glock, flags); + if (dev->ibd_failover) { + write_unlock_irqrestore(glock, flags); + CERROR("%s: failover in progress\n", dev->ibd_ifname); + goto failed_2; + } + + if (dev->ibd_hdev->ibh_ibdev != cmid->device) { + /* wakeup failover thread and teardown connection */ + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + wake_up(&kiblnd_data.kib_failover_waitq); + } + + write_unlock_irqrestore(glock, flags); + CERROR("cmid HCA(%s), kib_dev(%s) need failover\n", + cmid->device->name, dev->ibd_ifname); + goto failed_2; + } + + kiblnd_hdev_addref_locked(dev->ibd_hdev); + conn->ibc_hdev = dev->ibd_hdev; + + kiblnd_setup_mtu_locked(cmid); + + write_unlock_irqrestore(glock, flags); + + LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt, + IBLND_RX_MSGS(version) * sizeof(kib_rx_t)); + if (conn->ibc_rxs == NULL) { + CERROR("Cannot allocate RX buffers\n"); + goto failed_2; + } + + rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, + IBLND_RX_MSG_PAGES(version)); + if (rc != 0) + goto failed_2; + + kiblnd_map_rx_descs(conn); + + cq = ib_create_cq(cmid->device, + kiblnd_cq_completion, kiblnd_cq_event, conn, + IBLND_CQ_ENTRIES(version), + kiblnd_get_completion_vector(conn, cpt)); + if (IS_ERR(cq)) { + CERROR("Can't create CQ: %ld, cqe: %d\n", + PTR_ERR(cq), IBLND_CQ_ENTRIES(version)); + goto failed_2; + } + + conn->ibc_cq = cq; + + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (rc != 0) { + CERROR("Can't request completion notificiation: %d\n", rc); + goto failed_2; + } + + init_qp_attr->event_handler = kiblnd_qp_event; + init_qp_attr->qp_context = conn; + init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version); + init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version); + init_qp_attr->cap.max_send_sge = 1; + init_qp_attr->cap.max_recv_sge = 1; + init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + init_qp_attr->qp_type = IB_QPT_RC; + init_qp_attr->send_cq = cq; + init_qp_attr->recv_cq = cq; + + conn->ibc_sched = sched; + + rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr); + if (rc != 0) { + CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n", + rc, init_qp_attr->cap.max_send_wr, + init_qp_attr->cap.max_recv_wr); + goto failed_2; + } + + LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); + + /* 1 ref for caller and each rxmsg */ + atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version)); + conn->ibc_nrx = IBLND_RX_MSGS(version); + + /* post receives */ + for (i = 0; i < IBLND_RX_MSGS(version); i++) { + rc = kiblnd_post_rx(&conn->ibc_rxs[i], + IBLND_POSTRX_NO_CREDIT); + if (rc != 0) { + CERROR("Can't post rxmsg: %d\n", rc); + + /* Make posted receives complete */ + kiblnd_abort_receives(conn); + + /* correct # of posted buffers + * NB locking needed now I'm racing with completion */ + spin_lock_irqsave(&sched->ibs_lock, flags); + conn->ibc_nrx -= IBLND_RX_MSGS(version) - i; + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + /* cmid will be destroyed by CM(ofed) after cm_callback + * returned, so we can't refer it anymore + * (by kiblnd_connd()->kiblnd_destroy_conn) */ + rdma_destroy_qp(conn->ibc_cmid); + conn->ibc_cmid = NULL; + + /* Drop my own and unused rxbuffer refcounts */ + while (i++ <= IBLND_RX_MSGS(version)) + kiblnd_conn_decref(conn); + + return NULL; + } + } + + /* Init successful! */ + LASSERT(state == IBLND_CONN_ACTIVE_CONNECT || + state == IBLND_CONN_PASSIVE_WAIT); + conn->ibc_state = state; + + /* 1 more conn */ + atomic_inc(&net->ibn_nconns); + return conn; + + failed_2: + kiblnd_destroy_conn(conn); + failed_1: + LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); + failed_0: + return NULL; +} + +void kiblnd_destroy_conn(kib_conn_t *conn) +{ + struct rdma_cm_id *cmid = conn->ibc_cmid; + kib_peer_t *peer = conn->ibc_peer; + int rc; + + LASSERT(!in_interrupt()); + LASSERT(atomic_read(&conn->ibc_refcount) == 0); + LASSERT(list_empty(&conn->ibc_early_rxs)); + LASSERT(list_empty(&conn->ibc_tx_noops)); + LASSERT(list_empty(&conn->ibc_tx_queue)); + LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd)); + LASSERT(list_empty(&conn->ibc_tx_queue_nocred)); + LASSERT(list_empty(&conn->ibc_active_txs)); + LASSERT(conn->ibc_noops_posted == 0); + LASSERT(conn->ibc_nsends_posted == 0); + + switch (conn->ibc_state) { + default: + /* conn must be completely disengaged from the network */ + LBUG(); + + case IBLND_CONN_DISCONNECTED: + /* connvars should have been freed already */ + LASSERT(conn->ibc_connvars == NULL); + break; + + case IBLND_CONN_INIT: + break; + } + + /* conn->ibc_cmid might be destroyed by CM already */ + if (cmid != NULL && cmid->qp != NULL) + rdma_destroy_qp(cmid); + + if (conn->ibc_cq != NULL) { + rc = ib_destroy_cq(conn->ibc_cq); + if (rc != 0) + CWARN("Error destroying CQ: %d\n", rc); + } + + if (conn->ibc_rx_pages != NULL) + kiblnd_unmap_rx_descs(conn); + + if (conn->ibc_rxs != NULL) { + LIBCFS_FREE(conn->ibc_rxs, + IBLND_RX_MSGS(conn->ibc_version) + * sizeof(kib_rx_t)); + } + + if (conn->ibc_connvars != NULL) + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + + if (conn->ibc_hdev != NULL) + kiblnd_hdev_decref(conn->ibc_hdev); + + /* See CAVEAT EMPTOR above in kiblnd_create_conn */ + if (conn->ibc_state != IBLND_CONN_INIT) { + kib_net_t *net = peer->ibp_ni->ni_data; + + kiblnd_peer_decref(peer); + rdma_destroy_id(cmid); + atomic_dec(&net->ibn_nconns); + } + + LIBCFS_FREE(conn, sizeof(*conn)); +} + +int kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_version, why); + + kiblnd_close_conn_locked(conn, why); + count++; + } + + return count; +} + +int kiblnd_close_stale_conns_locked(kib_peer_t *peer, + int version, __u64 incarnation) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + if (conn->ibc_version == version && + conn->ibc_incarnation == incarnation) + continue; + + CDEBUG(D_NET, + "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_version, conn->ibc_incarnation, + version, incarnation); + + kiblnd_close_conn_locked(conn, -ESTALE); + count++; + } + + return count; +} + +static int kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid) +{ + kib_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + unsigned long flags; + int count = 0; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) + lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; + else { + lo = 0; + hi = kiblnd_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { + + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT(peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) + continue; + + count += kiblnd_close_peer_conns_locked(peer, 0); + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* wildcards always succeed */ + if (nid == LNET_NID_ANY) + return 0; + + return (count == 0) ? -ENOENT : 0; +} + +int kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; + + switch (cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + int count = 0; + + rc = kiblnd_get_peer_info(ni, data->ioc_count, + &nid, &count); + data->ioc_nid = nid; + data->ioc_count = count; + break; + } + + case IOC_LIBCFS_DEL_PEER: { + rc = kiblnd_del_peer(ni, data->ioc_nid); + break; + } + case IOC_LIBCFS_GET_CONN: { + kib_conn_t *conn; + + rc = 0; + conn = kiblnd_get_conn_by_idx(ni, data->ioc_count); + if (conn == NULL) { + rc = -ENOENT; + break; + } + + LASSERT(conn->ibc_cmid != NULL); + data->ioc_nid = conn->ibc_peer->ibp_nid; + if (conn->ibc_cmid->route.path_rec == NULL) + data->ioc_u32[0] = 0; /* iWarp has no path MTU */ + else + data->ioc_u32[0] = + ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); + kiblnd_conn_decref(conn); + break; + } + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kiblnd_close_matching_conns(ni, data->ioc_nid); + break; + } + + default: + break; + } + + return rc; +} + +void kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when) +{ + unsigned long last_alive = 0; + unsigned long now = cfs_time_current(); + rwlock_t *glock = &kiblnd_data.kib_global_lock; + kib_peer_t *peer; + unsigned long flags; + + read_lock_irqsave(glock, flags); + + peer = kiblnd_find_peer_locked(nid); + if (peer != NULL) { + LASSERT(peer->ibp_connecting > 0 || /* creating conns */ + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); /* active conn */ + last_alive = peer->ibp_last_alive; + } + + read_unlock_irqrestore(glock, flags); + + if (last_alive != 0) + *when = last_alive; + + /* peer is not persistent in hash, trigger peer creation + * and connection establishment with a NULL tx */ + if (peer == NULL) + kiblnd_launch_tx(ni, NULL, nid); + + CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n", + libcfs_nid2str(nid), peer, + last_alive ? cfs_duration_sec(now - last_alive) : -1); +} + +void kiblnd_free_pages(kib_pages_t *p) +{ + int npages = p->ibp_npages; + int i; + + for (i = 0; i < npages; i++) { + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); + } + + LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages])); +} + +int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages) +{ + kib_pages_t *p; + int i; + + LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt, + offsetof(kib_pages_t, ibp_pages[npages])); + if (p == NULL) { + CERROR("Can't allocate descriptor for %d pages\n", npages); + return -ENOMEM; + } + + memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; + + for (i = 0; i < npages; i++) { + p->ibp_pages[i] = alloc_pages_node( + cfs_cpt_spread_node(lnet_cpt_table(), cpt), + GFP_NOFS, 0); + if (p->ibp_pages[i] == NULL) { + CERROR("Can't allocate page %d of %d\n", i, npages); + kiblnd_free_pages(p); + return -ENOMEM; + } + } + + *pp = p; + return 0; +} + +void kiblnd_unmap_rx_descs(kib_conn_t *conn) +{ + kib_rx_t *rx; + int i; + + LASSERT(conn->ibc_rxs != NULL); + LASSERT(conn->ibc_hdev != NULL); + + for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) { + rx = &conn->ibc_rxs[i]; + + LASSERT(rx->rx_nob >= 0); /* not posted */ + + kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, + KIBLND_UNMAP_ADDR(rx, rx_msgunmap, + rx->rx_msgaddr), + IBLND_MSG_SIZE, DMA_FROM_DEVICE); + } + + kiblnd_free_pages(conn->ibc_rx_pages); + + conn->ibc_rx_pages = NULL; +} + +void kiblnd_map_rx_descs(kib_conn_t *conn) +{ + kib_rx_t *rx; + struct page *pg; + int pg_off; + int ipg; + int i; + + for (pg_off = ipg = i = 0; + i < IBLND_RX_MSGS(conn->ibc_version); i++) { + pg = conn->ibc_rx_pages->ibp_pages[ipg]; + rx = &conn->ibc_rxs[i]; + + rx->rx_conn = conn; + rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off); + + rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, + rx->rx_msg, + IBLND_MSG_SIZE, + DMA_FROM_DEVICE); + LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, + rx->rx_msgaddr)); + KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); + + CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n", + i, rx->rx_msg, rx->rx_msgaddr, + lnet_page2phys(pg) + pg_off); + + pg_off += IBLND_MSG_SIZE; + LASSERT(pg_off <= PAGE_SIZE); + + if (pg_off == PAGE_SIZE) { + pg_off = 0; + ipg++; + LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version)); + } + } +} + +static void kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo) +{ + kib_hca_dev_t *hdev = tpo->tpo_hdev; + kib_tx_t *tx; + int i; + + LASSERT(tpo->tpo_pool.po_allocated == 0); + + if (hdev == NULL) + return; + + for (i = 0; i < tpo->tpo_pool.po_size; i++) { + tx = &tpo->tpo_tx_descs[i]; + kiblnd_dma_unmap_single(hdev->ibh_ibdev, + KIBLND_UNMAP_ADDR(tx, tx_msgunmap, + tx->tx_msgaddr), + IBLND_MSG_SIZE, DMA_TO_DEVICE); + } + + kiblnd_hdev_decref(hdev); + tpo->tpo_hdev = NULL; +} + +static kib_hca_dev_t *kiblnd_current_hdev(kib_dev_t *dev) +{ + kib_hca_dev_t *hdev; + unsigned long flags; + int i = 0; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + while (dev->ibd_failover) { + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + if (i++ % 50 == 0) + CDEBUG(D_NET, "%s: Wait for failover\n", + dev->ibd_ifname); + schedule_timeout(cfs_time_seconds(1) / 100); + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + } + + kiblnd_hdev_addref_locked(dev->ibd_hdev); + hdev = dev->ibd_hdev; + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + return hdev; +} + +static void kiblnd_map_tx_pool(kib_tx_pool_t *tpo) +{ + kib_pages_t *txpgs = tpo->tpo_tx_pages; + kib_pool_t *pool = &tpo->tpo_pool; + kib_net_t *net = pool->po_owner->ps_net; + kib_dev_t *dev; + struct page *page; + kib_tx_t *tx; + int page_offset; + int ipage; + int i; + + LASSERT(net != NULL); + + dev = net->ibn_dev; + + /* pre-mapped messages are not bigger than 1 page */ + CLASSERT(IBLND_MSG_SIZE <= PAGE_SIZE); + + /* No fancy arithmetic when we do the buffer calculations */ + CLASSERT(PAGE_SIZE % IBLND_MSG_SIZE == 0); + + tpo->tpo_hdev = kiblnd_current_hdev(dev); + + for (ipage = page_offset = i = 0; i < pool->po_size; i++) { + page = txpgs->ibp_pages[ipage]; + tx = &tpo->tpo_tx_descs[i]; + + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + tx->tx_msgaddr = kiblnd_dma_map_single( + tpo->tpo_hdev->ibh_ibdev, tx->tx_msg, + IBLND_MSG_SIZE, DMA_TO_DEVICE); + LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev, + tx->tx_msgaddr)); + KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr); + + list_add(&tx->tx_list, &pool->po_free_list); + + page_offset += IBLND_MSG_SIZE; + LASSERT(page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT(ipage <= txpgs->ibp_npages); + } + } +} + +struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size) +{ + __u64 index; + + LASSERT(hdev->ibh_mrs[0] != NULL); + + if (hdev->ibh_nmrs == 1) + return hdev->ibh_mrs[0]; + + index = addr >> hdev->ibh_mr_shift; + + if (index < hdev->ibh_nmrs && + index == ((addr + size - 1) >> hdev->ibh_mr_shift)) + return hdev->ibh_mrs[index]; + + return NULL; +} + +struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd) +{ + struct ib_mr *prev_mr; + struct ib_mr *mr; + int i; + + LASSERT(hdev->ibh_mrs[0] != NULL); + + if (*kiblnd_tunables.kib_map_on_demand > 0 && + *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags) + return NULL; + + if (hdev->ibh_nmrs == 1) + return hdev->ibh_mrs[0]; + + for (i = 0, mr = prev_mr = NULL; + i < rd->rd_nfrags; i++) { + mr = kiblnd_find_dma_mr(hdev, + rd->rd_frags[i].rf_addr, + rd->rd_frags[i].rf_nob); + if (prev_mr == NULL) + prev_mr = mr; + + if (mr == NULL || prev_mr != mr) { + /* Can't covered by one single MR */ + mr = NULL; + break; + } + } + + return mr; +} + +static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool) +{ + LASSERT(pool->fpo_map_count == 0); + + if (pool->fpo_fmr_pool != NULL) + ib_destroy_fmr_pool(pool->fpo_fmr_pool); + + if (pool->fpo_hdev != NULL) + kiblnd_hdev_decref(pool->fpo_hdev); + + LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t)); +} + +static void kiblnd_destroy_fmr_pool_list(struct list_head *head) +{ + kib_fmr_pool_t *pool; + + while (!list_empty(head)) { + pool = list_entry(head->next, kib_fmr_pool_t, fpo_list); + list_del(&pool->fpo_list); + kiblnd_destroy_fmr_pool(pool); + } +} + +static int kiblnd_fmr_pool_size(int ncpts) +{ + int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts; + + return max(IBLND_FMR_POOL, size); +} + +static int kiblnd_fmr_flush_trigger(int ncpts) +{ + int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts; + + return max(IBLND_FMR_POOL_FLUSH, size); +} + +static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, + kib_fmr_pool_t **pp_fpo) +{ + /* FMR pool for RDMA */ + kib_dev_t *dev = fps->fps_net->ibn_dev; + kib_fmr_pool_t *fpo; + struct ib_fmr_pool_param param = { + .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE, + .page_shift = PAGE_SHIFT, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE), + .pool_size = fps->fps_pool_size, + .dirty_watermark = fps->fps_flush_trigger, + .flush_function = NULL, + .flush_arg = NULL, + .cache = !!*kiblnd_tunables.kib_fmr_cache}; + int rc; + + LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo)); + if (fpo == NULL) + return -ENOMEM; + + fpo->fpo_hdev = kiblnd_current_hdev(dev); + + fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, ¶m); + if (IS_ERR(fpo->fpo_fmr_pool)) { + rc = PTR_ERR(fpo->fpo_fmr_pool); + CERROR("Failed to create FMR pool: %d\n", rc); + + kiblnd_hdev_decref(fpo->fpo_hdev); + LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t)); + return rc; + } + + fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + fpo->fpo_owner = fps; + *pp_fpo = fpo; + + return 0; +} + +static void kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, + struct list_head *zombies) +{ + if (fps->fps_net == NULL) /* intialized? */ + return; + + spin_lock(&fps->fps_lock); + + while (!list_empty(&fps->fps_pool_list)) { + kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next, + kib_fmr_pool_t, fpo_list); + fpo->fpo_failed = 1; + list_del(&fpo->fpo_list); + if (fpo->fpo_map_count == 0) + list_add(&fpo->fpo_list, zombies); + else + list_add(&fpo->fpo_list, &fps->fps_failed_pool_list); + } + + spin_unlock(&fps->fps_lock); +} + +static void kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps) +{ + if (fps->fps_net != NULL) { /* initialized? */ + kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); + kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); + } +} + +static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, + kib_net_t *net, int pool_size, + int flush_trigger) +{ + kib_fmr_pool_t *fpo; + int rc; + + memset(fps, 0, sizeof(kib_fmr_poolset_t)); + + fps->fps_net = net; + fps->fps_cpt = cpt; + fps->fps_pool_size = pool_size; + fps->fps_flush_trigger = flush_trigger; + spin_lock_init(&fps->fps_lock); + INIT_LIST_HEAD(&fps->fps_pool_list); + INIT_LIST_HEAD(&fps->fps_failed_pool_list); + + rc = kiblnd_create_fmr_pool(fps, &fpo); + if (rc == 0) + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + + return rc; +} + +static int kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, unsigned long now) +{ + if (fpo->fpo_map_count != 0) /* still in use */ + return 0; + if (fpo->fpo_failed) + return 1; + return cfs_time_aftereq(now, fpo->fpo_deadline); +} + +void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status) +{ + LIST_HEAD(zombies); + kib_fmr_pool_t *fpo = fmr->fmr_pool; + kib_fmr_poolset_t *fps = fpo->fpo_owner; + unsigned long now = cfs_time_current(); + kib_fmr_pool_t *tmp; + int rc; + + rc = ib_fmr_pool_unmap(fmr->fmr_pfmr); + LASSERT(rc == 0); + + if (status != 0) { + rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool); + LASSERT(rc == 0); + } + + fmr->fmr_pool = NULL; + fmr->fmr_pfmr = NULL; + + spin_lock(&fps->fps_lock); + fpo->fpo_map_count--; /* decref the pool */ + + list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) { + /* the first pool is persistent */ + if (fps->fps_pool_list.next == &fpo->fpo_list) + continue; + + if (kiblnd_fmr_pool_is_idle(fpo, now)) { + list_move(&fpo->fpo_list, &zombies); + fps->fps_version++; + } + } + spin_unlock(&fps->fps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_fmr_pool_list(&zombies); +} + +int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages, + __u64 iov, kib_fmr_t *fmr) +{ + struct ib_pool_fmr *pfmr; + kib_fmr_pool_t *fpo; + __u64 version; + int rc; + + again: + spin_lock(&fps->fps_lock); + version = fps->fps_version; + list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { + fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + fpo->fpo_map_count++; + spin_unlock(&fps->fps_lock); + + pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool, + pages, npages, iov); + if (likely(!IS_ERR(pfmr))) { + fmr->fmr_pool = fpo; + fmr->fmr_pfmr = pfmr; + return 0; + } + + spin_lock(&fps->fps_lock); + fpo->fpo_map_count--; + if (PTR_ERR(pfmr) != -EAGAIN) { + spin_unlock(&fps->fps_lock); + return PTR_ERR(pfmr); + } + + /* EAGAIN and ... */ + if (version != fps->fps_version) { + spin_unlock(&fps->fps_lock); + goto again; + } + } + + if (fps->fps_increasing) { + spin_unlock(&fps->fps_lock); + CDEBUG(D_NET, + "Another thread is allocating new FMR pool, waiting for her to complete\n"); + schedule(); + goto again; + + } + + if (time_before(cfs_time_current(), fps->fps_next_retry)) { + /* someone failed recently */ + spin_unlock(&fps->fps_lock); + return -EAGAIN; + } + + fps->fps_increasing = 1; + spin_unlock(&fps->fps_lock); + + CDEBUG(D_NET, "Allocate new FMR pool\n"); + rc = kiblnd_create_fmr_pool(fps, &fpo); + spin_lock(&fps->fps_lock); + fps->fps_increasing = 0; + if (rc == 0) { + fps->fps_version++; + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + } else { + fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY); + } + spin_unlock(&fps->fps_lock); + + goto again; +} + +static void kiblnd_fini_pool(kib_pool_t *pool) +{ + LASSERT(list_empty(&pool->po_free_list)); + LASSERT(pool->po_allocated == 0); + + CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name); +} + +static void kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size) +{ + CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name); + + memset(pool, 0, sizeof(kib_pool_t)); + INIT_LIST_HEAD(&pool->po_free_list); + pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + pool->po_owner = ps; + pool->po_size = size; +} + +static void kiblnd_destroy_pool_list(struct list_head *head) +{ + kib_pool_t *pool; + + while (!list_empty(head)) { + pool = list_entry(head->next, kib_pool_t, po_list); + list_del(&pool->po_list); + + LASSERT(pool->po_owner != NULL); + pool->po_owner->ps_pool_destroy(pool); + } +} + +static void kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies) +{ + if (ps->ps_net == NULL) /* intialized? */ + return; + + spin_lock(&ps->ps_lock); + while (!list_empty(&ps->ps_pool_list)) { + kib_pool_t *po = list_entry(ps->ps_pool_list.next, + kib_pool_t, po_list); + po->po_failed = 1; + list_del(&po->po_list); + if (po->po_allocated == 0) + list_add(&po->po_list, zombies); + else + list_add(&po->po_list, &ps->ps_failed_pool_list); + } + spin_unlock(&ps->ps_lock); +} + +static void kiblnd_fini_poolset(kib_poolset_t *ps) +{ + if (ps->ps_net != NULL) { /* initialized? */ + kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); + kiblnd_destroy_pool_list(&ps->ps_pool_list); + } +} + +static int kiblnd_init_poolset(kib_poolset_t *ps, int cpt, + kib_net_t *net, char *name, int size, + kib_ps_pool_create_t po_create, + kib_ps_pool_destroy_t po_destroy, + kib_ps_node_init_t nd_init, + kib_ps_node_fini_t nd_fini) +{ + kib_pool_t *pool; + int rc; + + memset(ps, 0, sizeof(kib_poolset_t)); + + ps->ps_cpt = cpt; + ps->ps_net = net; + ps->ps_pool_create = po_create; + ps->ps_pool_destroy = po_destroy; + ps->ps_node_init = nd_init; + ps->ps_node_fini = nd_fini; + ps->ps_pool_size = size; + if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name)) + >= sizeof(ps->ps_name)) + return -E2BIG; + spin_lock_init(&ps->ps_lock); + INIT_LIST_HEAD(&ps->ps_pool_list); + INIT_LIST_HEAD(&ps->ps_failed_pool_list); + + rc = ps->ps_pool_create(ps, size, &pool); + if (rc == 0) + list_add(&pool->po_list, &ps->ps_pool_list); + else + CERROR("Failed to create the first pool for %s\n", ps->ps_name); + + return rc; +} + +static int kiblnd_pool_is_idle(kib_pool_t *pool, unsigned long now) +{ + if (pool->po_allocated != 0) /* still in use */ + return 0; + if (pool->po_failed) + return 1; + return cfs_time_aftereq(now, pool->po_deadline); +} + +void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node) +{ + LIST_HEAD(zombies); + kib_poolset_t *ps = pool->po_owner; + kib_pool_t *tmp; + unsigned long now = cfs_time_current(); + + spin_lock(&ps->ps_lock); + + if (ps->ps_node_fini != NULL) + ps->ps_node_fini(pool, node); + + LASSERT(pool->po_allocated > 0); + list_add(node, &pool->po_free_list); + pool->po_allocated--; + + list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) { + /* the first pool is persistent */ + if (ps->ps_pool_list.next == &pool->po_list) + continue; + + if (kiblnd_pool_is_idle(pool, now)) + list_move(&pool->po_list, &zombies); + } + spin_unlock(&ps->ps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_pool_list(&zombies); +} + +struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps) +{ + struct list_head *node; + kib_pool_t *pool; + int rc; + + again: + spin_lock(&ps->ps_lock); + list_for_each_entry(pool, &ps->ps_pool_list, po_list) { + if (list_empty(&pool->po_free_list)) + continue; + + pool->po_allocated++; + pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + node = pool->po_free_list.next; + list_del(node); + + if (ps->ps_node_init != NULL) { + /* still hold the lock */ + ps->ps_node_init(pool, node); + } + spin_unlock(&ps->ps_lock); + return node; + } + + /* no available tx pool and ... */ + if (ps->ps_increasing) { + /* another thread is allocating a new pool */ + spin_unlock(&ps->ps_lock); + CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting for her to complete\n", + ps->ps_name); + schedule(); + goto again; + } + + if (time_before(cfs_time_current(), ps->ps_next_retry)) { + /* someone failed recently */ + spin_unlock(&ps->ps_lock); + return NULL; + } + + ps->ps_increasing = 1; + spin_unlock(&ps->ps_lock); + + CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name); + + rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool); + + spin_lock(&ps->ps_lock); + ps->ps_increasing = 0; + if (rc == 0) { + list_add_tail(&pool->po_list, &ps->ps_pool_list); + } else { + ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY); + CERROR("Can't allocate new %s pool because out of memory\n", + ps->ps_name); + } + spin_unlock(&ps->ps_lock); + + goto again; +} + +void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr) +{ + kib_pmr_pool_t *ppo = pmr->pmr_pool; + struct ib_mr *mr = pmr->pmr_mr; + + pmr->pmr_mr = NULL; + kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list); + if (mr != NULL) + ib_dereg_mr(mr); +} + +int kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev, + kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr) +{ + kib_phys_mr_t *pmr; + struct list_head *node; + int rc; + int i; + + node = kiblnd_pool_alloc_node(&pps->pps_poolset); + if (node == NULL) { + CERROR("Failed to allocate PMR descriptor\n"); + return -ENOMEM; + } + + pmr = container_of(node, kib_phys_mr_t, pmr_list); + if (pmr->pmr_pool->ppo_hdev != hdev) { + kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node); + return -EAGAIN; + } + + for (i = 0; i < rd->rd_nfrags; i++) { + pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr; + pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob; + } + + pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd, + pmr->pmr_ipb, rd->rd_nfrags, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE, + iova); + if (!IS_ERR(pmr->pmr_mr)) { + pmr->pmr_iova = *iova; + *pp_pmr = pmr; + return 0; + } + + rc = PTR_ERR(pmr->pmr_mr); + CERROR("Failed ib_reg_phys_mr: %d\n", rc); + + pmr->pmr_mr = NULL; + kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node); + + return rc; +} + +static void kiblnd_destroy_pmr_pool(kib_pool_t *pool) +{ + kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool); + kib_phys_mr_t *pmr; + kib_phys_mr_t *tmp; + + LASSERT(pool->po_allocated == 0); + + list_for_each_entry_safe(pmr, tmp, &pool->po_free_list, pmr_list) { + LASSERT(pmr->pmr_mr == NULL); + list_del(&pmr->pmr_list); + + if (pmr->pmr_ipb != NULL) { + LIBCFS_FREE(pmr->pmr_ipb, + IBLND_MAX_RDMA_FRAGS * + sizeof(struct ib_phys_buf)); + } + + LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t)); + } + + kiblnd_fini_pool(pool); + if (ppo->ppo_hdev != NULL) + kiblnd_hdev_decref(ppo->ppo_hdev); + + LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t)); +} + +static inline int kiblnd_pmr_pool_size(int ncpts) +{ + int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts; + + return max(IBLND_PMR_POOL, size); +} + +static int kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, + kib_pool_t **pp_po) +{ + struct kib_pmr_pool *ppo; + struct kib_pool *pool; + kib_phys_mr_t *pmr; + int i; + + LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(), + ps->ps_cpt, sizeof(kib_pmr_pool_t)); + if (ppo == NULL) { + CERROR("Failed to allocate PMR pool\n"); + return -ENOMEM; + } + + pool = &ppo->ppo_pool; + kiblnd_init_pool(ps, pool, size); + + for (i = 0; i < size; i++) { + LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(), + ps->ps_cpt, sizeof(kib_phys_mr_t)); + if (pmr == NULL) + break; + + pmr->pmr_pool = ppo; + LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt, + IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb)); + if (pmr->pmr_ipb == NULL) + break; + + list_add(&pmr->pmr_list, &pool->po_free_list); + } + + if (i < size) { + ps->ps_pool_destroy(pool); + return -ENOMEM; + } + + ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev); + *pp_po = pool; + return 0; +} + +static void kiblnd_destroy_tx_pool(kib_pool_t *pool) +{ + kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool); + int i; + + LASSERT(pool->po_allocated == 0); + + if (tpo->tpo_tx_pages != NULL) { + kiblnd_unmap_tx_pool(tpo); + kiblnd_free_pages(tpo->tpo_tx_pages); + } + + if (tpo->tpo_tx_descs == NULL) + goto out; + + for (i = 0; i < pool->po_size; i++) { + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + + list_del(&tx->tx_list); + if (tx->tx_pages != NULL) + LIBCFS_FREE(tx->tx_pages, + LNET_MAX_IOV * + sizeof(*tx->tx_pages)); + if (tx->tx_frags != NULL) + LIBCFS_FREE(tx->tx_frags, + IBLND_MAX_RDMA_FRAGS * + sizeof(*tx->tx_frags)); + if (tx->tx_wrq != NULL) + LIBCFS_FREE(tx->tx_wrq, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_sge != NULL) + LIBCFS_FREE(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_sge)); + if (tx->tx_rd != NULL) + LIBCFS_FREE(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + } + + LIBCFS_FREE(tpo->tpo_tx_descs, + pool->po_size * sizeof(kib_tx_t)); +out: + kiblnd_fini_pool(pool); + LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); +} + +static int kiblnd_tx_pool_size(int ncpts) +{ + int ntx = *kiblnd_tunables.kib_ntx / ncpts; + + return max(IBLND_TX_POOL, ntx); +} + +static int kiblnd_create_tx_pool(kib_poolset_t *ps, int size, + kib_pool_t **pp_po) +{ + int i; + int npg; + kib_pool_t *pool; + kib_tx_pool_t *tpo; + + LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo)); + if (tpo == NULL) { + CERROR("Failed to allocate TX pool\n"); + return -ENOMEM; + } + + pool = &tpo->tpo_pool; + kiblnd_init_pool(ps, pool, size); + tpo->tpo_tx_descs = NULL; + tpo->tpo_tx_pages = NULL; + + npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE; + if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) { + CERROR("Can't allocate tx pages: %d\n", npg); + LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); + return -ENOMEM; + } + + LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt, + size * sizeof(kib_tx_t)); + if (tpo->tpo_tx_descs == NULL) { + CERROR("Can't allocate %d tx descriptors\n", size); + ps->ps_pool_destroy(pool); + return -ENOMEM; + } + + memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t)); + + for (i = 0; i < size; i++) { + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + + tx->tx_pool = tpo; + if (ps->ps_net->ibn_fmr_ps != NULL) { + LIBCFS_CPT_ALLOC(tx->tx_pages, + lnet_cpt_table(), ps->ps_cpt, + LNET_MAX_IOV * sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) + break; + } + + LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt, + IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags)); + if (tx->tx_frags == NULL) + break; + + sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS); + + LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_sge)); + if (tx->tx_sge == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt, + offsetof(kib_rdma_desc_t, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + break; + } + + if (i == size) { + kiblnd_map_tx_pool(tpo); + *pp_po = pool; + return 0; + } + + ps->ps_pool_destroy(pool); + return -ENOMEM; +} + +static void kiblnd_tx_init(kib_pool_t *pool, struct list_head *node) +{ + kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t, + tps_poolset); + kib_tx_t *tx = list_entry(node, kib_tx_t, tx_list); + + tx->tx_cookie = tps->tps_next_tx_cookie++; +} + +static void kiblnd_net_fini_pools(kib_net_t *net) +{ + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + kib_tx_poolset_t *tps; + kib_fmr_poolset_t *fps; + kib_pmr_poolset_t *pps; + + if (net->ibn_tx_ps != NULL) { + tps = net->ibn_tx_ps[i]; + kiblnd_fini_poolset(&tps->tps_poolset); + } + + if (net->ibn_fmr_ps != NULL) { + fps = net->ibn_fmr_ps[i]; + kiblnd_fini_fmr_poolset(fps); + } + + if (net->ibn_pmr_ps != NULL) { + pps = net->ibn_pmr_ps[i]; + kiblnd_fini_poolset(&pps->pps_poolset); + } + } + + if (net->ibn_tx_ps != NULL) { + cfs_percpt_free(net->ibn_tx_ps); + net->ibn_tx_ps = NULL; + } + + if (net->ibn_fmr_ps != NULL) { + cfs_percpt_free(net->ibn_fmr_ps); + net->ibn_fmr_ps = NULL; + } + + if (net->ibn_pmr_ps != NULL) { + cfs_percpt_free(net->ibn_pmr_ps); + net->ibn_pmr_ps = NULL; + } +} + +static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) +{ + unsigned long flags; + int cpt; + int rc; + int i; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + if (*kiblnd_tunables.kib_map_on_demand == 0 && + net->ibn_dev->ibd_hdev->ibh_nmrs == 1) { + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + goto create_tx_pool; + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (*kiblnd_tunables.kib_fmr_pool_size < + *kiblnd_tunables.kib_ntx / 4) { + CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", + *kiblnd_tunables.kib_fmr_pool_size, + *kiblnd_tunables.kib_ntx / 4); + rc = -EINVAL; + goto failed; + } + + /* TX pool must be created later than FMR/PMR, see LU-2268 + * for details */ + LASSERT(net->ibn_tx_ps == NULL); + + /* premapping can fail if ibd_nmr > 1, so we always create + * FMR/PMR pool and map-on-demand if premapping failed */ + + net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(kib_fmr_poolset_t)); + if (net->ibn_fmr_ps == NULL) { + CERROR("Failed to allocate FMR pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net, + kiblnd_fmr_pool_size(ncpts), + kiblnd_fmr_flush_trigger(ncpts)); + if (rc == -ENOSYS && i == 0) /* no FMR */ + break; /* create PMR pool */ + + if (rc != 0) { /* a real error */ + CERROR("Can't initialize FMR pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + if (i > 0) { + LASSERT(i == ncpts); + goto create_tx_pool; + } + + cfs_percpt_free(net->ibn_fmr_ps); + net->ibn_fmr_ps = NULL; + + CWARN("Device does not support FMR, failing back to PMR\n"); + + if (*kiblnd_tunables.kib_pmr_pool_size < + *kiblnd_tunables.kib_ntx / 4) { + CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n", + *kiblnd_tunables.kib_pmr_pool_size, + *kiblnd_tunables.kib_ntx / 4); + rc = -EINVAL; + goto failed; + } + + net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(kib_pmr_poolset_t)); + if (net->ibn_pmr_ps == NULL) { + CERROR("Failed to allocate PMR pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset, + cpt, net, "PMR", + kiblnd_pmr_pool_size(ncpts), + kiblnd_create_pmr_pool, + kiblnd_destroy_pmr_pool, NULL, NULL); + if (rc != 0) { + CERROR("Can't initialize PMR pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + create_tx_pool: + net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(kib_tx_poolset_t)); + if (net->ibn_tx_ps == NULL) { + CERROR("Failed to allocate tx pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset, + cpt, net, "TX", + kiblnd_tx_pool_size(ncpts), + kiblnd_create_tx_pool, + kiblnd_destroy_tx_pool, + kiblnd_tx_init, NULL); + if (rc != 0) { + CERROR("Can't initialize TX pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + return 0; + failed: + kiblnd_net_fini_pools(net); + LASSERT(rc != 0); + return rc; +} + +static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev) +{ + struct ib_device_attr *attr; + int rc; + + /* It's safe to assume a HCA can handle a page size + * matching that of the native system */ + hdev->ibh_page_shift = PAGE_SHIFT; + hdev->ibh_page_size = 1 << PAGE_SHIFT; + hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1); + + LIBCFS_ALLOC(attr, sizeof(*attr)); + if (attr == NULL) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + rc = ib_query_device(hdev->ibh_ibdev, attr); + if (rc == 0) + hdev->ibh_mr_size = attr->max_mr_size; + + LIBCFS_FREE(attr, sizeof(*attr)); + + if (rc != 0) { + CERROR("Failed to query IB device: %d\n", rc); + return rc; + } + + if (hdev->ibh_mr_size == ~0ULL) { + hdev->ibh_mr_shift = 64; + return 0; + } + + for (hdev->ibh_mr_shift = 0; + hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift++) { + if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) || + hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1) + return 0; + } + + CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size); + return -EINVAL; +} + +static void kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev) +{ + int i; + + if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL) + return; + + for (i = 0; i < hdev->ibh_nmrs; i++) { + if (hdev->ibh_mrs[i] == NULL) + break; + + ib_dereg_mr(hdev->ibh_mrs[i]); + } + + LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs); + hdev->ibh_mrs = NULL; + hdev->ibh_nmrs = 0; +} + +void kiblnd_hdev_destroy(kib_hca_dev_t *hdev) +{ + kiblnd_hdev_cleanup_mrs(hdev); + + if (hdev->ibh_pd != NULL) + ib_dealloc_pd(hdev->ibh_pd); + + if (hdev->ibh_cmid != NULL) + rdma_destroy_id(hdev->ibh_cmid); + + LIBCFS_FREE(hdev, sizeof(*hdev)); +} + +static int kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev) +{ + struct ib_mr *mr; + int i; + int rc; + __u64 mm_size; + __u64 mr_size; + int acflags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE; + + rc = kiblnd_hdev_get_attr(hdev); + if (rc != 0) + return rc; + + if (hdev->ibh_mr_shift == 64) { + LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs)); + if (hdev->ibh_mrs == NULL) { + CERROR("Failed to allocate MRs table\n"); + return -ENOMEM; + } + + hdev->ibh_mrs[0] = NULL; + hdev->ibh_nmrs = 1; + + mr = ib_get_dma_mr(hdev->ibh_pd, acflags); + if (IS_ERR(mr)) { + CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr)); + kiblnd_hdev_cleanup_mrs(hdev); + return PTR_ERR(mr); + } + + hdev->ibh_mrs[0] = mr; + + goto out; + } + + mr_size = 1ULL << hdev->ibh_mr_shift; + mm_size = (unsigned long)high_memory - PAGE_OFFSET; + + hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift); + + if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) { + /* it's 4T..., assume we will re-code at that time */ + CERROR("Can't support memory size: x%#llx with MR size: x%#llx\n", + mm_size, mr_size); + return -EINVAL; + } + + /* create an array of MRs to cover all memory */ + LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs); + if (hdev->ibh_mrs == NULL) { + CERROR("Failed to allocate MRs' table\n"); + return -ENOMEM; + } + + for (i = 0; i < hdev->ibh_nmrs; i++) { + struct ib_phys_buf ipb; + __u64 iova; + + ipb.size = hdev->ibh_mr_size; + ipb.addr = i * mr_size; + iova = ipb.addr; + + mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova); + if (IS_ERR(mr)) { + CERROR("Failed ib_reg_phys_mr addr %#llx size %#llx : %ld\n", + ipb.addr, ipb.size, PTR_ERR(mr)); + kiblnd_hdev_cleanup_mrs(hdev); + return PTR_ERR(mr); + } + + LASSERT(iova == ipb.addr); + + hdev->ibh_mrs[i] = mr; + } + +out: + if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1) + LCONSOLE_INFO("Register global MR array, MR size: %#llx, array size: %d\n", + hdev->ibh_mr_size, hdev->ibh_nmrs); + return 0; +} + +/* DUMMY */ +static int kiblnd_dummy_callback(struct rdma_cm_id *cmid, + struct rdma_cm_event *event) +{ + return 0; +} + +static int kiblnd_dev_need_failover(kib_dev_t *dev) +{ + struct rdma_cm_id *cmid; + struct sockaddr_in srcaddr; + struct sockaddr_in dstaddr; + int rc; + + if (dev->ibd_hdev == NULL || /* initializing */ + dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */ + *kiblnd_tunables.kib_dev_failover > 1) /* debugging */ + return 1; + + /* XXX: it's UGLY, but I don't have better way to find + * ib-bonding HCA failover because: + * + * a. no reliable CM event for HCA failover... + * b. no OFED API to get ib_device for current net_device... + * + * We have only two choices at this point: + * + * a. rdma_bind_addr(), it will conflict with listener cmid + * b. rdma_resolve_addr() to zero addr */ + cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(cmid)) { + rc = PTR_ERR(cmid); + CERROR("Failed to create cmid for failover: %d\n", rc); + return rc; + } + + memset(&srcaddr, 0, sizeof(srcaddr)); + srcaddr.sin_family = AF_INET; + srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); + + memset(&dstaddr, 0, sizeof(dstaddr)); + dstaddr.sin_family = AF_INET; + rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, + (struct sockaddr *)&dstaddr, 1); + if (rc != 0 || cmid->device == NULL) { + CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", + dev->ibd_ifname, &dev->ibd_ifip, + cmid->device, rc); + rdma_destroy_id(cmid); + return rc; + } + + if (dev->ibd_hdev->ibh_ibdev == cmid->device) { + /* don't need device failover */ + rdma_destroy_id(cmid); + return 0; + } + + return 1; +} + +int kiblnd_dev_failover(kib_dev_t *dev) +{ + LIST_HEAD(zombie_tpo); + LIST_HEAD(zombie_ppo); + LIST_HEAD(zombie_fpo); + struct rdma_cm_id *cmid = NULL; + kib_hca_dev_t *hdev = NULL; + kib_hca_dev_t *old; + struct ib_pd *pd; + kib_net_t *net; + struct sockaddr_in addr; + unsigned long flags; + int rc = 0; + int i; + + LASSERT(*kiblnd_tunables.kib_dev_failover > 1 || + dev->ibd_can_failover || + dev->ibd_hdev == NULL); + + rc = kiblnd_dev_need_failover(dev); + if (rc <= 0) + goto out; + + if (dev->ibd_hdev != NULL && + dev->ibd_hdev->ibh_cmid != NULL) { + /* XXX it's not good to close old listener at here, + * because we can fail to create new listener. + * But we have to close it now, otherwise rdma_bind_addr + * will return EADDRINUSE... How crap! */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + cmid = dev->ibd_hdev->ibh_cmid; + /* make next schedule of kiblnd_dev_need_failover() + * return 1 for me */ + dev->ibd_hdev->ibh_cmid = NULL; + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + rdma_destroy_id(cmid); + } + + cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(cmid)) { + rc = PTR_ERR(cmid); + CERROR("Failed to create cmid for failover: %d\n", rc); + goto out; + } + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); + addr.sin_port = htons(*kiblnd_tunables.kib_service); + + /* Bind to failover device or port */ + rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr); + if (rc != 0 || cmid->device == NULL) { + CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", + dev->ibd_ifname, &dev->ibd_ifip, + cmid->device, rc); + rdma_destroy_id(cmid); + goto out; + } + + LIBCFS_ALLOC(hdev, sizeof(*hdev)); + if (hdev == NULL) { + CERROR("Failed to allocate kib_hca_dev\n"); + rdma_destroy_id(cmid); + rc = -ENOMEM; + goto out; + } + + atomic_set(&hdev->ibh_ref, 1); + hdev->ibh_dev = dev; + hdev->ibh_cmid = cmid; + hdev->ibh_ibdev = cmid->device; + + pd = ib_alloc_pd(cmid->device); + if (IS_ERR(pd)) { + rc = PTR_ERR(pd); + CERROR("Can't allocate PD: %d\n", rc); + goto out; + } + + hdev->ibh_pd = pd; + + rc = rdma_listen(cmid, 0); + if (rc != 0) { + CERROR("Can't start new listener: %d\n", rc); + goto out; + } + + rc = kiblnd_hdev_setup_mrs(hdev); + if (rc != 0) { + CERROR("Can't setup device: %d\n", rc); + goto out; + } + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + old = dev->ibd_hdev; + dev->ibd_hdev = hdev; /* take over the refcount */ + hdev = old; + + list_for_each_entry(net, &dev->ibd_nets, ibn_list) { + cfs_cpt_for_each(i, lnet_cpt_table()) { + kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset, + &zombie_tpo); + + if (net->ibn_fmr_ps != NULL) { + kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i], + &zombie_fpo); + + } else if (net->ibn_pmr_ps != NULL) { + kiblnd_fail_poolset(&net->ibn_pmr_ps[i]-> + pps_poolset, &zombie_ppo); + } + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + out: + if (!list_empty(&zombie_tpo)) + kiblnd_destroy_pool_list(&zombie_tpo); + if (!list_empty(&zombie_ppo)) + kiblnd_destroy_pool_list(&zombie_ppo); + if (!list_empty(&zombie_fpo)) + kiblnd_destroy_fmr_pool_list(&zombie_fpo); + if (hdev != NULL) + kiblnd_hdev_decref(hdev); + + if (rc != 0) + dev->ibd_failed_failover++; + else + dev->ibd_failed_failover = 0; + + return rc; +} + +void kiblnd_destroy_dev(kib_dev_t *dev) +{ + LASSERT(dev->ibd_nnets == 0); + LASSERT(list_empty(&dev->ibd_nets)); + + list_del(&dev->ibd_fail_list); + list_del(&dev->ibd_list); + + if (dev->ibd_hdev != NULL) + kiblnd_hdev_decref(dev->ibd_hdev); + + LIBCFS_FREE(dev, sizeof(*dev)); +} + +static kib_dev_t *kiblnd_create_dev(char *ifname) +{ + struct net_device *netdev; + kib_dev_t *dev; + __u32 netmask; + __u32 ip; + int up; + int rc; + + rc = libcfs_ipif_query(ifname, &up, &ip, &netmask); + if (rc != 0) { + CERROR("Can't query IPoIB interface %s: %d\n", + ifname, rc); + return NULL; + } + + if (!up) { + CERROR("Can't query IPoIB interface %s: it's down\n", ifname); + return NULL; + } + + LIBCFS_ALLOC(dev, sizeof(*dev)); + if (dev == NULL) + return NULL; + + netdev = dev_get_by_name(&init_net, ifname); + if (netdev == NULL) { + dev->ibd_can_failover = 0; + } else { + dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER); + dev_put(netdev); + } + + INIT_LIST_HEAD(&dev->ibd_nets); + INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */ + INIT_LIST_HEAD(&dev->ibd_fail_list); + dev->ibd_ifip = ip; + strcpy(&dev->ibd_ifname[0], ifname); + + /* initialize the device */ + rc = kiblnd_dev_failover(dev); + if (rc != 0) { + CERROR("Can't initialize device: %d\n", rc); + LIBCFS_FREE(dev, sizeof(*dev)); + return NULL; + } + + list_add_tail(&dev->ibd_list, + &kiblnd_data.kib_devs); + return dev; +} + +static void kiblnd_base_shutdown(void) +{ + struct kib_sched_info *sched; + int i; + + LASSERT(list_empty(&kiblnd_data.kib_devs)); + + CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + switch (kiblnd_data.kib_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + case IBLND_INIT_DATA: + LASSERT(kiblnd_data.kib_peers != NULL); + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) + LASSERT(list_empty(&kiblnd_data.kib_peers[i])); + LASSERT(list_empty(&kiblnd_data.kib_connd_zombies)); + LASSERT(list_empty(&kiblnd_data.kib_connd_conns)); + + /* flag threads to terminate; wake and wait for them to die */ + kiblnd_data.kib_shutdown = 1; + + /* NB: we really want to stop scheduler threads net by net + * instead of the whole module, this should be improved + * with dynamic configuration LNet */ + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) + wake_up_all(&sched->ibs_waitq); + + wake_up_all(&kiblnd_data.kib_connd_waitq); + wake_up_all(&kiblnd_data.kib_failover_waitq); + + i = 2; + while (atomic_read(&kiblnd_data.kib_nthreads) != 0) { + i++; + /* power of 2 ? */ + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for %d threads to terminate\n", + atomic_read(&kiblnd_data.kib_nthreads)); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + + /* fall through */ + + case IBLND_INIT_NOTHING: + break; + } + + if (kiblnd_data.kib_peers != NULL) { + LIBCFS_FREE(kiblnd_data.kib_peers, + sizeof(struct list_head) * + kiblnd_data.kib_peer_hash_size); + } + + if (kiblnd_data.kib_scheds != NULL) + cfs_percpt_free(kiblnd_data.kib_scheds); + + CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + kiblnd_data.kib_init = IBLND_INIT_NOTHING; + module_put(THIS_MODULE); +} + +void kiblnd_shutdown(lnet_ni_t *ni) +{ + kib_net_t *net = ni->ni_data; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + int i; + unsigned long flags; + + LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); + + if (net == NULL) + goto out; + + CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + write_lock_irqsave(g_lock, flags); + net->ibn_shutdown = 1; + write_unlock_irqrestore(g_lock, flags); + + switch (net->ibn_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + /* nuke all existing peers within this net */ + kiblnd_del_peer(ni, LNET_NID_ANY); + + /* Wait for all peer state to clean up */ + i = 2; + while (atomic_read(&net->ibn_npeers) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ + "%s: waiting for %d peers to disconnect\n", + libcfs_nid2str(ni->ni_nid), + atomic_read(&net->ibn_npeers)); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + + kiblnd_net_fini_pools(net); + + write_lock_irqsave(g_lock, flags); + LASSERT(net->ibn_dev->ibd_nnets > 0); + net->ibn_dev->ibd_nnets--; + list_del(&net->ibn_list); + write_unlock_irqrestore(g_lock, flags); + + /* fall through */ + + case IBLND_INIT_NOTHING: + LASSERT(atomic_read(&net->ibn_nconns) == 0); + + if (net->ibn_dev != NULL && + net->ibn_dev->ibd_nnets == 0) + kiblnd_destroy_dev(net->ibn_dev); + + break; + } + + CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + net->ibn_init = IBLND_INIT_NOTHING; + ni->ni_data = NULL; + + LIBCFS_FREE(net, sizeof(*net)); + +out: + if (list_empty(&kiblnd_data.kib_devs)) + kiblnd_base_shutdown(); +} + +static int kiblnd_base_startup(void) +{ + struct kib_sched_info *sched; + int rc; + int i; + + LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING); + + try_module_get(THIS_MODULE); + /* zero pointers, flags etc */ + memset(&kiblnd_data, 0, sizeof(kiblnd_data)); + + rwlock_init(&kiblnd_data.kib_global_lock); + + INIT_LIST_HEAD(&kiblnd_data.kib_devs); + INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs); + + kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE; + LIBCFS_ALLOC(kiblnd_data.kib_peers, + sizeof(struct list_head) * + kiblnd_data.kib_peer_hash_size); + if (kiblnd_data.kib_peers == NULL) + goto failed; + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]); + + spin_lock_init(&kiblnd_data.kib_connd_lock); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); + init_waitqueue_head(&kiblnd_data.kib_connd_waitq); + init_waitqueue_head(&kiblnd_data.kib_failover_waitq); + + kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*sched)); + if (kiblnd_data.kib_scheds == NULL) + goto failed; + + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { + int nthrs; + + spin_lock_init(&sched->ibs_lock); + INIT_LIST_HEAD(&sched->ibs_conns); + init_waitqueue_head(&sched->ibs_waitq); + + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds); + } else { + /* max to half of CPUs, another half is reserved for + * upper layer modules */ + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + } + + sched->ibs_nthreads_max = nthrs; + sched->ibs_cpt = i; + } + + kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; + + /* lists/ptrs/locks initialised */ + kiblnd_data.kib_init = IBLND_INIT_DATA; + /*****************************************************/ + + rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd"); + if (rc != 0) { + CERROR("Can't spawn o2iblnd connd: %d\n", rc); + goto failed; + } + + if (*kiblnd_tunables.kib_dev_failover != 0) + rc = kiblnd_thread_start(kiblnd_failover_thread, NULL, + "kiblnd_failover"); + + if (rc != 0) { + CERROR("Can't spawn o2iblnd failover thread: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + kiblnd_data.kib_init = IBLND_INIT_ALL; + /*****************************************************/ + + return 0; + + failed: + kiblnd_base_shutdown(); + return -ENETDOWN; +} + +static int kiblnd_start_schedulers(struct kib_sched_info *sched) +{ + int rc = 0; + int nthrs; + int i; + + if (sched->ibs_nthreads == 0) { + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = sched->ibs_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + sched->ibs_cpt); + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + nthrs = min(IBLND_N_SCHED_HIGH, nthrs); + } + } else { + LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max); + /* increase one thread if there is new interface */ + nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max; + } + + for (i = 0; i < nthrs; i++) { + long id; + char name[20]; + + id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i); + snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld", + KIB_THREAD_CPT(id), KIB_THREAD_TID(id)); + rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + sched->ibs_cpt, sched->ibs_nthreads + i, rc); + break; + } + + sched->ibs_nthreads += i; + return rc; +} + +static int kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, + int ncpts) +{ + int cpt; + int rc; + int i; + + for (i = 0; i < ncpts; i++) { + struct kib_sched_info *sched; + + cpt = (cpts == NULL) ? i : cpts[i]; + sched = kiblnd_data.kib_scheds[cpt]; + + if (!newdev && sched->ibs_nthreads > 0) + continue; + + rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]); + if (rc != 0) { + CERROR("Failed to start scheduler threads for %s\n", + dev->ibd_ifname); + return rc; + } + } + return 0; +} + +static kib_dev_t *kiblnd_dev_search(char *ifname) +{ + kib_dev_t *alias = NULL; + kib_dev_t *dev; + char *colon; + char *colon2; + + colon = strchr(ifname, ':'); + list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + return dev; + + if (alias != NULL) + continue; + + colon2 = strchr(dev->ibd_ifname, ':'); + if (colon != NULL) + *colon = 0; + if (colon2 != NULL) + *colon2 = 0; + + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + alias = dev; + + if (colon != NULL) + *colon = ':'; + if (colon2 != NULL) + *colon2 = ':'; + } + return alias; +} + +int kiblnd_startup(lnet_ni_t *ni) +{ + char *ifname; + kib_dev_t *ibdev = NULL; + kib_net_t *net; + struct timeval tv; + unsigned long flags; + int rc; + int newdev; + + LASSERT(ni->ni_lnd == &the_o2iblnd); + + if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { + rc = kiblnd_base_startup(); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + ni->ni_data = net; + if (net == NULL) + goto net_failed; + + do_gettimeofday(&tv); + net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout; + ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits; + ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits; + ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits; + + if (ni->ni_interfaces[0] != NULL) { + /* Use the IPoIB interface specified in 'networks=' */ + + CLASSERT(LNET_MAX_INTERFACES > 1); + if (ni->ni_interfaces[1] != NULL) { + CERROR("Multiple interfaces not supported\n"); + goto failed; + } + + ifname = ni->ni_interfaces[0]; + } else { + ifname = *kiblnd_tunables.kib_default_ipif; + } + + if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { + CERROR("IPoIB interface name too long: %s\n", ifname); + goto failed; + } + + ibdev = kiblnd_dev_search(ifname); + + newdev = ibdev == NULL; + /* hmm...create kib_dev even for alias */ + if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) + ibdev = kiblnd_create_dev(ifname); + + if (ibdev == NULL) + goto failed; + + net->ibn_dev = ibdev; + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); + + rc = kiblnd_dev_start_threads(ibdev, newdev, + ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto failed; + + rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) { + CERROR("Failed to initialize NI pools: %d\n", rc); + goto failed; + } + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + ibdev->ibd_nnets++; + list_add_tail(&net->ibn_list, &ibdev->ibd_nets); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + net->ibn_init = IBLND_INIT_ALL; + + return 0; + +failed: + if (net->ibn_dev == NULL && ibdev != NULL) + kiblnd_destroy_dev(ibdev); + +net_failed: + kiblnd_shutdown(ni); + + CDEBUG(D_NET, "kiblnd_startup failed\n"); + return -ENETDOWN; +} + +static void __exit kiblnd_module_fini(void) +{ + lnet_unregister_lnd(&the_o2iblnd); +} + +static int __init kiblnd_module_init(void) +{ + int rc; + + CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE); + CLASSERT(offsetof(kib_msg_t, + ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) + <= IBLND_MSG_SIZE); + CLASSERT(offsetof(kib_msg_t, + ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) + <= IBLND_MSG_SIZE); + + rc = kiblnd_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_o2iblnd); + + return 0; +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00"); +MODULE_LICENSE("GPL"); + +module_init(kiblnd_module_init); +module_exit(kiblnd_module_fini); diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h new file mode 100644 index 000000000..cd664d025 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h @@ -0,0 +1,1030 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd.h + * + * Author: Eric Barton + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define DEBUG_SUBSYSTEM S_LND + +#include "../../../include/linux/libcfs/libcfs.h" +#include "../../../include/linux/lnet/lnet.h" +#include "../../../include/linux/lnet/lib-lnet.h" +#include "../../../include/linux/lnet/lnet-sysctl.h" + +#include +#include +#include +#include + +#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */ +/* # scheduler loops before reschedule */ +#define IBLND_RESCHED 100 + +#define IBLND_N_SCHED 2 +#define IBLND_N_SCHED_HIGH 4 + +typedef struct { + int *kib_dev_failover; /* HCA failover */ + unsigned int *kib_service; /* IB service number */ + int *kib_min_reconnect_interval; /* first failed connection retry... */ + int *kib_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kib_cksum; /* checksum kib_msg_t? */ + int *kib_timeout; /* comms timeout (seconds) */ + int *kib_keepalive; /* keepalive timeout (seconds) */ + int *kib_ntx; /* # tx descs */ + int *kib_credits; /* # concurrent sends */ + int *kib_peertxcredits; /* # concurrent sends to 1 peer */ + int *kib_peerrtrcredits; /* # per-peer router buffer credits */ + int *kib_peercredits_hiw; /* # when eagerly to return credits */ + int *kib_peertimeout; /* seconds to consider peer dead */ + char **kib_default_ipif; /* default IPoIB interface */ + int *kib_retry_count; + int *kib_rnr_retry_count; + int *kib_concurrent_sends; /* send work queue sizing */ + int *kib_ib_mtu; /* IB MTU */ + int *kib_map_on_demand; /* map-on-demand if RD has more fragments + * than this value, 0 disable map-on-demand */ + int *kib_pmr_pool_size; /* # physical MR in pool */ + int *kib_fmr_pool_size; /* # FMRs in pool */ + int *kib_fmr_flush_trigger; /* When to trigger FMR flush */ + int *kib_fmr_cache; /* enable FMR pool cache? */ + int *kib_require_priv_port;/* accept only privileged ports */ + int *kib_use_priv_port; /* use privileged port for active connect */ + /* # threads on each CPT */ + int *kib_nscheds; +} kib_tunables_t; + +extern kib_tunables_t kiblnd_tunables; + +#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */ +#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */ + +#define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */ +#define IBLND_CREDITS_MAX ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1) /* Max # of peer credits */ + +#define IBLND_MSG_QUEUE_SIZE(v) ((v) == IBLND_MSG_VERSION_1 ? \ + IBLND_MSG_QUEUE_SIZE_V1 : \ + *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */ +#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \ + IBLND_CREDIT_HIGHWATER_V1 : \ + *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */ + +#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt) + +static inline int +kiblnd_concurrent_sends_v1(void) +{ + if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2) + return IBLND_MSG_QUEUE_SIZE_V1 * 2; + + if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2) + return IBLND_MSG_QUEUE_SIZE_V1 / 2; + + return *kiblnd_tunables.kib_concurrent_sends; +} + +#define IBLND_CONCURRENT_SENDS(v) ((v) == IBLND_MSG_VERSION_1 ? \ + kiblnd_concurrent_sends_v1() : \ + *kiblnd_tunables.kib_concurrent_sends) +/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */ +#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) +#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) + +#define IBLND_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ +#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */ +#define IBLND_CFG_RDMA_FRAGS (*kiblnd_tunables.kib_map_on_demand != 0 ? \ + *kiblnd_tunables.kib_map_on_demand : \ + IBLND_MAX_RDMA_FRAGS) /* max # of fragments configured by user */ +#define IBLND_RDMA_FRAGS(v) ((v) == IBLND_MSG_VERSION_1 ? \ + IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS) + +/************************/ +/* derived constants... */ +/* Pools (shared by connections on each CPT) */ +/* These pools can grow at runtime, so don't need give a very large value */ +#define IBLND_TX_POOL 256 +#define IBLND_PMR_POOL 256 +#define IBLND_FMR_POOL 256 +#define IBLND_FMR_POOL_FLUSH 192 + +/* TX messages (shared by all connections) */ +#define IBLND_TX_MSGS() (*kiblnd_tunables.kib_ntx) + +/* RX messages (per connection) */ +#define IBLND_RX_MSGS(v) (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v)) +#define IBLND_RX_MSG_BYTES(v) (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE) +#define IBLND_RX_MSG_PAGES(v) ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE) + +/* WRs and CQEs (per connection) */ +#define IBLND_RECV_WRS(v) IBLND_RX_MSGS(v) +#define IBLND_SEND_WRS(v) ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v)) +#define IBLND_CQ_ENTRIES(v) (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v)) + +struct kib_hca_dev; + +/* o2iblnd can run over aliased interface */ +#ifdef IFALIASZ +#define KIB_IFNAME_SIZE IFALIASZ +#else +#define KIB_IFNAME_SIZE 256 +#endif + +typedef struct { + struct list_head ibd_list; /* chain on kib_devs */ + struct list_head ibd_fail_list; /* chain on kib_failed_devs */ + __u32 ibd_ifip; /* IPoIB interface IP */ + /** IPoIB interface name */ + char ibd_ifname[KIB_IFNAME_SIZE]; + int ibd_nnets; /* # nets extant */ + + unsigned long ibd_next_failover; + int ibd_failed_failover; /* # failover failures */ + unsigned int ibd_failover; /* failover in progress */ + unsigned int ibd_can_failover; /* IPoIB interface is a bonding master */ + struct list_head ibd_nets; + struct kib_hca_dev *ibd_hdev; +} kib_dev_t; + +typedef struct kib_hca_dev { + struct rdma_cm_id *ibh_cmid; /* listener cmid */ + struct ib_device *ibh_ibdev; /* IB device */ + int ibh_page_shift; /* page shift of current HCA */ + int ibh_page_size; /* page size of current HCA */ + __u64 ibh_page_mask; /* page mask of current HCA */ + int ibh_mr_shift; /* bits shift of max MR size */ + __u64 ibh_mr_size; /* size of MR */ + int ibh_nmrs; /* # of global MRs */ + struct ib_mr **ibh_mrs; /* global MR */ + struct ib_pd *ibh_pd; /* PD */ + kib_dev_t *ibh_dev; /* owner */ + atomic_t ibh_ref; /* refcount */ +} kib_hca_dev_t; + +/** # of seconds to keep pool alive */ +#define IBLND_POOL_DEADLINE 300 +/** # of seconds to retry if allocation failed */ +#define IBLND_POOL_RETRY 1 + +typedef struct { + int ibp_npages; /* # pages */ + struct page *ibp_pages[0]; /* page array */ +} kib_pages_t; + +struct kib_pmr_pool; + +typedef struct { + struct list_head pmr_list; /* chain node */ + struct ib_phys_buf *pmr_ipb; /* physical buffer */ + struct ib_mr *pmr_mr; /* IB MR */ + struct kib_pmr_pool *pmr_pool; /* owner of this MR */ + __u64 pmr_iova; /* Virtual I/O address */ + int pmr_refcount; /* reference count */ +} kib_phys_mr_t; + +struct kib_pool; +struct kib_poolset; + +typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, + int inc, struct kib_pool **pp_po); +typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po); +typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node); +typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node); + +struct kib_net; + +#define IBLND_POOL_NAME_LEN 32 + +typedef struct kib_poolset { + spinlock_t ps_lock; /* serialize */ + struct kib_net *ps_net; /* network it belongs to */ + char ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */ + struct list_head ps_pool_list; /* list of pools */ + struct list_head ps_failed_pool_list; /* failed pool list */ + unsigned long ps_next_retry; /* time stamp for retry if failed to allocate */ + int ps_increasing; /* is allocating new pool */ + int ps_pool_size; /* new pool size */ + int ps_cpt; /* CPT id */ + + kib_ps_pool_create_t ps_pool_create; /* create a new pool */ + kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */ + kib_ps_node_init_t ps_node_init; /* initialize new allocated node */ + kib_ps_node_fini_t ps_node_fini; /* finalize node */ +} kib_poolset_t; + +typedef struct kib_pool { + struct list_head po_list; /* chain on pool list */ + struct list_head po_free_list; /* pre-allocated node */ + kib_poolset_t *po_owner; /* pool_set of this pool */ + unsigned long po_deadline; /* deadline of this pool */ + int po_allocated; /* # of elements in use */ + int po_failed; /* pool is created on failed HCA */ + int po_size; /* # of pre-allocated elements */ +} kib_pool_t; + +typedef struct { + kib_poolset_t tps_poolset; /* pool-set */ + __u64 tps_next_tx_cookie; /* cookie of TX */ +} kib_tx_poolset_t; + +typedef struct { + kib_pool_t tpo_pool; /* pool */ + struct kib_hca_dev *tpo_hdev; /* device for this pool */ + struct kib_tx *tpo_tx_descs; /* all the tx descriptors */ + kib_pages_t *tpo_tx_pages; /* premapped tx msg pages */ +} kib_tx_pool_t; + +typedef struct { + kib_poolset_t pps_poolset; /* pool-set */ +} kib_pmr_poolset_t; + +typedef struct kib_pmr_pool { + struct kib_hca_dev *ppo_hdev; /* device for this pool */ + kib_pool_t ppo_pool; /* pool */ +} kib_pmr_pool_t; + +typedef struct { + spinlock_t fps_lock; /* serialize */ + struct kib_net *fps_net; /* IB network */ + struct list_head fps_pool_list; /* FMR pool list */ + struct list_head fps_failed_pool_list; /* FMR pool list */ + __u64 fps_version; /* validity stamp */ + int fps_cpt; /* CPT id */ + int fps_pool_size; + int fps_flush_trigger; + /* is allocating new pool */ + int fps_increasing; + /* time stamp for retry if failed to allocate */ + unsigned long fps_next_retry; +} kib_fmr_poolset_t; + +typedef struct { + struct list_head fpo_list; /* chain on pool list */ + struct kib_hca_dev *fpo_hdev; /* device for this pool */ + kib_fmr_poolset_t *fpo_owner; /* owner of this pool */ + struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */ + unsigned long fpo_deadline; /* deadline of this pool */ + int fpo_failed; /* fmr pool is failed */ + int fpo_map_count; /* # of mapped FMR */ +} kib_fmr_pool_t; + +typedef struct { + struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */ + kib_fmr_pool_t *fmr_pool; /* pool of FMR */ +} kib_fmr_t; + +typedef struct kib_net { + struct list_head ibn_list; /* chain on kib_dev_t::ibd_nets */ + __u64 ibn_incarnation; /* my epoch */ + int ibn_init; /* initialisation state */ + int ibn_shutdown; /* shutting down? */ + + atomic_t ibn_npeers; /* # peers extant */ + atomic_t ibn_nconns; /* # connections extant */ + + kib_tx_poolset_t **ibn_tx_ps; /* tx pool-set */ + kib_fmr_poolset_t **ibn_fmr_ps; /* fmr pool-set */ + kib_pmr_poolset_t **ibn_pmr_ps; /* pmr pool-set */ + + kib_dev_t *ibn_dev; /* underlying IB device */ +} kib_net_t; + +#define KIB_THREAD_SHIFT 16 +#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid)) +#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT) +#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1)) + +struct kib_sched_info { + /* serialise */ + spinlock_t ibs_lock; + /* schedulers sleep here */ + wait_queue_head_t ibs_waitq; + /* conns to check for rx completions */ + struct list_head ibs_conns; + /* number of scheduler threads */ + int ibs_nthreads; + /* max allowed scheduler threads */ + int ibs_nthreads_max; + int ibs_cpt; /* CPT id */ +}; + +typedef struct { + int kib_init; /* initialisation state */ + int kib_shutdown; /* shut down? */ + struct list_head kib_devs; /* IB devices extant */ + /* list head of failed devices */ + struct list_head kib_failed_devs; + /* schedulers sleep here */ + wait_queue_head_t kib_failover_waitq; + atomic_t kib_nthreads; /* # live threads */ + /* stabilize net/dev/peer/conn ops */ + rwlock_t kib_global_lock; + /* hash table of all my known peers */ + struct list_head *kib_peers; + /* size of kib_peers */ + int kib_peer_hash_size; + /* the connd task (serialisation assertions) */ + void *kib_connd; + /* connections to setup/teardown */ + struct list_head kib_connd_conns; + /* connections with zero refcount */ + struct list_head kib_connd_zombies; + /* connection daemon sleeps here */ + wait_queue_head_t kib_connd_waitq; + spinlock_t kib_connd_lock; /* serialise */ + struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ + /* percpt data for schedulers */ + struct kib_sched_info **kib_scheds; +} kib_data_t; + +#define IBLND_INIT_NOTHING 0 +#define IBLND_INIT_DATA 1 +#define IBLND_INIT_ALL 2 + +/************************************************************************ + * IB Wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +typedef struct kib_connparams { + __u16 ibcp_queue_depth; + __u16 ibcp_max_frags; + __u32 ibcp_max_msg_size; +} WIRE_ATTR kib_connparams_t; + +typedef struct { + lnet_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kib_immediate_msg_t; + +typedef struct { + __u32 rf_nob; /* # bytes this frag */ + __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */ +} WIRE_ATTR kib_rdma_frag_t; + +typedef struct { + __u32 rd_key; /* local/remote key */ + __u32 rd_nfrags; /* # fragments */ + kib_rdma_frag_t rd_frags[0]; /* buffer frags */ +} WIRE_ATTR kib_rdma_desc_t; + +typedef struct { + lnet_hdr_t ibprm_hdr; /* portals header */ + __u64 ibprm_cookie; /* opaque completion cookie */ +} WIRE_ATTR kib_putreq_msg_t; + +typedef struct { + __u64 ibpam_src_cookie; /* reflected completion cookie */ + __u64 ibpam_dst_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibpam_rd; /* sender's sink buffer */ +} WIRE_ATTR kib_putack_msg_t; + +typedef struct { + lnet_hdr_t ibgm_hdr; /* portals header */ + __u64 ibgm_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibgm_rd; /* rdma descriptor */ +} WIRE_ATTR kib_get_msg_t; + +typedef struct { + __u64 ibcm_cookie; /* opaque completion cookie */ + __s32 ibcm_status; /* < 0 failure: >= 0 length */ +} WIRE_ATTR kib_completion_msg_t; + +typedef struct { + /* First 2 fields fixed FOR ALL TIME */ + __u32 ibm_magic; /* I'm an ibnal message */ + __u16 ibm_version; /* this is my version number */ + + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ + __u32 ibm_nob; /* # bytes in whole message */ + __u32 ibm_cksum; /* checksum (0 == no checksum) */ + __u64 ibm_srcnid; /* sender's NID */ + __u64 ibm_srcstamp; /* sender's incarnation */ + __u64 ibm_dstnid; /* destination's NID */ + __u64 ibm_dststamp; /* destination's incarnation */ + + union { + kib_connparams_t connparams; + kib_immediate_msg_t immediate; + kib_putreq_msg_t putreq; + kib_putack_msg_t putack; + kib_get_msg_t get; + kib_completion_msg_t completion; + } WIRE_ATTR ibm_u; +} WIRE_ATTR kib_msg_t; + +#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */ + +#define IBLND_MSG_VERSION_1 0x11 +#define IBLND_MSG_VERSION_2 0x12 +#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2 + +#define IBLND_MSG_CONNREQ 0xc0 /* connection request */ +#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */ +#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */ +#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ +#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ +#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ +#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ +#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ +#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ + +typedef struct { + __u32 ibr_magic; /* sender's magic */ + __u16 ibr_version; /* sender's version */ + __u8 ibr_why; /* reject reason */ + __u8 ibr_padding; /* padding */ + __u64 ibr_incarnation; /* incarnation of peer */ + kib_connparams_t ibr_cp; /* connection parameters */ +} WIRE_ATTR kib_rej_t; + +/* connection rejection reasons */ +#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */ +#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */ +#define IBLND_REJECT_FATAL 3 /* Anything else */ + +#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer */ +#define IBLND_REJECT_CONN_STALE 5 /* stale peer */ + +#define IBLND_REJECT_RDMA_FRAGS 6 /* Fatal: peer's rdma frags can't match mine */ +#define IBLND_REJECT_MSG_QUEUE_SIZE 7 /* Fatal: peer's msg queue size can't match mine */ + +/***********************************************************************/ + +typedef struct kib_rx /* receive message */ +{ + struct list_head rx_list; /* queue for attention */ + struct kib_conn *rx_conn; /* owning conn */ + int rx_nob; /* # bytes received (-1 while posted) */ + enum ib_wc_status rx_status; /* completion status */ + kib_msg_t *rx_msg; /* message buffer (host vaddr) */ + __u64 rx_msgaddr; /* message buffer (I/O addr) */ + DECLARE_PCI_UNMAP_ADDR (rx_msgunmap); /* for dma_unmap_single() */ + struct ib_recv_wr rx_wrq; /* receive work item... */ + struct ib_sge rx_sge; /* ...and its memory */ +} kib_rx_t; + +#define IBLND_POSTRX_DONT_POST 0 /* don't post */ +#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */ +#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */ +#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */ + +typedef struct kib_tx /* transmit message */ +{ + struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ + kib_tx_pool_t *tx_pool; /* pool I'm from */ + struct kib_conn *tx_conn; /* owning conn */ + short tx_sending; /* # tx callbacks outstanding */ + short tx_queued; /* queued for sending */ + short tx_waiting; /* waiting for peer */ + int tx_status; /* LNET completion status */ + unsigned long tx_deadline; /* completion deadline */ + __u64 tx_cookie; /* completion cookie */ + lnet_msg_t *tx_lntmsg[2]; /* lnet msgs to finalize on completion */ + kib_msg_t *tx_msg; /* message buffer (host vaddr) */ + __u64 tx_msgaddr; /* message buffer (I/O addr) */ + DECLARE_PCI_UNMAP_ADDR (tx_msgunmap); /* for dma_unmap_single() */ + int tx_nwrq; /* # send work items */ + struct ib_send_wr *tx_wrq; /* send work items... */ + struct ib_sge *tx_sge; /* ...and their memory */ + kib_rdma_desc_t *tx_rd; /* rdma descriptor */ + int tx_nfrags; /* # entries in... */ + struct scatterlist *tx_frags; /* dma_map_sg descriptor */ + __u64 *tx_pages; /* rdma phys page addrs */ + union { + kib_phys_mr_t *pmr; /* MR for physical buffer */ + kib_fmr_t fmr; /* FMR */ + } tx_u; + int tx_dmadir; /* dma direction */ +} kib_tx_t; + +typedef struct kib_connvars { + /* connection-in-progress variables */ + kib_msg_t cv_msg; +} kib_connvars_t; + +typedef struct kib_conn { + struct kib_sched_info *ibc_sched; /* scheduler information */ + struct kib_peer *ibc_peer; /* owning peer */ + kib_hca_dev_t *ibc_hdev; /* HCA bound on */ + struct list_head ibc_list; /* stash on peer's conn list */ + struct list_head ibc_sched_list; /* schedule for attention */ + __u16 ibc_version; /* version of connection */ + __u64 ibc_incarnation; /* which instance of the peer */ + atomic_t ibc_refcount; /* # users */ + int ibc_state; /* what's happening */ + int ibc_nsends_posted; /* # uncompleted sends */ + int ibc_noops_posted; /* # uncompleted NOOPs */ + int ibc_credits; /* # credits I have */ + int ibc_outstanding_credits; /* # credits to return */ + int ibc_reserved_credits;/* # ACK/DONE msg credits */ + int ibc_comms_error; /* set on comms error */ + unsigned int ibc_nrx:16; /* receive buffers owned */ + unsigned int ibc_scheduled:1; /* scheduled for attention */ + unsigned int ibc_ready:1; /* CQ callback fired */ + /* time of last send */ + unsigned long ibc_last_send; + /** link chain for kiblnd_check_conns only */ + struct list_head ibc_connd_list; + /** rxs completed before ESTABLISHED */ + struct list_head ibc_early_rxs; + /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */ + struct list_head ibc_tx_noops; + struct list_head ibc_tx_queue; /* sends that need a credit */ + struct list_head ibc_tx_queue_nocred;/* sends that don't need a credit */ + struct list_head ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ + spinlock_t ibc_lock; /* serialise */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + + struct rdma_cm_id *ibc_cmid; /* CM id */ + struct ib_cq *ibc_cq; /* completion queue */ + + kib_connvars_t *ibc_connvars; /* in-progress connection state */ +} kib_conn_t; + +#define IBLND_CONN_INIT 0 /* being initialised */ +#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */ +#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */ +#define IBLND_CONN_ESTABLISHED 3 /* connection established */ +#define IBLND_CONN_CLOSING 4 /* being closed */ +#define IBLND_CONN_DISCONNECTED 5 /* disconnected */ + +typedef struct kib_peer { + struct list_head ibp_list; /* stash on global peer list */ + lnet_nid_t ibp_nid; /* who's on the other end(s) */ + lnet_ni_t *ibp_ni; /* LNet interface */ + atomic_t ibp_refcount; /* # users */ + struct list_head ibp_conns; /* all active connections */ + struct list_head ibp_tx_queue; /* msgs waiting for a conn */ + __u16 ibp_version; /* version of peer */ + __u64 ibp_incarnation; /* incarnation of peer */ + int ibp_connecting; /* current active connection attempts */ + int ibp_accepting; /* current passive connection attempts */ + int ibp_error; /* errno on closing this peer */ + unsigned long ibp_last_alive; /* when (in jiffies) I was last alive */ +} kib_peer_t; + +extern kib_data_t kiblnd_data; + +extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev); + +static inline void +kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev) +{ + LASSERT (atomic_read(&hdev->ibh_ref) > 0); + atomic_inc(&hdev->ibh_ref); +} + +static inline void +kiblnd_hdev_decref(kib_hca_dev_t *hdev) +{ + LASSERT (atomic_read(&hdev->ibh_ref) > 0); + if (atomic_dec_and_test(&hdev->ibh_ref)) + kiblnd_hdev_destroy(hdev); +} + +static inline int +kiblnd_dev_can_failover(kib_dev_t *dev) +{ + if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */ + return 0; + + if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */ + return 0; + + if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */ + return 1; + + return dev->ibd_can_failover; +} + +#define kiblnd_conn_addref(conn) \ +do { \ + CDEBUG(D_NET, "conn[%p] (%d)++\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + atomic_inc(&(conn)->ibc_refcount); \ +} while (0) + +#define kiblnd_conn_decref(conn) \ +do { \ + unsigned long flags; \ + \ + CDEBUG(D_NET, "conn[%p] (%d)--\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \ + if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \ + list_add_tail(&(conn)->ibc_list, \ + &kiblnd_data.kib_connd_zombies); \ + wake_up(&kiblnd_data.kib_connd_waitq); \ + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\ + } \ +} while (0) + +#define kiblnd_peer_addref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + atomic_inc(&(peer)->ibp_refcount); \ +} while (0) + +#define kiblnd_peer_decref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + LASSERT_ATOMIC_POS(&(peer)->ibp_refcount); \ + if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ + kiblnd_destroy_peer(peer); \ +} while (0) + +static inline struct list_head * +kiblnd_nid2peerlist (lnet_nid_t nid) +{ + unsigned int hash = + ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size; + + return (&kiblnd_data.kib_peers [hash]); +} + +static inline int +kiblnd_peer_active (kib_peer_t *peer) +{ + /* Am I in the peer hash table? */ + return (!list_empty(&peer->ibp_list)); +} + +static inline kib_conn_t * +kiblnd_get_conn_locked (kib_peer_t *peer) +{ + LASSERT (!list_empty(&peer->ibp_conns)); + + /* just return the first connection */ + return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list); +} + +static inline int +kiblnd_send_keepalive(kib_conn_t *conn) +{ + return (*kiblnd_tunables.kib_keepalive > 0) && + cfs_time_after(jiffies, conn->ibc_last_send + + *kiblnd_tunables.kib_keepalive*HZ); +} + +static inline int +kiblnd_need_noop(kib_conn_t *conn) +{ + LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (conn->ibc_outstanding_credits < + IBLND_CREDITS_HIGHWATER(conn->ibc_version) && + !kiblnd_send_keepalive(conn)) + return 0; /* No need to send NOOP */ + + if (IBLND_OOB_CAPABLE(conn->ibc_version)) { + if (!list_empty(&conn->ibc_tx_queue_nocred)) + return 0; /* NOOP can be piggybacked */ + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&conn->ibc_tx_queue) || + conn->ibc_credits == 0); + } + + if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ + !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */ + conn->ibc_credits == 0) /* no credit */ + return 0; + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + return 0; + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1); +} + +static inline void +kiblnd_abort_receives(kib_conn_t *conn) +{ + ib_modify_qp(conn->ibc_cmid->qp, + &kiblnd_data.kib_error_qpa, IB_QP_STATE); +} + +static inline const char * +kiblnd_queue2str (kib_conn_t *conn, struct list_head *q) +{ + if (q == &conn->ibc_tx_queue) + return "tx_queue"; + + if (q == &conn->ibc_tx_queue_rsrvd) + return "tx_queue_rsrvd"; + + if (q == &conn->ibc_tx_queue_nocred) + return "tx_queue_nocred"; + + if (q == &conn->ibc_active_txs) + return "active_txs"; + + LBUG(); + return NULL; +} + +/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the + * lowest bits of the work request id to stash the work item type. */ + +#define IBLND_WID_TX 0 +#define IBLND_WID_RDMA 1 +#define IBLND_WID_RX 2 +#define IBLND_WID_MASK 3UL + +static inline __u64 +kiblnd_ptr2wreqid (void *ptr, int type) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & IBLND_WID_MASK) == 0); + LASSERT ((type & ~IBLND_WID_MASK) == 0); + return (__u64)(lptr | type); +} + +static inline void * +kiblnd_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK); +} + +static inline int +kiblnd_wreqid2type (__u64 wreqid) +{ + return (wreqid & IBLND_WID_MASK); +} + +static inline void +kiblnd_set_conn_state (kib_conn_t *conn, int state) +{ + conn->ibc_state = state; + mb(); +} + +static inline void +kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob) +{ + msg->ibm_type = type; + msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; +} + +static inline int +kiblnd_rd_size (kib_rdma_desc_t *rd) +{ + int i; + int size; + + for (i = size = 0; i < rd->rd_nfrags; i++) + size += rd->rd_frags[i].rf_nob; + + return size; +} + +static inline __u64 +kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index) +{ + return rd->rd_frags[index].rf_addr; +} + +static inline __u32 +kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index) +{ + return rd->rd_frags[index].rf_nob; +} + +static inline __u32 +kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index) +{ + return rd->rd_key; +} + +static inline int +kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob) +{ + if (nob < rd->rd_frags[index].rf_nob) { + rd->rd_frags[index].rf_addr += nob; + rd->rd_frags[index].rf_nob -= nob; + } else { + index ++; + } + + return index; +} + +static inline int +kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n) +{ + LASSERT (msgtype == IBLND_MSG_GET_REQ || + msgtype == IBLND_MSG_PUT_ACK); + + return msgtype == IBLND_MSG_GET_REQ ? + offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) : + offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); +} + + +static inline __u64 +kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return ib_dma_mapping_error(dev, dma_addr); +} + +static inline __u64 kiblnd_dma_map_single(struct ib_device *dev, + void *msg, size_t size, + enum dma_data_direction direction) +{ + return ib_dma_map_single(dev, msg, size, direction); +} + +static inline void kiblnd_dma_unmap_single(struct ib_device *dev, + __u64 addr, size_t size, + enum dma_data_direction direction) +{ + ib_dma_unmap_single(dev, addr, size, direction); +} + +#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0) +#define KIBLND_UNMAP_ADDR(p, m, a) (a) + +static inline int kiblnd_dma_map_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + return ib_dma_map_sg(dev, sg, nents, direction); +} + +static inline void kiblnd_dma_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + ib_dma_unmap_sg(dev, sg, nents, direction); +} + +static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev, + struct scatterlist *sg) +{ + return ib_sg_dma_address(dev, sg); +} + +static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev, + struct scatterlist *sg) +{ + return ib_sg_dma_len(dev, sg); +} + +/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly + * right because OFED1.2 defines it as const, to use it we have to add + * (void *) cast to overcome "const" */ + +#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data) +#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len) + + +struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, + kib_rdma_desc_t *rd); +struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev, + __u64 addr, __u64 size); +void kiblnd_map_rx_descs(kib_conn_t *conn); +void kiblnd_unmap_rx_descs(kib_conn_t *conn); +int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, + kib_rdma_desc_t *rd, int nfrags); +void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx); +void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node); +struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps); + +int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, + int npages, __u64 iov, kib_fmr_t *fmr); +void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status); + +int kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev, + kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr); +void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr); + +int kiblnd_startup (lnet_ni_t *ni); +void kiblnd_shutdown (lnet_ni_t *ni); +int kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg); +void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when); + +int kiblnd_tunables_init(void); +void kiblnd_tunables_fini(void); + +int kiblnd_connd (void *arg); +int kiblnd_scheduler(void *arg); +int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name); +int kiblnd_failover_thread (void *arg); + +int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages); +void kiblnd_free_pages (kib_pages_t *p); + +int kiblnd_cm_callback(struct rdma_cm_id *cmid, + struct rdma_cm_event *event); +int kiblnd_translate_mtu(int value); + +int kiblnd_dev_failover(kib_dev_t *dev); +int kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid); +void kiblnd_destroy_peer (kib_peer_t *peer); +void kiblnd_destroy_dev (kib_dev_t *dev); +void kiblnd_unlink_peer_locked (kib_peer_t *peer); +void kiblnd_peer_alive (kib_peer_t *peer); +kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid); +void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error); +int kiblnd_close_stale_conns_locked (kib_peer_t *peer, + int version, __u64 incarnation); +int kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why); + +void kiblnd_connreq_done(kib_conn_t *conn, int status); +kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, + int state, int version); +void kiblnd_destroy_conn (kib_conn_t *conn); +void kiblnd_close_conn (kib_conn_t *conn, int error); +void kiblnd_close_conn_locked (kib_conn_t *conn, int error); + +int kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type, + int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie); + +void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid); +void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn); +void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn); +void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob); +void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, + int status); +void kiblnd_check_sends (kib_conn_t *conn); + +void kiblnd_qp_event(struct ib_event *event, void *arg); +void kiblnd_cq_event(struct ib_event *event, void *arg); +void kiblnd_cq_completion(struct ib_cq *cq, void *arg); + +void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version, + int credits, lnet_nid_t dstnid, __u64 dststamp); +int kiblnd_unpack_msg(kib_msg_t *msg, int nob); +int kiblnd_post_rx (kib_rx_t *rx, int credit); + +int kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c new file mode 100644 index 000000000..dbf374983 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -0,0 +1,3519 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd_cb.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" + +static void +kiblnd_tx_done(lnet_ni_t *ni, kib_tx_t *tx) +{ + lnet_msg_t *lntmsg[2]; + kib_net_t *net = ni->ni_data; + int rc; + int i; + + LASSERT(net != NULL); + LASSERT(!in_interrupt()); + LASSERT(!tx->tx_queued); /* mustn't be queued for sending */ + LASSERT(tx->tx_sending == 0); /* mustn't be awaiting sent callback */ + LASSERT(!tx->tx_waiting); /* mustn't be awaiting peer response */ + LASSERT(tx->tx_pool != NULL); + + kiblnd_unmap_tx(ni, tx); + + /* tx may have up to 2 lnet msgs to finalise */ + lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; + rc = tx->tx_status; + + if (tx->tx_conn != NULL) { + LASSERT(ni == tx->tx_conn->ibc_peer->ibp_ni); + + kiblnd_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + } + + tx->tx_nwrq = 0; + tx->tx_status = 0; + + kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); + + /* delay finalize until my descs have been freed */ + for (i = 0; i < 2; i++) { + if (lntmsg[i] == NULL) + continue; + + lnet_finalize(ni, lntmsg[i], rc); + } +} + +void +kiblnd_txlist_done(lnet_ni_t *ni, struct list_head *txlist, int status) +{ + kib_tx_t *tx; + + while (!list_empty(txlist)) { + tx = list_entry(txlist->next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + /* complete now */ + tx->tx_waiting = 0; + tx->tx_status = status; + kiblnd_tx_done(ni, tx); + } +} + +static kib_tx_t * +kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target) +{ + kib_net_t *net = (kib_net_t *)ni->ni_data; + struct list_head *node; + kib_tx_t *tx; + kib_tx_poolset_t *tps; + + tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)]; + node = kiblnd_pool_alloc_node(&tps->tps_poolset); + if (node == NULL) + return NULL; + tx = container_of(node, kib_tx_t, tx_list); + + LASSERT(tx->tx_nwrq == 0); + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_sending == 0); + LASSERT(!tx->tx_waiting); + LASSERT(tx->tx_status == 0); + LASSERT(tx->tx_conn == NULL); + LASSERT(tx->tx_lntmsg[0] == NULL); + LASSERT(tx->tx_lntmsg[1] == NULL); + LASSERT(tx->tx_u.pmr == NULL); + LASSERT(tx->tx_nfrags == 0); + + return tx; +} + +static void +kiblnd_drop_rx(kib_rx_t *rx) +{ + kib_conn_t *conn = rx->rx_conn; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; + + spin_lock_irqsave(&sched->ibs_lock, flags); + LASSERT(conn->ibc_nrx > 0); + conn->ibc_nrx--; + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + kiblnd_conn_decref(conn); +} + +int +kiblnd_post_rx(kib_rx_t *rx, int credit) +{ + kib_conn_t *conn = rx->rx_conn; + kib_net_t *net = conn->ibc_peer->ibp_ni->ni_data; + struct ib_recv_wr *bad_wrq = NULL; + struct ib_mr *mr; + int rc; + + LASSERT(net != NULL); + LASSERT(!in_interrupt()); + LASSERT(credit == IBLND_POSTRX_NO_CREDIT || + credit == IBLND_POSTRX_PEER_CREDIT || + credit == IBLND_POSTRX_RSRVD_CREDIT); + + mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE); + LASSERT(mr != NULL); + + rx->rx_sge.lkey = mr->lkey; + rx->rx_sge.addr = rx->rx_msgaddr; + rx->rx_sge.length = IBLND_MSG_SIZE; + + rx->rx_wrq.next = NULL; + rx->rx_wrq.sg_list = &rx->rx_sge; + rx->rx_wrq.num_sge = 1; + rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX); + + LASSERT(conn->ibc_state >= IBLND_CONN_INIT); + LASSERT(rx->rx_nob >= 0); /* not posted */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) { + kiblnd_drop_rx(rx); /* No more posts for this rx */ + return 0; + } + + rx->rx_nob = -1; /* flag posted */ + + rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq); + if (rc != 0) { + CERROR("Can't post rx for %s: %d, bad_wrq: %p\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq); + rx->rx_nob = 0; + } + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */ + return rc; + + if (rc != 0) { + kiblnd_close_conn(conn, rc); + kiblnd_drop_rx(rx); /* No more posts for this rx */ + return rc; + } + + if (credit == IBLND_POSTRX_NO_CREDIT) + return 0; + + spin_lock(&conn->ibc_lock); + if (credit == IBLND_POSTRX_PEER_CREDIT) + conn->ibc_outstanding_credits++; + else + conn->ibc_reserved_credits++; + spin_unlock(&conn->ibc_lock); + + kiblnd_check_sends(conn); + return 0; +} + +static kib_tx_t * +kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie) +{ + struct list_head *tmp; + + list_for_each(tmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_sending != 0 || tx->tx_waiting); + + if (tx->tx_cookie != cookie) + continue; + + if (tx->tx_waiting && + tx->tx_msg->ibm_type == txtype) + return tx; + + CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", + tx->tx_waiting ? "" : "NOT ", + tx->tx_msg->ibm_type, txtype); + } + return NULL; +} + +static void +kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) +{ + kib_tx_t *tx; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + int idle; + + spin_lock(&conn->ibc_lock); + + tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie); + if (tx == NULL) { + spin_unlock(&conn->ibc_lock); + + CWARN("Unmatched completion type %x cookie %#llx from %s\n", + txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_close_conn(conn, -EPROTO); + return; + } + + if (tx->tx_status == 0) { /* success so far */ + if (status < 0) { /* failed? */ + tx->tx_status = status; + } else if (txtype == IBLND_MSG_GET_REQ) { + lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status); + } + } + + tx->tx_waiting = 0; + + idle = !tx->tx_queued && (tx->tx_sending == 0); + if (idle) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(ni, tx); +} + +static void +kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie) +{ + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + kib_tx_t *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + + if (tx == NULL) { + CERROR("Can't get tx for completion %x for %s\n", + type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + tx->tx_msg->ibm_u.completion.ibcm_status = status; + tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; + kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t)); + + kiblnd_queue_tx(tx, conn); +} + +static void +kiblnd_handle_rx(kib_rx_t *rx) +{ + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + int credits = msg->ibm_credits; + kib_tx_t *tx; + int rc = 0; + int rc2; + int post_credit; + + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + CDEBUG(D_NET, "Received %x[%d] from %s\n", + msg->ibm_type, credits, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + if (credits != 0) { + /* Have I received credits that will let me send? */ + spin_lock(&conn->ibc_lock); + + if (conn->ibc_credits + credits > + IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) { + rc2 = conn->ibc_credits; + spin_unlock(&conn->ibc_lock); + + CERROR("Bad credits from %s: %d + %d > %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc2, credits, + IBLND_MSG_QUEUE_SIZE(conn->ibc_version)); + + kiblnd_close_conn(conn, -EPROTO); + kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT); + return; + } + + conn->ibc_credits += credits; + + /* This ensures the credit taken by NOOP can be returned */ + if (msg->ibm_type == IBLND_MSG_NOOP && + !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ + conn->ibc_outstanding_credits++; + + spin_unlock(&conn->ibc_lock); + kiblnd_check_sends(conn); + } + + switch (msg->ibm_type) { + default: + CERROR("Bad IBLND message type %x from %s\n", + msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_NO_CREDIT; + rc = -EPROTO; + break; + + case IBLND_MSG_NOOP: + if (IBLND_OOB_CAPABLE(conn->ibc_version)) { + post_credit = IBLND_POSTRX_NO_CREDIT; + break; + } + + if (credits != 0) /* credit already posted */ + post_credit = IBLND_POSTRX_NO_CREDIT; + else /* a keepalive NOOP */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_IMMEDIATE: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx, 0); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr, + msg->ibm_srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_NAK: + CWARN("PUT_NACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_PUT_ACK: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + + spin_lock(&conn->ibc_lock); + tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.putack.ibpam_src_cookie); + if (tx != NULL) + list_del(&tx->tx_list); + spin_unlock(&conn->ibc_lock); + + if (tx == NULL) { + CERROR("Unmatched PUT_ACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + break; + } + + LASSERT(tx->tx_waiting); + /* CAVEAT EMPTOR: I could be racing with tx_complete, but... + * (a) I can overwrite tx_msg since my peer has received it! + * (b) tx_waiting set tells tx_complete() it's not done. */ + + tx->tx_nwrq = 0; /* overwrite PUT_REQ */ + + rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE, + kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd), + &msg->ibm_u.putack.ibpam_rd, + msg->ibm_u.putack.ibpam_dst_cookie); + if (rc2 < 0) + CERROR("Can't setup rdma for PUT to %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); + + spin_lock(&conn->ibc_lock); + tx->tx_waiting = 0; /* clear waiting and queue atomically */ + kiblnd_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + break; + + case IBLND_MSG_PUT_DONE: + post_credit = IBLND_POSTRX_PEER_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_GET_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr, + msg->ibm_srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_GET_DONE: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + } + + if (rc < 0) /* protocol error */ + kiblnd_close_conn(conn, rc); + + if (post_credit != IBLND_POSTRX_DONT_POST) + kiblnd_post_rx(rx, post_credit); +} + +static void +kiblnd_rx_complete(kib_rx_t *rx, int status, int nob) +{ + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + kib_net_t *net = ni->ni_data; + int rc; + int err = -EIO; + + LASSERT(net != NULL); + LASSERT(rx->rx_nob < 0); /* was posted */ + rx->rx_nob = 0; /* isn't now */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) + goto ignore; + + if (status != IB_WC_SUCCESS) { + CNETERR("Rx from %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), status); + goto failed; + } + + LASSERT(nob >= 0); + rx->rx_nob = nob; + + rc = kiblnd_unpack_msg(msg, rx->rx_nob); + if (rc != 0) { + CERROR("Error %d unpacking rx from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + goto failed; + } + + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_dstnid != ni->ni_nid || + msg->ibm_srcstamp != conn->ibc_incarnation || + msg->ibm_dststamp != net->ibn_incarnation) { + CERROR("Stale rx from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + err = -ESTALE; + goto failed; + } + + /* set time last known alive */ + kiblnd_peer_alive(conn->ibc_peer); + + /* racing with connection establishment/teardown! */ + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + + write_lock_irqsave(g_lock, flags); + /* must check holding global lock to eliminate race */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); + write_unlock_irqrestore(g_lock, flags); + return; + } + write_unlock_irqrestore(g_lock, flags); + } + kiblnd_handle_rx(rx); + return; + + failed: + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + kiblnd_close_conn(conn, err); + ignore: + kiblnd_drop_rx(rx); /* Don't re-post rx. */ +} + +static struct page * +kiblnd_kvaddr_to_page(unsigned long vaddr) +{ + struct page *page; + + if (is_vmalloc_addr((void *)vaddr)) { + page = vmalloc_to_page((void *)vaddr); + LASSERT(page != NULL); + return page; + } +#ifdef CONFIG_HIGHMEM + if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { + /* No highmem pages only used for bulk (kiov) I/O */ + CERROR("find page for address in highmem\n"); + LBUG(); + } +#endif + page = virt_to_page(vaddr); + LASSERT(page != NULL); + return page; +} + +static int +kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) +{ + kib_hca_dev_t *hdev; + __u64 *pages = tx->tx_pages; + kib_fmr_poolset_t *fps; + int npages; + int size; + int cpt; + int rc; + int i; + + LASSERT(tx->tx_pool != NULL); + LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL); + + hdev = tx->tx_pool->tpo_hdev; + + for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { + for (size = 0; size < rd->rd_frags[i].rf_nob; + size += hdev->ibh_page_size) { + pages[npages++] = (rd->rd_frags[i].rf_addr & + hdev->ibh_page_mask) + size; + } + } + + cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; + + fps = net->ibn_fmr_ps[cpt]; + rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr); + if (rc != 0) { + CERROR("Can't map %d pages: %d\n", npages, rc); + return rc; + } + + /* If rd is not tx_rd, it's going to get sent to a peer, who will need + * the rkey */ + rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey : + tx->tx_u.fmr.fmr_pfmr->fmr->lkey; + rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask; + rd->rd_frags[0].rf_nob = nob; + rd->rd_nfrags = 1; + + return 0; +} + +static int +kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) +{ + kib_hca_dev_t *hdev; + kib_pmr_poolset_t *pps; + __u64 iova; + int cpt; + int rc; + + LASSERT(tx->tx_pool != NULL); + LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL); + + hdev = tx->tx_pool->tpo_hdev; + + iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask; + + cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; + + pps = net->ibn_pmr_ps[cpt]; + rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr); + if (rc != 0) { + CERROR("Failed to create MR by phybuf: %d\n", rc); + return rc; + } + + /* If rd is not tx_rd, it's going to get sent to a peer, who will need + * the rkey */ + rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey : + tx->tx_u.pmr->pmr_mr->lkey; + rd->rd_nfrags = 1; + rd->rd_frags[0].rf_addr = iova; + rd->rd_frags[0].rf_nob = nob; + + return 0; +} + +void +kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx) +{ + kib_net_t *net = ni->ni_data; + + LASSERT(net != NULL); + + if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) { + kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status); + tx->tx_u.fmr.fmr_pfmr = NULL; + + } else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) { + kiblnd_pmr_pool_unmap(tx->tx_u.pmr); + tx->tx_u.pmr = NULL; + } + + if (tx->tx_nfrags != 0) { + kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev, + tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + tx->tx_nfrags = 0; + } +} + +int +kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, + kib_rdma_desc_t *rd, int nfrags) +{ + kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; + kib_net_t *net = ni->ni_data; + struct ib_mr *mr = NULL; + __u32 nob; + int i; + + /* If rd is not tx_rd, it's going to get sent to a peer and I'm the + * RDMA sink */ + tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + tx->tx_nfrags = nfrags; + + rd->rd_nfrags = + kiblnd_dma_map_sg(hdev->ibh_ibdev, + tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + + for (i = 0, nob = 0; i < rd->rd_nfrags; i++) { + rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len( + hdev->ibh_ibdev, &tx->tx_frags[i]); + rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address( + hdev->ibh_ibdev, &tx->tx_frags[i]); + nob += rd->rd_frags[i].rf_nob; + } + + /* looking for pre-mapping MR */ + mr = kiblnd_find_rd_dma_mr(hdev, rd); + if (mr != NULL) { + /* found pre-mapping MR */ + rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey; + return 0; + } + + if (net->ibn_fmr_ps != NULL) + return kiblnd_fmr_map_tx(net, tx, rd, nob); + else if (net->ibn_pmr_ps != NULL) + return kiblnd_pmr_map_tx(net, tx, rd, nob); + + return -EINVAL; +} + + +static int +kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, + unsigned int niov, struct kvec *iov, int offset, int nob) +{ + kib_net_t *net = ni->ni_data; + struct page *page; + struct scatterlist *sg; + unsigned long vaddr; + int fragnob; + int page_offset; + + LASSERT(nob > 0); + LASSERT(niov > 0); + LASSERT(net != NULL); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT(niov > 0); + } + + sg = tx->tx_frags; + do { + LASSERT(niov > 0); + + vaddr = ((unsigned long)iov->iov_base) + offset; + page_offset = vaddr & (PAGE_SIZE - 1); + page = kiblnd_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR("Can't find page\n"); + return -EFAULT; + } + + fragnob = min((int)(iov->iov_len - offset), nob); + fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); + + sg_set_page(sg, page, fragnob, page_offset); + sg++; + + if (offset + fragnob < iov->iov_len) { + offset += fragnob; + } else { + offset = 0; + iov++; + niov--; + } + nob -= fragnob; + } while (nob > 0); + + return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); +} + +static int +kiblnd_setup_rd_kiov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, + int nkiov, lnet_kiov_t *kiov, int offset, int nob) +{ + kib_net_t *net = ni->ni_data; + struct scatterlist *sg; + int fragnob; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT(nob > 0); + LASSERT(nkiov > 0); + LASSERT(net != NULL); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT(nkiov > 0); + } + + sg = tx->tx_frags; + do { + LASSERT(nkiov > 0); + + fragnob = min((int)(kiov->kiov_len - offset), nob); + + sg_set_page(sg, kiov->kiov_page, fragnob, + kiov->kiov_offset + offset); + sg++; + + offset = 0; + kiov++; + nkiov--; + nob -= fragnob; + } while (nob > 0); + + return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); +} + +static int +kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit) + __releases(conn->ibc_lock) + __acquires(conn->ibc_lock) +{ + kib_msg_t *msg = tx->tx_msg; + kib_peer_t *peer = conn->ibc_peer; + int ver = conn->ibc_version; + int rc; + int done; + struct ib_send_wr *bad_wrq; + + LASSERT(tx->tx_queued); + /* We rely on this for QP sizing */ + LASSERT(tx->tx_nwrq > 0); + LASSERT(tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver)); + + LASSERT(credit == 0 || credit == 1); + LASSERT(conn->ibc_outstanding_credits >= 0); + LASSERT(conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver)); + LASSERT(conn->ibc_credits >= 0); + LASSERT(conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver)); + + if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) { + /* tx completions outstanding... */ + CDEBUG(D_NET, "%s: posted enough\n", + libcfs_nid2str(peer->ibp_nid)); + return -EAGAIN; + } + + if (credit != 0 && conn->ibc_credits == 0) { /* no credits */ + CDEBUG(D_NET, "%s: no credits\n", + libcfs_nid2str(peer->ibp_nid)); + return -EAGAIN; + } + + if (credit != 0 && !IBLND_OOB_CAPABLE(ver) && + conn->ibc_credits == 1 && /* last credit reserved */ + msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */ + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_nid2str(peer->ibp_nid)); + return -EAGAIN; + } + + /* NB don't drop ibc_lock before bumping tx_sending */ + list_del(&tx->tx_list); + tx->tx_queued = 0; + + if (msg->ibm_type == IBLND_MSG_NOOP && + (!kiblnd_need_noop(conn) || /* redundant NOOP */ + (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */ + conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) { + /* OK to drop when posted enough NOOPs, since + * kiblnd_check_sends will queue NOOP again when + * posted NOOPs complete */ + spin_unlock(&conn->ibc_lock); + kiblnd_tx_done(peer->ibp_ni, tx); + spin_lock(&conn->ibc_lock); + CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_noops_posted); + return 0; + } + + kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits, + peer->ibp_nid, conn->ibc_incarnation); + + conn->ibc_credits -= credit; + conn->ibc_outstanding_credits = 0; + conn->ibc_nsends_posted++; + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted++; + + /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA + * PUT. If so, it was first queued here as a PUT_REQ, sent and + * stashed on ibc_active_txs, matched by an incoming PUT_ACK, + * and then re-queued here. It's (just) possible that + * tx_sending is non-zero if we've not done the tx_complete() + * from the first send; hence the ++ rather than = below. */ + tx->tx_sending++; + list_add(&tx->tx_list, &conn->ibc_active_txs); + + /* I'm still holding ibc_lock! */ + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { + rc = -ECONNABORTED; + } else if (tx->tx_pool->tpo_pool.po_failed || + conn->ibc_hdev != tx->tx_pool->tpo_hdev) { + /* close_conn will launch failover */ + rc = -ENETDOWN; + } else { + rc = ib_post_send(conn->ibc_cmid->qp, + tx->tx_wrq, &bad_wrq); + } + + conn->ibc_last_send = jiffies; + + if (rc == 0) + return 0; + + /* NB credits are transferred in the actual + * message, which can only be the last work item */ + conn->ibc_credits += credit; + conn->ibc_outstanding_credits += msg->ibm_credits; + conn->ibc_nsends_posted--; + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted--; + + tx->tx_status = rc; + tx->tx_waiting = 0; + tx->tx_sending--; + + done = (tx->tx_sending == 0); + if (done) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CERROR("Error %d posting transmit to %s\n", + rc, libcfs_nid2str(peer->ibp_nid)); + else + CDEBUG(D_NET, "Error %d posting transmit to %s\n", + rc, libcfs_nid2str(peer->ibp_nid)); + + kiblnd_close_conn(conn, rc); + + if (done) + kiblnd_tx_done(peer->ibp_ni, tx); + + spin_lock(&conn->ibc_lock); + + return -EIO; +} + +void +kiblnd_check_sends(kib_conn_t *conn) +{ + int ver = conn->ibc_version; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + kib_tx_t *tx; + + /* Don't send anything until after the connection is established */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CDEBUG(D_NET, "%s too soon\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + spin_lock(&conn->ibc_lock); + + LASSERT(conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver)); + LASSERT(!IBLND_OOB_CAPABLE(ver) || + conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver)); + LASSERT(conn->ibc_reserved_credits >= 0); + + while (conn->ibc_reserved_credits > 0 && + !list_empty(&conn->ibc_tx_queue_rsrvd)) { + tx = list_entry(conn->ibc_tx_queue_rsrvd.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); + conn->ibc_reserved_credits--; + } + + if (kiblnd_need_noop(conn)) { + spin_unlock(&conn->ibc_lock); + + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + if (tx != NULL) + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0); + + spin_lock(&conn->ibc_lock); + if (tx != NULL) + kiblnd_queue_tx_locked(tx, conn); + } + + kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */ + + for (;;) { + int credit; + + if (!list_empty(&conn->ibc_tx_queue_nocred)) { + credit = 0; + tx = list_entry(conn->ibc_tx_queue_nocred.next, + kib_tx_t, tx_list); + } else if (!list_empty(&conn->ibc_tx_noops)) { + LASSERT(!IBLND_OOB_CAPABLE(ver)); + credit = 1; + tx = list_entry(conn->ibc_tx_noops.next, + kib_tx_t, tx_list); + } else if (!list_empty(&conn->ibc_tx_queue)) { + credit = 1; + tx = list_entry(conn->ibc_tx_queue.next, + kib_tx_t, tx_list); + } else + break; + + if (kiblnd_post_tx_locked(conn, tx, credit) != 0) + break; + } + + spin_unlock(&conn->ibc_lock); + + kiblnd_conn_decref(conn); /* ...until here */ +} + +static void +kiblnd_tx_complete(kib_tx_t *tx, int status) +{ + int failed = (status != IB_WC_SUCCESS); + kib_conn_t *conn = tx->tx_conn; + int idle; + + LASSERT(tx->tx_sending > 0); + + if (failed) { + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CNETERR("Tx -> %s cookie %#llx sending %d waiting %d: failed %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_cookie, tx->tx_sending, tx->tx_waiting, + status); + + kiblnd_close_conn(conn, -EIO); + } else { + kiblnd_peer_alive(conn->ibc_peer); + } + + spin_lock(&conn->ibc_lock); + + /* I could be racing with rdma completion. Whoever makes 'tx' idle + * gets to free it, which also drops its ref on 'conn'. */ + + tx->tx_sending--; + conn->ibc_nsends_posted--; + if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted--; + + if (failed) { + tx->tx_waiting = 0; /* don't wait for peer */ + tx->tx_status = -EIO; + } + + idle = (tx->tx_sending == 0) && /* This is the final callback */ + !tx->tx_waiting && /* Not waiting for peer */ + !tx->tx_queued; /* Not re-queued (PUT_DONE) */ + if (idle) + list_del(&tx->tx_list); + + kiblnd_conn_addref(conn); /* 1 ref for me.... */ + + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx); + + kiblnd_check_sends(conn); + + kiblnd_conn_decref(conn); /* ...until here */ +} + +void +kiblnd_init_tx_msg(lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob) +{ + kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; + struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq]; + struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq]; + int nob = offsetof(kib_msg_t, ibm_u) + body_nob; + struct ib_mr *mr; + + LASSERT(tx->tx_nwrq >= 0); + LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1); + LASSERT(nob <= IBLND_MSG_SIZE); + + kiblnd_init_msg(tx->tx_msg, type, body_nob); + + mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob); + LASSERT(mr != NULL); + + sge->lkey = mr->lkey; + sge->addr = tx->tx_msgaddr; + sge->length = nob; + + memset(wrq, 0, sizeof(*wrq)); + + wrq->next = NULL; + wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX); + wrq->sg_list = sge; + wrq->num_sge = 1; + wrq->opcode = IB_WR_SEND; + wrq->send_flags = IB_SEND_SIGNALED; + + tx->tx_nwrq++; +} + +int +kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type, + int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie) +{ + kib_msg_t *ibmsg = tx->tx_msg; + kib_rdma_desc_t *srcrd = tx->tx_rd; + struct ib_sge *sge = &tx->tx_sge[0]; + struct ib_send_wr *wrq = &tx->tx_wrq[0]; + int rc = resid; + int srcidx; + int dstidx; + int wrknob; + + LASSERT(!in_interrupt()); + LASSERT(tx->tx_nwrq == 0); + LASSERT(type == IBLND_MSG_GET_DONE || + type == IBLND_MSG_PUT_DONE); + + srcidx = dstidx = 0; + + while (resid > 0) { + if (srcidx >= srcrd->rd_nfrags) { + CERROR("Src buffer exhausted: %d frags\n", srcidx); + rc = -EPROTO; + break; + } + + if (dstidx == dstrd->rd_nfrags) { + CERROR("Dst buffer exhausted: %d frags\n", dstidx); + rc = -EPROTO; + break; + } + + if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) { + CERROR("RDMA too fragmented for %s (%d): %d/%d src %d/%d dst frags\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + IBLND_RDMA_FRAGS(conn->ibc_version), + srcidx, srcrd->rd_nfrags, + dstidx, dstrd->rd_nfrags); + rc = -EMSGSIZE; + break; + } + + wrknob = min(min(kiblnd_rd_frag_size(srcrd, srcidx), + kiblnd_rd_frag_size(dstrd, dstidx)), + (__u32) resid); + + sge = &tx->tx_sge[tx->tx_nwrq]; + sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx); + sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx); + sge->length = wrknob; + + wrq = &tx->tx_wrq[tx->tx_nwrq]; + + wrq->next = wrq + 1; + wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); + wrq->sg_list = sge; + wrq->num_sge = 1; + wrq->opcode = IB_WR_RDMA_WRITE; + wrq->send_flags = 0; + + wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx); + wrq->wr.rdma.rkey = kiblnd_rd_frag_key(dstrd, dstidx); + + srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob); + dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob); + + resid -= wrknob; + + tx->tx_nwrq++; + wrq++; + sge++; + } + + if (rc < 0) /* no RDMA if completing with failure */ + tx->tx_nwrq = 0; + + ibmsg->ibm_u.completion.ibcm_status = rc; + ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; + kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx, + type, sizeof(kib_completion_msg_t)); + + return rc; +} + +void +kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn) +{ + struct list_head *q; + + LASSERT(tx->tx_nwrq > 0); /* work items set up */ + LASSERT(!tx->tx_queued); /* not queued for sending already */ + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + tx->tx_queued = 1; + tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ); + + if (tx->tx_conn == NULL) { + kiblnd_conn_addref(conn); + tx->tx_conn = conn; + LASSERT(tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE); + } else { + /* PUT_DONE first attached to conn as a PUT_REQ */ + LASSERT(tx->tx_conn == conn); + LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE); + } + + switch (tx->tx_msg->ibm_type) { + default: + LBUG(); + + case IBLND_MSG_PUT_REQ: + case IBLND_MSG_GET_REQ: + q = &conn->ibc_tx_queue_rsrvd; + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_ACK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + q = &conn->ibc_tx_queue_nocred; + break; + + case IBLND_MSG_NOOP: + if (IBLND_OOB_CAPABLE(conn->ibc_version)) + q = &conn->ibc_tx_queue_nocred; + else + q = &conn->ibc_tx_noops; + break; + + case IBLND_MSG_IMMEDIATE: + q = &conn->ibc_tx_queue; + break; + } + + list_add_tail(&tx->tx_list, q); +} + +void +kiblnd_queue_tx(kib_tx_t *tx, kib_conn_t *conn) +{ + spin_lock(&conn->ibc_lock); + kiblnd_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + + kiblnd_check_sends(conn); +} + +static int kiblnd_resolve_addr(struct rdma_cm_id *cmid, + struct sockaddr_in *srcaddr, + struct sockaddr_in *dstaddr, + int timeout_ms) +{ + unsigned short port; + int rc; + + /* allow the port to be reused */ + rc = rdma_set_reuseaddr(cmid, 1); + if (rc != 0) { + CERROR("Unable to set reuse on cmid: %d\n", rc); + return rc; + } + + /* look for a free privileged port */ + for (port = PROT_SOCK-1; port > 0; port--) { + srcaddr->sin_port = htons(port); + rc = rdma_resolve_addr(cmid, + (struct sockaddr *)srcaddr, + (struct sockaddr *)dstaddr, + timeout_ms); + if (rc == 0) { + CDEBUG(D_NET, "bound to port %hu\n", port); + return 0; + } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) { + CDEBUG(D_NET, "bind to port %hu failed: %d\n", + port, rc); + } else { + return rc; + } + } + + CERROR("Failed to bind to a free privileged port\n"); + return rc; +} + +static void +kiblnd_connect_peer(kib_peer_t *peer) +{ + struct rdma_cm_id *cmid; + kib_dev_t *dev; + kib_net_t *net = peer->ibp_ni->ni_data; + struct sockaddr_in srcaddr; + struct sockaddr_in dstaddr; + int rc; + + LASSERT(net != NULL); + LASSERT(peer->ibp_connecting > 0); + + cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP, + IB_QPT_RC); + + if (IS_ERR(cmid)) { + CERROR("Can't create CMID for %s: %ld\n", + libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid)); + rc = PTR_ERR(cmid); + goto failed; + } + + dev = net->ibn_dev; + memset(&srcaddr, 0, sizeof(srcaddr)); + srcaddr.sin_family = AF_INET; + srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip); + + memset(&dstaddr, 0, sizeof(dstaddr)); + dstaddr.sin_family = AF_INET; + dstaddr.sin_port = htons(*kiblnd_tunables.kib_service); + dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid)); + + kiblnd_peer_addref(peer); /* cmid's ref */ + + if (*kiblnd_tunables.kib_use_priv_port) { + rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr, + *kiblnd_tunables.kib_timeout * 1000); + } else { + rc = rdma_resolve_addr(cmid, + (struct sockaddr *)&srcaddr, + (struct sockaddr *)&dstaddr, + *kiblnd_tunables.kib_timeout * 1000); + } + if (rc != 0) { + /* Can't initiate address resolution: */ + CERROR("Can't resolve addr for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + goto failed2; + } + + LASSERT(cmid->device != NULL); + CDEBUG(D_NET, "%s: connection bound to %s:%pI4h:%s\n", + libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname, + &dev->ibd_ifip, cmid->device->name); + + return; + + failed2: + kiblnd_peer_decref(peer); /* cmid's ref */ + rdma_destroy_id(cmid); + failed: + kiblnd_peer_connect_failed(peer, 1, rc); +} + +void +kiblnd_launch_tx(lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) +{ + kib_peer_t *peer; + kib_peer_t *peer2; + kib_conn_t *conn; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + int rc; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + LASSERT(tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */ + LASSERT(tx == NULL || tx->tx_nwrq > 0); /* work items have been set up */ + + /* First time, just use a read lock since I expect to find my peer + * connected */ + read_lock_irqsave(g_lock, flags); + + peer = kiblnd_find_peer_locked(nid); + if (peer != NULL && !list_empty(&peer->ibp_conns)) { + /* Found a peer with an established connection */ + conn = kiblnd_get_conn_locked(peer); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + read_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + return; + } + + read_unlock(g_lock); + /* Re-try with a write lock */ + write_lock(g_lock); + + peer = kiblnd_find_peer_locked(nid); + if (peer != NULL) { + if (list_empty(&peer->ibp_conns)) { + /* found a peer, but it's still connecting... */ + LASSERT(peer->ibp_connecting != 0 || + peer->ibp_accepting != 0); + if (tx != NULL) + list_add_tail(&tx->tx_list, + &peer->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + return; + } + + write_unlock_irqrestore(g_lock, flags); + + /* Allocate a peer ready to add to the peer table and retry */ + rc = kiblnd_create_peer(ni, &peer, nid); + if (rc != 0) { + CERROR("Can't create peer %s\n", libcfs_nid2str(nid)); + if (tx != NULL) { + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kiblnd_tx_done(ni, tx); + } + return; + } + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(nid); + if (peer2 != NULL) { + if (list_empty(&peer2->ibp_conns)) { + /* found a peer, but it's still connecting... */ + LASSERT(peer2->ibp_connecting != 0 || + peer2->ibp_accepting != 0); + if (tx != NULL) + list_add_tail(&tx->tx_list, + &peer2->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer2); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + + kiblnd_peer_decref(peer); + return; + } + + /* Brand new peer */ + LASSERT(peer->ibp_connecting == 0); + peer->ibp_connecting = 1; + + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT(((kib_net_t *)ni->ni_data)->ibn_shutdown == 0); + + if (tx != NULL) + list_add_tail(&tx->tx_list, &peer->ibp_tx_queue); + + kiblnd_peer_addref(peer); + list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); + + write_unlock_irqrestore(g_lock, flags); + + kiblnd_connect_peer(peer); + kiblnd_peer_decref(peer); +} + +int +kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct kvec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + int rc; + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT(payload_nob == 0 || payload_niov > 0); + LASSERT(payload_niov <= LNET_MAX_IOV); + + /* Thread context */ + LASSERT(!in_interrupt()); + /* payload is either all vaddrs or all pages */ + LASSERT(!(payload_kiov != NULL && payload_iov != NULL)); + + switch (type) { + default: + LBUG(); + return -EIO; + + case LNET_MSG_ACK: + LASSERT(payload_nob == 0); + break; + + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); + if (nob <= IBLND_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR("Can't allocate txd for GET to %s\n", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + rc = kiblnd_setup_rd_iov(ni, tx, + &ibmsg->ibm_u.get.ibgm_rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, + 0, lntmsg->msg_md->md_length); + else + rc = kiblnd_setup_rd_kiov(ni, tx, + &ibmsg->ibm_u.get.ibgm_rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); + if (rc != 0) { + CERROR("Can't setup GET sink for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kiblnd_tx_done(ni, tx); + return -EIO; + } + + nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]); + ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; + ibmsg->ibm_u.get.ibgm_hdr = *hdr; + + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob); + + tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET -> %s\n", + libcfs_nid2str(target.nid)); + kiblnd_tx_done(ni, tx); + return -EIO; + } + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ + tx->tx_waiting = 1; /* waiting for GET_DONE */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob <= IBLND_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + type == LNET_MSG_PUT ? "PUT" : "REPLY", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + if (payload_kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, + payload_niov, payload_iov, + payload_offset, payload_nob); + else + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup PUT src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kiblnd_tx_done(ni, tx); + return -EIO; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; + ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; + } + + /* send IMMEDIATE */ + + LASSERT(offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) + <= IBLND_MSG_SIZE); + + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR("Can't send %d to %s: tx descs exhausted\n", + type, libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_kiov != NULL) + lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_iov, + payload_offset, payload_nob); + + nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; +} + +static void +kiblnd_reply(lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) +{ + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct kvec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + kib_tx_t *tx; + int rc; + + tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to %s\n", + libcfs_nid2str(target.nid)); + goto failed_0; + } + + if (nob == 0) + rc = 0; + else if (kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, + niov, iov, offset, nob); + else + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + niov, kiov, offset, nob); + + if (rc != 0) { + CERROR("Can't setup GET src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + rc = kiblnd_init_rdma(rx->rx_conn, tx, + IBLND_MSG_GET_DONE, nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + if (nob == 0) { + /* No RDMA: local completion may happen now! */ + lnet_finalize(ni, lntmsg, 0); + } else { + /* RDMA: lnet_finalize(lntmsg) when it + * completes */ + tx->tx_lntmsg[0] = lntmsg; + } + + kiblnd_queue_tx(tx, rx->rx_conn); + return; + + failed_1: + kiblnd_tx_done(ni, tx); + failed_0: + lnet_finalize(ni, lntmsg, -EIO); +} + +int +kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + kib_tx_t *tx; + kib_msg_t *txmsg; + int nob; + int post_credit = IBLND_POSTRX_PEER_CREDIT; + int rc = 0; + + LASSERT(mlen <= rlen); + LASSERT(!in_interrupt()); + /* Either all pages or all vaddrs */ + LASSERT(!(kiov != NULL && iov != NULL)); + + switch (rxmsg->ibm_type) { + default: + LBUG(); + + case IBLND_MSG_IMMEDIATE: + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (nob > rx->rx_nob) { + CERROR("Immediate message from %s too big: %d(%d)\n", + libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), + nob, rx->rx_nob); + rc = -EPROTO; + break; + } + + if (kiov != NULL) + lnet_copy_flat2kiov(niov, kiov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + else + lnet_copy_flat2iov(niov, iov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + lnet_finalize(ni, lntmsg, 0); + break; + + case IBLND_MSG_PUT_REQ: + if (mlen == 0) { + lnet_finalize(ni, lntmsg, 0); + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; + } + + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + if (tx == NULL) { + CERROR("Can't allocate tx for %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* Not replying will break the connection */ + rc = -ENOMEM; + break; + } + + txmsg = tx->tx_msg; + if (kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, + &txmsg->ibm_u.putack.ibpam_rd, + niov, iov, offset, mlen); + else + rc = kiblnd_setup_rd_kiov(ni, tx, + &txmsg->ibm_u.putack.ibpam_rd, + niov, kiov, offset, mlen); + if (rc != 0) { + CERROR("Can't setup PUT sink for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kiblnd_tx_done(ni, tx); + /* tell peer it's over */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; + } + + nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]); + txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; + txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; + + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_DONE */ + kiblnd_queue_tx(tx, conn); + + /* reposted buffer reserved for PUT_DONE */ + post_credit = IBLND_POSTRX_NO_CREDIT; + break; + + case IBLND_MSG_GET_REQ: + if (lntmsg != NULL) { + /* Optimized GET; RDMA lntmsg's payload */ + kiblnd_reply(ni, rx, lntmsg); + } else { + /* GET didn't match anything */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE, + -ENODATA, + rxmsg->ibm_u.get.ibgm_cookie); + } + break; + } + + kiblnd_post_rx(rx, post_credit); + return rc; +} + +int +kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name) +{ + struct task_struct *task = kthread_run(fn, arg, "%s", name); + + if (IS_ERR(task)) + return PTR_ERR(task); + + atomic_inc(&kiblnd_data.kib_nthreads); + return 0; +} + +static void +kiblnd_thread_fini(void) +{ + atomic_dec(&kiblnd_data.kib_nthreads); +} + +void +kiblnd_peer_alive(kib_peer_t *peer) +{ + /* This is racy, but everyone's only writing cfs_time_current() */ + peer->ibp_last_alive = cfs_time_current(); + mb(); +} + +static void +kiblnd_peer_notify(kib_peer_t *peer) +{ + int error = 0; + unsigned long last_alive = 0; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (list_empty(&peer->ibp_conns) && + peer->ibp_accepting == 0 && + peer->ibp_connecting == 0 && + peer->ibp_error != 0) { + error = peer->ibp_error; + peer->ibp_error = 0; + + last_alive = peer->ibp_last_alive; + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (error != 0) + lnet_notify(peer->ibp_ni, + peer->ibp_nid, 0, last_alive); +} + +void +kiblnd_close_conn_locked(kib_conn_t *conn, int error) +{ + /* This just does the immediate housekeeping. 'error' is zero for a + * normal shutdown which can happen only after the connection has been + * established. If the connection is established, schedule the + * connection to be finished off by the connd. Otherwise the connd is + * already dealing with it (either to set it up or tear it down). + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; + kib_dev_t *dev; + unsigned long flags; + + LASSERT(error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (error != 0 && conn->ibc_comms_error == 0) + conn->ibc_comms_error = error; + + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) + return; /* already being handled */ + + if (error == 0 && + list_empty(&conn->ibc_tx_noops) && + list_empty(&conn->ibc_tx_queue) && + list_empty(&conn->ibc_tx_queue_rsrvd) && + list_empty(&conn->ibc_tx_queue_nocred) && + list_empty(&conn->ibc_active_txs)) { + CDEBUG(D_NET, "closing conn to %s\n", + libcfs_nid2str(peer->ibp_nid)); + } else { + CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n", + libcfs_nid2str(peer->ibp_nid), error, + list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", + list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)", + list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", + list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", + list_empty(&conn->ibc_active_txs) ? "" : "(waiting)"); + } + + dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev; + list_del(&conn->ibc_list); + /* connd (see below) takes over ibc_list's ref */ + + if (list_empty(&peer->ibp_conns) && /* no more conns */ + kiblnd_peer_active(peer)) { /* still in peer table */ + kiblnd_unlink_peer_locked(peer); + + /* set/clear error on last conn */ + peer->ibp_error = conn->ibc_comms_error; + } + + kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING); + + if (error != 0 && + kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + wake_up(&kiblnd_data.kib_failover_waitq); + } + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + + list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns); + wake_up(&kiblnd_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); +} + +void +kiblnd_close_conn(kib_conn_t *conn, int error) +{ + unsigned long flags; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + kiblnd_close_conn_locked(conn, error); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +static void +kiblnd_handle_early_rxs(kib_conn_t *conn) +{ + unsigned long flags; + kib_rx_t *rx; + kib_rx_t *tmp; + + LASSERT(!in_interrupt()); + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + list_for_each_entry_safe(rx, tmp, &conn->ibc_early_rxs, rx_list) { + list_del(&rx->rx_list); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_handle_rx(rx); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + } + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +static void +kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs) +{ + LIST_HEAD(zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + + spin_lock(&conn->ibc_lock); + + list_for_each_safe(tmp, nxt, txs) { + tx = list_entry(tmp, kib_tx_t, tx_list); + + if (txs == &conn->ibc_active_txs) { + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_waiting || + tx->tx_sending != 0); + } else { + LASSERT(tx->tx_queued); + } + + tx->tx_status = -ECONNABORTED; + tx->tx_waiting = 0; + + if (tx->tx_sending == 0) { + tx->tx_queued = 0; + list_del(&tx->tx_list); + list_add(&tx->tx_list, &zombies); + } + } + + spin_unlock(&conn->ibc_lock); + + kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED); +} + +static void +kiblnd_finalise_conn(kib_conn_t *conn) +{ + LASSERT(!in_interrupt()); + LASSERT(conn->ibc_state > IBLND_CONN_INIT); + + kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED); + + /* abort_receives moves QP state to IB_QPS_ERR. This is only required + * for connections that didn't get as far as being connected, because + * rdma_disconnect() does this for free. */ + kiblnd_abort_receives(conn); + + /* Complete all tx descs not waiting for sends to complete. + * NB we should be safe from RDMA now that the QP has changed state */ + + kiblnd_abort_txs(conn, &conn->ibc_tx_noops); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred); + kiblnd_abort_txs(conn, &conn->ibc_active_txs); + + kiblnd_handle_early_rxs(conn); +} + +void +kiblnd_peer_connect_failed(kib_peer_t *peer, int active, int error) +{ + LIST_HEAD(zombies); + unsigned long flags; + + LASSERT(error != 0); + LASSERT(!in_interrupt()); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (active) { + LASSERT(peer->ibp_connecting > 0); + peer->ibp_connecting--; + } else { + LASSERT(peer->ibp_accepting > 0); + peer->ibp_accepting--; + } + + if (peer->ibp_connecting != 0 || + peer->ibp_accepting != 0) { + /* another connection attempt under way... */ + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return; + } + + if (list_empty(&peer->ibp_conns)) { + /* Take peer's blocked transmits to complete with error */ + list_add(&zombies, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + + if (kiblnd_peer_active(peer)) + kiblnd_unlink_peer_locked(peer); + + peer->ibp_error = error; + } else { + /* Can't have blocked transmits if there are connections */ + LASSERT(list_empty(&peer->ibp_tx_queue)); + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_peer_notify(peer); + + if (list_empty(&zombies)) + return; + + CNETERR("Deleting messages for %s: connection failed\n", + libcfs_nid2str(peer->ibp_nid)); + + kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH); +} + +void +kiblnd_connreq_done(kib_conn_t *conn, int status) +{ + kib_peer_t *peer = conn->ibc_peer; + kib_tx_t *tx; + kib_tx_t *tmp; + struct list_head txs; + unsigned long flags; + int active; + + active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + CDEBUG(D_NET, "%s: active(%d), version(%x), status(%d)\n", + libcfs_nid2str(peer->ibp_nid), active, + conn->ibc_version, status); + + LASSERT(!in_interrupt()); + LASSERT((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT && + peer->ibp_connecting > 0) || + (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && + peer->ibp_accepting > 0)); + + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + conn->ibc_connvars = NULL; + + if (status != 0) { + /* failed to establish connection */ + kiblnd_peer_connect_failed(peer, active, status); + kiblnd_finalise_conn(conn); + return; + } + + /* connection established */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + conn->ibc_last_send = jiffies; + kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); + kiblnd_peer_alive(peer); + + /* Add conn to peer's list and nuke any dangling conns from a different + * peer instance... */ + kiblnd_conn_addref(conn); /* +1 ref for ibc_list */ + list_add(&conn->ibc_list, &peer->ibp_conns); + if (active) + peer->ibp_connecting--; + else + peer->ibp_accepting--; + + if (peer->ibp_version == 0) { + peer->ibp_version = conn->ibc_version; + peer->ibp_incarnation = conn->ibc_incarnation; + } + + if (peer->ibp_version != conn->ibc_version || + peer->ibp_incarnation != conn->ibc_incarnation) { + kiblnd_close_stale_conns_locked(peer, conn->ibc_version, + conn->ibc_incarnation); + peer->ibp_version = conn->ibc_version; + peer->ibp_incarnation = conn->ibc_incarnation; + } + + /* grab pending txs while I have the lock */ + list_add(&txs, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + + if (!kiblnd_peer_active(peer) || /* peer has been deleted */ + conn->ibc_comms_error != 0) { /* error has happened already */ + lnet_ni_t *ni = peer->ibp_ni; + + /* start to shut down connection */ + kiblnd_close_conn_locked(conn, -ECONNABORTED); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_txlist_done(ni, &txs, -ECONNABORTED); + + return; + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* Schedule blocked txs */ + spin_lock(&conn->ibc_lock); + list_for_each_entry_safe(tx, tmp, &txs, tx_list) { + list_del(&tx->tx_list); + + kiblnd_queue_tx_locked(tx, conn); + } + spin_unlock(&conn->ibc_lock); + + kiblnd_check_sends(conn); + + /* schedule blocked rxs */ + kiblnd_handle_early_rxs(conn); +} + +static void +kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej) +{ + int rc; + + rc = rdma_reject(cmid, rej, sizeof(*rej)); + + if (rc != 0) + CWARN("Error %d sending reject\n", rc); +} + +static int +kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) +{ + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + kib_msg_t *reqmsg = priv; + kib_msg_t *ackmsg; + kib_dev_t *ibdev; + kib_peer_t *peer; + kib_peer_t *peer2; + kib_conn_t *conn; + lnet_ni_t *ni = NULL; + kib_net_t *net = NULL; + lnet_nid_t nid; + struct rdma_conn_param cp; + kib_rej_t rej; + int version = IBLND_MSG_VERSION; + unsigned long flags; + int rc; + struct sockaddr_in *peer_addr; + LASSERT(!in_interrupt()); + + /* cmid inherits 'context' from the corresponding listener id */ + ibdev = (kib_dev_t *)cmid->context; + LASSERT(ibdev != NULL); + + memset(&rej, 0, sizeof(rej)); + rej.ibr_magic = IBLND_MSG_MAGIC; + rej.ibr_why = IBLND_REJECT_FATAL; + rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE; + + peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr); + if (*kiblnd_tunables.kib_require_priv_port && + ntohs(peer_addr->sin_port) >= PROT_SOCK) { + __u32 ip = ntohl(peer_addr->sin_addr.s_addr); + CERROR("Peer's port (%pI4h:%hu) is not privileged\n", + &ip, ntohs(peer_addr->sin_port)); + goto failed; + } + + if (priv_nob < offsetof(kib_msg_t, ibm_type)) { + CERROR("Short connection request\n"); + goto failed; + } + + /* Future protocol version compatibility support! If the + * o2iblnd-specific protocol changes, or when LNET unifies + * protocols over all LNDs, the initial connection will + * negotiate a protocol version. I trap this here to avoid + * console errors; the reject tells the peer which protocol I + * speak. */ + if (reqmsg->ibm_magic == LNET_PROTO_MAGIC || + reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) + goto failed; + if (reqmsg->ibm_magic == IBLND_MSG_MAGIC && + reqmsg->ibm_version != IBLND_MSG_VERSION && + reqmsg->ibm_version != IBLND_MSG_VERSION_1) + goto failed; + if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) && + reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) && + reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1)) + goto failed; + + rc = kiblnd_unpack_msg(reqmsg, priv_nob); + if (rc != 0) { + CERROR("Can't parse connection request: %d\n", rc); + goto failed; + } + + nid = reqmsg->ibm_srcnid; + ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid)); + + if (ni != NULL) { + net = (kib_net_t *)ni->ni_data; + rej.ibr_incarnation = net->ibn_incarnation; + } + + if (ni == NULL || /* no matching net */ + ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */ + net->ibn_dev != ibdev) { /* wrong device */ + CERROR("Can't accept %s on %s (%s:%d:%pI4h): bad dst nid %s\n", + libcfs_nid2str(nid), + ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid), + ibdev->ibd_ifname, ibdev->ibd_nnets, + &ibdev->ibd_ifip, + libcfs_nid2str(reqmsg->ibm_dstnid)); + + goto failed; + } + + /* check time stamp as soon as possible */ + if (reqmsg->ibm_dststamp != 0 && + reqmsg->ibm_dststamp != net->ibn_incarnation) { + CWARN("Stale connection request\n"); + rej.ibr_why = IBLND_REJECT_CONN_STALE; + goto failed; + } + + /* I can accept peer's version */ + version = reqmsg->ibm_version; + + if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) { + CERROR("Unexpected connreq msg type: %x from %s\n", + reqmsg->ibm_type, libcfs_nid2str(nid)); + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_queue_depth != + IBLND_MSG_QUEUE_SIZE(version)) { + CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n", + libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE(version)); + + if (version == IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE; + + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_max_frags != + IBLND_RDMA_FRAGS(version)) { + CERROR("Can't accept %s(version %x): incompatible max_frags %d (%d wanted)\n", + libcfs_nid2str(nid), version, + reqmsg->ibm_u.connparams.ibcp_max_frags, + IBLND_RDMA_FRAGS(version)); + + if (version == IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; + + goto failed; + + } + + if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("Can't accept %s: message size %d too big (%d max)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + goto failed; + } + + /* assume 'nid' is a new peer; create */ + rc = kiblnd_create_peer(ni, &peer, nid); + if (rc != 0) { + CERROR("Can't create peer for %s\n", libcfs_nid2str(nid)); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(nid); + if (peer2 != NULL) { + if (peer2->ibp_version == 0) { + peer2->ibp_version = version; + peer2->ibp_incarnation = reqmsg->ibm_srcstamp; + } + + /* not the guy I've talked with */ + if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp || + peer2->ibp_version != version) { + kiblnd_close_peer_conns_locked(peer2, -ESTALE); + write_unlock_irqrestore(g_lock, flags); + + CWARN("Conn stale %s [old ver: %x, new ver: %x]\n", + libcfs_nid2str(nid), peer2->ibp_version, version); + + kiblnd_peer_decref(peer); + rej.ibr_why = IBLND_REJECT_CONN_STALE; + goto failed; + } + + /* tie-break connection race in favour of the higher NID */ + if (peer2->ibp_connecting != 0 && + nid < ni->ni_nid) { + write_unlock_irqrestore(g_lock, flags); + + CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid)); + + kiblnd_peer_decref(peer); + rej.ibr_why = IBLND_REJECT_CONN_RACE; + goto failed; + } + + peer2->ibp_accepting++; + kiblnd_peer_addref(peer2); + + write_unlock_irqrestore(g_lock, flags); + kiblnd_peer_decref(peer); + peer = peer2; + } else { + /* Brand new peer */ + LASSERT(peer->ibp_accepting == 0); + LASSERT(peer->ibp_version == 0 && + peer->ibp_incarnation == 0); + + peer->ibp_accepting = 1; + peer->ibp_version = version; + peer->ibp_incarnation = reqmsg->ibm_srcstamp; + + /* I have a ref on ni that prevents it being shutdown */ + LASSERT(net->ibn_shutdown == 0); + + kiblnd_peer_addref(peer); + list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); + + write_unlock_irqrestore(g_lock, flags); + } + + conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version); + if (conn == NULL) { + kiblnd_peer_connect_failed(peer, 0, -ENOMEM); + kiblnd_peer_decref(peer); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + /* conn now "owns" cmid, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. */ + + conn->ibc_incarnation = reqmsg->ibm_srcstamp; + conn->ibc_credits = IBLND_MSG_QUEUE_SIZE(version); + conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version); + LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version) + <= IBLND_RX_MSGS(version)); + + ackmsg = &conn->ibc_connvars->cv_msg; + memset(ackmsg, 0, sizeof(*ackmsg)); + + kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, + sizeof(ackmsg->ibm_u.connparams)); + ackmsg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version); + ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + ackmsg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version); + + kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = ackmsg; + cp.private_data_len = ackmsg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid)); + + rc = rdma_accept(cmid, &cp); + if (rc != 0) { + CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc); + rej.ibr_version = version; + rej.ibr_why = IBLND_REJECT_FATAL; + + kiblnd_reject(cmid, &rej); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + lnet_ni_decref(ni); + return 0; + + failed: + if (ni != NULL) + lnet_ni_decref(ni); + + rej.ibr_version = version; + rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version); + rej.ibr_cp.ibcp_max_frags = IBLND_RDMA_FRAGS(version); + kiblnd_reject(cmid, &rej); + + return -ECONNREFUSED; +} + +static void +kiblnd_reconnect(kib_conn_t *conn, int version, + __u64 incarnation, int why, kib_connparams_t *cp) +{ + kib_peer_t *peer = conn->ibc_peer; + char *reason; + int retry = 0; + unsigned long flags; + + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + LASSERT(peer->ibp_connecting > 0); /* 'conn' at least */ + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + /* retry connection if it's still needed and no other connection + * attempts (active or passive) are in progress + * NB: reconnect is still needed even when ibp_tx_queue is + * empty if ibp_version != version because reconnect may be + * initiated by kiblnd_query() */ + if ((!list_empty(&peer->ibp_tx_queue) || + peer->ibp_version != version) && + peer->ibp_connecting == 1 && + peer->ibp_accepting == 0) { + retry = 1; + peer->ibp_connecting++; + + peer->ibp_version = version; + peer->ibp_incarnation = incarnation; + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (!retry) + return; + + switch (why) { + default: + reason = "Unknown"; + break; + + case IBLND_REJECT_CONN_STALE: + reason = "stale"; + break; + + case IBLND_REJECT_CONN_RACE: + reason = "conn race"; + break; + + case IBLND_REJECT_CONN_UNCOMPAT: + reason = "version negotiation"; + break; + } + + CNETERR("%s: retrying (%s), %x, %x, queue_dep: %d, max_frag: %d, msg_size: %d\n", + libcfs_nid2str(peer->ibp_nid), + reason, IBLND_MSG_VERSION, version, + cp != NULL ? cp->ibcp_queue_depth : IBLND_MSG_QUEUE_SIZE(version), + cp != NULL ? cp->ibcp_max_frags : IBLND_RDMA_FRAGS(version), + cp != NULL ? cp->ibcp_max_msg_size : IBLND_MSG_SIZE); + + kiblnd_connect_peer(peer); +} + +static void +kiblnd_rejected(kib_conn_t *conn, int reason, void *priv, int priv_nob) +{ + kib_peer_t *peer = conn->ibc_peer; + + LASSERT(!in_interrupt()); + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + switch (reason) { + case IB_CM_REJ_STALE_CONN: + kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0, + IBLND_REJECT_CONN_STALE, NULL); + break; + + case IB_CM_REJ_INVALID_SERVICE_ID: + CNETERR("%s rejected: no listener at %d\n", + libcfs_nid2str(peer->ibp_nid), + *kiblnd_tunables.kib_service); + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) { + kib_rej_t *rej = priv; + kib_connparams_t *cp = NULL; + int flip = 0; + __u64 incarnation = -1; + + /* NB. default incarnation is -1 because: + * a) V1 will ignore dst incarnation in connreq. + * b) V2 will provide incarnation while rejecting me, + * -1 will be overwrote. + * + * if I try to connect to a V1 peer with V2 protocol, + * it rejected me then upgrade to V2, I have no idea + * about the upgrading and try to reconnect with V1, + * in this case upgraded V2 can find out I'm trying to + * talk to the old guy and reject me(incarnation is -1). + */ + + if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || + rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) { + __swab32s(&rej->ibr_magic); + __swab16s(&rej->ibr_version); + flip = 1; + } + + if (priv_nob >= sizeof(kib_rej_t) && + rej->ibr_version > IBLND_MSG_VERSION_1) { + /* priv_nob is always 148 in current version + * of OFED, so we still need to check version. + * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */ + cp = &rej->ibr_cp; + + if (flip) { + __swab64s(&rej->ibr_incarnation); + __swab16s(&cp->ibcp_queue_depth); + __swab16s(&cp->ibcp_max_frags); + __swab32s(&cp->ibcp_max_msg_size); + } + + incarnation = rej->ibr_incarnation; + } + + if (rej->ibr_magic != IBLND_MSG_MAGIC && + rej->ibr_magic != LNET_PROTO_MAGIC) { + CERROR("%s rejected: consumer defined fatal error\n", + libcfs_nid2str(peer->ibp_nid)); + break; + } + + if (rej->ibr_version != IBLND_MSG_VERSION && + rej->ibr_version != IBLND_MSG_VERSION_1) { + CERROR("%s rejected: o2iblnd version %x error\n", + libcfs_nid2str(peer->ibp_nid), + rej->ibr_version); + break; + } + + if (rej->ibr_why == IBLND_REJECT_FATAL && + rej->ibr_version == IBLND_MSG_VERSION_1) { + CDEBUG(D_NET, "rejected by old version peer %s: %x\n", + libcfs_nid2str(peer->ibp_nid), rej->ibr_version); + + if (conn->ibc_version != IBLND_MSG_VERSION_1) + rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT; + } + + switch (rej->ibr_why) { + case IBLND_REJECT_CONN_RACE: + case IBLND_REJECT_CONN_STALE: + case IBLND_REJECT_CONN_UNCOMPAT: + kiblnd_reconnect(conn, rej->ibr_version, + incarnation, rej->ibr_why, cp); + break; + + case IBLND_REJECT_MSG_QUEUE_SIZE: + CERROR("%s rejected: incompatible message queue depth %d, %d\n", + libcfs_nid2str(peer->ibp_nid), + cp != NULL ? cp->ibcp_queue_depth : + IBLND_MSG_QUEUE_SIZE(rej->ibr_version), + IBLND_MSG_QUEUE_SIZE(conn->ibc_version)); + break; + + case IBLND_REJECT_RDMA_FRAGS: + CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n", + libcfs_nid2str(peer->ibp_nid), + cp != NULL ? cp->ibcp_max_frags : + IBLND_RDMA_FRAGS(rej->ibr_version), + IBLND_RDMA_FRAGS(conn->ibc_version)); + break; + + case IBLND_REJECT_NO_RESOURCES: + CERROR("%s rejected: o2iblnd no resources\n", + libcfs_nid2str(peer->ibp_nid)); + break; + + case IBLND_REJECT_FATAL: + CERROR("%s rejected: o2iblnd fatal error\n", + libcfs_nid2str(peer->ibp_nid)); + break; + + default: + CERROR("%s rejected: o2iblnd reason %d\n", + libcfs_nid2str(peer->ibp_nid), + rej->ibr_why); + break; + } + break; + } + /* fall through */ + default: + CNETERR("%s rejected: reason %d, size %d\n", + libcfs_nid2str(peer->ibp_nid), reason, priv_nob); + break; + } + + kiblnd_connreq_done(conn, -ECONNREFUSED); +} + +static void +kiblnd_check_connreply(kib_conn_t *conn, void *priv, int priv_nob) +{ + kib_peer_t *peer = conn->ibc_peer; + lnet_ni_t *ni = peer->ibp_ni; + kib_net_t *net = ni->ni_data; + kib_msg_t *msg = priv; + int ver = conn->ibc_version; + int rc = kiblnd_unpack_msg(msg, priv_nob); + unsigned long flags; + + LASSERT(net != NULL); + + if (rc != 0) { + CERROR("Can't unpack connack from %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + goto failed; + } + + if (msg->ibm_type != IBLND_MSG_CONNACK) { + CERROR("Unexpected message %d from %s\n", + msg->ibm_type, libcfs_nid2str(peer->ibp_nid)); + rc = -EPROTO; + goto failed; + } + + if (ver != msg->ibm_version) { + CERROR("%s replied version %x is different with requested version %x\n", + libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_queue_depth != + IBLND_MSG_QUEUE_SIZE(ver)) { + CERROR("%s has incompatible queue depth %d(%d wanted)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE(ver)); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_frags != + IBLND_RDMA_FRAGS(ver)) { + CERROR("%s has incompatible max_frags %d (%d wanted)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_max_frags, + IBLND_RDMA_FRAGS(ver)); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("%s max message size %d too big (%d max)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + rc = -EPROTO; + goto failed; + } + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + if (msg->ibm_dstnid == ni->ni_nid && + msg->ibm_dststamp == net->ibn_incarnation) + rc = 0; + else + rc = -ESTALE; + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (rc != 0) { + CERROR("Bad connection reply from %s, rc = %d, version: %x max_frags: %d\n", + libcfs_nid2str(peer->ibp_nid), rc, + msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags); + goto failed; + } + + conn->ibc_incarnation = msg->ibm_srcstamp; + conn->ibc_credits = + conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver); + LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver) + <= IBLND_RX_MSGS(ver)); + + kiblnd_connreq_done(conn, 0); + return; + + failed: + /* NB My QP has already established itself, so I handle anything going + * wrong here by setting ibc_comms_error. + * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then + * immediately tears it down. */ + + LASSERT(rc != 0); + conn->ibc_comms_error = rc; + kiblnd_connreq_done(conn, 0); +} + +static int +kiblnd_active_connect(struct rdma_cm_id *cmid) +{ + kib_peer_t *peer = (kib_peer_t *)cmid->context; + kib_conn_t *conn; + kib_msg_t *msg; + struct rdma_conn_param cp; + int version; + __u64 incarnation; + unsigned long flags; + int rc; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + incarnation = peer->ibp_incarnation; + version = (peer->ibp_version == 0) ? IBLND_MSG_VERSION : + peer->ibp_version; + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version); + if (conn == NULL) { + kiblnd_peer_connect_failed(peer, 1, -ENOMEM); + kiblnd_peer_decref(peer); /* lose cmid's ref */ + return -ENOMEM; + } + + /* conn "owns" cmid now, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. conn also takes over cmid's ref + * on peer */ + + msg = &conn->ibc_connvars->cv_msg; + + memset(msg, 0, sizeof(*msg)); + kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); + msg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version); + msg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version); + msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + + kiblnd_pack_msg(peer->ibp_ni, msg, version, + 0, peer->ibp_nid, incarnation); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = msg; + cp.private_data_len = msg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + LASSERT(cmid->context == (void *)conn); + LASSERT(conn->ibc_cmid == cmid); + + rc = rdma_connect(cmid, &cp); + if (rc != 0) { + CERROR("Can't connect to %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + return 0; +} + +int +kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) +{ + kib_peer_t *peer; + kib_conn_t *conn; + int rc; + + switch (event->event) { + default: + CERROR("Unexpected event: %d, status: %d\n", + event->event, event->status); + LBUG(); + + case RDMA_CM_EVENT_CONNECT_REQUEST: + /* destroy cmid on failure */ + rc = kiblnd_passive_connect(cmid, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + CDEBUG(D_NET, "connreq: %d\n", rc); + return rc; + + case RDMA_CM_EVENT_ADDR_ERROR: + peer = (kib_peer_t *)cmid->context; + CNETERR("%s: ADDR ERROR %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ADDR_RESOLVED: + peer = (kib_peer_t *)cmid->context; + + CDEBUG(D_NET, "%s Addr resolved: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + + if (event->status != 0) { + CNETERR("Can't resolve address for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + rc = event->status; + } else { + rc = rdma_resolve_route( + cmid, *kiblnd_tunables.kib_timeout * 1000); + if (rc == 0) + return 0; + /* Can't initiate route resolution */ + CERROR("Can't resolve route for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + } + kiblnd_peer_connect_failed(peer, 1, rc); + kiblnd_peer_decref(peer); + return rc; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_ERROR: + peer = (kib_peer_t *)cmid->context; + CNETERR("%s: ROUTE ERROR %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + peer = (kib_peer_t *)cmid->context; + CDEBUG(D_NET, "%s Route resolved: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + + if (event->status == 0) + return kiblnd_active_connect(cmid); + + CNETERR("Can't resolve route for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer, 1, event->status); + kiblnd_peer_decref(peer); + return event->status; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_UNREACHABLE: + conn = (kib_conn_t *)cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CNETERR("%s: UNREACHABLE %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENETDOWN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_CONNECT_ERROR: + conn = (kib_conn_t *)cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CNETERR("%s: CONNECT ERROR %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENOTCONN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_REJECTED: + conn = (kib_conn_t *)cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CERROR("%s: REJECTED %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + event->status); + kiblnd_connreq_done(conn, -ECONNRESET); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + kiblnd_rejected(conn, event->status, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + break; + } + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_ESTABLISHED: + conn = (kib_conn_t *)cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CDEBUG(D_NET, "ESTABLISHED (passive): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, 0); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + CDEBUG(D_NET, "ESTABLISHED(active): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_check_connreply(conn, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + break; + } + /* net keeps its ref on conn! */ + return 0; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n"); + return 0; + case RDMA_CM_EVENT_DISCONNECTED: + conn = (kib_conn_t *)cmid->context; + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CERROR("%s DISCONNECTED\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, -ECONNRESET); + } else { + kiblnd_close_conn(conn, 0); + } + kiblnd_conn_decref(conn); + cmid->context = NULL; + return 0; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + LCONSOLE_ERROR_MSG(0x131, + "Received notification of device removal\n" + "Please shutdown LNET to allow this to proceed\n"); + /* Can't remove network from underneath LNET for now, so I have + * to ignore this */ + return 0; + + case RDMA_CM_EVENT_ADDR_CHANGE: + LCONSOLE_INFO("Physical link changed (eg hca/port)\n"); + return 0; + } +} + +static int +kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs) +{ + kib_tx_t *tx; + struct list_head *ttmp; + + list_for_each(ttmp, txs) { + tx = list_entry(ttmp, kib_tx_t, tx_list); + + if (txs != &conn->ibc_active_txs) { + LASSERT(tx->tx_queued); + } else { + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_waiting || tx->tx_sending != 0); + } + + if (cfs_time_aftereq(jiffies, tx->tx_deadline)) { + CERROR("Timed out tx: %s, %lu seconds\n", + kiblnd_queue2str(conn, txs), + cfs_duration_sec(jiffies - tx->tx_deadline)); + return 1; + } + } + + return 0; +} + +static int +kiblnd_conn_timed_out_locked(kib_conn_t *conn) +{ + return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) || + kiblnd_check_txs_locked(conn, &conn->ibc_active_txs); +} + +static void +kiblnd_check_conns(int idx) +{ + LIST_HEAD(closes); + LIST_HEAD(checksends); + struct list_head *peers = &kiblnd_data.kib_peers[idx]; + struct list_head *ptmp; + kib_peer_t *peer; + kib_conn_t *conn; + kib_conn_t *tmp; + struct list_head *ctmp; + unsigned long flags; + + /* NB. We expect to have a look at all the peers and not find any + * RDMAs to time out, so we just use a shared lock while we + * take a look... */ + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + list_for_each(ptmp, peers) { + peer = list_entry(ptmp, kib_peer_t, ibp_list); + + list_for_each(ctmp, &peer->ibp_conns) { + int timedout; + int sendnoop; + + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED); + + spin_lock(&conn->ibc_lock); + + sendnoop = kiblnd_need_noop(conn); + timedout = kiblnd_conn_timed_out_locked(conn); + if (!sendnoop && !timedout) { + spin_unlock(&conn->ibc_lock); + continue; + } + + if (timedout) { + CERROR("Timed out RDMA with %s (%lu): c: %u, oc: %u, rc: %u\n", + libcfs_nid2str(peer->ibp_nid), + cfs_duration_sec(cfs_time_current() - + peer->ibp_last_alive), + conn->ibc_credits, + conn->ibc_outstanding_credits, + conn->ibc_reserved_credits); + list_add(&conn->ibc_connd_list, &closes); + } else { + list_add(&conn->ibc_connd_list, + &checksends); + } + /* +ref for 'closes' or 'checksends' */ + kiblnd_conn_addref(conn); + + spin_unlock(&conn->ibc_lock); + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* Handle timeout by closing the whole + * connection. We can only be sure RDMA activity + * has ceased once the QP has been modified. */ + list_for_each_entry_safe(conn, tmp, &closes, ibc_connd_list) { + list_del(&conn->ibc_connd_list); + kiblnd_close_conn(conn, -ETIMEDOUT); + kiblnd_conn_decref(conn); + } + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + while (!list_empty(&checksends)) { + conn = list_entry(checksends.next, + kib_conn_t, ibc_connd_list); + list_del(&conn->ibc_connd_list); + kiblnd_check_sends(conn); + kiblnd_conn_decref(conn); + } +} + +static void +kiblnd_disconnect_conn(kib_conn_t *conn) +{ + LASSERT(!in_interrupt()); + LASSERT(current == kiblnd_data.kib_connd); + LASSERT(conn->ibc_state == IBLND_CONN_CLOSING); + + rdma_disconnect(conn->ibc_cmid); + kiblnd_finalise_conn(conn); + + kiblnd_peer_notify(conn->ibc_peer); +} + +int +kiblnd_connd(void *arg) +{ + wait_queue_t wait; + unsigned long flags; + kib_conn_t *conn; + int timeout; + int i; + int dropped_lock; + int peer_index = 0; + unsigned long deadline = jiffies; + + cfs_block_allsigs(); + + init_waitqueue_entry(&wait, current); + kiblnd_data.kib_connd = current; + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + + while (!kiblnd_data.kib_shutdown) { + + dropped_lock = 0; + + if (!list_empty(&kiblnd_data.kib_connd_zombies)) { + conn = list_entry(kiblnd_data. \ + kib_connd_zombies.next, + kib_conn_t, ibc_list); + list_del(&conn->ibc_list); + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, + flags); + dropped_lock = 1; + + kiblnd_destroy_conn(conn); + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + } + + if (!list_empty(&kiblnd_data.kib_connd_conns)) { + conn = list_entry(kiblnd_data.kib_connd_conns.next, + kib_conn_t, ibc_list); + list_del(&conn->ibc_list); + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, + flags); + dropped_lock = 1; + + kiblnd_disconnect_conn(conn); + kiblnd_conn_decref(conn); + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + } + + /* careful with the jiffy wrap... */ + timeout = (int)(deadline - jiffies); + if (timeout <= 0) { + const int n = 4; + const int p = 1; + int chunk = kiblnd_data.kib_peer_hash_size; + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); + dropped_lock = 1; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. */ + + if (*kiblnd_tunables.kib_timeout > n * p) + chunk = (chunk * n * p) / + *kiblnd_tunables.kib_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kiblnd_check_conns(peer_index); + peer_index = (peer_index + 1) % + kiblnd_data.kib_peer_hash_size; + } + + deadline += p * HZ; + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + } + + if (dropped_lock) + continue; + + /* Nothing to do for 'timeout' */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); + + schedule_timeout(timeout); + + remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + } + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); + + kiblnd_thread_fini(); + return 0; +} + +void +kiblnd_qp_event(struct ib_event *event, void *arg) +{ + kib_conn_t *conn = arg; + + switch (event->event) { + case IB_EVENT_COMM_EST: + CDEBUG(D_NET, "%s established\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + + default: + CERROR("%s: Async QP event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); + return; + } +} + +static void +kiblnd_complete(struct ib_wc *wc) +{ + switch (kiblnd_wreqid2type(wc->wr_id)) { + default: + LBUG(); + + case IBLND_WID_RDMA: + /* We only get RDMA completion notification if it fails. All + * subsequent work items, including the final SEND will fail + * too. However we can't print out any more info about the + * failing RDMA because 'tx' might be back on the idle list or + * even reused already if we didn't manage to post all our work + * items */ + CNETERR("RDMA (tx: %p) failed: %d\n", + kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_TX: + kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_RX: + kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status, + wc->byte_len); + return; + } +} + +void +kiblnd_cq_completion(struct ib_cq *cq, void *arg) +{ + /* NB I'm not allowed to schedule this conn once its refcount has + * reached 0. Since fundamentally I'm racing with scheduler threads + * consuming my CQ I could be called after all completions have + * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0 + * and this CQ is about to be destroyed so I NOOP. */ + kib_conn_t *conn = (kib_conn_t *)arg; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; + + LASSERT(cq == conn->ibc_cq); + + spin_lock_irqsave(&sched->ibs_lock, flags); + + conn->ibc_ready = 1; + + if (!conn->ibc_scheduled && + (conn->ibc_nrx > 0 || + conn->ibc_nsends_posted > 0)) { + kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ + conn->ibc_scheduled = 1; + list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns); + + if (waitqueue_active(&sched->ibs_waitq)) + wake_up(&sched->ibs_waitq); + } + + spin_unlock_irqrestore(&sched->ibs_lock, flags); +} + +void +kiblnd_cq_event(struct ib_event *event, void *arg) +{ + kib_conn_t *conn = arg; + + CERROR("%s: async CQ event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); +} + +int +kiblnd_scheduler(void *arg) +{ + long id = (long)arg; + struct kib_sched_info *sched; + kib_conn_t *conn; + wait_queue_t wait; + unsigned long flags; + struct ib_wc wc; + int did_something; + int busy_loops = 0; + int rc; + + cfs_block_allsigs(); + + init_waitqueue_entry(&wait, current); + + sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)]; + + rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt); + if (rc != 0) { + CWARN("Failed to bind on CPT %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n", + sched->ibs_cpt); + } + + spin_lock_irqsave(&sched->ibs_lock, flags); + + while (!kiblnd_data.kib_shutdown) { + if (busy_loops++ >= IBLND_RESCHED) { + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + cond_resched(); + busy_loops = 0; + + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + did_something = 0; + + if (!list_empty(&sched->ibs_conns)) { + conn = list_entry(sched->ibs_conns.next, + kib_conn_t, ibc_sched_list); + /* take over kib_sched_conns' ref on conn... */ + LASSERT(conn->ibc_scheduled); + list_del(&conn->ibc_sched_list); + conn->ibc_ready = 0; + + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + if (rc == 0) { + rc = ib_req_notify_cq(conn->ibc_cq, + IB_CQ_NEXT_COMP); + if (rc < 0) { + CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); + spin_lock_irqsave(&sched->ibs_lock, + flags); + continue; + } + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + } + + if (rc < 0) { + CWARN("%s: ib_poll_cq failed: %d, closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); + spin_lock_irqsave(&sched->ibs_lock, flags); + continue; + } + + spin_lock_irqsave(&sched->ibs_lock, flags); + + if (rc != 0 || conn->ibc_ready) { + /* There may be another completion waiting; get + * another scheduler to check while I handle + * this one... */ + /* +1 ref for sched_conns */ + kiblnd_conn_addref(conn); + list_add_tail(&conn->ibc_sched_list, + &sched->ibs_conns); + if (waitqueue_active(&sched->ibs_waitq)) + wake_up(&sched->ibs_waitq); + } else { + conn->ibc_scheduled = 0; + } + + if (rc != 0) { + spin_unlock_irqrestore(&sched->ibs_lock, flags); + kiblnd_complete(&wc); + + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + kiblnd_conn_decref(conn); /* ...drop my ref from above */ + did_something = 1; + } + + if (did_something) + continue; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&sched->ibs_waitq, &wait); + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + schedule(); + busy_loops = 0; + + remove_wait_queue(&sched->ibs_waitq, &wait); + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + kiblnd_thread_fini(); + return 0; +} + +int +kiblnd_failover_thread(void *arg) +{ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + kib_dev_t *dev; + wait_queue_t wait; + unsigned long flags; + int rc; + + LASSERT(*kiblnd_tunables.kib_dev_failover != 0); + + cfs_block_allsigs(); + + init_waitqueue_entry(&wait, current); + write_lock_irqsave(glock, flags); + + while (!kiblnd_data.kib_shutdown) { + int do_failover = 0; + int long_sleep; + + list_for_each_entry(dev, &kiblnd_data.kib_failed_devs, + ibd_fail_list) { + if (time_before(cfs_time_current(), + dev->ibd_next_failover)) + continue; + do_failover = 1; + break; + } + + if (do_failover) { + list_del_init(&dev->ibd_fail_list); + dev->ibd_failover = 1; + write_unlock_irqrestore(glock, flags); + + rc = kiblnd_dev_failover(dev); + + write_lock_irqsave(glock, flags); + + LASSERT(dev->ibd_failover); + dev->ibd_failover = 0; + if (rc >= 0) { /* Device is OK or failover succeed */ + dev->ibd_next_failover = cfs_time_shift(3); + continue; + } + + /* failed to failover, retry later */ + dev->ibd_next_failover = + cfs_time_shift(min(dev->ibd_failed_failover, 10)); + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + } + + continue; + } + + /* long sleep if no more pending failover */ + long_sleep = list_empty(&kiblnd_data.kib_failed_devs); + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); + write_unlock_irqrestore(glock, flags); + + rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : + cfs_time_seconds(1)); + remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); + write_lock_irqsave(glock, flags); + + if (!long_sleep || rc != 0) + continue; + + /* have a long sleep, routine check all active devices, + * we need checking like this because if there is not active + * connection on the dev and no SEND from local, we may listen + * on wrong HCA for ever while there is a bonding failover */ + list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + } + } + } + + write_unlock_irqrestore(glock, flags); + + kiblnd_thread_fini(); + return 0; +} diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c new file mode 100644 index 000000000..eedf01afd --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -0,0 +1,230 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd_modparams.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" + +static int service = 987; +module_param(service, int, 0444); +MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)"); + +static int cksum; +module_param(cksum, int, 0644); +MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = 50; +module_param(timeout, int, 0644); +MODULE_PARM_DESC(timeout, "timeout (seconds)"); + +/* Number of threads in each scheduler pool which is percpt, + * we will estimate reasonable value based on CPUs if it's set to zero. */ +static int nscheds; +module_param(nscheds, int, 0444); +MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int ntx = 512; +module_param(ntx, int, 0444); +MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool"); + +/* NB: this value is shared by all CPTs */ +static int credits = 256; +module_param(credits, int, 0444); +MODULE_PARM_DESC(credits, "# concurrent sends"); + +static int peer_credits = 8; +module_param(peer_credits, int, 0444); +MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); + +static int peer_credits_hiw; +module_param(peer_credits_hiw, int, 0444); +MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits"); + +static int peer_buffer_credits; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); + +static int peer_timeout = 180; +module_param(peer_timeout, int, 0444); +MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +static char *ipif_name = "ib0"; +module_param(ipif_name, charp, 0444); +MODULE_PARM_DESC(ipif_name, "IPoIB interface name"); + +static int retry_count = 5; +module_param(retry_count, int, 0644); +MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received"); + +static int rnr_retry_count = 6; +module_param(rnr_retry_count, int, 0644); +MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions"); + +static int keepalive = 100; +module_param(keepalive, int, 0644); +MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive"); + +static int ib_mtu; +module_param(ib_mtu, int, 0444); +MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096"); + +static int concurrent_sends; +module_param(concurrent_sends, int, 0444); +MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing"); + +static int map_on_demand; +module_param(map_on_demand, int, 0444); +MODULE_PARM_DESC(map_on_demand, "map on demand"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int fmr_pool_size = 512; +module_param(fmr_pool_size, int, 0444); +MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int fmr_flush_trigger = 384; +module_param(fmr_flush_trigger, int, 0444); +MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush"); + +static int fmr_cache = 1; +module_param(fmr_cache, int, 0444); +MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int pmr_pool_size = 512; +module_param(pmr_pool_size, int, 0444); +MODULE_PARM_DESC(pmr_pool_size, "size of MR cache pmr pool on each CPT"); + +/* + * 0: disable failover + * 1: enable failover if necessary + * 2: force to failover (for debug) + */ +static int dev_failover; +module_param(dev_failover, int, 0444); +MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)"); + + +static int require_privileged_port; +module_param(require_privileged_port, int, 0644); +MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection"); + +static int use_privileged_port = 1; +module_param(use_privileged_port, int, 0644); +MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection"); + +kib_tunables_t kiblnd_tunables = { + .kib_dev_failover = &dev_failover, + .kib_service = &service, + .kib_cksum = &cksum, + .kib_timeout = &timeout, + .kib_keepalive = &keepalive, + .kib_ntx = &ntx, + .kib_credits = &credits, + .kib_peertxcredits = &peer_credits, + .kib_peercredits_hiw = &peer_credits_hiw, + .kib_peerrtrcredits = &peer_buffer_credits, + .kib_peertimeout = &peer_timeout, + .kib_default_ipif = &ipif_name, + .kib_retry_count = &retry_count, + .kib_rnr_retry_count = &rnr_retry_count, + .kib_concurrent_sends = &concurrent_sends, + .kib_ib_mtu = &ib_mtu, + .kib_map_on_demand = &map_on_demand, + .kib_fmr_pool_size = &fmr_pool_size, + .kib_fmr_flush_trigger = &fmr_flush_trigger, + .kib_fmr_cache = &fmr_cache, + .kib_pmr_pool_size = &pmr_pool_size, + .kib_require_priv_port = &require_privileged_port, + .kib_use_priv_port = &use_privileged_port, + .kib_nscheds = &nscheds +}; + +int +kiblnd_tunables_init(void) +{ + if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) { + CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", + *kiblnd_tunables.kib_ib_mtu); + return -EINVAL; + } + + if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT) + *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT; + + if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX) + *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX; + + if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits) + *kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits; + + if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2) + *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2; + + if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits) + *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1; + + if (*kiblnd_tunables.kib_map_on_demand < 0 || + *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS) + *kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */ + + if (*kiblnd_tunables.kib_map_on_demand == 1) + *kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */ + + if (*kiblnd_tunables.kib_concurrent_sends == 0) { + if (*kiblnd_tunables.kib_map_on_demand > 0 && + *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) + *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2; + else + *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits); + } + + if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2) + *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2; + + if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2) + *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2; + + if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) { + CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n", + *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits); + } + + return 0; +} diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile new file mode 100644 index 000000000..f3fb8778c --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LNET) += ksocklnd.o + +ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c new file mode 100644 index 000000000..7586b7e40 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c @@ -0,0 +1,2886 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/socklnd/socklnd.c + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + */ + +#include "socklnd.h" + +static lnd_t the_ksocklnd; +ksock_nal_data_t ksocknal_data; + +static ksock_interface_t * +ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip) +{ + ksock_net_t *net = ni->ni_data; + int i; + ksock_interface_t *iface; + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT(i < LNET_MAX_INTERFACES); + iface = &net->ksnn_interfaces[i]; + + if (iface->ksni_ipaddr == ip) + return iface; + } + + return NULL; +} + +static ksock_route_t * +ksocknal_create_route(__u32 ipaddr, int port) +{ + ksock_route_t *route; + + LIBCFS_ALLOC(route, sizeof(*route)); + if (route == NULL) + return NULL; + + atomic_set(&route->ksnr_refcount, 1); + route->ksnr_peer = NULL; + route->ksnr_retry_interval = 0; /* OK to connect at any time */ + route->ksnr_ipaddr = ipaddr; + route->ksnr_port = port; + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + route->ksnr_connected = 0; + route->ksnr_deleted = 0; + route->ksnr_conn_count = 0; + route->ksnr_share_count = 0; + + return route; +} + +void +ksocknal_destroy_route(ksock_route_t *route) +{ + LASSERT(atomic_read(&route->ksnr_refcount) == 0); + + if (route->ksnr_peer != NULL) + ksocknal_peer_decref(route->ksnr_peer); + + LIBCFS_FREE(route, sizeof(*route)); +} + +static int +ksocknal_create_peer(ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id) +{ + ksock_net_t *net = ni->ni_data; + ksock_peer_t *peer; + + LASSERT(id.nid != LNET_NID_ANY); + LASSERT(id.pid != LNET_PID_ANY); + LASSERT(!in_interrupt()); + + LIBCFS_ALLOC(peer, sizeof(*peer)); + if (peer == NULL) + return -ENOMEM; + + peer->ksnp_ni = ni; + peer->ksnp_id = id; + atomic_set(&peer->ksnp_refcount, 1); /* 1 ref for caller */ + peer->ksnp_closing = 0; + peer->ksnp_accepting = 0; + peer->ksnp_proto = NULL; + peer->ksnp_last_alive = 0; + peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + + INIT_LIST_HEAD(&peer->ksnp_conns); + INIT_LIST_HEAD(&peer->ksnp_routes); + INIT_LIST_HEAD(&peer->ksnp_tx_queue); + INIT_LIST_HEAD(&peer->ksnp_zc_req_list); + spin_lock_init(&peer->ksnp_lock); + + spin_lock_bh(&net->ksnn_lock); + + if (net->ksnn_shutdown) { + spin_unlock_bh(&net->ksnn_lock); + + LIBCFS_FREE(peer, sizeof(*peer)); + CERROR("Can't create peer: network shutdown\n"); + return -ESHUTDOWN; + } + + net->ksnn_npeers++; + + spin_unlock_bh(&net->ksnn_lock); + + *peerp = peer; + return 0; +} + +void +ksocknal_destroy_peer(ksock_peer_t *peer) +{ + ksock_net_t *net = peer->ksnp_ni->ni_data; + + CDEBUG(D_NET, "peer %s %p deleted\n", + libcfs_id2str(peer->ksnp_id), peer); + + LASSERT(atomic_read(&peer->ksnp_refcount) == 0); + LASSERT(peer->ksnp_accepting == 0); + LASSERT(list_empty(&peer->ksnp_conns)); + LASSERT(list_empty(&peer->ksnp_routes)); + LASSERT(list_empty(&peer->ksnp_tx_queue)); + LASSERT(list_empty(&peer->ksnp_zc_req_list)); + + LIBCFS_FREE(peer, sizeof(*peer)); + + /* NB a peer's connections and routes keep a reference on their peer + * until they are destroyed, so we can be assured that _all_ state to + * do with this peer has been cleaned up when its refcount drops to + * zero. */ + spin_lock_bh(&net->ksnn_lock); + net->ksnn_npeers--; + spin_unlock_bh(&net->ksnn_lock); +} + +ksock_peer_t * +ksocknal_find_peer_locked(lnet_ni_t *ni, lnet_process_id_t id) +{ + struct list_head *peer_list = ksocknal_nid2peerlist(id.nid); + struct list_head *tmp; + ksock_peer_t *peer; + + list_for_each(tmp, peer_list) { + + peer = list_entry(tmp, ksock_peer_t, ksnp_list); + + LASSERT(!peer->ksnp_closing); + + if (peer->ksnp_ni != ni) + continue; + + if (peer->ksnp_id.nid != id.nid || + peer->ksnp_id.pid != id.pid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", + peer, libcfs_id2str(id), + atomic_read(&peer->ksnp_refcount)); + return peer; + } + return NULL; +} + +ksock_peer_t * +ksocknal_find_peer(lnet_ni_t *ni, lnet_process_id_t id) +{ + ksock_peer_t *peer; + + read_lock(&ksocknal_data.ksnd_global_lock); + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) /* +1 ref for caller? */ + ksocknal_peer_addref(peer); + read_unlock(&ksocknal_data.ksnd_global_lock); + + return peer; +} + +static void +ksocknal_unlink_peer_locked(ksock_peer_t *peer) +{ + int i; + __u32 ip; + ksock_interface_t *iface; + + for (i = 0; i < peer->ksnp_n_passive_ips; i++) { + LASSERT(i < LNET_MAX_INTERFACES); + ip = peer->ksnp_passive_ips[i]; + + iface = ksocknal_ip2iface(peer->ksnp_ni, ip); + /* All IPs in peer->ksnp_passive_ips[] come from the + * interface list, therefore the call must succeed. */ + LASSERT(iface != NULL); + + CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n", + peer, iface, iface->ksni_nroutes); + iface->ksni_npeers--; + } + + LASSERT(list_empty(&peer->ksnp_conns)); + LASSERT(list_empty(&peer->ksnp_routes)); + LASSERT(!peer->ksnp_closing); + peer->ksnp_closing = 1; + list_del(&peer->ksnp_list); + /* lose peerlist's ref */ + ksocknal_peer_decref(peer); +} + +static int +ksocknal_get_peer_info(lnet_ni_t *ni, int index, + lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip, + int *port, int *conn_count, int *share_count) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + ksock_route_t *route; + struct list_head *rtmp; + int i; + int j; + int rc = -ENOENT; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + + if (peer->ksnp_n_passive_ips == 0 && + list_empty(&peer->ksnp_routes)) { + if (index-- > 0) + continue; + + *id = peer->ksnp_id; + *myip = 0; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + for (j = 0; j < peer->ksnp_n_passive_ips; j++) { + if (index-- > 0) + continue; + + *id = peer->ksnp_id; + *myip = peer->ksnp_passive_ips[j]; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + list_for_each(rtmp, &peer->ksnp_routes) { + if (index-- > 0) + continue; + + route = list_entry(rtmp, ksock_route_t, + ksnr_list); + + *id = peer->ksnp_id; + *myip = route->ksnr_myipaddr; + *peer_ip = route->ksnr_ipaddr; + *port = route->ksnr_port; + *conn_count = route->ksnr_conn_count; + *share_count = route->ksnr_share_count; + rc = 0; + goto out; + } + } + } + out: + read_unlock(&ksocknal_data.ksnd_global_lock); + return rc; +} + +static void +ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn) +{ + ksock_peer_t *peer = route->ksnr_peer; + int type = conn->ksnc_type; + ksock_interface_t *iface; + + conn->ksnc_route = route; + ksocknal_route_addref(route); + + if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { + if (route->ksnr_myipaddr == 0) { + /* route wasn't bound locally yet (the initial route) */ + CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n", + libcfs_id2str(peer->ksnp_id), + &route->ksnr_ipaddr, + &conn->ksnc_myipaddr); + } else { + CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h to %pI4h\n", + libcfs_id2str(peer->ksnp_id), + &route->ksnr_ipaddr, + &route->ksnr_myipaddr, + &conn->ksnc_myipaddr); + + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + route->ksnr_myipaddr = conn->ksnc_myipaddr; + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes++; + } + + route->ksnr_connected |= (1<ksnr_conn_count++; + + /* Successful connection => further attempts can + * proceed immediately */ + route->ksnr_retry_interval = 0; +} + +static void +ksocknal_add_route_locked(ksock_peer_t *peer, ksock_route_t *route) +{ + struct list_head *tmp; + ksock_conn_t *conn; + ksock_route_t *route2; + + LASSERT(!peer->ksnp_closing); + LASSERT(route->ksnr_peer == NULL); + LASSERT(!route->ksnr_scheduled); + LASSERT(!route->ksnr_connecting); + LASSERT(route->ksnr_connected == 0); + + /* LASSERT(unique) */ + list_for_each(tmp, &peer->ksnp_routes) { + route2 = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { + CERROR("Duplicate route %s %pI4h\n", + libcfs_id2str(peer->ksnp_id), + &route->ksnr_ipaddr); + LBUG(); + } + } + + route->ksnr_peer = peer; + ksocknal_peer_addref(peer); + /* peer's routelist takes over my ref on 'route' */ + list_add_tail(&route->ksnr_list, &peer->ksnp_routes); + + list_for_each(tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_ipaddr != route->ksnr_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + /* keep going (typed routes) */ + } +} + +static void +ksocknal_del_route_locked(ksock_route_t *route) +{ + ksock_peer_t *peer = route->ksnr_peer; + ksock_interface_t *iface; + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + + LASSERT(!route->ksnr_deleted); + + /* Close associated conns */ + list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry(ctmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_route != route) + continue; + + ksocknal_close_conn_locked(conn, 0); + } + + if (route->ksnr_myipaddr != 0) { + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + + route->ksnr_deleted = 1; + list_del(&route->ksnr_list); + ksocknal_route_decref(route); /* drop peer's ref */ + + if (list_empty(&peer->ksnp_routes) && + list_empty(&peer->ksnp_conns)) { + /* I've just removed the last route to a peer with no active + * connections */ + ksocknal_unlink_peer_locked(peer); + } +} + +int +ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port) +{ + struct list_head *tmp; + ksock_peer_t *peer; + ksock_peer_t *peer2; + ksock_route_t *route; + ksock_route_t *route2; + int rc; + + if (id.nid == LNET_NID_ANY || + id.pid == LNET_PID_ANY) + return -EINVAL; + + /* Have a brand new peer ready... */ + rc = ksocknal_create_peer(&peer, ni, id); + if (rc != 0) + return rc; + + route = ksocknal_create_route(ipaddr, port); + if (route == NULL) { + ksocknal_peer_decref(peer); + return -ENOMEM; + } + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + /* always called with a ref on ni, so shutdown can't have started */ + LASSERT(((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); + + peer2 = ksocknal_find_peer_locked(ni, id); + if (peer2 != NULL) { + ksocknal_peer_decref(peer); + peer = peer2; + } else { + /* peer table takes my ref on peer */ + list_add_tail(&peer->ksnp_list, + ksocknal_nid2peerlist(id.nid)); + } + + route2 = NULL; + list_for_each(tmp, &peer->ksnp_routes) { + route2 = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == ipaddr) + break; + + route2 = NULL; + } + if (route2 == NULL) { + ksocknal_add_route_locked(peer, route); + route->ksnr_share_count++; + } else { + ksocknal_route_decref(route); + route2->ksnr_share_count++; + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return 0; +} + +static void +ksocknal_del_peer_locked(ksock_peer_t *peer, __u32 ip) +{ + ksock_conn_t *conn; + ksock_route_t *route; + struct list_head *tmp; + struct list_head *nxt; + int nshared; + + LASSERT(!peer->ksnp_closing); + + /* Extra ref prevents peer disappearing until I'm done with it */ + ksocknal_peer_addref(peer); + + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + /* no match */ + if (!(ip == 0 || route->ksnr_ipaddr == ip)) + continue; + + route->ksnr_share_count = 0; + /* This deletes associated conns too */ + ksocknal_del_route_locked(route); + } + + nshared = 0; + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + nshared += route->ksnr_share_count; + } + + if (nshared == 0) { + /* remove everything else if there are no explicit entries + * left */ + + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + /* we should only be removing auto-entries */ + LASSERT(route->ksnr_share_count == 0); + ksocknal_del_route_locked(route); + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + ksocknal_close_conn_locked(conn, 0); + } + } + + ksocknal_peer_decref(peer); + /* NB peer unlinks itself when last conn/route is removed */ +} + +static int +ksocknal_del_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip) +{ + LIST_HEAD(zombies); + struct list_head *ptmp; + struct list_head *pnxt; + ksock_peer_t *peer; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (id.nid != LNET_NID_ANY) + lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, + &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + + if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) && + (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid))) + continue; + + ksocknal_peer_addref(peer); /* a ref for me... */ + + ksocknal_del_peer_locked(peer, ip); + + if (peer->ksnp_closing && + !list_empty(&peer->ksnp_tx_queue)) { + LASSERT(list_empty(&peer->ksnp_conns)); + LASSERT(list_empty(&peer->ksnp_routes)); + + list_splice_init(&peer->ksnp_tx_queue, + &zombies); + } + + ksocknal_peer_decref(peer); /* ...till here */ + + rc = 0; /* matched! */ + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(ni, &zombies, 1); + + return rc; +} + +static ksock_conn_t * +ksocknal_get_conn_by_idx(lnet_ni_t *ni, int index) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + ksock_conn_t *conn; + struct list_head *ctmp; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + + LASSERT(!peer->ksnp_closing); + + if (peer->ksnp_ni != ni) + continue; + + list_for_each(ctmp, &peer->ksnp_conns) { + if (index-- > 0) + continue; + + conn = list_entry(ctmp, ksock_conn_t, + ksnc_list); + ksocknal_conn_addref(conn); + read_unlock(&ksocknal_data.ksnd_global_lock); + return conn; + } + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return NULL; +} + +static ksock_sched_t * +ksocknal_choose_scheduler_locked(unsigned int cpt) +{ + struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt]; + ksock_sched_t *sched; + int i; + + LASSERT(info->ksi_nthreads > 0); + + sched = &info->ksi_scheds[0]; + /* + * NB: it's safe so far, but info->ksi_nthreads could be changed + * at runtime when we have dynamic LNet configuration, then we + * need to take care of this. + */ + for (i = 1; i < info->ksi_nthreads; i++) { + if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns) + sched = &info->ksi_scheds[i]; + } + + return sched; +} + +static int +ksocknal_local_ipvec(lnet_ni_t *ni, __u32 *ipaddrs) +{ + ksock_net_t *net = ni->ni_data; + int i; + int nip; + + read_lock(&ksocknal_data.ksnd_global_lock); + + nip = net->ksnn_ninterfaces; + LASSERT(nip <= LNET_MAX_INTERFACES); + + /* Only offer interfaces for additional connections if I have + * more than one. */ + if (nip < 2) { + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; + } + + for (i = 0; i < nip; i++) { + ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr; + LASSERT(ipaddrs[i] != 0); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return nip; +} + +static int +ksocknal_match_peerip(ksock_interface_t *iface, __u32 *ips, int nips) +{ + int best_netmatch = 0; + int best_xor = 0; + int best = -1; + int this_xor; + int this_netmatch; + int i; + + for (i = 0; i < nips; i++) { + if (ips[i] == 0) + continue; + + this_xor = ips[i] ^ iface->ksni_ipaddr; + this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best < 0 || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_xor > this_xor))) + continue; + + best = i; + best_netmatch = this_netmatch; + best_xor = this_xor; + } + + LASSERT(best >= 0); + return best; +} + +static int +ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + ksock_net_t *net = peer->ksnp_ni->ni_data; + ksock_interface_t *iface; + ksock_interface_t *best_iface; + int n_ips; + int i; + int j; + int k; + __u32 ip; + __u32 xor; + int this_netmatch; + int best_netmatch; + int best_npeers; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness shouldn't matter */ + + /* Also note that I'm not going to return more than n_peerips + * interfaces, even if I have more myself */ + + write_lock_bh(global_lock); + + LASSERT(n_peerips <= LNET_MAX_INTERFACES); + LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); + + /* Only match interfaces for additional connections + * if I have > 1 interface */ + n_ips = (net->ksnn_ninterfaces < 2) ? 0 : + min(n_peerips, net->ksnn_ninterfaces); + + for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) { + /* ^ yes really... */ + + /* If we have any new interfaces, first tick off all the + * peer IPs that match old interfaces, then choose new + * interfaces to match the remaining peer IPS. + * We don't forget interfaces we've stopped using; we might + * start using them again... */ + + if (i < peer->ksnp_n_passive_ips) { + /* Old interface. */ + ip = peer->ksnp_passive_ips[i]; + best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip); + + } else { + /* choose a new interface */ + LASSERT(i == peer->ksnp_n_passive_ips); + + best_iface = NULL; + best_netmatch = 0; + best_npeers = 0; + + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; + ip = iface->ksni_ipaddr; + + for (k = 0; k < peer->ksnp_n_passive_ips; k++) + if (peer->ksnp_passive_ips[k] == ip) + break; + + if (k < peer->ksnp_n_passive_ips) /* using it already */ + continue; + + k = ksocknal_match_peerip(iface, peerips, n_peerips); + xor = ip ^ peerips[k]; + this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_npeers > iface->ksni_npeers))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_npeers = iface->ksni_npeers; + } + + best_iface->ksni_npeers++; + ip = best_iface->ksni_ipaddr; + peer->ksnp_passive_ips[i] = ip; + peer->ksnp_n_passive_ips = i+1; + } + + /* mark the best matching peer IP used */ + j = ksocknal_match_peerip(best_iface, peerips, n_peerips); + peerips[j] = 0; + } + + /* Overwrite input peer IP addresses */ + memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips)); + + write_unlock_bh(global_lock); + + return n_ips; +} + +static void +ksocknal_create_routes(ksock_peer_t *peer, int port, + __u32 *peer_ipaddrs, int npeer_ipaddrs) +{ + ksock_route_t *newroute = NULL; + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + lnet_ni_t *ni = peer->ksnp_ni; + ksock_net_t *net = ni->ni_data; + struct list_head *rtmp; + ksock_route_t *route; + ksock_interface_t *iface; + ksock_interface_t *best_iface; + int best_netmatch; + int this_netmatch; + int best_nroutes; + int i; + int j; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness here shouldn't matter */ + + write_lock_bh(global_lock); + + if (net->ksnn_ninterfaces < 2) { + /* Only create additional connections + * if I have > 1 interface */ + write_unlock_bh(global_lock); + return; + } + + LASSERT(npeer_ipaddrs <= LNET_MAX_INTERFACES); + + for (i = 0; i < npeer_ipaddrs; i++) { + if (newroute != NULL) { + newroute->ksnr_ipaddr = peer_ipaddrs[i]; + } else { + write_unlock_bh(global_lock); + + newroute = ksocknal_create_route(peer_ipaddrs[i], port); + if (newroute == NULL) + return; + + write_lock_bh(global_lock); + } + + if (peer->ksnp_closing) { + /* peer got closed under me */ + break; + } + + /* Already got a route? */ + route = NULL; + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, ksock_route_t, ksnr_list); + + if (route->ksnr_ipaddr == newroute->ksnr_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + best_iface = NULL; + best_nroutes = 0; + best_netmatch = 0; + + LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); + + /* Select interface to connect from */ + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; + + /* Using this interface already? */ + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, ksock_route_t, + ksnr_list); + + if (route->ksnr_myipaddr == iface->ksni_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + this_netmatch = (((iface->ksni_ipaddr ^ + newroute->ksnr_ipaddr) & + iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_nroutes > iface->ksni_nroutes))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_nroutes = iface->ksni_nroutes; + } + + if (best_iface == NULL) + continue; + + newroute->ksnr_myipaddr = best_iface->ksni_ipaddr; + best_iface->ksni_nroutes++; + + ksocknal_add_route_locked(peer, newroute); + newroute = NULL; + } + + write_unlock_bh(global_lock); + if (newroute != NULL) + ksocknal_route_decref(newroute); +} + +int +ksocknal_accept(lnet_ni_t *ni, struct socket *sock) +{ + ksock_connreq_t *cr; + int rc; + __u32 peer_ip; + int peer_port; + + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT(rc == 0); /* we succeeded before */ + + LIBCFS_ALLOC(cr, sizeof(*cr)); + if (cr == NULL) { + LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from %pI4h: memory exhausted\n", + &peer_ip); + return -ENOMEM; + } + + lnet_ni_addref(ni); + cr->ksncr_ni = ni; + cr->ksncr_sock = sock; + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + return 0; +} + +static int +ksocknal_connecting(ksock_peer_t *peer, __u32 ipaddr) +{ + ksock_route_t *route; + + list_for_each_entry(route, &peer->ksnp_routes, ksnr_list) { + + if (route->ksnr_ipaddr == ipaddr) + return route->ksnr_connecting; + } + return 0; +} + +int +ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route, + struct socket *sock, int type) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + LIST_HEAD(zombies); + lnet_process_id_t peerid; + struct list_head *tmp; + __u64 incarnation; + ksock_conn_t *conn; + ksock_conn_t *conn2; + ksock_peer_t *peer = NULL; + ksock_peer_t *peer2; + ksock_sched_t *sched; + ksock_hello_msg_t *hello; + int cpt; + ksock_tx_t *tx; + ksock_tx_t *txtmp; + int rc; + int active; + char *warn = NULL; + + active = (route != NULL); + + LASSERT(active == (type != SOCKLND_CONN_NONE)); + + LIBCFS_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + rc = -ENOMEM; + goto failed_0; + } + + conn->ksnc_peer = NULL; + conn->ksnc_route = NULL; + conn->ksnc_sock = sock; + /* 2 ref, 1 for conn, another extra ref prevents socket + * being closed before establishment of connection */ + atomic_set(&conn->ksnc_sock_refcount, 2); + conn->ksnc_type = type; + ksocknal_lib_save_callback(sock, conn); + atomic_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + + INIT_LIST_HEAD(&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + conn->ksnc_tx_carrier = NULL; + atomic_set(&conn->ksnc_tx_nob, 0); + + LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + if (hello == NULL) { + rc = -ENOMEM; + goto failed_1; + } + + /* stash conn's local and remote addrs */ + rc = ksocknal_lib_get_conn_addrs(conn); + if (rc != 0) + goto failed_1; + + /* Find out/confirm peer's NID and connection type and get the + * vector of interfaces she's willing to let me connect to. + * Passive connections use the listener timeout since the peer sends + * eagerly */ + + if (active) { + peer = route->ksnr_peer; + LASSERT(ni == peer->ksnp_ni); + + /* Active connection sends HELLO eagerly */ + hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips); + peerid = peer->ksnp_id; + + write_lock_bh(global_lock); + conn->ksnc_proto = peer->ksnp_proto; + write_unlock_bh(global_lock); + + if (conn->ksnc_proto == NULL) { + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + } + + rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); + if (rc != 0) + goto failed_1; + } else { + peerid.nid = LNET_NID_ANY; + peerid.pid = LNET_PID_ANY; + + /* Passive, get protocol from peer */ + conn->ksnc_proto = NULL; + } + + rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation); + if (rc < 0) + goto failed_1; + + LASSERT(rc == 0 || active); + LASSERT(conn->ksnc_proto != NULL); + LASSERT(peerid.nid != LNET_NID_ANY); + + cpt = lnet_cpt_of_nid(peerid.nid); + + if (active) { + ksocknal_peer_addref(peer); + write_lock_bh(global_lock); + } else { + rc = ksocknal_create_peer(&peer, ni, peerid); + if (rc != 0) + goto failed_1; + + write_lock_bh(global_lock); + + /* called with a ref on ni, so shutdown can't have started */ + LASSERT(((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); + + peer2 = ksocknal_find_peer_locked(ni, peerid); + if (peer2 == NULL) { + /* NB this puts an "empty" peer in the peer + * table (which takes my ref) */ + list_add_tail(&peer->ksnp_list, + ksocknal_nid2peerlist(peerid.nid)); + } else { + ksocknal_peer_decref(peer); + peer = peer2; + } + + /* +1 ref for me */ + ksocknal_peer_addref(peer); + peer->ksnp_accepting++; + + /* Am I already connecting to this guy? Resolve in + * favour of higher NID... */ + if (peerid.nid < ni->ni_nid && + ksocknal_connecting(peer, conn->ksnc_ipaddr)) { + rc = EALREADY; + warn = "connection race resolution"; + goto failed_2; + } + } + + if (peer->ksnp_closing || + (active && route->ksnr_deleted)) { + /* peer/route got closed under me */ + rc = -ESTALE; + warn = "peer/route removed"; + goto failed_2; + } + + if (peer->ksnp_proto == NULL) { + /* Never connected before. + * NB recv_hello may have returned EPROTO to signal my peer + * wants a different protocol than the one I asked for. + */ + LASSERT(list_empty(&peer->ksnp_conns)); + + peer->ksnp_proto = conn->ksnc_proto; + peer->ksnp_incarnation = incarnation; + } + + if (peer->ksnp_proto != conn->ksnc_proto || + peer->ksnp_incarnation != incarnation) { + /* Peer rebooted or I've got the wrong protocol version */ + ksocknal_close_peer_conns_locked(peer, 0, 0); + + peer->ksnp_proto = NULL; + rc = ESTALE; + warn = peer->ksnp_incarnation != incarnation ? + "peer rebooted" : + "wrong proto version"; + goto failed_2; + } + + switch (rc) { + default: + LBUG(); + case 0: + break; + case EALREADY: + warn = "lost conn race"; + goto failed_2; + case EPROTO: + warn = "retry with different protocol version"; + goto failed_2; + } + + /* Refuse to duplicate an existing connection, unless this is a + * loopback connection */ + if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { + list_for_each(tmp, &peer->ksnp_conns) { + conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr || + conn2->ksnc_myipaddr != conn->ksnc_myipaddr || + conn2->ksnc_type != conn->ksnc_type) + continue; + + /* Reply on a passive connection attempt so the peer + * realises we're connected. */ + LASSERT(rc == 0); + if (!active) + rc = EALREADY; + + warn = "duplicate"; + goto failed_2; + } + } + + /* If the connection created by this route didn't bind to the IP + * address the route connected to, the connection/route matching + * code below probably isn't going to work. */ + if (active && + route->ksnr_ipaddr != conn->ksnc_ipaddr) { + CERROR("Route %s %pI4h connected to %pI4h\n", + libcfs_id2str(peer->ksnp_id), + &route->ksnr_ipaddr, + &conn->ksnc_ipaddr); + } + + /* Search for a route corresponding to the new connection and + * create an association. This allows incoming connections created + * by routes in my peer to match my own route entries so I don't + * continually create duplicate routes. */ + list_for_each(tmp, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_ipaddr != conn->ksnc_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + break; + } + + conn->ksnc_peer = peer; /* conn takes my ref on peer */ + peer->ksnp_last_alive = cfs_time_current(); + peer->ksnp_send_keepalive = 0; + peer->ksnp_error = 0; + + sched = ksocknal_choose_scheduler_locked(cpt); + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + + conn->ksnc_tx_last_post = cfs_time_current(); + /* Set the deadline for the outgoing HELLO to drain */ + conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued; + conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + mb(); /* order with adding to peer's conn list */ + + list_add(&conn->ksnc_list, &peer->ksnp_conns); + ksocknal_conn_addref(conn); + + ksocknal_new_packet(conn, 0); + + conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn); + + /* Take packets blocking for this connection. */ + list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) { + if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO) + continue; + + list_del(&tx->tx_list); + ksocknal_queue_tx_locked(tx, conn); + } + + write_unlock_bh(global_lock); + + /* We've now got a new connection. Any errors from here on are just + * like "normal" comms errors and we close the connection normally. + * NB (a) we still have to send the reply HELLO for passive + * connections, + * (b) normal I/O on the conn is blocked until I setup and call the + * socket callbacks. + */ + + CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d incarnation:%lld sched[%d:%d]\n", + libcfs_id2str(peerid), conn->ksnc_proto->pro_version, + &conn->ksnc_myipaddr, &conn->ksnc_ipaddr, + conn->ksnc_port, incarnation, cpt, + (int)(sched - &sched->kss_info->ksi_scheds[0])); + + if (active) { + /* additional routes after interface exchange? */ + ksocknal_create_routes(peer, conn->ksnc_port, + hello->kshm_ips, hello->kshm_nips); + } else { + hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips, + hello->kshm_nips); + rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + + /* setup the socket AFTER I've received hello (it disables + * SO_LINGER). I might call back to the acceptor who may want + * to send a protocol version response and then close the + * socket; this ensures the socket only tears down after the + * response has been sent. */ + if (rc == 0) + rc = ksocknal_lib_setup_sock(sock); + + write_lock_bh(global_lock); + + /* NB my callbacks block while I hold ksnd_global_lock */ + ksocknal_lib_set_callback(sock, conn); + + if (!active) + peer->ksnp_accepting--; + + write_unlock_bh(global_lock); + + if (rc != 0) { + write_lock_bh(global_lock); + if (!conn->ksnc_closing) { + /* could be closed by another thread */ + ksocknal_close_conn_locked(conn, rc); + } + write_unlock_bh(global_lock); + } else if (ksocknal_connsock_addref(conn) == 0) { + /* Allow I/O to proceed. */ + ksocknal_read_callback(conn); + ksocknal_write_callback(conn); + ksocknal_connsock_decref(conn); + } + + ksocknal_connsock_decref(conn); + ksocknal_conn_decref(conn); + return rc; + + failed_2: + if (!peer->ksnp_closing && + list_empty(&peer->ksnp_conns) && + list_empty(&peer->ksnp_routes)) { + list_add(&zombies, &peer->ksnp_tx_queue); + list_del_init(&peer->ksnp_tx_queue); + ksocknal_unlink_peer_locked(peer); + } + + write_unlock_bh(global_lock); + + if (warn != NULL) { + if (rc < 0) + CERROR("Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + else + CDEBUG(D_NET, "Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + } + + if (!active) { + if (rc > 0) { + /* Request retry by replying with CONN_NONE + * ksnc_proto has been set already */ + conn->ksnc_type = SOCKLND_CONN_NONE; + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + + write_lock_bh(global_lock); + peer->ksnp_accepting--; + write_unlock_bh(global_lock); + } + + ksocknal_txlist_done(ni, &zombies, 1); + ksocknal_peer_decref(peer); + + failed_1: + if (hello != NULL) + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + + LIBCFS_FREE(conn, sizeof(*conn)); + + failed_0: + libcfs_sock_release(sock); + return rc; +} + +void +ksocknal_close_conn_locked(ksock_conn_t *conn, int error) +{ + /* This just does the immmediate housekeeping, and queues the + * connection for the reaper to terminate. + * Caller holds ksnd_global_lock exclusively in irq context */ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_route_t *route; + ksock_conn_t *conn2; + struct list_head *tmp; + + LASSERT(peer->ksnp_error == 0); + LASSERT(!conn->ksnc_closing); + conn->ksnc_closing = 1; + + /* ksnd_deathrow_conns takes over peer's ref */ + list_del(&conn->ksnc_list); + + route = conn->ksnc_route; + if (route != NULL) { + /* dissociate conn from route... */ + LASSERT(!route->ksnr_deleted); + LASSERT((route->ksnr_connected & (1 << conn->ksnc_type)) != 0); + + conn2 = NULL; + list_for_each(tmp, &peer->ksnp_conns) { + conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn2->ksnc_route == route && + conn2->ksnc_type == conn->ksnc_type) + break; + + conn2 = NULL; + } + if (conn2 == NULL) + route->ksnr_connected &= ~(1 << conn->ksnc_type); + + conn->ksnc_route = NULL; + +#if 0 /* irrelevant with only eager routes */ + /* make route least favourite */ + list_del(&route->ksnr_list); + list_add_tail(&route->ksnr_list, &peer->ksnp_routes); +#endif + ksocknal_route_decref(route); /* drop conn's ref on route */ + } + + if (list_empty(&peer->ksnp_conns)) { + /* No more connections to this peer */ + + if (!list_empty(&peer->ksnp_tx_queue)) { + ksock_tx_t *tx; + + LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x); + + /* throw them to the last connection..., + * these TXs will be send to /dev/null by scheduler */ + list_for_each_entry(tx, &peer->ksnp_tx_queue, + tx_list) + ksocknal_tx_prep(conn, tx); + + spin_lock_bh(&conn->ksnc_scheduler->kss_lock); + list_splice_init(&peer->ksnp_tx_queue, + &conn->ksnc_tx_queue); + spin_unlock_bh(&conn->ksnc_scheduler->kss_lock); + } + + peer->ksnp_proto = NULL; /* renegotiate protocol version */ + peer->ksnp_error = error; /* stash last conn close reason */ + + if (list_empty(&peer->ksnp_routes)) { + /* I've just closed last conn belonging to a + * peer with no routes to it */ + ksocknal_unlink_peer_locked(peer); + } + } + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, + &ksocknal_data.ksnd_deathrow_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_peer_failed(ksock_peer_t *peer) +{ + int notify = 0; + unsigned long last_alive = 0; + + /* There has been a connection failure or comms error; but I'll only + * tell LNET I think the peer is dead if it's to another kernel and + * there are no connections or connection attempts in existence. */ + + read_lock(&ksocknal_data.ksnd_global_lock); + + if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 && + list_empty(&peer->ksnp_conns) && + peer->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer) == NULL) { + notify = 1; + last_alive = peer->ksnp_last_alive; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (notify) + lnet_notify(peer->ksnp_ni, peer->ksnp_id.nid, 0, + last_alive); +} + +void +ksocknal_finalize_zcreq(ksock_conn_t *conn) +{ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_tx_t *tx; + ksock_tx_t *tmp; + LIST_HEAD(zlist); + + /* NB safe to finalize TXs because closing of socket will + * abort all buffered data */ + LASSERT(conn->ksnc_sock == NULL); + + spin_lock(&peer->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) { + if (tx->tx_conn != conn) + continue; + + LASSERT(tx->tx_msg.ksm_zc_cookies[0] != 0); + + tx->tx_msg.ksm_zc_cookies[0] = 0; + tx->tx_zc_aborted = 1; /* mark it as not-acked */ + list_del(&tx->tx_zc_list); + list_add(&tx->tx_zc_list, &zlist); + } + + spin_unlock(&peer->ksnp_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list); + + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } +} + +void +ksocknal_terminate_conn(ksock_conn_t *conn) +{ + /* This gets called by the reaper (guaranteed thread context) to + * disengage the socket from its callbacks and close it. + * ksnc_refcount will eventually hit zero, and then the reaper will + * destroy it. */ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_sched_t *sched = conn->ksnc_scheduler; + int failed = 0; + + LASSERT(conn->ksnc_closing); + + /* wake up the scheduler to "send" all remaining packets to /dev/null */ + spin_lock_bh(&sched->kss_lock); + + /* a closing conn is always ready to tx */ + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && + !list_empty(&conn->ksnc_tx_queue)) { + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up(&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); + + /* serialise with callbacks */ + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_lib_reset_callback(conn->ksnc_sock, conn); + + /* OK, so this conn may not be completely disengaged from its + * scheduler yet, but it _has_ committed to terminate... */ + conn->ksnc_scheduler->kss_nconns--; + + if (peer->ksnp_error != 0) { + /* peer's last conn closed in error */ + LASSERT(list_empty(&peer->ksnp_conns)); + failed = 1; + peer->ksnp_error = 0; /* avoid multiple notifications */ + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (failed) + ksocknal_peer_failed(peer); + + /* The socket is closed on the final put; either here, or in + * ksocknal_{send,recv}msg(). Since we set up the linger2 option + * when the connection was established, this will close the socket + * immediately, aborting anything buffered in it. Any hung + * zero-copy transmits will therefore complete in finite time. */ + ksocknal_connsock_decref(conn); +} + +void +ksocknal_queue_zombie_conn(ksock_conn_t *conn) +{ + /* Queue the conn for the reaper to destroy */ + + LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0); + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_destroy_conn(ksock_conn_t *conn) +{ + unsigned long last_rcv; + + /* Final coup-de-grace of the reaper */ + CDEBUG(D_NET, "connection %p\n", conn); + + LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0); + LASSERT(atomic_read(&conn->ksnc_sock_refcount) == 0); + LASSERT(conn->ksnc_sock == NULL); + LASSERT(conn->ksnc_route == NULL); + LASSERT(!conn->ksnc_tx_scheduled); + LASSERT(!conn->ksnc_rx_scheduled); + LASSERT(list_empty(&conn->ksnc_tx_queue)); + + /* complete current receive if any */ + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_LNET_PAYLOAD: + last_rcv = conn->ksnc_rx_deadline - + cfs_time_seconds(*ksocknal_tunables.ksnd_timeout); + CERROR("Completing partial receive from %s[%d], ip %pI4h:%d, with error, wanted: %d, left: %d, last alive is %ld secs ago\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type, + &conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left, + cfs_duration_sec(cfs_time_sub(cfs_time_current(), + last_rcv))); + lnet_finalize(conn->ksnc_peer->ksnp_ni, + conn->ksnc_cookie, -EIO); + break; + case SOCKNAL_RX_LNET_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of lnet header from %s, ip %pI4h:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of ksock message from %s, ip %pI4h:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_SLOP: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of slops from %s, ip %pI4h:%d, with error\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port); + break; + default: + LBUG(); + break; + } + + ksocknal_peer_decref(conn->ksnc_peer); + + LIBCFS_FREE(conn, sizeof(*conn)); +} + +int +ksocknal_close_peer_conns_locked(ksock_peer_t *peer, __u32 ipaddr, int why) +{ + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry(ctmp, ksock_conn_t, ksnc_list); + + if (ipaddr == 0 || + conn->ksnc_ipaddr == ipaddr) { + count++; + ksocknal_close_conn_locked(conn, why); + } + } + + return count; +} + +int +ksocknal_close_conn_and_siblings(ksock_conn_t *conn, int why) +{ + ksock_peer_t *peer = conn->ksnc_peer; + __u32 ipaddr = conn->ksnc_ipaddr; + int count; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + count = ksocknal_close_peer_conns_locked(peer, ipaddr, why); + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return count; +} + +int +ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int count = 0; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (id.nid != LNET_NID_ANY) + lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, + &ksocknal_data.ksnd_peers[i]) { + + peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + + if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid))) + continue; + + count += ksocknal_close_peer_conns_locked(peer, ipaddr, 0); + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + /* wildcards always succeed */ + if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0) + return 0; + + if (count == 0) + return -ENOENT; + else + return 0; +} + +void +ksocknal_notify(lnet_ni_t *ni, lnet_nid_t gw_nid, int alive) +{ + /* The router is telling me she's been notified of a change in + * gateway state.... */ + lnet_process_id_t id = {0}; + + id.nid = gw_nid; + id.pid = LNET_PID_ANY; + + CDEBUG(D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), + alive ? "up" : "down"); + + if (!alive) { + /* If the gateway crashed, close all open connections... */ + ksocknal_close_matching_conns(id, 0); + return; + } + + /* ...otherwise do nothing. We can only establish new connections + * if we have autroutes, and these connect on demand. */ +} + +void +ksocknal_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when) +{ + int connect = 1; + unsigned long last_alive = 0; + unsigned long now = cfs_time_current(); + ksock_peer_t *peer = NULL; + rwlock_t *glock = &ksocknal_data.ksnd_global_lock; + lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; + + read_lock(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) { + struct list_head *tmp; + ksock_conn_t *conn; + int bufnob; + + list_for_each(tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + + if (bufnob < conn->ksnc_tx_bufnob) { + /* something got ACKed */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + peer->ksnp_last_alive = now; + conn->ksnc_tx_bufnob = bufnob; + } + } + + last_alive = peer->ksnp_last_alive; + if (ksocknal_find_connectable_route_locked(peer) == NULL) + connect = 0; + } + + read_unlock(glock); + + if (last_alive != 0) + *when = last_alive; + + CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n", + libcfs_nid2str(nid), peer, + last_alive ? cfs_duration_sec(now - last_alive) : -1, + connect); + + if (!connect) + return; + + ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port()); + + write_lock_bh(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) + ksocknal_launch_all_connections_locked(peer); + + write_unlock_bh(glock); + return; +} + +static void +ksocknal_push_peer(ksock_peer_t *peer) +{ + int index; + int i; + struct list_head *tmp; + ksock_conn_t *conn; + + for (index = 0; ; index++) { + read_lock(&ksocknal_data.ksnd_global_lock); + + i = 0; + conn = NULL; + + list_for_each(tmp, &peer->ksnp_conns) { + if (i++ == index) { + conn = list_entry(tmp, ksock_conn_t, + ksnc_list); + ksocknal_conn_addref(conn); + break; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (conn == NULL) + break; + + ksocknal_lib_push_conn(conn); + ksocknal_conn_decref(conn); + } +} + +static int +ksocknal_push(lnet_ni_t *ni, lnet_process_id_t id) +{ + ksock_peer_t *peer; + struct list_head *tmp; + int index; + int i; + int j; + int rc = -ENOENT; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + for (j = 0; ; j++) { + read_lock(&ksocknal_data.ksnd_global_lock); + + index = 0; + peer = NULL; + + list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(tmp, ksock_peer_t, + ksnp_list); + + if (!((id.nid == LNET_NID_ANY || + id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || + id.pid == peer->ksnp_id.pid))) { + peer = NULL; + continue; + } + + if (index++ == j) { + ksocknal_peer_addref(peer); + break; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (peer != NULL) { + rc = 0; + ksocknal_push_peer(peer); + ksocknal_peer_decref(peer); + } + } + + } + + return rc; +} + +static int +ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask) +{ + ksock_net_t *net = ni->ni_data; + ksock_interface_t *iface; + int rc; + int i; + int j; + struct list_head *ptmp; + ksock_peer_t *peer; + struct list_head *rtmp; + ksock_route_t *route; + + if (ipaddress == 0 || + netmask == 0) + return -EINVAL; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + iface = ksocknal_ip2iface(ni, ipaddress); + if (iface != NULL) { + /* silently ignore dups */ + rc = 0; + } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) { + rc = -ENOSPC; + } else { + iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++]; + + iface->ksni_ipaddr = ipaddress; + iface->ksni_netmask = netmask; + iface->ksni_nroutes = 0; + iface->ksni_npeers = 0; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, + ksnp_list); + + for (j = 0; j < peer->ksnp_n_passive_ips; j++) + if (peer->ksnp_passive_ips[j] == ipaddress) + iface->ksni_npeers++; + + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, + ksock_route_t, + ksnr_list); + + if (route->ksnr_myipaddr == ipaddress) + iface->ksni_nroutes++; + } + } + } + + rc = 0; + /* NB only new connections will pay attention to the new interface! */ + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return rc; +} + +static void +ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr) +{ + struct list_head *tmp; + struct list_head *nxt; + ksock_route_t *route; + ksock_conn_t *conn; + int i; + int j; + + for (i = 0; i < peer->ksnp_n_passive_ips; i++) + if (peer->ksnp_passive_ips[i] == ipaddr) { + for (j = i+1; j < peer->ksnp_n_passive_ips; j++) + peer->ksnp_passive_ips[j-1] = + peer->ksnp_passive_ips[j]; + peer->ksnp_n_passive_ips--; + break; + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_myipaddr != ipaddr) + continue; + + if (route->ksnr_share_count != 0) { + /* Manually created; keep, but unbind */ + route->ksnr_myipaddr = 0; + } else { + ksocknal_del_route_locked(route); + } + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_myipaddr == ipaddr) + ksocknal_close_conn_locked(conn, 0); + } +} + +static int +ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress) +{ + ksock_net_t *net = ni->ni_data; + int rc = -ENOENT; + struct list_head *tmp; + struct list_head *nxt; + ksock_peer_t *peer; + __u32 this_ip; + int i; + int j; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + this_ip = net->ksnn_interfaces[i].ksni_ipaddr; + + if (!(ipaddress == 0 || + ipaddress == this_ip)) + continue; + + rc = 0; + + for (j = i+1; j < net->ksnn_ninterfaces; j++) + net->ksnn_interfaces[j-1] = + net->ksnn_interfaces[j]; + + net->ksnn_ninterfaces--; + + for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { + list_for_each_safe(tmp, nxt, + &ksocknal_data.ksnd_peers[j]) { + peer = list_entry(tmp, ksock_peer_t, + ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + + ksocknal_peer_del_interface_locked(peer, this_ip); + } + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return rc; +} + +int +ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + lnet_process_id_t id = {0}; + struct libcfs_ioctl_data *data = arg; + int rc; + + switch (cmd) { + case IOC_LIBCFS_GET_INTERFACE: { + ksock_net_t *net = ni->ni_data; + ksock_interface_t *iface; + + read_lock(&ksocknal_data.ksnd_global_lock); + + if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) { + rc = -ENOENT; + } else { + rc = 0; + iface = &net->ksnn_interfaces[data->ioc_count]; + + data->ioc_u32[0] = iface->ksni_ipaddr; + data->ioc_u32[1] = iface->ksni_netmask; + data->ioc_u32[2] = iface->ksni_npeers; + data->ioc_u32[3] = iface->ksni_nroutes; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return rc; + } + + case IOC_LIBCFS_ADD_INTERFACE: + return ksocknal_add_interface(ni, + data->ioc_u32[0], /* IP address */ + data->ioc_u32[1]); /* net mask */ + + case IOC_LIBCFS_DEL_INTERFACE: + return ksocknal_del_interface(ni, + data->ioc_u32[0]); /* IP address */ + + case IOC_LIBCFS_GET_PEER: { + __u32 myip = 0; + __u32 ip = 0; + int port = 0; + int conn_count = 0; + int share_count = 0; + + rc = ksocknal_get_peer_info(ni, data->ioc_count, + &id, &myip, &ip, &port, + &conn_count, &share_count); + if (rc != 0) + return rc; + + data->ioc_nid = id.nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = port; + data->ioc_u32[2] = myip; + data->ioc_u32[3] = conn_count; + data->ioc_u32[4] = id.pid; + return 0; + } + + case IOC_LIBCFS_ADD_PEER: + id.nid = data->ioc_nid; + id.pid = LUSTRE_SRV_LNET_PID; + return ksocknal_add_peer(ni, id, + data->ioc_u32[0], /* IP */ + data->ioc_u32[1]); /* port */ + + case IOC_LIBCFS_DEL_PEER: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_del_peer(ni, id, + data->ioc_u32[0]); /* IP */ + + case IOC_LIBCFS_GET_CONN: { + int txmem; + int rxmem; + int nagle; + ksock_conn_t *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count); + + if (conn == NULL) + return -ENOENT; + + ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + + data->ioc_count = txmem; + data->ioc_nid = conn->ksnc_peer->ksnp_id.nid; + data->ioc_flags = nagle; + data->ioc_u32[0] = conn->ksnc_ipaddr; + data->ioc_u32[1] = conn->ksnc_port; + data->ioc_u32[2] = conn->ksnc_myipaddr; + data->ioc_u32[3] = conn->ksnc_type; + data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt; + data->ioc_u32[5] = rxmem; + data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid; + ksocknal_conn_decref(conn); + return 0; + } + + case IOC_LIBCFS_CLOSE_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_close_matching_conns(id, + data->ioc_u32[0]); + + case IOC_LIBCFS_REGISTER_MYNID: + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) + return 0; + + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + + case IOC_LIBCFS_PUSH_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_push(ni, id); + + default: + return -EINVAL; + } + /* not reached */ +} + +static void +ksocknal_free_buffers(void) +{ + LASSERT(atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0); + + if (ksocknal_data.ksnd_sched_info != NULL) { + struct ksock_sched_info *info; + int i; + + cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) { + if (info->ksi_scheds != NULL) { + LIBCFS_FREE(info->ksi_scheds, + info->ksi_nthreads_max * + sizeof(info->ksi_scheds[0])); + } + } + cfs_percpt_free(ksocknal_data.ksnd_sched_info); + } + + LIBCFS_FREE(ksocknal_data.ksnd_peers, + sizeof(struct list_head) * + ksocknal_data.ksnd_peer_hash_size); + + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + struct list_head zlist; + ksock_tx_t *tx; + + list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs); + list_del_init(&ksocknal_data.ksnd_idle_noop_txs); + spin_unlock(&ksocknal_data.ksnd_tx_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_list); + list_del(&tx->tx_list); + LIBCFS_FREE(tx, tx->tx_desc_size); + } + } else { + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } +} + +static void +ksocknal_base_shutdown(void) +{ + struct ksock_sched_info *info; + ksock_sched_t *sched; + int i; + int j; + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + LASSERT(ksocknal_data.ksnd_nnets == 0); + + switch (ksocknal_data.ksnd_init) { + default: + LASSERT(0); + + case SOCKNAL_INIT_ALL: + case SOCKNAL_INIT_DATA: + LASSERT(ksocknal_data.ksnd_peers != NULL); + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + LASSERT(list_empty(&ksocknal_data.ksnd_peers[i])); + } + + LASSERT(list_empty(&ksocknal_data.ksnd_nets)); + LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns)); + LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns)); + LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs)); + LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes)); + + if (ksocknal_data.ksnd_sched_info != NULL) { + cfs_percpt_for_each(info, i, + ksocknal_data.ksnd_sched_info) { + if (info->ksi_scheds == NULL) + continue; + + for (j = 0; j < info->ksi_nthreads_max; j++) { + + sched = &info->ksi_scheds[j]; + LASSERT(list_empty( + &sched->kss_tx_conns)); + LASSERT(list_empty( + &sched->kss_rx_conns)); + LASSERT(list_empty( + &sched->kss_zombie_noop_txs)); + LASSERT(sched->kss_nconns == 0); + } + } + } + + /* flag threads to terminate; wake and wait for them to die */ + ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all(&ksocknal_data.ksnd_connd_waitq); + wake_up_all(&ksocknal_data.ksnd_reaper_waitq); + + if (ksocknal_data.ksnd_sched_info != NULL) { + cfs_percpt_for_each(info, i, + ksocknal_data.ksnd_sched_info) { + if (info->ksi_scheds == NULL) + continue; + + for (j = 0; j < info->ksi_nthreads_max; j++) { + sched = &info->ksi_scheds[j]; + wake_up_all(&sched->kss_waitq); + } + } + } + + i = 4; + read_lock(&ksocknal_data.ksnd_global_lock); + while (ksocknal_data.ksnd_nthreads != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d threads to terminate\n", + ksocknal_data.ksnd_nthreads); + read_unlock(&ksocknal_data.ksnd_global_lock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + read_lock(&ksocknal_data.ksnd_global_lock); + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_free_buffers(); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + module_put(THIS_MODULE); +} + +static __u64 +ksocknal_new_incarnation(void) +{ + + /* The incarnation number is the time this module loaded and it + * identifies this particular instance of the socknal. + */ + return ktime_get_ns(); +} + +static int +ksocknal_base_startup(void) +{ + struct ksock_sched_info *info; + int rc; + int i; + + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + LASSERT(ksocknal_data.ksnd_nnets == 0); + + memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */ + + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; + LIBCFS_ALLOC(ksocknal_data.ksnd_peers, + sizeof(struct list_head) * + ksocknal_data.ksnd_peer_hash_size); + if (ksocknal_data.ksnd_peers == NULL) + return -ENOMEM; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) + INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); + + rwlock_init(&ksocknal_data.ksnd_global_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_nets); + + spin_lock_init(&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns); + INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns); + INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns); + init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); + + spin_lock_init(&ksocknal_data.ksnd_connd_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes); + init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq); + + spin_lock_init(&ksocknal_data.ksnd_tx_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs); + + /* NB memset above zeros whole of ksocknal_data */ + + /* flag lists/ptrs/locks initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + try_module_get(THIS_MODULE); + + ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*info)); + if (ksocknal_data.ksnd_sched_info == NULL) + goto failed; + + cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) { + ksock_sched_t *sched; + int nthrs; + + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds); + } else { + /* max to half of CPUs, assume another half should be + * reserved for upper layer modules */ + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + } + + info->ksi_nthreads_max = nthrs; + info->ksi_cpt = i; + + LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i, + info->ksi_nthreads_max * sizeof(*sched)); + if (info->ksi_scheds == NULL) + goto failed; + + for (; nthrs > 0; nthrs--) { + sched = &info->ksi_scheds[nthrs - 1]; + + sched->kss_info = info; + spin_lock_init(&sched->kss_lock); + INIT_LIST_HEAD(&sched->kss_rx_conns); + INIT_LIST_HEAD(&sched->kss_tx_conns); + INIT_LIST_HEAD(&sched->kss_zombie_noop_txs); + init_waitqueue_head(&sched->kss_waitq); + } + } + + ksocknal_data.ksnd_connd_starting = 0; + ksocknal_data.ksnd_connd_failed_stamp = 0; + ksocknal_data.ksnd_connd_starting_stamp = get_seconds(); + /* must have at least 2 connds to remain responsive to accepts while + * connecting */ + if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1) + *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1; + + if (*ksocknal_tunables.ksnd_nconnds_max < + *ksocknal_tunables.ksnd_nconnds) { + ksocknal_tunables.ksnd_nconnds_max = + ksocknal_tunables.ksnd_nconnds; + } + + for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) { + char name[16]; + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + + snprintf(name, sizeof(name), "socknal_cd%02d", i); + rc = ksocknal_thread_start(ksocknal_connd, + (void *)((ulong_ptr_t)i), name); + if (rc != 0) { + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting--; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + CERROR("Can't spawn socknal connd: %d\n", rc); + goto failed; + } + } + + rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper"); + if (rc != 0) { + CERROR("Can't spawn socknal reaper: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + return 0; + + failed: + ksocknal_base_shutdown(); + return -ENETDOWN; +} + +static void +ksocknal_debug_peerhash(lnet_ni_t *ni) +{ + ksock_peer_t *peer = NULL; + struct list_head *tmp; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(tmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni == ni) + break; + + peer = NULL; + } + } + + if (peer != NULL) { + ksock_route_t *route; + ksock_conn_t *conn; + + CWARN("Active peer on shutdown: %s, ref %d, scnt %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n", + libcfs_id2str(peer->ksnp_id), + atomic_read(&peer->ksnp_refcount), + peer->ksnp_sharecount, peer->ksnp_closing, + peer->ksnp_accepting, peer->ksnp_error, + peer->ksnp_zc_next_cookie, + !list_empty(&peer->ksnp_tx_queue), + !list_empty(&peer->ksnp_zc_req_list)); + + list_for_each(tmp, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + CWARN("Route: ref %d, schd %d, conn %d, cnted %d, del %d\n", + atomic_read(&route->ksnr_refcount), + route->ksnr_scheduled, route->ksnr_connecting, + route->ksnr_connected, route->ksnr_deleted); + } + + list_for_each(tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + CWARN("Conn: ref %d, sref %d, t %d, c %d\n", + atomic_read(&conn->ksnc_conn_refcount), + atomic_read(&conn->ksnc_sock_refcount), + conn->ksnc_type, conn->ksnc_closing); + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return; +} + +void +ksocknal_shutdown(lnet_ni_t *ni) +{ + ksock_net_t *net = ni->ni_data; + int i; + lnet_process_id_t anyid = {0}; + + anyid.nid = LNET_NID_ANY; + anyid.pid = LNET_PID_ANY; + + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); + LASSERT(ksocknal_data.ksnd_nnets > 0); + + spin_lock_bh(&net->ksnn_lock); + net->ksnn_shutdown = 1; /* prevent new peers */ + spin_unlock_bh(&net->ksnn_lock); + + /* Delete all peers */ + ksocknal_del_peer(ni, anyid, 0); + + /* Wait for all peer state to clean up */ + i = 2; + spin_lock_bh(&net->ksnn_lock); + while (net->ksnn_npeers != 0) { + spin_unlock_bh(&net->ksnn_lock); + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect\n", + net->ksnn_npeers); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + + ksocknal_debug_peerhash(ni); + + spin_lock_bh(&net->ksnn_lock); + } + spin_unlock_bh(&net->ksnn_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT(net->ksnn_interfaces[i].ksni_npeers == 0); + LASSERT(net->ksnn_interfaces[i].ksni_nroutes == 0); + } + + list_del(&net->ksnn_list); + LIBCFS_FREE(net, sizeof(*net)); + + ksocknal_data.ksnd_nnets--; + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); +} + +static int +ksocknal_enumerate_interfaces(ksock_net_t *net) +{ + char **names; + int i; + int j; + int rc; + int n; + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Can't enumerate interfaces: %d\n", n); + return n; + } + + for (i = j = 0; i < n; i++) { + int up; + __u32 ip; + __u32 mask; + + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &mask); + if (rc != 0) { + CWARN("Can't get interface %s info: %d\n", + names[i], rc); + continue; + } + + if (!up) { + CWARN("Ignoring interface %s (down)\n", + names[i]); + continue; + } + + if (j == LNET_MAX_INTERFACES) { + CWARN("Ignoring interface %s (too many interfaces)\n", + names[i]); + continue; + } + + net->ksnn_interfaces[j].ksni_ipaddr = ip; + net->ksnn_interfaces[j].ksni_netmask = mask; + strncpy(&net->ksnn_interfaces[j].ksni_name[0], + names[i], IFNAMSIZ); + j++; + } + + libcfs_ipif_free_enumeration(names, n); + + if (j == 0) + CERROR("Can't find any usable interfaces\n"); + + return j; +} + +static int +ksocknal_search_new_ipif(ksock_net_t *net) +{ + int new_ipif = 0; + int i; + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + char *ifnam = &net->ksnn_interfaces[i].ksni_name[0]; + char *colon = strchr(ifnam, ':'); + int found = 0; + ksock_net_t *tmp; + int j; + + if (colon != NULL) /* ignore alias device */ + *colon = 0; + + list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, + ksnn_list) { + for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) { + char *ifnam2 = + &tmp->ksnn_interfaces[j].ksni_name[0]; + char *colon2 = strchr(ifnam2, ':'); + + if (colon2 != NULL) + *colon2 = 0; + + found = strcmp(ifnam, ifnam2) == 0; + if (colon2 != NULL) + *colon2 = ':'; + } + if (found) + break; + } + + new_ipif += !found; + if (colon != NULL) + *colon = ':'; + } + + return new_ipif; +} + +static int +ksocknal_start_schedulers(struct ksock_sched_info *info) +{ + int nthrs; + int rc = 0; + int i; + + if (info->ksi_nthreads == 0) { + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = info->ksi_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + info->ksi_cpt); + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs); + } + nthrs = min(nthrs, info->ksi_nthreads_max); + } else { + LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max); + /* increase two threads if there is new interface */ + nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads); + } + + for (i = 0; i < nthrs; i++) { + long id; + char name[20]; + ksock_sched_t *sched; + id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i); + sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; + snprintf(name, sizeof(name), "socknal_sd%02d_%02d", + info->ksi_cpt, (int)(sched - &info->ksi_scheds[0])); + + rc = ksocknal_thread_start(ksocknal_scheduler, + (void *)id, name); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + info->ksi_cpt, info->ksi_nthreads + i, rc); + break; + } + + info->ksi_nthreads += i; + return rc; +} + +static int +ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts) +{ + int newif = ksocknal_search_new_ipif(net); + int rc; + int i; + + LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table())); + + for (i = 0; i < ncpts; i++) { + struct ksock_sched_info *info; + int cpt = (cpts == NULL) ? i : cpts[i]; + + LASSERT(cpt < cfs_cpt_number(lnet_cpt_table())); + info = ksocknal_data.ksnd_sched_info[cpt]; + + if (!newif && info->ksi_nthreads > 0) + continue; + + rc = ksocknal_start_schedulers(info); + if (rc != 0) + return rc; + } + return 0; +} + +int +ksocknal_startup(lnet_ni_t *ni) +{ + ksock_net_t *net; + int rc; + int i; + + LASSERT(ni->ni_lnd == &the_ksocklnd); + + if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) { + rc = ksocknal_base_startup(); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) + goto fail_0; + + spin_lock_init(&net->ksnn_lock); + net->ksnn_incarnation = ksocknal_new_incarnation(); + ni->ni_data = net; + ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout; + ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits; + ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits; + ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits; + + if (ni->ni_interfaces[0] == NULL) { + rc = ksocknal_enumerate_interfaces(net); + if (rc <= 0) + goto fail_1; + + net->ksnn_ninterfaces = 1; + } else { + for (i = 0; i < LNET_MAX_INTERFACES; i++) { + int up; + + if (ni->ni_interfaces[i] == NULL) + break; + + rc = libcfs_ipif_query( + ni->ni_interfaces[i], &up, + &net->ksnn_interfaces[i].ksni_ipaddr, + &net->ksnn_interfaces[i].ksni_netmask); + + if (rc != 0) { + CERROR("Can't get interface %s info: %d\n", + ni->ni_interfaces[i], rc); + goto fail_1; + } + + if (!up) { + CERROR("Interface %s is down\n", + ni->ni_interfaces[i]); + goto fail_1; + } + + strncpy(&net->ksnn_interfaces[i].ksni_name[0], + ni->ni_interfaces[i], IFNAMSIZ); + } + net->ksnn_ninterfaces = i; + } + + /* call it before add it to ksocknal_data.ksnd_nets */ + rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto fail_1; + + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), + net->ksnn_interfaces[0].ksni_ipaddr); + list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets); + + ksocknal_data.ksnd_nnets++; + + return 0; + + fail_1: + LIBCFS_FREE(net, sizeof(*net)); + fail_0: + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); + + return -ENETDOWN; +} + + +static void __exit +ksocknal_module_fini(void) +{ + lnet_unregister_lnd(&the_ksocklnd); +} + +static int __init +ksocknal_module_init(void) +{ + int rc; + + /* check ksnr_connected/connecting field large enough */ + CLASSERT(SOCKLND_CONN_NTYPES <= 4); + CLASSERT(SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN); + + /* initialize the_ksocklnd */ + the_ksocklnd.lnd_type = SOCKLND; + the_ksocklnd.lnd_startup = ksocknal_startup; + the_ksocklnd.lnd_shutdown = ksocknal_shutdown; + the_ksocklnd.lnd_ctl = ksocknal_ctl; + the_ksocklnd.lnd_send = ksocknal_send; + the_ksocklnd.lnd_recv = ksocknal_recv; + the_ksocklnd.lnd_notify = ksocknal_notify; + the_ksocklnd.lnd_query = ksocknal_query; + the_ksocklnd.lnd_accept = ksocknal_accept; + + rc = ksocknal_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_ksocklnd); + + return 0; +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("3.0.0"); + +module_init(ksocknal_module_init); +module_exit(ksocknal_module_fini); diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h new file mode 100644 index 000000000..c54c99551 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define DEBUG_SUBSYSTEM S_LND + +#include "socklnd_lib-linux.h" + +#include "../../../include/linux/libcfs/libcfs.h" +#include "../../../include/linux/lnet/lnet.h" +#include "../../../include/linux/lnet/lib-lnet.h" +#include "../../../include/linux/lnet/socklnd.h" +#include "../../../include/linux/lnet/lnet-sysctl.h" + +#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */ +#define SOCKNAL_ENOMEM_RETRY CFS_TICK /* jiffies between retries */ + +#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */ +#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */ + +#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */ + +/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled). + * no risk if we're not running on a CONFIG_HIGHMEM platform. */ +#ifdef CONFIG_HIGHMEM +# define SOCKNAL_RISK_KMAP_DEADLOCK 0 +#else +# define SOCKNAL_RISK_KMAP_DEADLOCK 1 +#endif + +struct ksock_sched_info; + +typedef struct /* per scheduler state */ +{ + spinlock_t kss_lock; /* serialise */ + struct list_head kss_rx_conns; /* conn waiting to be read */ + /* conn waiting to be written */ + struct list_head kss_tx_conns; + /* zombie noop tx list */ + struct list_head kss_zombie_noop_txs; + wait_queue_head_t kss_waitq; /* where scheduler sleeps */ + /* # connections assigned to this scheduler */ + int kss_nconns; + struct ksock_sched_info *kss_info; /* owner of it */ + struct page *kss_rx_scratch_pgs[LNET_MAX_IOV]; + struct kvec kss_scratch_iov[LNET_MAX_IOV]; +} ksock_sched_t; + +struct ksock_sched_info { + int ksi_nthreads_max; /* max allowed threads */ + int ksi_nthreads; /* number of threads */ + int ksi_cpt; /* CPT id */ + ksock_sched_t *ksi_scheds; /* array of schedulers */ +}; + +#define KSOCK_CPT_SHIFT 16 +#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid)) +#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT) +#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1)) + +typedef struct /* in-use interface */ +{ + __u32 ksni_ipaddr; /* interface's IP address */ + __u32 ksni_netmask; /* interface's network mask */ + int ksni_nroutes; /* # routes using (active) */ + int ksni_npeers; /* # peers using (passive) */ + char ksni_name[IFNAMSIZ]; /* interface name */ +} ksock_interface_t; + +typedef struct { + /* "stuck" socket timeout (seconds) */ + int *ksnd_timeout; + /* # scheduler threads in each pool while starting */ + int *ksnd_nscheds; + int *ksnd_nconnds; /* # connection daemons */ + int *ksnd_nconnds_max; /* max # connection daemons */ + int *ksnd_min_reconnectms; /* first connection retry after (ms)... */ + int *ksnd_max_reconnectms; /* ...exponentially increasing to this */ + int *ksnd_eager_ack; /* make TCP ack eagerly? */ + int *ksnd_typed_conns; /* drive sockets by type? */ + int *ksnd_min_bulk; /* smallest "large" message */ + int *ksnd_tx_buffer_size; /* socket tx buffer size */ + int *ksnd_rx_buffer_size; /* socket rx buffer size */ + int *ksnd_nagle; /* enable NAGLE? */ + int *ksnd_round_robin; /* round robin for multiple interfaces */ + int *ksnd_keepalive; /* # secs for sending keepalive NOOP */ + int *ksnd_keepalive_idle; /* # idle secs before 1st probe */ + int *ksnd_keepalive_count; /* # probes */ + int *ksnd_keepalive_intvl; /* time between probes */ + int *ksnd_credits; /* # concurrent sends */ + int *ksnd_peertxcredits; /* # concurrent sends to 1 peer */ + int *ksnd_peerrtrcredits; /* # per-peer router buffer credits */ + int *ksnd_peertimeout; /* seconds to consider peer dead */ + int *ksnd_enable_csum; /* enable check sum */ + int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */ + int *ksnd_nonblk_zcack; /* always send zc-ack on non-blocking connection */ + unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload size */ + int *ksnd_zc_recv; /* enable ZC receive (for Chelsio TOE) */ + int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */ +} ksock_tunables_t; + +typedef struct { + __u64 ksnn_incarnation; /* my epoch */ + spinlock_t ksnn_lock; /* serialise */ + struct list_head ksnn_list; /* chain on global list */ + int ksnn_npeers; /* # peers */ + int ksnn_shutdown; /* shutting down? */ + int ksnn_ninterfaces; /* IP interfaces */ + ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES]; +} ksock_net_t; + +/** connd timeout */ +#define SOCKNAL_CONND_TIMEOUT 120 +/** reserved thread for accepting & creating new connd */ +#define SOCKNAL_CONND_RESV 1 + +typedef struct { + int ksnd_init; /* initialisation state */ + int ksnd_nnets; /* # networks set up */ + struct list_head ksnd_nets; /* list of nets */ + /* stabilize peer/conn ops */ + rwlock_t ksnd_global_lock; + /* hash table of all my known peers */ + struct list_head *ksnd_peers; + int ksnd_peer_hash_size; /* size of ksnd_peers */ + + int ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + /* schedulers information */ + struct ksock_sched_info **ksnd_sched_info; + + atomic_t ksnd_nactive_txs; /* #active txs */ + + struct list_head ksnd_deathrow_conns; /* conns to close: reaper_lock*/ + struct list_head ksnd_zombie_conns; /* conns to free: reaper_lock */ + struct list_head ksnd_enomem_conns; /* conns to retry: reaper_lock*/ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + unsigned long ksnd_reaper_waketime;/* when reaper will wake */ + spinlock_t ksnd_reaper_lock; /* serialise */ + + int ksnd_enomem_tx; /* test ENOMEM sender */ + int ksnd_stall_tx; /* test sluggish sender */ + int ksnd_stall_rx; /* test sluggish receiver */ + + struct list_head ksnd_connd_connreqs; /* incoming connection requests */ + struct list_head ksnd_connd_routes; /* routes waiting to be connected */ + wait_queue_head_t ksnd_connd_waitq; /* connds sleep here */ + int ksnd_connd_connecting;/* # connds connecting */ + /** time stamp of the last failed connecting attempt */ + long ksnd_connd_failed_stamp; + /** # starting connd */ + unsigned ksnd_connd_starting; + /** time stamp of the last starting connd */ + long ksnd_connd_starting_stamp; + /** # running connd */ + unsigned ksnd_connd_running; + spinlock_t ksnd_connd_lock; /* serialise */ + + struct list_head ksnd_idle_noop_txs; /* list head for freed noop tx */ + spinlock_t ksnd_tx_lock; /* serialise, g_lock unsafe */ + +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_ALL 2 + +/* A packet just assembled for transmission is represented by 1 or more + * struct iovec fragments (the first frag contains the portals header), + * followed by 0 or more lnet_kiov_t fragments. + * + * On the receive side, initially 1 struct iovec fragment is posted for + * receive (the header). Once the header has been received, the payload is + * received into either struct iovec or lnet_kiov_t fragments, depending on + * what the header matched or whether the message needs forwarding. */ + +struct ksock_conn; /* forward ref */ +struct ksock_peer; /* forward ref */ +struct ksock_route; /* forward ref */ +struct ksock_proto; /* forward ref */ + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + struct list_head tx_zc_list; /* queue on peer for ZC request */ + atomic_t tx_refcount; /* tx reference count */ + int tx_nob; /* # packet bytes */ + int tx_resid; /* residual bytes */ + int tx_niov; /* # packet iovec frags */ + struct kvec *tx_iov; /* packet iovec frags */ + int tx_nkiov; /* # packet page frags */ + unsigned short tx_zc_aborted; /* aborted ZC request */ + unsigned short tx_zc_capable:1; /* payload is large enough for ZC */ + unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */ + unsigned short tx_nonblk:1; /* it's a non-blocking ACK */ + lnet_kiov_t *tx_kiov; /* packet page frags */ + struct ksock_conn *tx_conn; /* owning conn */ + lnet_msg_t *tx_lnetmsg; /* lnet message for lnet_finalize() */ + unsigned long tx_deadline; /* when (in jiffies) tx times out */ + ksock_msg_t tx_msg; /* socklnd message buffer */ + int tx_desc_size; /* size of this descriptor */ + union { + struct { + struct kvec iov; /* virt hdr */ + lnet_kiov_t kiov[0]; /* paged payload */ + } paged; + struct { + struct kvec iov[1]; /* virt hdr + payload */ + } virt; + } tx_frags; +} ksock_tx_t; + +#define KSOCK_NOOP_TX_SIZE ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0])) + +/* network zero copy callback descriptor embedded in ksock_tx_t */ + +/* space for the rx frag descriptors; we either read a single contiguous + * header, or up to LNET_MAX_IOV frags of payload of either type. */ +typedef union { + struct kvec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; +} ksock_rxiovspace_t; + +#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */ +#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */ +#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */ +#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */ +#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */ +#define SOCKNAL_RX_SLOP 6 /* skipping body */ + +typedef struct ksock_conn { + struct ksock_peer *ksnc_peer; /* owning peer */ + struct ksock_route *ksnc_route; /* owning route */ + struct list_head ksnc_list; /* stash on peer's conn list */ + struct socket *ksnc_sock; /* actual socket */ + void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ + void *ksnc_saved_write_space; /* socket's original write_space() callback */ + atomic_t ksnc_conn_refcount; /* conn refcount */ + atomic_t ksnc_sock_refcount; /* sock refcount */ + ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ + __u32 ksnc_myipaddr; /* my IP */ + __u32 ksnc_ipaddr; /* peer's IP */ + int ksnc_port; /* peer's port */ + signed int ksnc_type:3; /* type of connection, + * should be signed value */ + unsigned int ksnc_closing:1; /* being shut down */ + unsigned int ksnc_flip:1; /* flip or not, only for V2.x */ + unsigned int ksnc_zc_capable:1; /* enable to ZC */ + struct ksock_proto *ksnc_proto; /* protocol for the connection */ + + /* reader */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */ + __u8 ksnc_rx_started; /* started receiving a message */ + __u8 ksnc_rx_ready; /* data ready to read */ + __u8 ksnc_rx_scheduled;/* being progressed */ + __u8 ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # iovec frags */ + struct kvec *ksnc_rx_iov; /* the iovec frags */ + int ksnc_rx_nkiov; /* # page frags */ + lnet_kiov_t *ksnc_rx_kiov; /* the page frags */ + ksock_rxiovspace_t ksnc_rx_iov_space;/* space for frag descriptors */ + __u32 ksnc_rx_csum; /* partial checksum for incoming data */ + void *ksnc_cookie; /* rx lnet_finalize passthru arg */ + ksock_msg_t ksnc_msg; /* incoming message buffer: + * V2.x message takes the + * whole struct + * V1.x message is a bare + * lnet_hdr_t, it's stored in + * ksnc_msg.ksm_u.lnetmsg */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + ksock_tx_t *ksnc_tx_carrier; /* next TX that can carry a LNet message or ZC-ACK */ + unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */ + int ksnc_tx_bufnob; /* send buffer marker */ + atomic_t ksnc_tx_nob; /* # bytes queued */ + int ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + unsigned long ksnc_tx_last_post; /* time stamp of the last posted TX */ +} ksock_conn_t; + +typedef struct ksock_route { + struct list_head ksnr_list; /* chain on peer route list */ + struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */ + struct ksock_peer *ksnr_peer; /* owning peer */ + atomic_t ksnr_refcount; /* # users */ + unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */ + long ksnr_retry_interval; /* how long between retries */ + __u32 ksnr_myipaddr; /* my IP */ + __u32 ksnr_ipaddr; /* IP address to connect to */ + int ksnr_port; /* port to connect to */ + unsigned int ksnr_scheduled:1; /* scheduled for attention */ + unsigned int ksnr_connecting:1;/* connection establishment in progress */ + unsigned int ksnr_connected:4; /* connections established by type */ + unsigned int ksnr_deleted:1; /* been removed from peer? */ + unsigned int ksnr_share_count; /* created explicitly? */ + int ksnr_conn_count; /* # conns established by this route */ +} ksock_route_t; + +#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */ + +typedef struct ksock_peer { + struct list_head ksnp_list; /* stash on global peer list */ + unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */ + lnet_process_id_t ksnp_id; /* who's on the other end(s) */ + atomic_t ksnp_refcount; /* # users */ + int ksnp_sharecount; /* lconf usage counter */ + int ksnp_closing; /* being closed */ + int ksnp_accepting;/* # passive connections pending */ + int ksnp_error; /* errno on closing last conn */ + __u64 ksnp_zc_next_cookie;/* ZC completion cookie */ + __u64 ksnp_incarnation; /* latest known peer incarnation */ + struct ksock_proto *ksnp_proto; /* latest known peer protocol */ + struct list_head ksnp_conns; /* all active connections */ + struct list_head ksnp_routes; /* routes */ + struct list_head ksnp_tx_queue; /* waiting packets */ + spinlock_t ksnp_lock; /* serialize, g_lock unsafe */ + struct list_head ksnp_zc_req_list; /* zero copy requests wait for ACK */ + unsigned long ksnp_send_keepalive; /* time to send keepalive */ + lnet_ni_t *ksnp_ni; /* which network */ + int ksnp_n_passive_ips; /* # of... */ + __u32 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */ +} ksock_peer_t; + +typedef struct ksock_connreq { + struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */ + lnet_ni_t *ksncr_ni; /* chosen NI */ + struct socket *ksncr_sock; /* accepted socket */ +} ksock_connreq_t; + +extern ksock_nal_data_t ksocknal_data; +extern ksock_tunables_t ksocknal_tunables; + +#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */ +#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */ +#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not preferred */ + +typedef struct ksock_proto { + int pro_version; /* version number of protocol */ + int (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *); /* handshake function */ + int (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */ + void (*pro_pack)(ksock_tx_t *); /* message pack */ + void (*pro_unpack)(ksock_msg_t *); /* message unpack */ + ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *); /* queue tx on the connection */ + int (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */ + int (*pro_handle_zcreq)(ksock_conn_t *, __u64, int); /* handle ZC request */ + int (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64); /* handle ZC ACK */ + int (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int); /* msg type matches the connection type: + * return value: + * return MATCH_NO : no + * return MATCH_YES : matching type + * return MATCH_MAY : can be backup */ +} ksock_proto_t; + +extern ksock_proto_t ksocknal_protocol_v1x; +extern ksock_proto_t ksocknal_protocol_v2x; +extern ksock_proto_t ksocknal_protocol_v3x; + +#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR +#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR +#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR + +#ifndef CPU_MASK_NONE +#define CPU_MASK_NONE 0UL +#endif + +static inline int +ksocknal_route_mask(void) +{ + if (!*ksocknal_tunables.ksnd_typed_conns) + return (1 << SOCKLND_CONN_ANY); + + return ((1 << SOCKLND_CONN_CONTROL) | + (1 << SOCKLND_CONN_BULK_IN) | + (1 << SOCKLND_CONN_BULK_OUT)); +} + +static inline struct list_head * +ksocknal_nid2peerlist(lnet_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; + + return &ksocknal_data.ksnd_peers[hash]; +} + +static inline void +ksocknal_conn_addref(ksock_conn_t *conn) +{ + LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); + atomic_inc(&conn->ksnc_conn_refcount); +} + +extern void ksocknal_queue_zombie_conn(ksock_conn_t *conn); +extern void ksocknal_finalize_zcreq(ksock_conn_t *conn); + +static inline void +ksocknal_conn_decref(ksock_conn_t *conn) +{ + LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_conn_refcount)) + ksocknal_queue_zombie_conn(conn); +} + +static inline int +ksocknal_connsock_addref(ksock_conn_t *conn) +{ + int rc = -ESHUTDOWN; + + read_lock(&ksocknal_data.ksnd_global_lock); + if (!conn->ksnc_closing) { + LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); + atomic_inc(&conn->ksnc_sock_refcount); + rc = 0; + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + return rc; +} + +static inline void +ksocknal_connsock_decref(ksock_conn_t *conn) +{ + LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) { + LASSERT(conn->ksnc_closing); + libcfs_sock_release(conn->ksnc_sock); + conn->ksnc_sock = NULL; + ksocknal_finalize_zcreq(conn); + } +} + +static inline void +ksocknal_tx_addref(ksock_tx_t *tx) +{ + LASSERT(atomic_read(&tx->tx_refcount) > 0); + atomic_inc(&tx->tx_refcount); +} + +extern void ksocknal_tx_prep(ksock_conn_t *, ksock_tx_t *tx); +extern void ksocknal_tx_done(lnet_ni_t *ni, ksock_tx_t *tx); + +static inline void +ksocknal_tx_decref(ksock_tx_t *tx) +{ + LASSERT(atomic_read(&tx->tx_refcount) > 0); + if (atomic_dec_and_test(&tx->tx_refcount)) + ksocknal_tx_done(NULL, tx); +} + +static inline void +ksocknal_route_addref(ksock_route_t *route) +{ + LASSERT(atomic_read(&route->ksnr_refcount) > 0); + atomic_inc(&route->ksnr_refcount); +} + +extern void ksocknal_destroy_route(ksock_route_t *route); + +static inline void +ksocknal_route_decref(ksock_route_t *route) +{ + LASSERT(atomic_read(&route->ksnr_refcount) > 0); + if (atomic_dec_and_test(&route->ksnr_refcount)) + ksocknal_destroy_route(route); +} + +static inline void +ksocknal_peer_addref(ksock_peer_t *peer) +{ + LASSERT(atomic_read(&peer->ksnp_refcount) > 0); + atomic_inc(&peer->ksnp_refcount); +} + +extern void ksocknal_destroy_peer(ksock_peer_t *peer); + +static inline void +ksocknal_peer_decref(ksock_peer_t *peer) +{ + LASSERT(atomic_read(&peer->ksnp_refcount) > 0); + if (atomic_dec_and_test(&peer->ksnp_refcount)) + ksocknal_destroy_peer(peer); +} + +int ksocknal_startup(lnet_ni_t *ni); +void ksocknal_shutdown(lnet_ni_t *ni); +int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int ksocknal_accept(lnet_ni_t *ni, struct socket *sock); + +extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port); +extern ksock_peer_t *ksocknal_find_peer_locked(lnet_ni_t *ni, lnet_process_id_t id); +extern ksock_peer_t *ksocknal_find_peer(lnet_ni_t *ni, lnet_process_id_t id); +extern void ksocknal_peer_failed(ksock_peer_t *peer); +extern int ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route, + struct socket *sock, int type); +extern void ksocknal_close_conn_locked(ksock_conn_t *conn, int why); +extern void ksocknal_terminate_conn(ksock_conn_t *conn); +extern void ksocknal_destroy_conn(ksock_conn_t *conn); +extern int ksocknal_close_peer_conns_locked(ksock_peer_t *peer, + __u32 ipaddr, int why); +extern int ksocknal_close_conn_and_siblings(ksock_conn_t *conn, int why); +extern int ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr); +extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer, + ksock_tx_t *tx, int nonblk); + +extern int ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx, + lnet_process_id_t id); +extern ksock_tx_t *ksocknal_alloc_tx(int type, int size); +extern void ksocknal_free_tx(ksock_tx_t *tx); +extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk); +extern void ksocknal_next_tx_carrier(ksock_conn_t *conn); +extern void ksocknal_queue_tx_locked(ksock_tx_t *tx, ksock_conn_t *conn); +extern void ksocknal_txlist_done(lnet_ni_t *ni, struct list_head *txlist, + int error); +extern void ksocknal_notify(lnet_ni_t *ni, lnet_nid_t gw_nid, int alive); +extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when); +extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name); +extern void ksocknal_thread_fini(void); +extern void ksocknal_launch_all_connections_locked(ksock_peer_t *peer); +extern ksock_route_t *ksocknal_find_connectable_route_locked(ksock_peer_t *peer); +extern ksock_route_t *ksocknal_find_connecting_route_locked(ksock_peer_t *peer); +extern int ksocknal_new_packet(ksock_conn_t *conn, int skip); +extern int ksocknal_scheduler(void *arg); +extern int ksocknal_connd(void *arg); +extern int ksocknal_reaper(void *arg); +extern int ksocknal_send_hello(lnet_ni_t *ni, ksock_conn_t *conn, + lnet_nid_t peer_nid, ksock_hello_msg_t *hello); +extern int ksocknal_recv_hello(lnet_ni_t *ni, ksock_conn_t *conn, + ksock_hello_msg_t *hello, lnet_process_id_t *id, + __u64 *incarnation); +extern void ksocknal_read_callback(ksock_conn_t *conn); +extern void ksocknal_write_callback(ksock_conn_t *conn); + +extern int ksocknal_lib_zc_capable(ksock_conn_t *conn); +extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn); +extern void ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn); +extern void ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn); +extern void ksocknal_lib_push_conn(ksock_conn_t *conn); +extern int ksocknal_lib_get_conn_addrs(ksock_conn_t *conn); +extern int ksocknal_lib_setup_sock(struct socket *so); +extern int ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx); +extern int ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx); +extern void ksocknal_lib_eager_ack(ksock_conn_t *conn); +extern int ksocknal_lib_recv_iov(ksock_conn_t *conn); +extern int ksocknal_lib_recv_kiov(ksock_conn_t *conn); +extern int ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem, + int *rxmem, int *nagle); + +extern int ksocknal_tunables_init(void); + +extern void ksocknal_lib_csum_tx(ksock_tx_t *tx); + +extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn); +extern int ksocknal_lib_bind_thread_to_cpu(int id); diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c new file mode 100644 index 000000000..fa7ad883b --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c @@ -0,0 +1,2634 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +ksock_tx_t * +ksocknal_alloc_tx(int type, int size) +{ + ksock_tx_t *tx = NULL; + + if (type == KSOCK_MSG_NOOP) { + LASSERT(size == KSOCK_NOOP_TX_SIZE); + + /* searching for a noop tx in free list */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \ + next, ksock_tx_t, tx_list); + LASSERT(tx->tx_desc_size == size); + list_del(&tx->tx_list); + } + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } + + if (tx == NULL) + LIBCFS_ALLOC(tx, size); + + if (tx == NULL) + return NULL; + + atomic_set(&tx->tx_refcount, 1); + tx->tx_zc_aborted = 0; + tx->tx_zc_capable = 0; + tx->tx_zc_checked = 0; + tx->tx_desc_size = size; + + atomic_inc(&ksocknal_data.ksnd_nactive_txs); + + return tx; +} + +ksock_tx_t * +ksocknal_alloc_tx_noop(__u64 cookie, int nonblk) +{ + ksock_tx_t *tx; + + tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE); + if (tx == NULL) { + CERROR("Can't allocate noop tx desc\n"); + return NULL; + } + + tx->tx_conn = NULL; + tx->tx_lnetmsg = NULL; + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1; + tx->tx_nonblk = nonblk; + + socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP); + tx->tx_msg.ksm_zc_cookies[1] = cookie; + + return tx; +} + + +void +ksocknal_free_tx (ksock_tx_t *tx) +{ + atomic_dec(&ksocknal_data.ksnd_nactive_txs); + + if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) { + /* it's a noop tx */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs); + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } else { + LIBCFS_FREE(tx, tx->tx_desc_size); + } +} + +static int +ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct kvec *iov = tx->tx_iov; + int nob; + int rc; + + LASSERT (tx->tx_niov > 0); + + /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */ + rc = ksocknal_lib_send_iov(conn, tx); + + if (rc <= 0) /* sent nothing? */ + return rc; + + nob = rc; + LASSERT (nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" iov */ + do { + LASSERT (tx->tx_niov > 0); + + if (nob < (int) iov->iov_len) { + iov->iov_base = (void *)((char *)iov->iov_base + nob); + iov->iov_len -= nob; + return rc; + } + + nob -= iov->iov_len; + tx->tx_iov = ++iov; + tx->tx_niov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + lnet_kiov_t *kiov = tx->tx_kiov; + int nob; + int rc; + + LASSERT (tx->tx_niov == 0); + LASSERT (tx->tx_nkiov > 0); + + /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */ + rc = ksocknal_lib_send_kiov(conn, tx); + + if (rc <= 0) /* sent nothing? */ + return rc; + + nob = rc; + LASSERT (nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" kiov */ + do { + LASSERT(tx->tx_nkiov > 0); + + if (nob < (int)kiov->kiov_len) { + kiov->kiov_offset += nob; + kiov->kiov_len -= nob; + return rc; + } + + nob -= (int)kiov->kiov_len; + tx->tx_kiov = ++kiov; + tx->tx_nkiov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) +{ + int rc; + int bufnob; + + if (ksocknal_data.ksnd_stall_tx != 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_tx)); + } + + LASSERT (tx->tx_resid != 0); + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + return -ESHUTDOWN; + } + + do { + if (ksocknal_data.ksnd_enomem_tx > 0) { + /* testing... */ + ksocknal_data.ksnd_enomem_tx--; + rc = -EAGAIN; + } else if (tx->tx_niov != 0) { + rc = ksocknal_send_iov (conn, tx); + } else { + rc = ksocknal_send_kiov (conn, tx); + } + + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + if (rc > 0) /* sent something? */ + conn->ksnc_tx_bufnob += rc; /* account it */ + + if (bufnob < conn->ksnc_tx_bufnob) { + /* allocated send buffer bytes < computed; infer + * something got ACKed */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_tx_bufnob = bufnob; + mb(); + } + + if (rc <= 0) { /* Didn't write anything? */ + + if (rc == 0) /* some stacks return 0 instead of -EAGAIN */ + rc = -EAGAIN; + + /* Check if EAGAIN is due to memory pressure */ + if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn)) + rc = -ENOMEM; + + break; + } + + /* socket's wmem_queued now includes 'rc' bytes */ + atomic_sub (rc, &conn->ksnc_tx_nob); + rc = 0; + + } while (tx->tx_resid != 0); + + ksocknal_connsock_decref(conn); + return rc; +} + +static int +ksocknal_recv_iov (ksock_conn_t *conn) +{ + struct kvec *iov = conn->ksnc_rx_iov; + int nob; + int rc; + + LASSERT (conn->ksnc_rx_niov > 0); + + /* Never touch conn->ksnc_rx_iov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_iov(conn); + + if (rc <= 0) + return rc; + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_rx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT (conn->ksnc_rx_niov > 0); + + if (nob < (int)iov->iov_len) { + iov->iov_len -= nob; + iov->iov_base += nob; + return -EAGAIN; + } + + nob -= iov->iov_len; + conn->ksnc_rx_iov = ++iov; + conn->ksnc_rx_niov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_recv_kiov (ksock_conn_t *conn) +{ + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + int nob; + int rc; + LASSERT (conn->ksnc_rx_nkiov > 0); + + /* Never touch conn->ksnc_rx_kiov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_kiov(conn); + + if (rc <= 0) + return rc; + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_rx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT (conn->ksnc_rx_nkiov > 0); + + if (nob < (int) kiov->kiov_len) { + kiov->kiov_offset += nob; + kiov->kiov_len -= nob; + return -EAGAIN; + } + + nob -= kiov->kiov_len; + conn->ksnc_rx_kiov = ++kiov; + conn->ksnc_rx_nkiov--; + } while (nob != 0); + + return 1; +} + +static int +ksocknal_receive (ksock_conn_t *conn) +{ + /* Return 1 on success, 0 on EOF, < 0 on error. + * Caller checks ksnc_rx_nob_wanted to determine + * progress/completion. */ + int rc; + + if (ksocknal_data.ksnd_stall_rx != 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx)); + } + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + return -ESHUTDOWN; + } + + for (;;) { + if (conn->ksnc_rx_niov != 0) + rc = ksocknal_recv_iov (conn); + else + rc = ksocknal_recv_kiov (conn); + + if (rc <= 0) { + /* error/EOF or partial receive */ + if (rc == -EAGAIN) { + rc = 1; + } else if (rc == 0 && conn->ksnc_rx_started) { + /* EOF in the middle of a message */ + rc = -EPROTO; + } + break; + } + + /* Completed a fragment */ + + if (conn->ksnc_rx_nob_wanted == 0) { + rc = 1; + break; + } + } + + ksocknal_connsock_decref(conn); + return rc; +} + +void +ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx) +{ + lnet_msg_t *lnetmsg = tx->tx_lnetmsg; + int rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO; + + LASSERT(ni != NULL || tx->tx_conn != NULL); + + if (tx->tx_conn != NULL) + ksocknal_conn_decref(tx->tx_conn); + + if (ni == NULL && tx->tx_conn != NULL) + ni = tx->tx_conn->ksnc_peer->ksnp_ni; + + ksocknal_free_tx (tx); + if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */ + lnet_finalize (ni, lnetmsg, rc); +} + +void +ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error) +{ + ksock_tx_t *tx; + + while (!list_empty (txlist)) { + tx = list_entry (txlist->next, ksock_tx_t, tx_list); + + if (error && tx->tx_lnetmsg != NULL) { + CNETERR("Deleting packet type %d len %d %s->%s\n", + le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type), + le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid))); + } else if (error) { + CNETERR("Deleting noop packet\n"); + } + + list_del (&tx->tx_list); + + LASSERT (atomic_read(&tx->tx_refcount) == 1); + ksocknal_tx_done (ni, tx); + } +} + +static void +ksocknal_check_zc_req(ksock_tx_t *tx) +{ + ksock_conn_t *conn = tx->tx_conn; + ksock_peer_t *peer = conn->ksnc_peer; + + /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx + * to ksnp_zc_req_list if some fragment of this message should be sent + * zero-copy. Our peer will send an ACK containing this cookie when + * she has received this message to tell us we can signal completion. + * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on + * ksnp_zc_req_list. */ + LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT (tx->tx_zc_capable); + + tx->tx_zc_checked = 1; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x || + !conn->ksnc_zc_capable) + return; + + /* assign cookie and queue tx to pending list, it will be released when + * a matching ack is received. See ksocknal_handle_zcack() */ + + ksocknal_tx_addref(tx); + + spin_lock(&peer->ksnp_lock); + + /* ZC_REQ is going to be pinned to the peer */ + tx->tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + + LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0); + + tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++; + + if (peer->ksnp_zc_next_cookie == 0) + peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + + list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list); + + spin_unlock(&peer->ksnp_lock); +} + +static void +ksocknal_uncheck_zc_req(ksock_tx_t *tx) +{ + ksock_peer_t *peer = tx->tx_conn->ksnc_peer; + + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_zc_capable); + + tx->tx_zc_checked = 0; + + spin_lock(&peer->ksnp_lock); + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* Not waiting for an ACK */ + spin_unlock(&peer->ksnp_lock); + return; + } + + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_del(&tx->tx_zc_list); + + spin_unlock(&peer->ksnp_lock); + + ksocknal_tx_decref(tx); +} + +static int +ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) +{ + int rc; + + if (tx->tx_zc_capable && !tx->tx_zc_checked) + ksocknal_check_zc_req(tx); + + rc = ksocknal_transmit (conn, tx); + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); + + if (tx->tx_resid == 0) { + /* Sent everything OK */ + LASSERT (rc == 0); + + return 0; + } + + if (rc == -EAGAIN) + return rc; + + if (rc == -ENOMEM) { + static int counter; + + counter++; /* exponential backoff warnings */ + if ((counter & (-counter)) == counter) + CWARN("%u ENOMEM tx %p (%u allocated)\n", + counter, conn, atomic_read(&libcfs_kmemory)); + + /* Queue on ksnd_enomem_conns for retry after a timeout */ + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* enomem list takes over scheduler's ref... */ + LASSERT (conn->ksnc_tx_scheduled); + list_add_tail(&conn->ksnc_tx_list, + &ksocknal_data.ksnd_enomem_conns); + if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(), + SOCKNAL_ENOMEM_RETRY), + ksocknal_data.ksnd_reaper_waketime)) + wake_up (&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + return rc; + } + + /* Actual error */ + LASSERT (rc < 0); + + if (!conn->ksnc_closing) { + switch (rc) { + case -ECONNRESET: + LCONSOLE_WARN("Host %pI4h reset our connection while we were sending data; it may have rebooted.\n", + &conn->ksnc_ipaddr); + break; + default: + LCONSOLE_WARN("There was an unexpected network error while writing to %pI4h: %d.\n", + &conn->ksnc_ipaddr, rc); + break; + } + CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n", + conn, rc, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + } + + if (tx->tx_zc_checked) + ksocknal_uncheck_zc_req(tx); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); + + return rc; +} + +static void +ksocknal_launch_connection_locked (ksock_route_t *route) +{ + + /* called holding write lock on ksnd_global_lock */ + + LASSERT (!route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0); + + route->ksnr_scheduled = 1; /* scheduling conn for connd */ + ksocknal_route_addref(route); /* extra ref for connd */ + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&route->ksnr_connd_list, + &ksocknal_data.ksnd_connd_routes); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); +} + +void +ksocknal_launch_all_connections_locked (ksock_peer_t *peer) +{ + ksock_route_t *route; + + /* called holding write lock on ksnd_global_lock */ + for (;;) { + /* launch any/all connections that need it */ + route = ksocknal_find_connectable_route_locked(peer); + if (route == NULL) + return; + + ksocknal_launch_connection_locked(route); + } +} + +ksock_conn_t * +ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk) +{ + struct list_head *tmp; + ksock_conn_t *conn; + ksock_conn_t *typed = NULL; + ksock_conn_t *fallback = NULL; + int tnob = 0; + int fnob = 0; + + list_for_each (tmp, &peer->ksnp_conns) { + ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list); + int nob = atomic_read(&c->ksnc_tx_nob) + + c->ksnc_sock->sk->sk_wmem_queued; + int rc; + + LASSERT (!c->ksnc_closing); + LASSERT (c->ksnc_proto != NULL && + c->ksnc_proto->pro_match_tx != NULL); + + rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk); + + switch (rc) { + default: + LBUG(); + case SOCKNAL_MATCH_NO: /* protocol rejected the tx */ + continue; + + case SOCKNAL_MATCH_YES: /* typed connection */ + if (typed == NULL || tnob > nob || + (tnob == nob && *ksocknal_tunables.ksnd_round_robin && + cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) { + typed = c; + tnob = nob; + } + break; + + case SOCKNAL_MATCH_MAY: /* fallback connection */ + if (fallback == NULL || fnob > nob || + (fnob == nob && *ksocknal_tunables.ksnd_round_robin && + cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) { + fallback = c; + fnob = nob; + } + break; + } + } + + /* prefer the typed selection */ + conn = (typed != NULL) ? typed : fallback; + + if (conn != NULL) + conn->ksnc_tx_last_post = cfs_time_current(); + + return conn; +} + +void +ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx) +{ + conn->ksnc_proto->pro_pack(tx); + + atomic_add (tx->tx_nob, &conn->ksnc_tx_nob); + ksocknal_conn_addref(conn); /* +1 ref for tx */ + tx->tx_conn = conn; +} + +void +ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) +{ + ksock_sched_t *sched = conn->ksnc_scheduler; + ksock_msg_t *msg = &tx->tx_msg; + ksock_tx_t *ztx = NULL; + int bufnob = 0; + + /* called holding global lock (read or irq-write) and caller may + * not have dropped this lock between finding conn and calling me, + * so we don't need the {get,put}connsock dance to deref + * ksnc_sock... */ + LASSERT(!conn->ksnc_closing); + + CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + + ksocknal_tx_prep(conn, tx); + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete ksocknal message header. */ + LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) + + lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) == + (unsigned int)tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_resid == tx->tx_nob); + + CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n", + tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type: + KSOCK_MSG_NOOP, + tx->tx_nob, tx->tx_niov, tx->tx_nkiov); + + /* + * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__ + * but they're used inside spinlocks a lot. + */ + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + spin_lock_bh(&sched->kss_lock); + + if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) { + /* First packet starts the timeout */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */ + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_tx_bufnob = 0; + mb(); /* order with adding to tx_queue */ + } + + if (msg->ksm_type == KSOCK_MSG_NOOP) { + /* The packet is noop ZC ACK, try to piggyback the ack_cookie + * on a normal packet so I don't need to send it */ + LASSERT (msg->ksm_zc_cookies[1] != 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0)) + ztx = tx; /* ZC ACK piggybacked on ztx release tx later */ + + } else { + /* It's a normal packet - can it piggback a noop zc-ack that + * has been queued already? */ + LASSERT (msg->ksm_zc_cookies[1] == 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL); + + ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx); + /* ztx will be released later */ + } + + if (ztx != NULL) { + atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob); + list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs); + } + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + /* +1 ref for scheduler */ + ksocknal_conn_addref(conn); + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + wake_up (&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); +} + + +ksock_route_t * +ksocknal_find_connectable_route_locked (ksock_peer_t *peer) +{ + unsigned long now = cfs_time_current(); + struct list_head *tmp; + ksock_route_t *route; + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); + + if (route->ksnr_scheduled) /* connections being established */ + continue; + + /* all route types connected ? */ + if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0) + continue; + + if (!(route->ksnr_retry_interval == 0 || /* first attempt */ + cfs_time_aftereq(now, route->ksnr_timeout))) { + CDEBUG(D_NET, + "Too soon to retry route %pI4h (cnted %d, interval %ld, %ld secs later)\n", + &route->ksnr_ipaddr, + route->ksnr_connected, + route->ksnr_retry_interval, + cfs_duration_sec(route->ksnr_timeout - now)); + continue; + } + + return route; + } + + return NULL; +} + +ksock_route_t * +ksocknal_find_connecting_route_locked (ksock_peer_t *peer) +{ + struct list_head *tmp; + ksock_route_t *route; + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); + + if (route->ksnr_scheduled) + return route; + } + + return NULL; +} + +int +ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id) +{ + ksock_peer_t *peer; + ksock_conn_t *conn; + rwlock_t *g_lock; + int retry; + int rc; + + LASSERT (tx->tx_conn == NULL); + + g_lock = &ksocknal_data.ksnd_global_lock; + + for (retry = 0;; retry = 1) { + read_lock(g_lock); + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) { + if (ksocknal_find_connectable_route_locked(peer) == NULL) { + conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); + if (conn != NULL) { + /* I've got no routes that need to be + * connecting and I do have an actual + * connection... */ + ksocknal_queue_tx_locked (tx, conn); + read_unlock(g_lock); + return 0; + } + } + } + + /* I'll need a write lock... */ + read_unlock(g_lock); + + write_lock_bh(g_lock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) + break; + + write_unlock_bh(g_lock); + + if ((id.pid & LNET_PID_USERFLAG) != 0) { + CERROR("Refusing to create a connection to userspace process %s\n", + libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + rc = ksocknal_add_peer(ni, id, + LNET_NIDADDR(id.nid), + lnet_acceptor_port()); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_id2str(id), rc); + return rc; + } + } + + ksocknal_launch_all_connections_locked(peer); + + conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); + if (conn != NULL) { + /* Connection exists; queue message on it */ + ksocknal_queue_tx_locked (tx, conn); + write_unlock_bh(g_lock); + return 0; + } + + if (peer->ksnp_accepting > 0 || + ksocknal_find_connecting_route_locked (peer) != NULL) { + /* the message is going to be pinned to the peer */ + tx->tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + + /* Queue the message until a connection is established */ + list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); + write_unlock_bh(g_lock); + return 0; + } + + write_unlock_bh(g_lock); + + /* NB Routes may be ignored if connections to them failed recently */ + CNETERR("No usable routes to %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; +} + +int +ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + int mpflag = 1; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + unsigned int payload_niov = lntmsg->msg_niov; + struct kvec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + ksock_tx_t *tx; + int desc_size; + int rc; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it... */ + + CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + LASSERT (!in_interrupt ()); + + if (payload_iov != NULL) + desc_size = offsetof(ksock_tx_t, + tx_frags.virt.iov[1 + payload_niov]); + else + desc_size = offsetof(ksock_tx_t, + tx_frags.paged.kiov[payload_niov]); + + if (lntmsg->msg_vmflush) + mpflag = cfs_memory_pressure_get_and_set(); + tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size); + if (tx == NULL) { + CERROR("Can't allocate tx desc type %d size %d\n", + type, desc_size); + if (lntmsg->msg_vmflush) + cfs_memory_pressure_restore(mpflag); + return -ENOMEM; + } + + tx->tx_conn = NULL; /* set when assigned a conn */ + tx->tx_lnetmsg = lntmsg; + + if (payload_iov != NULL) { + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1 + + lnet_extract_iov(payload_niov, &tx->tx_iov[1], + payload_niov, payload_iov, + payload_offset, payload_nob); + } else { + tx->tx_niov = 1; + tx->tx_iov = &tx->tx_frags.paged.iov; + tx->tx_kiov = tx->tx_frags.paged.kiov; + tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov, + payload_niov, payload_kiov, + payload_offset, payload_nob); + + if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload) + tx->tx_zc_capable = 1; + } + + socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET); + + /* The first fragment will be set later in pro_pack */ + rc = ksocknal_launch_packet(ni, tx, target); + if (!mpflag) + cfs_memory_pressure_restore(mpflag); + + if (rc == 0) + return 0; + + ksocknal_free_tx(tx); + return -EIO; +} + +int +ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name) +{ + struct task_struct *task = kthread_run(fn, arg, "%s", name); + + if (IS_ERR(task)) + return PTR_ERR(task); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + ksocknal_data.ksnd_nthreads++; + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + return 0; +} + +void +ksocknal_thread_fini (void) +{ + write_lock_bh(&ksocknal_data.ksnd_global_lock); + ksocknal_data.ksnd_nthreads--; + write_unlock_bh(&ksocknal_data.ksnd_global_lock); +} + +int +ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ksocknal_slop_buffer[4096]; + + int nob; + unsigned int niov; + int skipped; + + LASSERT(conn->ksnc_proto != NULL); + + if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) { + /* Remind the socket to ack eagerly... */ + ksocknal_lib_eager_ack(conn); + } + + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_started = 0; + mb(); /* racing with timeout thread */ + + switch (conn->ksnc_proto->pro_version) { + case KSOCK_PROTO_V2: + case KSOCK_PROTO_V3: + conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER; + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg; + + conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u); + conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u); + conn->ksnc_rx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u); + break; + + case KSOCK_PROTO_V1: + /* Receiving bare lnet_hdr_t */ + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t); + conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t); + + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof (lnet_hdr_t); + break; + + default: + LBUG (); + } + conn->ksnc_rx_niov = 1; + + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_csum = ~0; + return 1; + } + + /* Set up to skip as much as possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return 0; +} + +static int +ksocknal_process_receive (ksock_conn_t *conn) +{ + lnet_hdr_t *lhdr; + lnet_process_id_t *id; + int rc; + + LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0); + + /* NB: sched lock NOT held */ + /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + again: + if (conn->ksnc_rx_nob_wanted != 0) { + rc = ksocknal_receive(conn); + + if (rc <= 0) { + LASSERT (rc != -EAGAIN); + + if (rc == 0) + CDEBUG(D_NET, "[%p] EOF from %s ip %pI4h:%d\n", + conn, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + else if (!conn->ksnc_closing) + CERROR("[%p] Error %d on read from %s ip %pI4h:%d\n", + conn, rc, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); + return (rc == 0 ? -ESHUTDOWN : rc); + } + + if (conn->ksnc_rx_nob_wanted != 0) { + /* short read */ + return -EAGAIN; + } + } + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_flip) { + __swab32s(&conn->ksnc_msg.ksm_type); + __swab32s(&conn->ksnc_msg.ksm_csum); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]); + } + + if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) { + CERROR("%s: Unknown message type: %x\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_type); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return -EPROTO; + } + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + /* NOOP Checksum error */ + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return -EIO; + } + + if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) { + __u64 cookie = 0; + + LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x); + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) + cookie = conn->ksnc_msg.ksm_zc_cookies[0]; + + rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie, + conn->ksnc_msg.ksm_zc_cookies[1]); + + if (rc != 0) { + CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + cookie, conn->ksnc_msg.ksm_zc_cookies[1]); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return rc; + } + } + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) { + ksocknal_new_packet (conn, 0); + return 0; /* NOOP is done and just return */ + } + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t); + conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t); + + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof(ksock_lnet_msg_t); + + conn->ksnc_rx_niov = 1; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + + goto again; /* read lnet header now */ + + case SOCKNAL_RX_LNET_HEADER: + /* unpack message header */ + conn->ksnc_proto->pro_unpack(&conn->ksnc_msg); + + if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) { + /* Userspace peer */ + lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; + id = &conn->ksnc_peer->ksnp_id; + + /* Substitute process ID assigned at connection time */ + lhdr->src_pid = cpu_to_le32(id->pid); + lhdr->src_nid = cpu_to_le64(id->nid); + } + + conn->ksnc_rx_state = SOCKNAL_RX_PARSE; + ksocknal_conn_addref(conn); /* ++ref while parsing */ + + rc = lnet_parse(conn->ksnc_peer->ksnp_ni, + &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr, + conn->ksnc_peer->ksnp_id.nid, conn, 0); + if (rc < 0) { + /* I just received garbage: give up on this conn */ + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings (conn, rc); + ksocknal_conn_decref(conn); + return -EPROTO; + } + + /* I'm racing with ksocknal_recv() */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD); + + if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD) + return 0; + + /* ksocknal_recv() got called */ + goto again; + + case SOCKNAL_RX_LNET_PAYLOAD: + /* payload all received */ + rc = 0; + + if (conn->ksnc_rx_nob_left == 0 && /* not truncating */ + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + rc = -EIO; + } + + if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) { + LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); + + lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; + id = &conn->ksnc_peer->ksnp_id; + + rc = conn->ksnc_proto->pro_handle_zcreq(conn, + conn->ksnc_msg.ksm_zc_cookies[0], + *ksocknal_tunables.ksnd_nonblk_zcack || + le64_to_cpu(lhdr->src_nid) != id->nid); + } + + lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc); + + if (rc != 0) { + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings (conn, rc); + return -EPROTO; + } + /* Fall through */ + + case SOCKNAL_RX_SLOP: + /* starting new packet? */ + if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) + return 0; /* come back later */ + goto again; /* try to finish reading slop now */ + + default: + break; + } + + /* Not Reached */ + LBUG (); + return -EINVAL; /* keep gcc happy */ +} + +int +ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + ksock_sched_t *sched = conn->ksnc_scheduler; + + LASSERT (mlen <= rlen); + LASSERT (niov <= LNET_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + if (mlen == 0 || iov != NULL) { + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + conn->ksnc_rx_niov = + lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov, + niov, iov, offset, mlen); + } else { + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + conn->ksnc_rx_nkiov = + lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov, + niov, kiov, offset, mlen); + } + + LASSERT (mlen == + lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + LASSERT (conn->ksnc_rx_scheduled); + + spin_lock_bh(&sched->kss_lock); + + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_PARSE_WAIT: + list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); + wake_up (&sched->kss_waitq); + LASSERT (conn->ksnc_rx_ready); + break; + + case SOCKNAL_RX_PARSE: + /* scheduler hasn't noticed I'm parsing yet */ + break; + } + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD; + + spin_unlock_bh(&sched->kss_lock); + ksocknal_conn_decref(conn); + return 0; +} + +static inline int +ksocknal_sched_cansleep(ksock_sched_t *sched) +{ + int rc; + + spin_lock_bh(&sched->kss_lock); + + rc = !ksocknal_data.ksnd_shuttingdown && + list_empty(&sched->kss_rx_conns) && + list_empty(&sched->kss_tx_conns); + + spin_unlock_bh(&sched->kss_lock); + return rc; +} + +int ksocknal_scheduler(void *arg) +{ + struct ksock_sched_info *info; + ksock_sched_t *sched; + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + int nloops = 0; + long id = (long)arg; + + info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)]; + sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; + + cfs_block_allsigs(); + + rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt); + if (rc != 0) { + CERROR("Can't set CPT affinity to %d: %d\n", + info->ksi_cpt, rc); + } + + spin_lock_bh(&sched->kss_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&sched->kss_rx_conns)) { + conn = list_entry(sched->kss_rx_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del(&conn->ksnc_rx_list); + + LASSERT(conn->ksnc_rx_scheduled); + LASSERT(conn->ksnc_rx_ready); + + /* clear rx_ready in case receive isn't complete. + * Do it BEFORE we call process_recv, since + * data_ready can set it any time after we release + * kss_lock. */ + conn->ksnc_rx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + rc = ksocknal_process_receive(conn); + + spin_lock_bh(&sched->kss_lock); + + /* I'm the only one that can clear this flag */ + LASSERT(conn->ksnc_rx_scheduled); + + /* Did process_receive get everything it wanted? */ + if (rc == 0) + conn->ksnc_rx_ready = 1; + + if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) { + /* Conn blocked waiting for ksocknal_recv() + * I change its state (under lock) to signal + * it can be rescheduled */ + conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT; + } else if (conn->ksnc_rx_ready) { + /* reschedule for rx */ + list_add_tail (&conn->ksnc_rx_list, + &sched->kss_rx_conns); + } else { + conn->ksnc_rx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = 1; + } + + if (!list_empty (&sched->kss_tx_conns)) { + LIST_HEAD (zlist); + + if (!list_empty(&sched->kss_zombie_noop_txs)) { + list_add(&zlist, + &sched->kss_zombie_noop_txs); + list_del_init(&sched->kss_zombie_noop_txs); + } + + conn = list_entry(sched->kss_tx_conns.next, + ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + LASSERT(conn->ksnc_tx_scheduled); + LASSERT(conn->ksnc_tx_ready); + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + + tx = list_entry(conn->ksnc_tx_queue.next, + ksock_tx_t, tx_list); + + if (conn->ksnc_tx_carrier == tx) + ksocknal_next_tx_carrier(conn); + + /* dequeue now so empty list => more to send */ + list_del(&tx->tx_list); + + /* Clear tx_ready in case send isn't complete. Do + * it BEFORE we call process_transmit, since + * write_space can set it any time after we release + * kss_lock. */ + conn->ksnc_tx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + if (!list_empty(&zlist)) { + /* free zombie noop txs, it's fast because + * noop txs are just put in freelist */ + ksocknal_txlist_done(NULL, &zlist, 0); + } + + rc = ksocknal_process_transmit(conn, tx); + + if (rc == -ENOMEM || rc == -EAGAIN) { + /* Incomplete send: replace tx on HEAD of tx_queue */ + spin_lock_bh(&sched->kss_lock); + list_add(&tx->tx_list, + &conn->ksnc_tx_queue); + } else { + /* Complete send; tx -ref */ + ksocknal_tx_decref(tx); + + spin_lock_bh(&sched->kss_lock); + /* assume space for more */ + conn->ksnc_tx_ready = 1; + } + + if (rc == -ENOMEM) { + /* Do nothing; after a short timeout, this + * conn will be reposted on kss_tx_conns. */ + } else if (conn->ksnc_tx_ready && + !list_empty (&conn->ksnc_tx_queue)) { + /* reschedule for tx */ + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + } else { + conn->ksnc_tx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = 1; + } + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ + spin_unlock_bh(&sched->kss_lock); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ + rc = wait_event_interruptible_exclusive( + sched->kss_waitq, + !ksocknal_sched_cansleep(sched)); + LASSERT (rc == 0); + } else { + cond_resched(); + } + + spin_lock_bh(&sched->kss_lock); + } + } + + spin_unlock_bh(&sched->kss_lock); + ksocknal_thread_fini(); + return 0; +} + +/* + * Add connection to kss_rx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_read_callback (ksock_conn_t *conn) +{ + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_rx_ready = 1; + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + spin_unlock_bh(&sched->kss_lock); +} + +/* + * Add connection to kss_tx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_write_callback (ksock_conn_t *conn) +{ + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && /* not being progressed */ + !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */ + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); +} + +static ksock_proto_t * +ksocknal_parse_proto_version (ksock_hello_msg_t *hello) +{ + __u32 version = 0; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + version = hello->kshm_version; + else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC)) + version = __swab32(hello->kshm_version); + + if (version != 0) { +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 1) + return NULL; + + if (*ksocknal_tunables.ksnd_protocol == 2 && + version == KSOCK_PROTO_V3) + return NULL; +#endif + if (version == KSOCK_PROTO_V2) + return &ksocknal_protocol_v2x; + + if (version == KSOCK_PROTO_V3) + return &ksocknal_protocol_v3x; + + return NULL; + } + + if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { + lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello; + + CLASSERT (sizeof (lnet_magicversion_t) == + offsetof (ksock_hello_msg_t, kshm_src_nid)); + + if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) && + hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR)) + return &ksocknal_protocol_v1x; + } + + return NULL; +} + +int +ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn, + lnet_nid_t peer_nid, ksock_hello_msg_t *hello) +{ + /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ + ksock_net_t *net = (ksock_net_t *)ni->ni_data; + + LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES); + + /* rely on caller to hold a ref on socket so it wouldn't disappear */ + LASSERT (conn->ksnc_proto != NULL); + + hello->kshm_src_nid = ni->ni_nid; + hello->kshm_dst_nid = peer_nid; + hello->kshm_src_pid = the_lnet.ln_pid; + + hello->kshm_src_incarnation = net->ksnn_incarnation; + hello->kshm_ctype = conn->ksnc_type; + + return conn->ksnc_proto->pro_send_hello(conn, hello); +} + +static int +ksocknal_invert_type(int type) +{ + switch (type) { + case SOCKLND_CONN_ANY: + case SOCKLND_CONN_CONTROL: + return type; + case SOCKLND_CONN_BULK_IN: + return SOCKLND_CONN_BULK_OUT; + case SOCKLND_CONN_BULK_OUT: + return SOCKLND_CONN_BULK_IN; + default: + return SOCKLND_CONN_NONE; + } +} + +int +ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, + ksock_hello_msg_t *hello, lnet_process_id_t *peerid, + __u64 *incarnation) +{ + /* Return < 0 fatal error + * 0 success + * EALREADY lost connection race + * EPROTO protocol version mismatch + */ + struct socket *sock = conn->ksnc_sock; + int active = (conn->ksnc_proto != NULL); + int timeout; + int proto_match; + int rc; + ksock_proto_t *proto; + lnet_process_id_t recv_id; + + /* socket type set on active connections - not set on passive */ + LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE)); + + timeout = active ? *ksocknal_tunables.ksnd_timeout : + lnet_acceptor_timeout(); + + rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT (rc < 0); + return rc; + } + + if (hello->kshm_magic != LNET_PROTO_MAGIC && + hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) && + hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { + /* Unexpected magic! */ + CERROR("Bad magic(1) %#08x (%#08x expected) from %pI4h\n", + __cpu_to_le32 (hello->kshm_magic), + LNET_PROTO_TCP_MAGIC, + &conn->ksnc_ipaddr); + return -EPROTO; + } + + rc = libcfs_sock_read(sock, &hello->kshm_version, + sizeof(hello->kshm_version), timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT (rc < 0); + return rc; + } + + proto = ksocknal_parse_proto_version(hello); + if (proto == NULL) { + if (!active) { + /* unknown protocol from peer, tell peer my protocol */ + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, ni->ni_nid, hello); + } + + CERROR("Unknown protocol version (%d.x expected) from %pI4h\n", + conn->ksnc_proto->pro_version, + &conn->ksnc_ipaddr); + + return -EPROTO; + } + + proto_match = (conn->ksnc_proto == proto); + conn->ksnc_proto = proto; + + /* receive the rest of hello message anyway */ + rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout); + if (rc != 0) { + CERROR("Error %d reading or checking hello from from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT (rc < 0); + return rc; + } + + *incarnation = hello->kshm_src_incarnation; + + if (hello->kshm_src_nid == LNET_NID_ANY) { + CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pI4h\n", + &conn->ksnc_ipaddr); + return -EPROTO; + } + + if (!active && + conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + /* Userspace NAL assigns peer process ID from socket */ + recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG; + recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr); + } else { + recv_id.nid = hello->kshm_src_nid; + recv_id.pid = hello->kshm_src_pid; + } + + if (!active) { + *peerid = recv_id; + + /* peer determines type */ + conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype); + if (conn->ksnc_type == SOCKLND_CONN_NONE) { + CERROR("Unexpected type %d from %s ip %pI4h\n", + hello->kshm_ctype, libcfs_id2str(*peerid), + &conn->ksnc_ipaddr); + return -EPROTO; + } + + return 0; + } + + if (peerid->pid != recv_id.pid || + peerid->nid != recv_id.nid) { + LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host %pI4h, but they claimed they were %s; please check your Lustre configuration.\n", + libcfs_id2str(*peerid), + &conn->ksnc_ipaddr, + libcfs_id2str(recv_id)); + return -EPROTO; + } + + if (hello->kshm_ctype == SOCKLND_CONN_NONE) { + /* Possible protocol mismatch or I lost the connection race */ + return proto_match ? EALREADY : EPROTO; + } + + if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) { + CERROR("Mismatched types: me %d, %s ip %pI4h %d\n", + conn->ksnc_type, libcfs_id2str(*peerid), + &conn->ksnc_ipaddr, + hello->kshm_ctype); + return -EPROTO; + } + + return 0; +} + +static int +ksocknal_connect (ksock_route_t *route) +{ + LIST_HEAD (zombies); + ksock_peer_t *peer = route->ksnr_peer; + int type; + int wanted; + struct socket *sock; + unsigned long deadline; + int retry_later = 0; + int rc = 0; + + deadline = cfs_time_add(cfs_time_current(), + cfs_time_seconds(*ksocknal_tunables.ksnd_timeout)); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + LASSERT (route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + + route->ksnr_connecting = 1; + + for (;;) { + wanted = ksocknal_route_mask() & ~route->ksnr_connected; + + /* stop connecting if peer/route got closed under me, or + * route got connected while queued */ + if (peer->ksnp_closing || route->ksnr_deleted || + wanted == 0) { + retry_later = 0; + break; + } + + /* reschedule if peer is connecting to me */ + if (peer->ksnp_accepting > 0) { + CDEBUG(D_NET, + "peer %s(%d) already connecting to me, retry later.\n", + libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting); + retry_later = 1; + } + + if (retry_later) /* needs reschedule */ + break; + + if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) { + type = SOCKLND_CONN_ANY; + } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) { + type = SOCKLND_CONN_CONTROL; + } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) { + type = SOCKLND_CONN_BULK_IN; + } else { + LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0); + type = SOCKLND_CONN_BULK_OUT; + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (cfs_time_aftereq(cfs_time_current(), deadline)) { + rc = -ETIMEDOUT; + lnet_connect_console_error(rc, peer->ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + rc = lnet_connect(&sock, peer->ksnp_id.nid, + route->ksnr_myipaddr, + route->ksnr_ipaddr, route->ksnr_port); + if (rc != 0) + goto failed; + + rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type); + if (rc < 0) { + lnet_connect_console_error(rc, peer->ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + /* A +ve RC means I have to retry because I lost the connection + * race or I have to renegotiate protocol version */ + retry_later = (rc != 0); + if (retry_later) + CDEBUG(D_NET, "peer %s: conn race, retry later.\n", + libcfs_nid2str(peer->ksnp_id.nid)); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + } + + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + + if (retry_later) { + /* re-queue for attention; this frees me up to handle + * the peer's incoming connection request */ + + if (rc == EALREADY || + (rc == 0 && peer->ksnp_accepting > 0)) { + /* We want to introduce a delay before next + * attempt to connect if we lost conn race, + * but the race is resolved quickly usually, + * so min_reconnectms should be good heuristic */ + route->ksnr_retry_interval = + cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000; + route->ksnr_timeout = cfs_time_add(cfs_time_current(), + route->ksnr_retry_interval); + } + + ksocknal_launch_connection_locked(route); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + return retry_later; + + failed: + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + + /* This is a retry rather than a new connection */ + route->ksnr_retry_interval *= 2; + route->ksnr_retry_interval = + max(route->ksnr_retry_interval, + cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000); + route->ksnr_retry_interval = + min(route->ksnr_retry_interval, + cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000); + + LASSERT (route->ksnr_retry_interval != 0); + route->ksnr_timeout = cfs_time_add(cfs_time_current(), + route->ksnr_retry_interval); + + if (!list_empty(&peer->ksnp_tx_queue) && + peer->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer) == NULL) { + ksock_conn_t *conn; + + /* ksnp_tx_queue is queued on a conn on successful + * connection for V1.x and V2.x */ + if (!list_empty (&peer->ksnp_conns)) { + conn = list_entry(peer->ksnp_conns.next, + ksock_conn_t, ksnc_list); + LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x); + } + + /* take all the blocked packets while I've got the lock and + * complete below... */ + list_splice_init(&peer->ksnp_tx_queue, &zombies); + } + +#if 0 /* irrelevant with only eager routes */ + if (!route->ksnr_deleted) { + /* make this route least-favourite for re-selection */ + list_del(&route->ksnr_list); + list_add_tail(&route->ksnr_list, &peer->ksnp_routes); + } +#endif + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_peer_failed(peer); + ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1); + return 0; +} + +/* + * check whether we need to create more connds. + * It will try to create new thread if it's necessary, @timeout can + * be updated if failed to create, so caller wouldn't keep try while + * running out of resource. + */ +static int +ksocknal_connd_check_start(long sec, long *timeout) +{ + char name[16]; + int rc; + int total = ksocknal_data.ksnd_connd_starting + + ksocknal_data.ksnd_connd_running; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (total >= *ksocknal_tunables.ksnd_nconnds_max || + total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) { + /* can't create more connd, or still have enough + * threads to handle more connecting */ + return 0; + } + + if (list_empty(&ksocknal_data.ksnd_connd_routes)) { + /* no pending connecting request */ + return 0; + } + + if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) { + /* may run out of resource, retry later */ + *timeout = cfs_time_seconds(1); + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* serialize starting to avoid flood */ + return 0; + } + + ksocknal_data.ksnd_connd_starting_stamp = sec; + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + /* NB: total is the next id */ + snprintf(name, sizeof(name), "socknal_cd%02d", total); + rc = ksocknal_thread_start(ksocknal_connd, NULL, name); + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + if (rc == 0) + return 1; + + /* we tried ... */ + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_failed_stamp = get_seconds(); + + return 1; +} + +/* + * check whether current thread can exit, it will return 1 if there are too + * many threads and no creating in past 120 seconds. + * Also, this function may update @timeout to make caller come back + * again to recheck these conditions. + */ +static int +ksocknal_connd_check_stop(long sec, long *timeout) +{ + int val; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* in progress of starting new thread */ + return 0; + } + + if (ksocknal_data.ksnd_connd_running <= + *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */ + return 0; + } + + /* created thread in past 120 seconds? */ + val = (int)(ksocknal_data.ksnd_connd_starting_stamp + + SOCKNAL_CONND_TIMEOUT - sec); + + *timeout = (val > 0) ? cfs_time_seconds(val) : + cfs_time_seconds(SOCKNAL_CONND_TIMEOUT); + if (val > 0) + return 0; + + /* no creating in past 120 seconds */ + + return ksocknal_data.ksnd_connd_running > + ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV; +} + +/* Go through connd_routes queue looking for a route that we can process + * right now, @timeout_p can be updated if we need to come back later */ +static ksock_route_t * +ksocknal_connd_get_route_locked(signed long *timeout_p) +{ + ksock_route_t *route; + unsigned long now; + + now = cfs_time_current(); + + /* connd_routes can contain both pending and ordinary routes */ + list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes, + ksnr_connd_list) { + + if (route->ksnr_retry_interval == 0 || + cfs_time_aftereq(now, route->ksnr_timeout)) + return route; + + if (*timeout_p == MAX_SCHEDULE_TIMEOUT || + (int)*timeout_p > (int)(route->ksnr_timeout - now)) + *timeout_p = (int)(route->ksnr_timeout - now); + } + + return NULL; +} + +int +ksocknal_connd (void *arg) +{ + spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; + ksock_connreq_t *cr; + wait_queue_t wait; + int nloops = 0; + int cons_retry = 0; + + cfs_block_allsigs (); + + init_waitqueue_entry(&wait, current); + + spin_lock_bh(connd_lock); + + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_running++; + + while (!ksocknal_data.ksnd_shuttingdown) { + ksock_route_t *route = NULL; + long sec = get_seconds(); + long timeout = MAX_SCHEDULE_TIMEOUT; + int dropped_lock = 0; + + if (ksocknal_connd_check_stop(sec, &timeout)) { + /* wakeup another one to check stop */ + wake_up(&ksocknal_data.ksnd_connd_waitq); + break; + } + + if (ksocknal_connd_check_start(sec, &timeout)) { + /* created new thread */ + dropped_lock = 1; + } + + if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { + /* Connection accepted by the listener */ + cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \ + next, ksock_connreq_t, ksncr_list); + + list_del(&cr->ksncr_list); + spin_unlock_bh(connd_lock); + dropped_lock = 1; + + ksocknal_create_conn(cr->ksncr_ni, NULL, + cr->ksncr_sock, SOCKLND_CONN_NONE); + lnet_ni_decref(cr->ksncr_ni); + LIBCFS_FREE(cr, sizeof(*cr)); + + spin_lock_bh(connd_lock); + } + + /* Only handle an outgoing connection request if there + * is a thread left to handle incoming connections and + * create new connd */ + if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV < + ksocknal_data.ksnd_connd_running) { + route = ksocknal_connd_get_route_locked(&timeout); + } + if (route != NULL) { + list_del (&route->ksnr_connd_list); + ksocknal_data.ksnd_connd_connecting++; + spin_unlock_bh(connd_lock); + dropped_lock = 1; + + if (ksocknal_connect(route)) { + /* consecutive retry */ + if (cons_retry++ > SOCKNAL_INSANITY_RECONN) { + CWARN("massive consecutive re-connecting to %pI4h\n", + &route->ksnr_ipaddr); + cons_retry = 0; + } + } else { + cons_retry = 0; + } + + ksocknal_route_decref(route); + + spin_lock_bh(connd_lock); + ksocknal_data.ksnd_connd_connecting--; + } + + if (dropped_lock) { + if (++nloops < SOCKNAL_RESCHED) + continue; + spin_unlock_bh(connd_lock); + nloops = 0; + cond_resched(); + spin_lock_bh(connd_lock); + continue; + } + + /* Nothing to do for 'timeout' */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait); + spin_unlock_bh(connd_lock); + + nloops = 0; + schedule_timeout(timeout); + + remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); + spin_lock_bh(connd_lock); + } + ksocknal_data.ksnd_connd_running--; + spin_unlock_bh(connd_lock); + + ksocknal_thread_fini(); + return 0; +} + +static ksock_conn_t * +ksocknal_find_timed_out_conn (ksock_peer_t *peer) +{ + /* We're called with a shared lock on ksnd_global_lock */ + ksock_conn_t *conn; + struct list_head *ctmp; + + list_for_each (ctmp, &peer->ksnp_conns) { + int error; + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + + /* Don't need the {get,put}connsock dance to deref ksnc_sock */ + LASSERT (!conn->ksnc_closing); + + /* SOCK_ERROR will reset error code of socket in + * some platform (like Darwin8.x) */ + error = conn->ksnc_sock->sk->sk_err; + if (error != 0) { + ksocknal_conn_addref(conn); + + switch (error) { + case ECONNRESET: + CNETERR("A connection with %s (%pI4h:%d) was reset; it may have rebooted.\n", + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + break; + case ETIMEDOUT: + CNETERR("A connection with %s (%pI4h:%d) timed out; the network or node may be down.\n", + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + break; + default: + CNETERR("An unexpected network error %d occurred with %s (%pI4h:%d\n", + error, + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + break; + } + + return conn; + } + + if (conn->ksnc_rx_started && + cfs_time_aftereq(cfs_time_current(), + conn->ksnc_rx_deadline)) { + /* Timed out incomplete incoming message */ + ksocknal_conn_addref(conn); + CNETERR("Timeout receiving from %s (%pI4h:%d), state %d wanted %d left %d\n", + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port, + conn->ksnc_rx_state, + conn->ksnc_rx_nob_wanted, + conn->ksnc_rx_nob_left); + return conn; + } + + if ((!list_empty(&conn->ksnc_tx_queue) || + conn->ksnc_sock->sk->sk_wmem_queued != 0) && + cfs_time_aftereq(cfs_time_current(), + conn->ksnc_tx_deadline)) { + /* Timed out messages queued for sending or + * buffered in the socket's send buffer */ + ksocknal_conn_addref(conn); + CNETERR("Timeout sending data to %s (%pI4h:%d) the network or that node may be down.\n", + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + return conn; + } + } + + return NULL; +} + +static inline void +ksocknal_flush_stale_txs(ksock_peer_t *peer) +{ + ksock_tx_t *tx; + LIST_HEAD (stale_txs); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + while (!list_empty (&peer->ksnp_tx_queue)) { + tx = list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + if (!cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) + break; + + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &stale_txs); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1); +} + +static int +ksocknal_send_keepalive_locked(ksock_peer_t *peer) +{ + ksock_sched_t *sched; + ksock_conn_t *conn; + ksock_tx_t *tx; + + if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */ + return 0; + + if (peer->ksnp_proto != &ksocknal_protocol_v3x) + return 0; + + if (*ksocknal_tunables.ksnd_keepalive <= 0 || + time_before(cfs_time_current(), + cfs_time_add(peer->ksnp_last_alive, + cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive)))) + return 0; + + if (time_before(cfs_time_current(), peer->ksnp_send_keepalive)) + return 0; + + /* retry 10 secs later, so we wouldn't put pressure + * on this peer if we failed to send keepalive this time */ + peer->ksnp_send_keepalive = cfs_time_shift(10); + + conn = ksocknal_find_conn_locked(peer, NULL, 1); + if (conn != NULL) { + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + if (!list_empty(&conn->ksnc_tx_queue)) { + spin_unlock_bh(&sched->kss_lock); + /* there is an queued ACK, don't need keepalive */ + return 0; + } + + spin_unlock_bh(&sched->kss_lock); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* cookie = 1 is reserved for keepalive PING */ + tx = ksocknal_alloc_tx_noop(1, 1); + if (tx == NULL) { + read_lock(&ksocknal_data.ksnd_global_lock); + return -ENOMEM; + } + + if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) { + read_lock(&ksocknal_data.ksnd_global_lock); + return 1; + } + + ksocknal_free_tx(tx); + read_lock(&ksocknal_data.ksnd_global_lock); + + return -EIO; +} + + +static void +ksocknal_check_peer_timeouts (int idx) +{ + struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; + ksock_peer_t *peer; + ksock_conn_t *conn; + ksock_tx_t *tx; + + again: + /* NB. We expect to have a look at all the peers and not find any + * connections to time out, so we just use a shared lock while we + * take a look... */ + read_lock(&ksocknal_data.ksnd_global_lock); + + list_for_each_entry(peer, peers, ksnp_list) { + unsigned long deadline = 0; + int resid = 0; + int n = 0; + + if (ksocknal_send_keepalive_locked(peer) != 0) { + read_unlock(&ksocknal_data.ksnd_global_lock); + goto again; + } + + conn = ksocknal_find_timed_out_conn (peer); + + if (conn != NULL) { + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + + /* NB we won't find this one again, but we can't + * just proceed with the next peer, since we dropped + * ksnd_global_lock and it might be dead already! */ + ksocknal_conn_decref(conn); + goto again; + } + + /* we can't process stale txs right here because we're + * holding only shared lock */ + if (!list_empty (&peer->ksnp_tx_queue)) { + ksock_tx_t *tx = + list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + if (cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) { + + ksocknal_peer_addref(peer); + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_flush_stale_txs(peer); + + ksocknal_peer_decref(peer); + goto again; + } + } + + if (list_empty(&peer->ksnp_zc_req_list)) + continue; + + spin_lock(&peer->ksnp_lock); + list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) { + if (!cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) + break; + /* ignore the TX if connection is being closed */ + if (tx->tx_conn->ksnc_closing) + continue; + n++; + } + + if (n == 0) { + spin_unlock(&peer->ksnp_lock); + continue; + } + + tx = list_entry(peer->ksnp_zc_req_list.next, + ksock_tx_t, tx_zc_list); + deadline = tx->tx_deadline; + resid = tx->tx_resid; + conn = tx->tx_conn; + ksocknal_conn_addref(conn); + + spin_unlock(&peer->ksnp_lock); + read_unlock(&ksocknal_data.ksnd_global_lock); + + CERROR("Total %d stale ZC_REQs for peer %s detected; the oldest(%p) timed out %ld secs ago, resid: %d, wmem: %d\n", + n, libcfs_nid2str(peer->ksnp_id.nid), tx, + cfs_duration_sec(cfs_time_current() - deadline), + resid, conn->ksnc_sock->sk->sk_wmem_queued); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + ksocknal_conn_decref(conn); + goto again; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +int +ksocknal_reaper (void *arg) +{ + wait_queue_t wait; + ksock_conn_t *conn; + ksock_sched_t *sched; + struct list_head enomem_conns; + int nenomem_conns; + long timeout; + int i; + int peer_index = 0; + unsigned long deadline = cfs_time_current(); + + cfs_block_allsigs (); + + INIT_LIST_HEAD(&enomem_conns); + init_waitqueue_entry(&wait, current); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + + if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) { + conn = list_entry (ksocknal_data. \ + ksnd_deathrow_conns.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_terminate_conn(conn); + ksocknal_conn_decref(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) { + conn = list_entry (ksocknal_data.ksnd_zombie_conns.\ + next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_destroy_conn(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) { + list_add(&enomem_conns, + &ksocknal_data.ksnd_enomem_conns); + list_del_init(&ksocknal_data.ksnd_enomem_conns); + } + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* reschedule all the connections that stalled with ENOMEM... */ + nenomem_conns = 0; + while (!list_empty (&enomem_conns)) { + conn = list_entry (enomem_conns.next, + ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + LASSERT(conn->ksnc_tx_scheduled); + conn->ksnc_tx_ready = 1; + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + wake_up(&sched->kss_waitq); + + spin_unlock_bh(&sched->kss_lock); + nenomem_conns++; + } + + /* careful with the jiffy wrap... */ + while ((timeout = cfs_time_sub(deadline, + cfs_time_current())) <= 0) { + const int n = 4; + const int p = 1; + int chunk = ksocknal_data.ksnd_peer_hash_size; + + /* Time to check for timeouts on a few more peers: I do + * checks every 'p' seconds on a proportion of the peer + * table and I need to check every connection 'n' times + * within a timeout interval, to ensure I detect a + * timeout on any connection within (n+1)/n times the + * timeout interval. */ + + if (*ksocknal_tunables.ksnd_timeout > n * p) + chunk = (chunk * n * p) / + *ksocknal_tunables.ksnd_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + ksocknal_check_peer_timeouts (peer_index); + peer_index = (peer_index + 1) % + ksocknal_data.ksnd_peer_hash_size; + } + + deadline = cfs_time_add(deadline, cfs_time_seconds(p)); + } + + if (nenomem_conns != 0) { + /* Reduce my timeout if I rescheduled ENOMEM conns. + * This also prevents me getting woken immediately + * if any go back on my enomem list. */ + timeout = SOCKNAL_ENOMEM_RETRY; + } + ksocknal_data.ksnd_reaper_waketime = + cfs_time_add(cfs_time_current(), timeout); + + set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + + if (!ksocknal_data.ksnd_shuttingdown && + list_empty (&ksocknal_data.ksnd_deathrow_conns) && + list_empty (&ksocknal_data.ksnd_zombie_conns)) + schedule_timeout(timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + } + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_thread_fini(); + return 0; +} diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c new file mode 100644 index 000000000..f5e8ab060 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -0,0 +1,714 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include "socklnd.h" + +int +ksocknal_lib_get_conn_addrs(ksock_conn_t *conn) +{ + int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, + &conn->ksnc_ipaddr, + &conn->ksnc_port); + + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT(!conn->ksnc_closing); + + if (rc != 0) { + CERROR("Error %d getting sock peer IP\n", rc); + return rc; + } + + rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, + &conn->ksnc_myipaddr, NULL); + if (rc != 0) { + CERROR("Error %d getting sock local IP\n", rc); + return rc; + } + + return 0; +} + +int +ksocknal_lib_zc_capable(ksock_conn_t *conn) +{ + int caps = conn->ksnc_sock->sk->sk_route_caps; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x) + return 0; + + /* ZC if the socket supports scatter/gather and doesn't need software + * checksums */ + return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0); +} + +int +ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct socket *sock = conn->ksnc_sock; + int nob; + int rc; + + if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ + conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ + tx->tx_nob == tx->tx_resid && /* frist sending */ + tx->tx_msg.ksm_csum == 0) /* not checksummed */ + ksocknal_lib_csum_tx(tx); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + + { +#if SOCKNAL_SINGLE_FRAG_TX + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_niov; +#endif + struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = tx->tx_iov[i]; + nob += scratchiov[i].iov_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob); + } + return rc; +} + +int +ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct socket *sock = conn->ksnc_sock; + lnet_kiov_t *kiov = tx->tx_kiov; + int rc; + int nob; + + /* Not NOOP message */ + LASSERT(tx->tx_lnetmsg != NULL); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + if (tx->tx_msg.ksm_zc_cookies[0] != 0) { + /* Zero copy is enabled */ + struct sock *sk = sock->sk; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int fragsize = kiov->kiov_len; + int msgflg = MSG_DONTWAIT; + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, kiov->kiov_len); + + if (!list_empty(&conn->ksnc_tx_queue) || + fragsize < tx->tx_resid) + msgflg |= MSG_MORE; + + if (sk->sk_prot->sendpage != NULL) { + rc = sk->sk_prot->sendpage(sk, page, + offset, fragsize, msgflg); + } else { + rc = cfs_tcp_sendpage(sk, page, offset, fragsize, + msgflg); + } + } else { +#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_nkiov; +#endif + struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob); + + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } + return rc; +} + +void +ksocknal_lib_eager_ack(ksock_conn_t *conn) +{ + int opt = 1; + struct socket *sock = conn->ksnc_sock; + + /* Remind the socket to ACK eagerly. If I don't, the socket might + * think I'm about to send something it could piggy-back the ACK + * on, introducing delay in completing zero-copy sends in my + * peer. */ + + kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (char *)&opt, sizeof(opt)); +} + +int +ksocknal_lib_recv_iov(ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = conn->ksnc_rx_niov; +#endif + struct kvec *iov = conn->ksnc_rx_iov; + struct msghdr msg = { + .msg_flags = 0 + }; + int nob; + int i; + int rc; + int fragnob; + int sum; + __u32 saved_csum; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + LASSERT(niov > 0); + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = iov[i]; + nob += scratchiov[i].iov_len; + } + LASSERT(nob <= conn->ksnc_rx_nob_wanted); + + rc = kernel_recvmsg(conn->ksnc_sock, &msg, + scratchiov, niov, nob, MSG_DONTWAIT); + + saved_csum = 0; + if (conn->ksnc_proto == &ksocknal_protocol_v2x) { + saved_csum = conn->ksnc_msg.ksm_csum; + conn->ksnc_msg.ksm_csum = 0; + } + + if (saved_csum != 0) { + /* accumulate checksum */ + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT(i < niov); + + fragnob = iov[i].iov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + iov[i].iov_base, fragnob); + } + conn->ksnc_msg.ksm_csum = saved_csum; + } + + return rc; +} + +static void +ksocknal_lib_kiov_vunmap(void *addr) +{ + if (addr == NULL) + return; + + vunmap(addr); +} + +static void * +ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, + struct kvec *iov, struct page **pages) +{ + void *addr; + int nob; + int i; + + if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) + return NULL; + + LASSERT(niov <= LNET_MAX_IOV); + + if (niov < 2 || + niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) + return NULL; + + for (nob = i = 0; i < niov; i++) { + if ((kiov[i].kiov_offset != 0 && i > 0) || + (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1)) + return NULL; + + pages[i] = kiov[i].kiov_page; + nob += kiov[i].kiov_len; + } + + addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); + if (addr == NULL) + return NULL; + + iov->iov_base = addr + kiov[0].kiov_offset; + iov->iov_len = nob; + + return addr; +} + +int +ksocknal_lib_recv_kiov(ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct kvec scratch; + struct kvec *scratchiov = &scratch; + struct page **pages = NULL; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; + unsigned int niov = conn->ksnc_rx_nkiov; +#endif + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + struct msghdr msg = { + .msg_flags = 0 + }; + int nob; + int i; + int rc; + void *base; + void *addr; + int sum; + int fragnob; + int n; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages); + if (addr != NULL) { + nob = scratchiov[0].iov_len; + n = 1; + + } else { + for (nob = i = 0; i < niov; i++) { + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + } + n = niov; + } + + LASSERT(nob <= conn->ksnc_rx_nob_wanted); + + rc = kernel_recvmsg(conn->ksnc_sock, &msg, + (struct kvec *)scratchiov, n, nob, MSG_DONTWAIT); + + if (conn->ksnc_msg.ksm_csum != 0) { + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT(i < niov); + + /* Dang! have to kmap again because I have nowhere to stash the + * mapped address. But by doing it while the page is still + * mapped, the kernel just bumps the map count and returns me + * the address it stashed. */ + base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; + fragnob = kiov[i].kiov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + base, fragnob); + + kunmap(kiov[i].kiov_page); + } + } + + if (addr != NULL) { + ksocknal_lib_kiov_vunmap(addr); + } else { + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } + + return rc; +} + +void +ksocknal_lib_csum_tx(ksock_tx_t *tx) +{ + int i; + __u32 csum; + void *base; + + LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg); + LASSERT(tx->tx_conn != NULL); + LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); + + tx->tx_msg.ksm_csum = 0; + + csum = ksocknal_csum(~0, tx->tx_iov[0].iov_base, + tx->tx_iov[0].iov_len); + + if (tx->tx_kiov != NULL) { + for (i = 0; i < tx->tx_nkiov; i++) { + base = kmap(tx->tx_kiov[i].kiov_page) + + tx->tx_kiov[i].kiov_offset; + + csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); + + kunmap(tx->tx_kiov[i].kiov_page); + } + } else { + for (i = 1; i < tx->tx_niov; i++) + csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base, + tx->tx_iov[i].iov_len); + } + + if (*ksocknal_tunables.ksnd_inject_csum_error) { + csum++; + *ksocknal_tunables.ksnd_inject_csum_error = 0; + } + + tx->tx_msg.ksm_csum = csum; +} + +int +ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) +{ + struct socket *sock = conn->ksnc_sock; + int len; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT(conn->ksnc_closing); + *txmem = *rxmem = *nagle = 0; + return -ESHUTDOWN; + } + + rc = libcfs_sock_getbuf(sock, txmem, rxmem); + if (rc == 0) { + len = sizeof(*nagle); + rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)nagle, &len); + } + + ksocknal_connsock_decref(conn); + + if (rc == 0) + *nagle = !*nagle; + else + *txmem = *rxmem = *nagle = 0; + + return rc; +} + +int +ksocknal_lib_setup_sock(struct socket *sock) +{ + int rc; + int option; + int keep_idle; + int keep_intvl; + int keep_count; + int do_keepalive; + struct linger linger; + + sock->sk->sk_allocation = GFP_NOFS; + + /* Ensure this socket aborts active sends immediately when we close + * it. */ + + linger.l_onoff = 0; + linger.l_linger = 0; + + rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof(linger)); + if (rc != 0) { + CERROR("Can't set SO_LINGER: %d\n", rc); + return rc; + } + + option = -1; + rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't set SO_LINGER2: %d\n", rc); + return rc; + } + + if (!*ksocknal_tunables.ksnd_nagle) { + option = 1; + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't disable nagle: %d\n", rc); + return rc; + } + } + + rc = libcfs_sock_setbuf(sock, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + if (rc != 0) { + CERROR("Can't set buffer tx %d, rx %d buffers: %d\n", + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size, rc); + return rc; + } + +/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ + + /* snapshot tunables */ + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; + + do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); + + option = (do_keepalive ? 1 : 0); + rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't set SO_KEEPALIVE: %d\n", rc); + return rc; + } + + if (!do_keepalive) + return 0; + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&keep_idle, sizeof(keep_idle)); + if (rc != 0) { + CERROR("Can't set TCP_KEEPIDLE: %d\n", rc); + return rc; + } + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&keep_intvl, sizeof(keep_intvl)); + if (rc != 0) { + CERROR("Can't set TCP_KEEPINTVL: %d\n", rc); + return rc; + } + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, + (char *)&keep_count, sizeof(keep_count)); + if (rc != 0) { + CERROR("Can't set TCP_KEEPCNT: %d\n", rc); + return rc; + } + + return 0; +} + +void +ksocknal_lib_push_conn(ksock_conn_t *conn) +{ + struct sock *sk; + struct tcp_sock *tp; + int nonagle; + int val = 1; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) /* being shut down */ + return; + + sk = conn->ksnc_sock->sk; + tp = tcp_sk(sk); + + lock_sock(sk); + nonagle = tp->nonagle; + tp->nonagle = 1; + release_sock(sk); + + rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof(val)); + LASSERT(rc == 0); + + lock_sock(sk); + tp->nonagle = nonagle; + release_sock(sk); + + ksocknal_connsock_decref(conn); +} + +extern void ksocknal_read_callback(ksock_conn_t *conn); +extern void ksocknal_write_callback(ksock_conn_t *conn); +/* + * socket call back in Linux + */ +static void +ksocknal_data_ready(struct sock *sk) +{ + ksock_conn_t *conn; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT(sk->sk_data_ready != &ksocknal_data_ready); + sk->sk_data_ready(sk); + } else + ksocknal_read_callback(conn); + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +static void +ksocknal_write_space(struct sock *sk) +{ + ksock_conn_t *conn; + int wspace; + int min_wpace; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + wspace = SOCKNAL_WSPACE(sk); + min_wpace = SOCKNAL_MIN_WSPACE(sk); + + CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", + sk, wspace, min_wpace, conn, + (conn == NULL) ? "" : (conn->ksnc_tx_ready ? + " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? + " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty(&conn->ksnc_tx_queue) ? + " empty" : " queued")); + + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT(sk->sk_write_space != &ksocknal_write_space); + sk->sk_write_space(sk); + + read_unlock(&ksocknal_data.ksnd_global_lock); + return; + } + + if (wspace >= min_wpace) { /* got enough space */ + ksocknal_write_callback(conn); + + /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the + * ENOMEM check in ksocknal_transmit is race-free (think about + * it). */ + + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +void +ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn) +{ + conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; + conn->ksnc_saved_write_space = sock->sk->sk_write_space; +} + +void +ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) +{ + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = ksocknal_data_ready; + sock->sk->sk_write_space = ksocknal_write_space; + return; +} + +void +ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) +{ + /* Remove conn's network callbacks. + * NB I _have_ to restore the callback, rather than storing a noop, + * since the socket could survive past this module being unloaded!! */ + sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; + sock->sk->sk_write_space = conn->ksnc_saved_write_space; + + /* A callback could be in progress already; they hold a read lock + * on ksnd_global_lock (to serialise with me) and NOOP if + * sk_user_data is NULL. */ + sock->sk->sk_user_data = NULL; + + return ; +} + +int +ksocknal_lib_memory_pressure(ksock_conn_t *conn) +{ + int rc = 0; + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + spin_lock_bh(&sched->kss_lock); + + if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) && + !conn->ksnc_tx_ready) { + /* SOCK_NOSPACE is set when the socket fills + * and cleared in the write_space callback + * (which also sets ksnc_tx_ready). If + * SOCK_NOSPACE and ksnc_tx_ready are BOTH + * zero, I didn't fill the socket and + * write_space won't reschedule me, so I + * return -ENOMEM to get my caller to retry + * after a timeout */ + rc = -ENOMEM; + } + + spin_unlock_bh(&sched->kss_lock); + + return rc; +} diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h new file mode 100644 index 000000000..f5563881b --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h @@ -0,0 +1,86 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_PORTAL_ALLOC + +#ifndef __LINUX_SOCKNAL_LIB_H__ +#define __LINUX_SOCKNAL_LIB_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../../../include/linux/libcfs/libcfs.h" + +#include +static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len) +{ +#if 1 + return crc32_le(crc, p, len); +#else + while (len-- > 0) + crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ; + return crc; +#endif +} + +#define SOCKNAL_WSPACE(sk) sk_stream_wspace(sk) +#define SOCKNAL_MIN_WSPACE(sk) sk_stream_min_wspace(sk) + +/* assume one thread for each connection type */ +#define SOCKNAL_NSCHEDS 3 +#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1) + +#endif diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c new file mode 100644 index 000000000..86b88db1c --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Eric Barton + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +static int sock_timeout = 50; +module_param(sock_timeout, int, 0644); +MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)"); + +static int credits = 256; +module_param(credits, int, 0444); +MODULE_PARM_DESC(credits, "# concurrent sends"); + +static int peer_credits = 8; +module_param(peer_credits, int, 0444); +MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); + +static int peer_buffer_credits; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); + +static int peer_timeout = 180; +module_param(peer_timeout, int, 0444); +MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +/* Number of daemons in each thread pool which is percpt, + * we will estimate reasonable value based on CPUs if it's not set. */ +static unsigned int nscheds; +module_param(nscheds, int, 0444); +MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting"); + +static int nconnds = 4; +module_param(nconnds, int, 0444); +MODULE_PARM_DESC(nconnds, "# connection daemons while starting"); + +static int nconnds_max = 64; +module_param(nconnds_max, int, 0444); +MODULE_PARM_DESC(nconnds_max, "max # connection daemons"); + +static int min_reconnectms = 1000; +module_param(min_reconnectms, int, 0644); +MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)"); + +static int max_reconnectms = 60000; +module_param(max_reconnectms, int, 0644); +MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)"); + +# define DEFAULT_EAGER_ACK 0 +static int eager_ack = DEFAULT_EAGER_ACK; +module_param(eager_ack, int, 0644); +MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly"); + +static int typed_conns = 1; +module_param(typed_conns, int, 0444); +MODULE_PARM_DESC(typed_conns, "use different sockets for bulk"); + +static int min_bulk = 1<<10; +module_param(min_bulk, int, 0644); +MODULE_PARM_DESC(min_bulk, "smallest 'large' message"); + +# define DEFAULT_BUFFER_SIZE 0 +static int tx_buffer_size = DEFAULT_BUFFER_SIZE; +module_param(tx_buffer_size, int, 0644); +MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)"); + +static int rx_buffer_size = DEFAULT_BUFFER_SIZE; +module_param(rx_buffer_size, int, 0644); +MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)"); + +static int nagle; +module_param(nagle, int, 0644); +MODULE_PARM_DESC(nagle, "enable NAGLE?"); + +static int round_robin = 1; +module_param(round_robin, int, 0644); +MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces"); + +static int keepalive = 30; +module_param(keepalive, int, 0644); +MODULE_PARM_DESC(keepalive, "# seconds before send keepalive"); + +static int keepalive_idle = 30; +module_param(keepalive_idle, int, 0644); +MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe"); + +#define DEFAULT_KEEPALIVE_COUNT 5 +static int keepalive_count = DEFAULT_KEEPALIVE_COUNT; +module_param(keepalive_count, int, 0644); +MODULE_PARM_DESC(keepalive_count, "# missed probes == dead"); + +static int keepalive_intvl = 5; +module_param(keepalive_intvl, int, 0644); +MODULE_PARM_DESC(keepalive_intvl, "seconds between probes"); + +static int enable_csum; +module_param(enable_csum, int, 0644); +MODULE_PARM_DESC(enable_csum, "enable check sum"); + +static int inject_csum_error; +module_param(inject_csum_error, int, 0644); +MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error"); + +static int nonblk_zcack = 1; +module_param(nonblk_zcack, int, 0644); +MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection"); + +static unsigned int zc_min_payload = 16 << 10; +module_param(zc_min_payload, int, 0644); +MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy"); + +static unsigned int zc_recv; +module_param(zc_recv, int, 0644); +MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver"); + +static unsigned int zc_recv_min_nfrags = 16; +module_param(zc_recv_min_nfrags, int, 0644); +MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv"); + + +#if SOCKNAL_VERSION_DEBUG +static int protocol = 3; +module_param(protocol, int, 0644); +MODULE_PARM_DESC(protocol, "protocol version"); +#endif + +ksock_tunables_t ksocknal_tunables; + +int ksocknal_tunables_init(void) +{ + + /* initialize ksocknal_tunables structure */ + ksocknal_tunables.ksnd_timeout = &sock_timeout; + ksocknal_tunables.ksnd_nscheds = &nscheds; + ksocknal_tunables.ksnd_nconnds = &nconnds; + ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; + ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; + ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; + ksocknal_tunables.ksnd_eager_ack = &eager_ack; + ksocknal_tunables.ksnd_typed_conns = &typed_conns; + ksocknal_tunables.ksnd_min_bulk = &min_bulk; + ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; + ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; + ksocknal_tunables.ksnd_nagle = &nagle; + ksocknal_tunables.ksnd_round_robin = &round_robin; + ksocknal_tunables.ksnd_keepalive = &keepalive; + ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; + ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; + ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; + ksocknal_tunables.ksnd_credits = &credits; + ksocknal_tunables.ksnd_peertxcredits = &peer_credits; + ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; + ksocknal_tunables.ksnd_peertimeout = &peer_timeout; + ksocknal_tunables.ksnd_enable_csum = &enable_csum; + ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; + ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; + ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; + ksocknal_tunables.ksnd_zc_recv = &zc_recv; + ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; + + + +#if SOCKNAL_VERSION_DEBUG + ksocknal_tunables.ksnd_protocol = &protocol; +#endif + + if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) + *ksocknal_tunables.ksnd_zc_min_payload = 2 << 10; + + return 0; +}; diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c new file mode 100644 index 000000000..8596581f5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +/* + * Protocol entries : + * pro_send_hello : send hello message + * pro_recv_hello : receive hello message + * pro_pack : pack message header + * pro_unpack : unpack message header + * pro_queue_tx_zcack() : Called holding BH lock: kss_lock + * return 1 if ACK is piggybacked, otherwise return 0 + * pro_queue_tx_msg() : Called holding BH lock: kss_lock + * return the ACK that piggybacked by my message, or NULL + * pro_handle_zcreq() : handler of incoming ZC-REQ + * pro_handle_zcack() : handler of incoming ZC-ACK + * pro_match_tx() : Called holding glock + */ + +static ksock_tx_t * +ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg) +{ + /* V1.x, just enqueue it */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; +} + +void +ksocknal_next_tx_carrier(ksock_conn_t *conn) +{ + ksock_tx_t *tx = conn->ksnc_tx_carrier; + + /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */ + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + LASSERT(tx != NULL); + + /* Next TX that can carry ZC-ACK or LNet message */ + if (tx->tx_list.next == &conn->ksnc_tx_queue) { + /* no more packets queued */ + conn->ksnc_tx_carrier = NULL; + } else { + conn->ksnc_tx_carrier = list_entry(tx->tx_list.next, + ksock_tx_t, tx_list); + LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type); + } +} + +static int +ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn, + ksock_tx_t *tx_ack, __u64 cookie) +{ + ksock_tx_t *tx = conn->ksnc_tx_carrier; + + LASSERT(tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* + * Enqueue or piggyback tx_ack / cookie + * . no tx can piggyback cookie of tx_ack (or cookie), just + * enqueue the tx_ack (if tx_ack != NUL) and return NULL. + * . There is tx can piggyback cookie of tx_ack (or cookie), + * piggyback the cookie and return the tx. + */ + if (tx == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) { + /* tx is noop zc-ack, can't piggyback zc-ack cookie */ + if (tx_ack != NULL) + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + return 0; + } + + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET); + LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0); + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + /* piggyback the zc-ack cookie */ + tx->tx_msg.ksm_zc_cookies[1] = cookie; + /* move on to the next TX which can carry cookie */ + ksocknal_next_tx_carrier(conn); + + return 1; +} + +static ksock_tx_t * +ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg) +{ + ksock_tx_t *tx = conn->ksnc_tx_carrier; + + /* + * Enqueue tx_msg: + * . If there is no NOOP on the connection, just enqueue + * tx_msg and return NULL + * . If there is NOOP on the connection, piggyback the cookie + * and replace the NOOP tx, and return the NOOP tx. + */ + if (tx == NULL) { /* nothing on queue */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_msg; + return NULL; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; + } + + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* There is a noop zc-ack can be piggybacked */ + tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1]; + ksocknal_next_tx_carrier(conn); + + /* use new_tx to replace the noop zc-ack packet */ + list_add(&tx_msg->tx_list, &tx->tx_list); + list_del(&tx->tx_list); + + return tx; +} + +static int +ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn, + ksock_tx_t *tx_ack, __u64 cookie) +{ + ksock_tx_t *tx; + + if (conn->ksnc_type != SOCKLND_CONN_ACK) + return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie); + + /* non-blocking ZC-ACK (to router) */ + LASSERT(tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + tx = conn->ksnc_tx_carrier; + if (tx == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + /* conn->ksnc_tx_carrier != NULL */ + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */ + return 1; + + if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) { + /* replace the keepalive PING with a real ACK */ + LASSERT(tx->tx_msg.ksm_zc_cookies[0] == 0); + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] || + cookie == tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: %llu\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); + return 1; /* XXX return error in the future */ + } + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */ + if (tx->tx_msg.ksm_zc_cookies[1] > cookie) { + tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1]; + tx->tx_msg.ksm_zc_cookies[1] = cookie; + } else { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + } + + if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) { + /* not likely to carry more ACKs, skip it to simplify logic */ + ksocknal_next_tx_carrier(conn); + } + + return 1; + } + + /* takes two or more cookies already */ + + if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) { + __u64 tmp = 0; + + /* two separated cookies: (a+2, a) or (a+1, a) */ + LASSERT(tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] <= 2); + + if (tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] == 2) { + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) + tmp = cookie; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) { + tmp = tx->tx_msg.ksm_zc_cookies[1]; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) { + tmp = tx->tx_msg.ksm_zc_cookies[0]; + } + + if (tmp != 0) { + /* range of cookies */ + tx->tx_msg.ksm_zc_cookies[0] = tmp - 1; + tx->tx_msg.ksm_zc_cookies[1] = tmp + 1; + return 1; + } + + } else { + /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */ + if (cookie >= tx->tx_msg.ksm_zc_cookies[0] && + cookie <= tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: %llu\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); + return 1; /* XXX: return error in the future */ + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) { + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + return 1; + } + } + + /* failed to piggyback ZC-ACK */ + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue); + /* the next tx can piggyback at least 1 ACK */ + ksocknal_next_tx_carrier(conn); + } + + return 0; +} + +static int +ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk) +{ + int nob; + +#if SOCKNAL_VERSION_DEBUG + if (!*ksocknal_tunables.ksnd_typed_conns) + return SOCKNAL_MATCH_YES; +#endif + + if (tx == NULL || tx->tx_lnetmsg == NULL) { + /* noop packet */ + nob = offsetof(ksock_msg_t, ksm_u); + } else { + nob = tx->tx_lnetmsg->msg_len + + ((conn->ksnc_proto == &ksocknal_protocol_v1x) ? + sizeof(lnet_hdr_t) : sizeof(ksock_msg_t)); + } + + /* default checking for typed connection */ + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_BULK_IN: + return SOCKNAL_MATCH_MAY; + + case SOCKLND_CONN_BULK_OUT: + if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +static int +ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk) +{ + int nob; + + if (tx == NULL || tx->tx_lnetmsg == NULL) + nob = offsetof(ksock_msg_t, ksm_u); + else + nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t); + + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_ACK: + if (nonblk) + return SOCKNAL_MATCH_YES; + else if (tx == NULL || tx->tx_lnetmsg == NULL) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_BULK_OUT: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +/* (Sink) handle incoming ZC request from sender */ +static int +ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote) +{ + ksock_peer_t *peer = c->ksnc_peer; + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = ksocknal_find_conn_locked(peer, NULL, !!remote); + if (conn != NULL) { + ksock_sched_t *sched = conn->ksnc_scheduler; + + LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + spin_lock_bh(&sched->kss_lock); + + rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie); + + spin_unlock_bh(&sched->kss_lock); + + if (rc) { /* piggybacked */ + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* ACK connection is not ready, or can't piggyback the ACK */ + tx = ksocknal_alloc_tx_noop(cookie, !!remote); + if (tx == NULL) + return -ENOMEM; + + rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id); + if (rc == 0) + return 0; + + ksocknal_free_tx(tx); + return rc; +} + +/* (Sender) handle ZC_ACK from sink */ +static int +ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2) +{ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_tx_t *tx; + ksock_tx_t *tmp; + LIST_HEAD(zlist); + int count; + + if (cookie1 == 0) + cookie1 = cookie2; + + count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1); + + if (cookie2 == SOCKNAL_KEEPALIVE_PING && + conn->ksnc_proto == &ksocknal_protocol_v3x) { + /* keepalive PING for V3.x, just ignore it */ + return count == 1 ? 0 : -EPROTO; + } + + spin_lock(&peer->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, + &peer->ksnp_zc_req_list, tx_zc_list) { + __u64 c = tx->tx_msg.ksm_zc_cookies[0]; + + if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) { + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_del(&tx->tx_zc_list); + list_add(&tx->tx_zc_list, &zlist); + + if (--count == 0) + break; + } + } + + spin_unlock(&peer->ksnp_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list); + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } + + return count == 0 ? 0 : -EPROTO; +} + +static int +ksocknal_send_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello) +{ + struct socket *sock = conn->ksnc_sock; + lnet_hdr_t *hdr; + lnet_magicversion_t *hmv; + int rc; + int i; + + CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid)); + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate lnet_hdr_t\n"); + return -ENOMEM; + } + + hmv = (lnet_magicversion_t *)&hdr->dest_nid; + + /* Re-organize V2.x message header to V1.x (lnet_hdr_t) + * header and send out */ + hmv->magic = cpu_to_le32 (LNET_PROTO_TCP_MAGIC); + hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR); + hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR); + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hmv->version_major++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + hmv->magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + + hdr->src_nid = cpu_to_le64 (hello->kshm_src_nid); + hdr->src_pid = cpu_to_le32 (hello->kshm_src_pid); + hdr->type = cpu_to_le32 (LNET_MSG_HELLO); + hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32)); + hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype); + hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation); + + rc = libcfs_sock_write(sock, hdr, sizeof(*hdr), + lnet_acceptor_timeout()); + + if (rc != 0) { + CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", + rc, &conn->ksnc_ipaddr, conn->ksnc_port); + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + for (i = 0; i < (int) hello->kshm_nips; i++) { + hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]); + } + + rc = libcfs_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n", + rc, hello->kshm_nips, + &conn->ksnc_ipaddr, conn->ksnc_port); + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} + +static int +ksocknal_send_hello_v2(ksock_conn_t *conn, ksock_hello_msg_t *hello) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + + hello->kshm_magic = LNET_PROTO_MAGIC; + hello->kshm_version = conn->ksnc_proto->pro_version; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hello->kshm_version++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + LNET_UNLOCK(); + } + + rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips), + lnet_acceptor_timeout()); + + if (rc != 0) { + CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", + rc, &conn->ksnc_ipaddr, conn->ksnc_port); + return rc; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = libcfs_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n", + rc, hello->kshm_nips, + &conn->ksnc_ipaddr, conn->ksnc_port); + } + + return rc; +} + +static int +ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello, + int timeout) +{ + struct socket *sock = conn->ksnc_sock; + lnet_hdr_t *hdr; + int rc; + int i; + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate lnet_hdr_t\n"); + return -ENOMEM; + } + + rc = libcfs_sock_read(sock, &hdr->src_nid, + sizeof(*hdr) - offsetof(lnet_hdr_t, src_nid), + timeout); + if (rc != 0) { + CERROR("Error %d reading rest of HELLO hdr from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + goto out; + } + + /* ...and check we got what we expected */ + if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) { + CERROR("Expecting a HELLO hdr, but got type %d from %pI4h\n", + le32_to_cpu(hdr->type), + &conn->ksnc_ipaddr); + rc = -EPROTO; + goto out; + } + + hello->kshm_src_nid = le64_to_cpu(hdr->src_nid); + hello->kshm_src_pid = le32_to_cpu(hdr->src_pid); + hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation); + hello->kshm_ctype = le32_to_cpu(hdr->msg.hello.type); + hello->kshm_nips = le32_to_cpu(hdr->payload_length) / + sizeof(__u32); + + if (hello->kshm_nips > LNET_MAX_INTERFACES) { + CERROR("Bad nips %d from ip %pI4h\n", + hello->kshm_nips, &conn->ksnc_ipaddr); + rc = -EPROTO; + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + rc = libcfs_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR("Error %d reading IPs from ip %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + goto out; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %pI4h\n", + i, &conn->ksnc_ipaddr); + rc = -EPROTO; + break; + } + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} + +static int +ksocknal_recv_hello_v2(ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + int i; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + conn->ksnc_flip = 0; + else + conn->ksnc_flip = 1; + + rc = libcfs_sock_read(sock, &hello->kshm_src_nid, + offsetof(ksock_hello_msg_t, kshm_ips) - + offsetof(ksock_hello_msg_t, kshm_src_nid), + timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + return rc; + } + + if (conn->ksnc_flip) { + __swab32s(&hello->kshm_src_pid); + __swab64s(&hello->kshm_src_nid); + __swab32s(&hello->kshm_dst_pid); + __swab64s(&hello->kshm_dst_nid); + __swab64s(&hello->kshm_src_incarnation); + __swab64s(&hello->kshm_dst_incarnation); + __swab32s(&hello->kshm_ctype); + __swab32s(&hello->kshm_nips); + } + + if (hello->kshm_nips > LNET_MAX_INTERFACES) { + CERROR("Bad nips %d from ip %pI4h\n", + hello->kshm_nips, &conn->ksnc_ipaddr); + return -EPROTO; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = libcfs_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR("Error %d reading IPs from ip %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + return rc; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + if (conn->ksnc_flip) + __swab32s(&hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %pI4h\n", + i, &conn->ksnc_ipaddr); + return -EPROTO; + } + } + + return 0; +} + +static void +ksocknal_pack_msg_v1(ksock_tx_t *tx) +{ + /* V1.x has no KSOCK_MSG_NOOP */ + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_lnetmsg != NULL); + + tx->tx_iov[0].iov_base = &tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = sizeof(lnet_hdr_t); + + tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t); +} + +static void +ksocknal_pack_msg_v2(ksock_tx_t *tx) +{ + tx->tx_iov[0].iov_base = &tx->tx_msg; + + if (tx->tx_lnetmsg != NULL) { + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + + tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = sizeof(ksock_msg_t); + tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len; + } else { + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr); + tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr); + } + /* Don't checksum before start sending, because packet can be piggybacked with ACK */ +} + +static void +ksocknal_unpack_msg_v1(ksock_msg_t *msg) +{ + msg->ksm_csum = 0; + msg->ksm_type = KSOCK_MSG_LNET; + msg->ksm_zc_cookies[0] = msg->ksm_zc_cookies[1] = 0; +} + +static void +ksocknal_unpack_msg_v2(ksock_msg_t *msg) +{ + return; /* Do nothing */ +} + +ksock_proto_t ksocknal_protocol_v1x = { + .pro_version = KSOCK_PROTO_V1, + .pro_send_hello = ksocknal_send_hello_v1, + .pro_recv_hello = ksocknal_recv_hello_v1, + .pro_pack = ksocknal_pack_msg_v1, + .pro_unpack = ksocknal_unpack_msg_v1, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1, + .pro_handle_zcreq = NULL, + .pro_handle_zcack = NULL, + .pro_queue_tx_zcack = NULL, + .pro_match_tx = ksocknal_match_tx +}; + +ksock_proto_t ksocknal_protocol_v2x = { + .pro_version = KSOCK_PROTO_V2, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx +}; + +ksock_proto_t ksocknal_protocol_v3x = { + .pro_version = KSOCK_PROTO_V3, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx_v3 +}; diff --git a/kernel/drivers/staging/lustre/lnet/lnet/Makefile b/kernel/drivers/staging/lustre/lnet/lnet/Makefile new file mode 100644 index 000000000..336b8ea4f --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LNET) += lnet.o + +lnet-y := api-ni.o config.o lib-me.o lib-msg.o lib-eq.o \ + lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o \ + router_proc.o acceptor.o peer.o diff --git a/kernel/drivers/staging/lustre/lnet/lnet/acceptor.c b/kernel/drivers/staging/lustre/lnet/lnet/acceptor.c new file mode 100644 index 000000000..72fd1bf70 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/acceptor.c @@ -0,0 +1,500 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include "../../include/linux/lnet/lib-lnet.h" + + +static int accept_port = 988; +static int accept_backlog = 127; +static int accept_timeout = 5; + +static struct { + int pta_shutdown; + struct socket *pta_sock; + struct completion pta_signal; +} lnet_acceptor_state; + +int +lnet_acceptor_port(void) +{ + return accept_port; +} +EXPORT_SYMBOL(lnet_acceptor_port); + +static inline int +lnet_accept_magic(__u32 magic, __u32 constant) +{ + return (magic == constant || + magic == __swab32(constant)); +} + +static char *accept = "secure"; + +module_param(accept, charp, 0444); +MODULE_PARM_DESC(accept, "Accept connections (secure|all|none)"); +module_param(accept_port, int, 0444); +MODULE_PARM_DESC(accept_port, "Acceptor's port (same on all nodes)"); +module_param(accept_backlog, int, 0444); +MODULE_PARM_DESC(accept_backlog, "Acceptor's listen backlog"); +module_param(accept_timeout, int, 0644); +MODULE_PARM_DESC(accept_timeout, "Acceptor's timeout (seconds)"); + +static char *accept_type; + +static int +lnet_acceptor_get_tunables(void) +{ + /* Userland acceptor uses 'accept_type' instead of 'accept', due to + * conflict with 'accept(2)', but kernel acceptor still uses 'accept' + * for compatibility. Hence the trick. */ + accept_type = accept; + return 0; +} + +int +lnet_acceptor_timeout(void) +{ + return accept_timeout; +} +EXPORT_SYMBOL(lnet_acceptor_timeout); + +void +lnet_connect_console_error(int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int peer_port) +{ + switch (rc) { + /* "normal" errors */ + case -ECONNREFUSED: + CNETERR("Connection to %s at host %pI4h on port %d was refused: check that Lustre is running on that node.\n", + libcfs_nid2str(peer_nid), + &peer_ip, peer_port); + break; + case -EHOSTUNREACH: + case -ENETUNREACH: + CNETERR("Connection to %s at host %pI4h was unreachable: the network or that node may be down, or Lustre may be misconfigured.\n", + libcfs_nid2str(peer_nid), &peer_ip); + break; + case -ETIMEDOUT: + CNETERR("Connection to %s at host %pI4h on port %d took too long: that node may be hung or experiencing high load.\n", + libcfs_nid2str(peer_nid), + &peer_ip, peer_port); + break; + case -ECONNRESET: + LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %pI4h on port %d was reset: is it running a compatible version of Lustre and is %s one of its NIDs?\n", + libcfs_nid2str(peer_nid), + &peer_ip, peer_port, + libcfs_nid2str(peer_nid)); + break; + case -EPROTO: + LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at host %pI4h on port %d: is it running a compatible version of Lustre?\n", + libcfs_nid2str(peer_nid), + &peer_ip, peer_port); + break; + case -EADDRINUSE: + LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to connect to %s at host %pI4h on port %d\n", + libcfs_nid2str(peer_nid), + &peer_ip, peer_port); + break; + default: + LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s at host %pI4h on port %d\n", + rc, libcfs_nid2str(peer_nid), + &peer_ip, peer_port); + break; + } +} +EXPORT_SYMBOL(lnet_connect_console_error); + +int +lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port) +{ + lnet_acceptor_connreq_t cr; + struct socket *sock; + int rc; + int port; + int fatal; + + CLASSERT(sizeof(cr) <= 16); /* not too big to be on the stack */ + + for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT; + port >= LNET_ACCEPTOR_MIN_RESERVED_PORT; + --port) { + /* Iterate through reserved ports. */ + + rc = libcfs_sock_connect(&sock, &fatal, + local_ip, port, + peer_ip, peer_port); + if (rc != 0) { + if (fatal) + goto failed; + continue; + } + + CLASSERT(LNET_PROTO_ACCEPTOR_VERSION == 1); + + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + cr.acr_nid = peer_nid; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + lnet_net_lock(LNET_LOCK_EX); + if ((the_lnet.ln_testprotocompat & 4) != 0) { + cr.acr_version++; + the_lnet.ln_testprotocompat &= ~4; + } + if ((the_lnet.ln_testprotocompat & 8) != 0) { + cr.acr_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~8; + } + lnet_net_unlock(LNET_LOCK_EX); + } + + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + if (rc != 0) + goto failed_sock; + + *sockp = sock; + return 0; + } + + rc = -EADDRINUSE; + goto failed; + + failed_sock: + libcfs_sock_release(sock); + failed: + lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port); + return rc; +} +EXPORT_SYMBOL(lnet_connect); + + +/* Below is the code common for both kernel and MT user-space */ + +static int +lnet_accept(struct socket *sock, __u32 magic) +{ + lnet_acceptor_connreq_t cr; + __u32 peer_ip; + int peer_port; + int rc; + int flip; + lnet_ni_t *ni; + char *str; + + LASSERT(sizeof(cr) <= 16); /* not too big for the stack */ + + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT(rc == 0); /* we succeeded before */ + + if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) { + + if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) { + /* future version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. I send back + * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */ + + memset(&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response to LNET magic from %pI4h: %d\n", + &peer_ip, rc); + return -EPROTO; + } + + if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) + str = "'old' socknal/tcpnal"; + else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC)) + str = "'old' ranal"; + else + str = "unrecognised"; + + LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %pI4h magic %08x: %s acceptor protocol\n", + &peer_ip, magic, str); + return -EPROTO; + } + + flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC); + + rc = libcfs_sock_read(sock, &cr.acr_version, + sizeof(cr.acr_version), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request version from %pI4h\n", + rc, &peer_ip); + return -EIO; + } + + if (flip) + __swab32s(&cr.acr_version); + + if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) { + /* future version compatibility! + * An acceptor-specific protocol rev will first send a version + * query. I send back my current version to tell her I'm + * "old". */ + int peer_version = cr.acr_version; + + memset(&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response to version %d from %pI4h: %d\n", + peer_version, &peer_ip, rc); + return -EPROTO; + } + + rc = libcfs_sock_read(sock, &cr.acr_nid, + sizeof(cr) - + offsetof(lnet_acceptor_connreq_t, acr_nid), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from %pI4h\n", + rc, &peer_ip); + return -EIO; + } + + if (flip) + __swab64s(&cr.acr_nid); + + ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid)); + if (ni == NULL || /* no matching net */ + ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */ + if (ni != NULL) + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %pI4h for %s: No matching NI\n", + &peer_ip, libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + if (ni->ni_lnd->lnd_accept == NULL) { + /* This catches a request for the loopback LND */ + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %pI4h for %s: NI doesn not accept IP connections\n", + &peer_ip, libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + CDEBUG(D_NET, "Accept %s from %pI4h\n", + libcfs_nid2str(cr.acr_nid), &peer_ip); + + rc = ni->ni_lnd->lnd_accept(ni, sock); + + lnet_ni_decref(ni); + return rc; +} + +static int +lnet_acceptor(void *arg) +{ + struct socket *newsock; + int rc; + __u32 magic; + __u32 peer_ip; + int peer_port; + int secure = (int)((long_ptr_t)arg); + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + + cfs_block_allsigs(); + + rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock, + 0, accept_port, accept_backlog); + if (rc != 0) { + if (rc == -EADDRINUSE) + LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port %d: port already in use\n", + accept_port); + else + LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port %d: unexpected error %d\n", + accept_port, rc); + + lnet_acceptor_state.pta_sock = NULL; + } else { + LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port); + } + + /* set init status and unblock parent */ + lnet_acceptor_state.pta_shutdown = rc; + complete(&lnet_acceptor_state.pta_signal); + + if (rc != 0) + return rc; + + while (!lnet_acceptor_state.pta_shutdown) { + + rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock); + if (rc != 0) { + if (rc != -EAGAIN) { + CWARN("Accept error %d: pausing...\n", rc); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + continue; + } + + /* maybe we're waken up with libcfs_sock_abort_accept() */ + if (lnet_acceptor_state.pta_shutdown) { + libcfs_sock_release(newsock); + break; + } + + rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port); + if (rc != 0) { + CERROR("Can't determine new connection's address\n"); + goto failed; + } + + if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + CERROR("Refusing connection from %pI4h: insecure port %d\n", + &peer_ip, peer_port); + goto failed; + } + + rc = libcfs_sock_read(newsock, &magic, sizeof(magic), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from %pI4h\n", + rc, &peer_ip); + goto failed; + } + + rc = lnet_accept(newsock, magic); + if (rc != 0) + goto failed; + + continue; + +failed: + libcfs_sock_release(newsock); + } + + libcfs_sock_release(lnet_acceptor_state.pta_sock); + lnet_acceptor_state.pta_sock = NULL; + + CDEBUG(D_NET, "Acceptor stopping\n"); + + /* unblock lnet_acceptor_stop() */ + complete(&lnet_acceptor_state.pta_signal); + return 0; +} + +static inline int +accept2secure(const char *acc, long *sec) +{ + if (!strcmp(acc, "secure")) { + *sec = 1; + return 1; + } else if (!strcmp(acc, "all")) { + *sec = 0; + return 1; + } else if (!strcmp(acc, "none")) { + return 0; + } + + LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n", + acc); + return -EINVAL; +} + +int +lnet_acceptor_start(void) +{ + int rc; + long rc2; + long secure; + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + + rc = lnet_acceptor_get_tunables(); + if (rc != 0) + return rc; + + + init_completion(&lnet_acceptor_state.pta_signal); + rc = accept2secure(accept_type, &secure); + if (rc <= 0) + return rc; + + if (lnet_count_acceptor_nis() == 0) /* not required */ + return 0; + + rc2 = PTR_ERR(kthread_run(lnet_acceptor, + (void *)(ulong_ptr_t)secure, + "acceptor_%03ld", secure)); + if (IS_ERR_VALUE(rc2)) { + CERROR("Can't start acceptor thread: %ld\n", rc2); + + return -ESRCH; + } + + /* wait for acceptor to startup */ + wait_for_completion(&lnet_acceptor_state.pta_signal); + + if (!lnet_acceptor_state.pta_shutdown) { + /* started OK */ + LASSERT(lnet_acceptor_state.pta_sock != NULL); + return 0; + } + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + + return -ENETDOWN; +} + +void +lnet_acceptor_stop(void) +{ + if (lnet_acceptor_state.pta_sock == NULL) /* not running */ + return; + + lnet_acceptor_state.pta_shutdown = 1; + libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock); + + /* block until acceptor signals exit */ + wait_for_completion(&lnet_acceptor_state.pta_signal); +} diff --git a/kernel/drivers/staging/lustre/lnet/lnet/api-ni.c b/kernel/drivers/staging/lustre/lnet/lnet/api-ni.c new file mode 100644 index 000000000..4a14e5109 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/api-ni.c @@ -0,0 +1,1940 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include "../../include/linux/lnet/lib-lnet.h" +#include +#include + +#define D_LNI D_CONSOLE + +lnet_t the_lnet; /* THE state of the network */ +EXPORT_SYMBOL(the_lnet); + + +static char *ip2nets = ""; +module_param(ip2nets, charp, 0444); +MODULE_PARM_DESC(ip2nets, "LNET network <- IP table"); + +static char *networks = ""; +module_param(networks, charp, 0444); +MODULE_PARM_DESC(networks, "local networks"); + +static char *routes = ""; +module_param(routes, charp, 0444); +MODULE_PARM_DESC(routes, "routes to non-local networks"); + +static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; +module_param(rnet_htable_size, int, 0444); +MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table"); + +static char * +lnet_get_routes(void) +{ + return routes; +} + +static char * +lnet_get_networks(void) +{ + char *nets; + int rc; + + if (*networks != 0 && *ip2nets != 0) { + LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or 'ip2nets' but not both at once\n"); + return NULL; + } + + if (*ip2nets != 0) { + rc = lnet_parse_ip2nets(&nets, ip2nets); + return (rc == 0) ? nets : NULL; + } + + if (*networks != 0) + return networks; + + return "tcp"; +} + +static void +lnet_init_locks(void) +{ + spin_lock_init(&the_lnet.ln_eq_wait_lock); + init_waitqueue_head(&the_lnet.ln_eq_waitq); + mutex_init(&the_lnet.ln_lnd_mutex); + mutex_init(&the_lnet.ln_api_mutex); +} + +static void +lnet_fini_locks(void) +{ +} + + +static int +lnet_create_remote_nets_table(void) +{ + int i; + struct list_head *hash; + + LASSERT(the_lnet.ln_remote_nets_hash == NULL); + LASSERT(the_lnet.ln_remote_nets_hbits > 0); + LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash)); + if (hash == NULL) { + CERROR("Failed to create remote nets hash table\n"); + return -ENOMEM; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + INIT_LIST_HEAD(&hash[i]); + the_lnet.ln_remote_nets_hash = hash; + return 0; +} + +static void +lnet_destroy_remote_nets_table(void) +{ + int i; + + if (the_lnet.ln_remote_nets_hash == NULL) + return; + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i])); + + LIBCFS_FREE(the_lnet.ln_remote_nets_hash, + LNET_REMOTE_NETS_HASH_SIZE * + sizeof(the_lnet.ln_remote_nets_hash[0])); + the_lnet.ln_remote_nets_hash = NULL; +} + +static void +lnet_destroy_locks(void) +{ + if (the_lnet.ln_res_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_res_lock); + the_lnet.ln_res_lock = NULL; + } + + if (the_lnet.ln_net_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_net_lock); + the_lnet.ln_net_lock = NULL; + } + + lnet_fini_locks(); +} + +static int +lnet_create_locks(void) +{ + lnet_init_locks(); + + the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_res_lock == NULL) + goto failed; + + the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_net_lock == NULL) + goto failed; + + return 0; + + failed: + lnet_destroy_locks(); + return -ENOMEM; +} + +static void lnet_assert_wire_constants(void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * running on Linux robert.bartonsoftware.com 2.6.8-1.521 + * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux + * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */ + + /* Constants... */ + CLASSERT(LNET_PROTO_TCP_MAGIC == 0xeebc0ded); + CLASSERT(LNET_PROTO_TCP_VERSION_MAJOR == 1); + CLASSERT(LNET_PROTO_TCP_VERSION_MINOR == 0); + CLASSERT(LNET_MSG_ACK == 0); + CLASSERT(LNET_MSG_PUT == 1); + CLASSERT(LNET_MSG_GET == 2); + CLASSERT(LNET_MSG_REPLY == 3); + CLASSERT(LNET_MSG_HELLO == 4); + + /* Checks for struct ptl_handle_wire_t */ + CLASSERT((int)sizeof(lnet_handle_wire_t) == 16); + CLASSERT((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0); + CLASSERT((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8); + CLASSERT((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8); + CLASSERT((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8); + + /* Checks for struct lnet_magicversion_t */ + CLASSERT((int)sizeof(lnet_magicversion_t) == 8); + CLASSERT((int)offsetof(lnet_magicversion_t, magic) == 0); + CLASSERT((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4); + CLASSERT((int)offsetof(lnet_magicversion_t, version_major) == 4); + CLASSERT((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2); + CLASSERT((int)offsetof(lnet_magicversion_t, version_minor) == 6); + CLASSERT((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2); + + /* Checks for struct lnet_hdr_t */ + CLASSERT((int)sizeof(lnet_hdr_t) == 72); + CLASSERT((int)offsetof(lnet_hdr_t, dest_nid) == 0); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8); + CLASSERT((int)offsetof(lnet_hdr_t, src_nid) == 8); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8); + CLASSERT((int)offsetof(lnet_hdr_t, dest_pid) == 16); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4); + CLASSERT((int)offsetof(lnet_hdr_t, src_pid) == 20); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4); + CLASSERT((int)offsetof(lnet_hdr_t, type) == 24); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->type) == 4); + CLASSERT((int)offsetof(lnet_hdr_t, payload_length) == 28); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4); + CLASSERT((int)offsetof(lnet_hdr_t, msg) == 32); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg) == 40); + + /* Ack */ + CLASSERT((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16); + CLASSERT((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8); + CLASSERT((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4); + + /* Put */ + CLASSERT((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16); + CLASSERT((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8); + CLASSERT((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8); + CLASSERT((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4); + CLASSERT((int)offsetof(lnet_hdr_t, msg.put.offset) == 68); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4); + + /* Get */ + CLASSERT((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16); + CLASSERT((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8); + CLASSERT((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4); + CLASSERT((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4); + CLASSERT((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4); + + /* Reply */ + CLASSERT((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16); + + /* Hello */ + CLASSERT((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8); + CLASSERT((int)offsetof(lnet_hdr_t, msg.hello.type) == 40); + CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4); +} + +static lnd_t * +lnet_find_lnd_by_type(int type) +{ + lnd_t *lnd; + struct list_head *tmp; + + /* holding lnd mutex */ + list_for_each(tmp, &the_lnet.ln_lnds) { + lnd = list_entry(tmp, lnd_t, lnd_list); + + if ((int)lnd->lnd_type == type) + return lnd; + } + + return NULL; +} + +void +lnet_register_lnd(lnd_t *lnd) +{ + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + LASSERT(the_lnet.ln_init); + LASSERT(libcfs_isknown_lnd(lnd->lnd_type)); + LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == NULL); + + list_add_tail(&lnd->lnd_list, &the_lnet.ln_lnds); + lnd->lnd_refcount = 0; + + CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type)); + + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_register_lnd); + +void +lnet_unregister_lnd(lnd_t *lnd) +{ + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + LASSERT(the_lnet.ln_init); + LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == lnd); + LASSERT(lnd->lnd_refcount == 0); + + list_del(&lnd->lnd_list); + CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type)); + + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_unregister_lnd); + +void +lnet_counters_get(lnet_counters_t *counters) +{ + lnet_counters_t *ctr; + int i; + + memset(counters, 0, sizeof(*counters)); + + lnet_net_lock(LNET_LOCK_EX); + + cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { + counters->msgs_max += ctr->msgs_max; + counters->msgs_alloc += ctr->msgs_alloc; + counters->errors += ctr->errors; + counters->send_count += ctr->send_count; + counters->recv_count += ctr->recv_count; + counters->route_count += ctr->route_count; + counters->drop_count += ctr->drop_count; + counters->send_length += ctr->send_length; + counters->recv_length += ctr->recv_length; + counters->route_length += ctr->route_length; + counters->drop_length += ctr->drop_length; + + } + lnet_net_unlock(LNET_LOCK_EX); +} +EXPORT_SYMBOL(lnet_counters_get); + +void +lnet_counters_reset(void) +{ + lnet_counters_t *counters; + int i; + + lnet_net_lock(LNET_LOCK_EX); + + cfs_percpt_for_each(counters, i, the_lnet.ln_counters) + memset(counters, 0, sizeof(lnet_counters_t)); + + lnet_net_unlock(LNET_LOCK_EX); +} +EXPORT_SYMBOL(lnet_counters_reset); + +#ifdef LNET_USE_LIB_FREELIST + +int +lnet_freelist_init(lnet_freelist_t *fl, int n, int size) +{ + char *space; + + LASSERT(n > 0); + + size += offsetof(lnet_freeobj_t, fo_contents); + + LIBCFS_ALLOC(space, n * size); + if (space == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&fl->fl_list); + fl->fl_objs = space; + fl->fl_nobjs = n; + fl->fl_objsize = size; + + do { + memset(space, 0, size); + list_add((struct list_head *)space, &fl->fl_list); + space += size; + } while (--n != 0); + + return 0; +} + +void +lnet_freelist_fini(lnet_freelist_t *fl) +{ + struct list_head *el; + int count; + + if (fl->fl_nobjs == 0) + return; + + count = 0; + for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) + count++; + + LASSERT(count == fl->fl_nobjs); + + LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + memset(fl, 0, sizeof(*fl)); +} + +#endif /* LNET_USE_LIB_FREELIST */ + +static __u64 +lnet_create_interface_cookie(void) +{ + /* NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid after reboot. + */ + return ktime_get_ns(); +} + +static char * +lnet_res_type2str(int type) +{ + switch (type) { + default: + LBUG(); + case LNET_COOKIE_TYPE_MD: + return "MD"; + case LNET_COOKIE_TYPE_ME: + return "ME"; + case LNET_COOKIE_TYPE_EQ: + return "EQ"; + } +} + +static void +lnet_res_container_cleanup(struct lnet_res_container *rec) +{ + int count = 0; + + if (rec->rec_type == 0) /* not set yet, it's uninitialized */ + return; + + while (!list_empty(&rec->rec_active)) { + struct list_head *e = rec->rec_active.next; + + list_del_init(e); + if (rec->rec_type == LNET_COOKIE_TYPE_EQ) { + lnet_eq_free(list_entry(e, lnet_eq_t, eq_list)); + + } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) { + lnet_md_free(list_entry(e, lnet_libmd_t, md_list)); + + } else { /* NB: Active MEs should be attached on portals */ + LBUG(); + } + count++; + } + + if (count > 0) { + /* Found alive MD/ME/EQ, user really should unlink/free + * all of them before finalize LNet, but if someone didn't, + * we have to recycle garbage for him */ + CERROR("%d active elements on exit of %s container\n", + count, lnet_res_type2str(rec->rec_type)); + } + +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_fini(&rec->rec_freelist); +#endif + if (rec->rec_lh_hash != NULL) { + LIBCFS_FREE(rec->rec_lh_hash, + LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0])); + rec->rec_lh_hash = NULL; + } + + rec->rec_type = 0; /* mark it as finalized */ +} + +static int +lnet_res_container_setup(struct lnet_res_container *rec, + int cpt, int type, int objnum, int objsz) +{ + int rc = 0; + int i; + + LASSERT(rec->rec_type == 0); + + rec->rec_type = type; + INIT_LIST_HEAD(&rec->rec_active); + +#ifdef LNET_USE_LIB_FREELIST + memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist)); + rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz); + if (rc != 0) + goto out; +#endif + rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type; + + /* Arbitrary choice of hash table size */ + LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt, + LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0])); + if (rec->rec_lh_hash == NULL) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < LNET_LH_HASH_SIZE; i++) + INIT_LIST_HEAD(&rec->rec_lh_hash[i]); + + return 0; + +out: + CERROR("Failed to setup %s resource container\n", + lnet_res_type2str(type)); + lnet_res_container_cleanup(rec); + return rc; +} + +static void +lnet_res_containers_destroy(struct lnet_res_container **recs) +{ + struct lnet_res_container *rec; + int i; + + cfs_percpt_for_each(rec, i, recs) + lnet_res_container_cleanup(rec); + + cfs_percpt_free(recs); +} + +static struct lnet_res_container ** +lnet_res_containers_create(int type, int objnum, int objsz) +{ + struct lnet_res_container **recs; + struct lnet_res_container *rec; + int rc; + int i; + + recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec)); + if (recs == NULL) { + CERROR("Failed to allocate %s resource containers\n", + lnet_res_type2str(type)); + return NULL; + } + + cfs_percpt_for_each(rec, i, recs) { + rc = lnet_res_container_setup(rec, i, type, objnum, objsz); + if (rc != 0) { + lnet_res_containers_destroy(recs); + return NULL; + } + } + + return recs; +} + +lnet_libhandle_t * +lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie) +{ + /* ALWAYS called with lnet_res_lock held */ + struct list_head *head; + lnet_libhandle_t *lh; + unsigned int hash; + + if ((cookie & LNET_COOKIE_MASK) != rec->rec_type) + return NULL; + + hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS); + head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK]; + + list_for_each_entry(lh, head, lh_hash_chain) { + if (lh->lh_cookie == cookie) + return lh; + } + + return NULL; +} + +void +lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh) +{ + /* ALWAYS called with lnet_res_lock held */ + unsigned int ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS; + unsigned int hash; + + lh->lh_cookie = rec->rec_lh_cookie; + rec->rec_lh_cookie += 1 << ibits; + + hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK; + + list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]); +} + + +int lnet_unprepare(void); + +static int +lnet_prepare(lnet_pid_t requested_pid) +{ + /* Prepare to bring up the network */ + struct lnet_res_container **recs; + int rc = 0; + + LASSERT(the_lnet.ln_refcount == 0); + + the_lnet.ln_routing = 0; + + LASSERT((requested_pid & LNET_PID_USERFLAG) == 0); + the_lnet.ln_pid = requested_pid; + + INIT_LIST_HEAD(&the_lnet.ln_test_peers); + INIT_LIST_HEAD(&the_lnet.ln_nis); + INIT_LIST_HEAD(&the_lnet.ln_nis_cpt); + INIT_LIST_HEAD(&the_lnet.ln_nis_zombie); + INIT_LIST_HEAD(&the_lnet.ln_routers); + + rc = lnet_create_remote_nets_table(); + if (rc != 0) + goto failed; + + the_lnet.ln_interface_cookie = lnet_create_interface_cookie(); + + the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(lnet_counters_t)); + if (the_lnet.ln_counters == NULL) { + CERROR("Failed to allocate counters for LNet\n"); + rc = -ENOMEM; + goto failed; + } + + rc = lnet_peer_tables_create(); + if (rc != 0) + goto failed; + + rc = lnet_msg_containers_create(); + if (rc != 0) + goto failed; + + rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0, + LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS, + sizeof(lnet_eq_t)); + if (rc != 0) + goto failed; + + recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES, + sizeof(lnet_me_t)); + if (recs == NULL) { + rc = -ENOMEM; + goto failed; + } + + the_lnet.ln_me_containers = recs; + + recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS, + sizeof(lnet_libmd_t)); + if (recs == NULL) { + rc = -ENOMEM; + goto failed; + } + + the_lnet.ln_md_containers = recs; + + rc = lnet_portals_create(); + if (rc != 0) { + CERROR("Failed to create portals for LNet: %d\n", rc); + goto failed; + } + + return 0; + + failed: + lnet_unprepare(); + return rc; +} + +int +lnet_unprepare(void) +{ + /* NB no LNET_LOCK since this is the last reference. All LND instances + * have shut down already, so it is safe to unlink and free all + * descriptors, even those that appear committed to a network op (eg MD + * with non-zero pending count) */ + + lnet_fail_nid(LNET_NID_ANY, 0); + + LASSERT(the_lnet.ln_refcount == 0); + LASSERT(list_empty(&the_lnet.ln_test_peers)); + LASSERT(list_empty(&the_lnet.ln_nis)); + LASSERT(list_empty(&the_lnet.ln_nis_cpt)); + LASSERT(list_empty(&the_lnet.ln_nis_zombie)); + + lnet_portals_destroy(); + + if (the_lnet.ln_md_containers != NULL) { + lnet_res_containers_destroy(the_lnet.ln_md_containers); + the_lnet.ln_md_containers = NULL; + } + + if (the_lnet.ln_me_containers != NULL) { + lnet_res_containers_destroy(the_lnet.ln_me_containers); + the_lnet.ln_me_containers = NULL; + } + + lnet_res_container_cleanup(&the_lnet.ln_eq_container); + + lnet_msg_containers_destroy(); + lnet_peer_tables_destroy(); + lnet_rtrpools_free(); + + if (the_lnet.ln_counters != NULL) { + cfs_percpt_free(the_lnet.ln_counters); + the_lnet.ln_counters = NULL; + } + lnet_destroy_remote_nets_table(); + + return 0; +} + +lnet_ni_t * +lnet_net2ni_locked(__u32 net, int cpt) +{ + struct list_head *tmp; + lnet_ni_t *ni; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (LNET_NIDNET(ni->ni_nid) == net) { + lnet_ni_addref_locked(ni, cpt); + return ni; + } + } + + return NULL; +} + +lnet_ni_t * +lnet_net2ni(__u32 net) +{ + lnet_ni_t *ni; + + lnet_net_lock(0); + ni = lnet_net2ni_locked(net, 0); + lnet_net_unlock(0); + + return ni; +} +EXPORT_SYMBOL(lnet_net2ni); + +static unsigned int +lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number) +{ + __u64 key = nid; + unsigned int val; + + LASSERT(number >= 1 && number <= LNET_CPT_NUMBER); + + if (number == 1) + return 0; + + val = hash_long(key, LNET_CPT_BITS); + /* NB: LNET_CP_NUMBER doesn't have to be PO2 */ + if (val < number) + return val; + + return (unsigned int)(key + val + (val >> 1)) % number; +} + +int +lnet_cpt_of_nid_locked(lnet_nid_t nid) +{ + struct lnet_ni *ni; + + /* must called with hold of lnet_net_lock */ + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + /* take lnet_net_lock(any) would be OK */ + if (!list_empty(&the_lnet.ln_nis_cpt)) { + list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) { + if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) + continue; + + LASSERT(ni->ni_cpts != NULL); + return ni->ni_cpts[lnet_nid_cpt_hash + (nid, ni->ni_ncpts)]; + } + } + + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); +} + +int +lnet_cpt_of_nid(lnet_nid_t nid) +{ + int cpt; + int cpt2; + + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + if (list_empty(&the_lnet.ln_nis_cpt)) + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + + cpt = lnet_net_lock_current(); + cpt2 = lnet_cpt_of_nid_locked(nid); + lnet_net_unlock(cpt); + + return cpt2; +} +EXPORT_SYMBOL(lnet_cpt_of_nid); + +int +lnet_islocalnet(__u32 net) +{ + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + + ni = lnet_net2ni_locked(net, cpt); + if (ni != NULL) + lnet_ni_decref_locked(ni, cpt); + + lnet_net_unlock(cpt); + + return ni != NULL; +} + +lnet_ni_t * +lnet_nid2ni_locked(lnet_nid_t nid, int cpt) +{ + struct lnet_ni *ni; + struct list_head *tmp; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (ni->ni_nid == nid) { + lnet_ni_addref_locked(ni, cpt); + return ni; + } + } + + return NULL; +} + +int +lnet_islocalnid(lnet_nid_t nid) +{ + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + ni = lnet_nid2ni_locked(nid, cpt); + if (ni != NULL) + lnet_ni_decref_locked(ni, cpt); + lnet_net_unlock(cpt); + + return ni != NULL; +} + +int +lnet_count_acceptor_nis(void) +{ + /* Return the # of NIs that need the acceptor. */ + int count = 0; + struct list_head *tmp; + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (ni->ni_lnd->lnd_accept != NULL) + count++; + } + + lnet_net_unlock(cpt); + + return count; +} + +static int +lnet_ni_tq_credits(lnet_ni_t *ni) +{ + int credits; + + LASSERT(ni->ni_ncpts >= 1); + + if (ni->ni_ncpts == 1) + return ni->ni_maxtxcredits; + + credits = ni->ni_maxtxcredits / ni->ni_ncpts; + credits = max(credits, 8 * ni->ni_peertxcredits); + credits = min(credits, ni->ni_maxtxcredits); + + return credits; +} + +static void +lnet_shutdown_lndnis(void) +{ + int i; + int islo; + lnet_ni_t *ni; + + /* NB called holding the global mutex */ + + /* All quiet on the API front */ + LASSERT(!the_lnet.ln_shutdown); + LASSERT(the_lnet.ln_refcount == 0); + LASSERT(list_empty(&the_lnet.ln_nis_zombie)); + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_shutdown = 1; /* flag shutdown */ + + /* Unlink NIs from the global table */ + while (!list_empty(&the_lnet.ln_nis)) { + ni = list_entry(the_lnet.ln_nis.next, + lnet_ni_t, ni_list); + /* move it to zombie list and nobody can find it anymore */ + list_move(&ni->ni_list, &the_lnet.ln_nis_zombie); + lnet_ni_decref_locked(ni, 0); /* drop ln_nis' ref */ + + if (!list_empty(&ni->ni_cptlist)) { + list_del_init(&ni->ni_cptlist); + lnet_ni_decref_locked(ni, 0); + } + } + + /* Drop the cached eqwait NI. */ + if (the_lnet.ln_eq_waitni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0); + the_lnet.ln_eq_waitni = NULL; + } + + /* Drop the cached loopback NI. */ + if (the_lnet.ln_loni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_loni, 0); + the_lnet.ln_loni = NULL; + } + + lnet_net_unlock(LNET_LOCK_EX); + + /* Clear lazy portals and drop delayed messages which hold refs + * on their lnet_msg_t::msg_rxpeer */ + for (i = 0; i < the_lnet.ln_nportals; i++) + LNetClearLazyPortal(i); + + /* Clear the peer table and wait for all peers to go (they hold refs on + * their NIs) */ + lnet_peer_tables_cleanup(); + + lnet_net_lock(LNET_LOCK_EX); + /* Now wait for the NI's I just nuked to show up on ln_zombie_nis + * and shut them down in guaranteed thread context */ + i = 2; + while (!list_empty(&the_lnet.ln_nis_zombie)) { + int *ref; + int j; + + ni = list_entry(the_lnet.ln_nis_zombie.next, + lnet_ni_t, ni_list); + list_del_init(&ni->ni_list); + cfs_percpt_for_each(ref, j, ni->ni_refs) { + if (*ref == 0) + continue; + /* still busy, add it back to zombie list */ + list_add(&ni->ni_list, &the_lnet.ln_nis_zombie); + break; + } + + if (!list_empty(&ni->ni_list)) { + lnet_net_unlock(LNET_LOCK_EX); + ++i; + if ((i & (-i)) == i) { + CDEBUG(D_WARNING, "Waiting for zombie LNI %s\n", + libcfs_nid2str(ni->ni_nid)); + } + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + lnet_net_lock(LNET_LOCK_EX); + continue; + } + + ni->ni_lnd->lnd_refcount--; + lnet_net_unlock(LNET_LOCK_EX); + + islo = ni->ni_lnd->lnd_type == LOLND; + + LASSERT(!in_interrupt()); + (ni->ni_lnd->lnd_shutdown)(ni); + + /* can't deref lnd anymore now; it might have unregistered + * itself... */ + + if (!islo) + CDEBUG(D_LNI, "Removed LNI %s\n", + libcfs_nid2str(ni->ni_nid)); + + lnet_ni_free(ni); + i = 2; + + lnet_net_lock(LNET_LOCK_EX); + } + + the_lnet.ln_shutdown = 0; + lnet_net_unlock(LNET_LOCK_EX); + + if (the_lnet.ln_network_tokens != NULL) { + LIBCFS_FREE(the_lnet.ln_network_tokens, + the_lnet.ln_network_tokens_nob); + the_lnet.ln_network_tokens = NULL; + } +} + +static int +lnet_startup_lndnis(void) +{ + lnd_t *lnd; + struct lnet_ni *ni; + struct lnet_tx_queue *tq; + struct list_head nilist; + int i; + int rc = 0; + int lnd_type; + int nicount = 0; + char *nets = lnet_get_networks(); + + INIT_LIST_HEAD(&nilist); + + if (nets == NULL) + goto failed; + + rc = lnet_parse_networks(&nilist, nets); + if (rc != 0) + goto failed; + + while (!list_empty(&nilist)) { + ni = list_entry(nilist.next, lnet_ni_t, ni_list); + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); + + LASSERT(libcfs_isknown_lnd(lnd_type)); + + if (lnd_type == CIBLND || + lnd_type == OPENIBLND || + lnd_type == IIBLND || + lnd_type == VIBLND) { + CERROR("LND %s obsoleted\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } + + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + lnd = lnet_find_lnd_by_type(lnd_type); + + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + rc = request_module("%s", + libcfs_lnd2modname(lnd_type)); + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + lnd = lnet_find_lnd_by_type(lnd_type); + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + CERROR("Can't load LND %s, module %s, rc=%d\n", + libcfs_lnd2str(lnd_type), + libcfs_lnd2modname(lnd_type), rc); + goto failed; + } + } + + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount++; + lnet_net_unlock(LNET_LOCK_EX); + + ni->ni_lnd = lnd; + + rc = (lnd->lnd_startup)(ni); + + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + + if (rc != 0) { + LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s\n", + rc, libcfs_lnd2str(lnd->lnd_type)); + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount--; + lnet_net_unlock(LNET_LOCK_EX); + goto failed; + } + + LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL); + + list_del(&ni->ni_list); + + lnet_net_lock(LNET_LOCK_EX); + /* refcount for ln_nis */ + lnet_ni_addref_locked(ni, 0); + list_add_tail(&ni->ni_list, &the_lnet.ln_nis); + if (ni->ni_cpts != NULL) { + list_add_tail(&ni->ni_cptlist, + &the_lnet.ln_nis_cpt); + lnet_ni_addref_locked(ni, 0); + } + + lnet_net_unlock(LNET_LOCK_EX); + + if (lnd->lnd_type == LOLND) { + lnet_ni_addref(ni); + LASSERT(the_lnet.ln_loni == NULL); + the_lnet.ln_loni = ni; + continue; + } + + if (ni->ni_peertxcredits == 0 || + ni->ni_maxtxcredits == 0) { + LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", + libcfs_lnd2str(lnd->lnd_type), + ni->ni_peertxcredits == 0 ? + "" : "per-peer "); + goto failed; + } + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + tq->tq_credits_min = + tq->tq_credits_max = + tq->tq_credits = lnet_ni_tq_credits(ni); + } + + CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", + libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits, + lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, + ni->ni_peerrtrcredits, ni->ni_peertimeout); + + nicount++; + } + + if (the_lnet.ln_eq_waitni != NULL && nicount > 1) { + lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type; + LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } + + return 0; + + failed: + lnet_shutdown_lndnis(); + + while (!list_empty(&nilist)) { + ni = list_entry(nilist.next, lnet_ni_t, ni_list); + list_del(&ni->ni_list); + lnet_ni_free(ni); + } + + return -ENETDOWN; +} + +/** + * Initialize LNet library. + * + * Only userspace program needs to call this function - it's automatically + * called in the kernel at module loading time. Caller has to call LNetFini() + * after a call to LNetInit(), if and only if the latter returned 0. It must + * be called exactly once. + * + * \return 0 on success, and -ve on failures. + */ +int +LNetInit(void) +{ + int rc; + + lnet_assert_wire_constants(); + LASSERT(!the_lnet.ln_init); + + memset(&the_lnet, 0, sizeof(the_lnet)); + + /* refer to global cfs_cpt_table for now */ + the_lnet.ln_cpt_table = cfs_cpt_table; + the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_table); + + LASSERT(the_lnet.ln_cpt_number > 0); + if (the_lnet.ln_cpt_number > LNET_CPT_MAX) { + /* we are under risk of consuming all lh_cookie */ + CERROR("Can't have %d CPTs for LNet (max allowed is %d), please change setting of CPT-table and retry\n", + the_lnet.ln_cpt_number, LNET_CPT_MAX); + return -1; + } + + while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number) + the_lnet.ln_cpt_bits++; + + rc = lnet_create_locks(); + if (rc != 0) { + CERROR("Can't create LNet global locks: %d\n", rc); + return -1; + } + + the_lnet.ln_refcount = 0; + the_lnet.ln_init = 1; + LNetInvalidateHandle(&the_lnet.ln_rc_eqh); + INIT_LIST_HEAD(&the_lnet.ln_lnds); + INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); + INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); + + /* The hash table size is the number of bits it takes to express the set + * ln_num_routes, minus 1 (better to under estimate than over so we + * don't waste memory). */ + if (rnet_htable_size <= 0) + rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; + else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX) + rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX; + the_lnet.ln_remote_nets_hbits = max_t(int, 1, + order_base_2(rnet_htable_size) - 1); + + /* All LNDs apart from the LOLND are in separate modules. They + * register themselves when their module loads, and unregister + * themselves when their module is unloaded. */ + lnet_register_lnd(&the_lolnd); + return 0; +} +EXPORT_SYMBOL(LNetInit); + +/** + * Finalize LNet library. + * + * Only userspace program needs to call this function. It can be called + * at most once. + * + * \pre LNetInit() called with success. + * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls. + */ +void +LNetFini(void) +{ + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount == 0); + + while (!list_empty(&the_lnet.ln_lnds)) + lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next, + lnd_t, lnd_list)); + lnet_destroy_locks(); + + the_lnet.ln_init = 0; +} +EXPORT_SYMBOL(LNetFini); + +/** + * Set LNet PID and start LNet interfaces, routing, and forwarding. + * + * Userspace program should call this after a successful call to LNetInit(). + * Users must call this function at least once before any other functions. + * For each successful call there must be a corresponding call to + * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is + * ignored. + * + * The PID used by LNet may be different from the one requested. + * See LNetGetId(). + * + * \param requested_pid PID requested by the caller. + * + * \return >= 0 on success, and < 0 error code on failures. + */ +int +LNetNIInit(lnet_pid_t requested_pid) +{ + int im_a_router = 0; + int rc; + + LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); + + LASSERT(the_lnet.ln_init); + CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount); + + if (the_lnet.ln_refcount > 0) { + rc = the_lnet.ln_refcount++; + goto out; + } + + lnet_get_tunables(); + + if (requested_pid == LNET_PID_ANY) { + /* Don't instantiate LNET just for me */ + rc = -ENETDOWN; + goto failed0; + } + + rc = lnet_prepare(requested_pid); + if (rc != 0) + goto failed0; + + rc = lnet_startup_lndnis(); + if (rc != 0) + goto failed1; + + rc = lnet_parse_routes(lnet_get_routes(), &im_a_router); + if (rc != 0) + goto failed2; + + rc = lnet_check_routes(); + if (rc != 0) + goto failed2; + + rc = lnet_rtrpools_alloc(im_a_router); + if (rc != 0) + goto failed2; + + rc = lnet_acceptor_start(); + if (rc != 0) + goto failed2; + + the_lnet.ln_refcount = 1; + /* Now I may use my own API functions... */ + + /* NB router checker needs the_lnet.ln_ping_info in + * lnet_router_checker -> lnet_update_ni_status_locked */ + rc = lnet_ping_target_init(); + if (rc != 0) + goto failed3; + + rc = lnet_router_checker_start(); + if (rc != 0) + goto failed4; + + lnet_proc_init(); + goto out; + + failed4: + lnet_ping_target_fini(); + failed3: + the_lnet.ln_refcount = 0; + lnet_acceptor_stop(); + failed2: + lnet_destroy_routes(); + lnet_shutdown_lndnis(); + failed1: + lnet_unprepare(); + failed0: + LASSERT(rc < 0); + out: + LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); + return rc; +} +EXPORT_SYMBOL(LNetNIInit); + +/** + * Stop LNet interfaces, routing, and forwarding. + * + * Users must call this function once for each successful call to LNetNIInit(). + * Once the LNetNIFini() operation has been started, the results of pending + * API operations are undefined. + * + * \return always 0 for current implementation. + */ +int +LNetNIFini(void) +{ + LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if (the_lnet.ln_refcount != 1) { + the_lnet.ln_refcount--; + } else { + LASSERT(!the_lnet.ln_niinit_self); + + lnet_proc_fini(); + lnet_router_checker_stop(); + lnet_ping_target_fini(); + + /* Teardown fns that use my own API functions BEFORE here */ + the_lnet.ln_refcount = 0; + + lnet_acceptor_stop(); + lnet_destroy_routes(); + lnet_shutdown_lndnis(); + lnet_unprepare(); + } + + LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); + return 0; +} +EXPORT_SYMBOL(LNetNIFini); + +/** + * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and + * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet + * internal ioctl handler. + * + * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it. + * + * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer. + * The data will be printed to system console. Don't use it excessively. + * \param arg A pointer to lnet_process_id_t, process ID of the peer. + * + * \return Always return 0 when called by users directly (i.e., not via ioctl). + */ +int +LNetCtl(unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + lnet_process_id_t id = {0}; + lnet_ni_t *ni; + int rc; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + switch (cmd) { + case IOC_LIBCFS_GET_NI: + rc = LNetGetId(data->ioc_count, &id); + data->ioc_nid = id.nid; + return rc; + + case IOC_LIBCFS_FAIL_NID: + return lnet_fail_nid(data->ioc_nid, data->ioc_count); + + case IOC_LIBCFS_ADD_ROUTE: + rc = lnet_add_route(data->ioc_net, data->ioc_count, + data->ioc_nid, data->ioc_priority); + return (rc != 0) ? rc : lnet_check_routes(); + + case IOC_LIBCFS_DEL_ROUTE: + return lnet_del_route(data->ioc_net, data->ioc_nid); + + case IOC_LIBCFS_GET_ROUTE: + return lnet_get_route(data->ioc_count, + &data->ioc_net, &data->ioc_count, + &data->ioc_nid, &data->ioc_flags, + &data->ioc_priority); + case IOC_LIBCFS_NOTIFY_ROUTER: + return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, + cfs_time_current() - + cfs_time_seconds(get_seconds() - + (time_t)data->ioc_u64[0])); + + case IOC_LIBCFS_PORTALS_COMPATIBILITY: + /* This can be removed once lustre stops calling it */ + return 0; + + case IOC_LIBCFS_LNET_DIST: + rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]); + if (rc < 0 && rc != -EHOSTUNREACH) + return rc; + + data->ioc_u32[0] = rc; + return 0; + + case IOC_LIBCFS_TESTPROTOCOMPAT: + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_testprotocompat = data->ioc_flags; + lnet_net_unlock(LNET_LOCK_EX); + return 0; + + case IOC_LIBCFS_PING: + id.nid = data->ioc_nid; + id.pid = data->ioc_u32[0]; + rc = lnet_ping(id, data->ioc_u32[1], /* timeout */ + (lnet_process_id_t *)data->ioc_pbuf1, + data->ioc_plen1/sizeof(lnet_process_id_t)); + if (rc < 0) + return rc; + data->ioc_count = rc; + return 0; + + case IOC_LIBCFS_DEBUG_PEER: { + /* CAVEAT EMPTOR: this one designed for calling directly; not + * via an ioctl */ + id = *((lnet_process_id_t *) arg); + + lnet_debug_peer(id.nid); + + ni = lnet_net2ni(LNET_NIDNET(id.nid)); + if (ni == NULL) { + CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id)); + } else { + if (ni->ni_lnd->lnd_ctl == NULL) { + CDEBUG(D_WARNING, "No ctl for %s\n", + libcfs_id2str(id)); + } else { + (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg); + } + + lnet_ni_decref(ni); + } + return 0; + } + + default: + ni = lnet_net2ni(data->ioc_net); + if (ni == NULL) + return -EINVAL; + + if (ni->ni_lnd->lnd_ctl == NULL) + rc = -EINVAL; + else + rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg); + + lnet_ni_decref(ni); + return rc; + } + /* not reached */ +} +EXPORT_SYMBOL(LNetCtl); + +/** + * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that + * all interfaces share a same PID, as requested by LNetNIInit(). + * + * \param index Index of the interface to look up. + * \param id On successful return, this location will hold the + * lnet_process_id_t ID of the interface. + * + * \retval 0 If an interface exists at \a index. + * \retval -ENOENT If no interface has been found. + */ +int +LNetGetId(unsigned int index, lnet_process_id_t *id) +{ + struct lnet_ni *ni; + struct list_head *tmp; + int cpt; + int rc = -ENOENT; + + LASSERT(the_lnet.ln_init); + + /* LNetNI initilization failed? */ + if (the_lnet.ln_refcount == 0) + return rc; + + cpt = lnet_net_lock_current(); + + list_for_each(tmp, &the_lnet.ln_nis) { + if (index-- != 0) + continue; + + ni = list_entry(tmp, lnet_ni_t, ni_list); + + id->nid = ni->ni_nid; + id->pid = the_lnet.ln_pid; + rc = 0; + break; + } + + lnet_net_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetGetId); + +/** + * Print a string representation of handle \a h into buffer \a str of + * \a len bytes. + */ +void +LNetSnprintHandle(char *str, int len, lnet_handle_any_t h) +{ + snprintf(str, len, "%#llx", h.cookie); +} +EXPORT_SYMBOL(LNetSnprintHandle); + +static int +lnet_create_ping_info(void) +{ + int i; + int n; + int rc; + unsigned int infosz; + lnet_ni_t *ni; + lnet_process_id_t id; + lnet_ping_info_t *pinfo; + + for (n = 0; ; n++) { + rc = LNetGetId(n, &id); + if (rc == -ENOENT) + break; + + LASSERT(rc == 0); + } + + infosz = offsetof(lnet_ping_info_t, pi_ni[n]); + LIBCFS_ALLOC(pinfo, infosz); + if (pinfo == NULL) { + CERROR("Can't allocate ping info[%d]\n", n); + return -ENOMEM; + } + + pinfo->pi_nnis = n; + pinfo->pi_pid = the_lnet.ln_pid; + pinfo->pi_magic = LNET_PROTO_PING_MAGIC; + pinfo->pi_features = LNET_PING_FEAT_NI_STATUS; + + for (i = 0; i < n; i++) { + lnet_ni_status_t *ns = &pinfo->pi_ni[i]; + + rc = LNetGetId(i, &id); + LASSERT(rc == 0); + + ns->ns_nid = id.nid; + ns->ns_status = LNET_NI_STATUS_UP; + + lnet_net_lock(0); + + ni = lnet_nid2ni_locked(id.nid, 0); + LASSERT(ni != NULL); + + lnet_ni_lock(ni); + LASSERT(ni->ni_status == NULL); + ni->ni_status = ns; + lnet_ni_unlock(ni); + + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); + } + + the_lnet.ln_ping_info = pinfo; + return 0; +} + +static void +lnet_destroy_ping_info(void) +{ + struct lnet_ni *ni; + + lnet_net_lock(0); + + list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { + lnet_ni_lock(ni); + ni->ni_status = NULL; + lnet_ni_unlock(ni); + } + + lnet_net_unlock(0); + + LIBCFS_FREE(the_lnet.ln_ping_info, + offsetof(lnet_ping_info_t, + pi_ni[the_lnet.ln_ping_info->pi_nnis])); + the_lnet.ln_ping_info = NULL; +} + +int +lnet_ping_target_init(void) +{ + lnet_md_t md = { NULL }; + lnet_handle_me_t meh; + lnet_process_id_t id; + int rc; + int rc2; + int infosz; + + rc = lnet_create_ping_info(); + if (rc != 0) + return rc; + + /* We can have a tiny EQ since we only need to see the unlink event on + * teardown, which by definition is the last one! */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq); + if (rc != 0) { + CERROR("Can't allocate ping EQ: %d\n", rc); + goto failed_0; + } + + memset(&id, 0, sizeof(lnet_process_id_t)); + id.nid = LNET_NID_ANY; + id.pid = LNET_PID_ANY; + + rc = LNetMEAttach(LNET_RESERVED_PORTAL, id, + LNET_PROTO_PING_MATCHBITS, 0, + LNET_UNLINK, LNET_INS_AFTER, + &meh); + if (rc != 0) { + CERROR("Can't create ping ME: %d\n", rc); + goto failed_1; + } + + /* initialize md content */ + infosz = offsetof(lnet_ping_info_t, + pi_ni[the_lnet.ln_ping_info->pi_nnis]); + md.start = the_lnet.ln_ping_info; + md.length = infosz; + md.threshold = LNET_MD_THRESH_INF; + md.max_size = 0; + md.options = LNET_MD_OP_GET | LNET_MD_TRUNCATE | + LNET_MD_MANAGE_REMOTE; + md.user_ptr = NULL; + md.eq_handle = the_lnet.ln_ping_target_eq; + + rc = LNetMDAttach(meh, md, + LNET_RETAIN, + &the_lnet.ln_ping_target_md); + if (rc != 0) { + CERROR("Can't attach ping MD: %d\n", rc); + goto failed_2; + } + + return 0; + + failed_2: + rc2 = LNetMEUnlink(meh); + LASSERT(rc2 == 0); + failed_1: + rc2 = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT(rc2 == 0); + failed_0: + lnet_destroy_ping_info(); + return rc; +} + +void +lnet_ping_target_fini(void) +{ + lnet_event_t event; + int rc; + int which; + int timeout_ms = 1000; + sigset_t blocked = cfs_block_allsigs(); + + LNetMDUnlink(the_lnet.ln_ping_target_md); + /* NB md could be busy; this just starts the unlink */ + + for (;;) { + rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1, + timeout_ms, &event, &which); + + /* I expect overflow... */ + LASSERT(rc >= 0 || rc == -EOVERFLOW); + + if (rc == 0) { + /* timed out: provide a diagnostic */ + CWARN("Still waiting for ping MD to unlink\n"); + timeout_ms *= 2; + continue; + } + + /* Got a valid event */ + if (event.unlinked) + break; + } + + rc = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT(rc == 0); + lnet_destroy_ping_info(); + cfs_restore_sigs(blocked); +} + +int +lnet_ping(lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids) +{ + lnet_handle_eq_t eqh; + lnet_handle_md_t mdh; + lnet_event_t event; + lnet_md_t md = { NULL }; + int which; + int unlinked = 0; + int replied = 0; + const int a_long_time = 60000; /* mS */ + int infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]); + lnet_ping_info_t *info; + lnet_process_id_t tmpid; + int i; + int nob; + int rc; + int rc2; + sigset_t blocked; + + if (n_ids <= 0 || + id.nid == LNET_NID_ANY || + timeout_ms > 500000 || /* arbitrary limit! */ + n_ids > 20) /* arbitrary limit! */ + return -EINVAL; + + if (id.pid == LNET_PID_ANY) + id.pid = LUSTRE_SRV_LNET_PID; + + LIBCFS_ALLOC(info, infosz); + if (info == NULL) + return -ENOMEM; + + /* NB 2 events max (including any unlink event) */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh); + if (rc != 0) { + CERROR("Can't allocate EQ: %d\n", rc); + goto out_0; + } + + /* initialize md content */ + md.start = info; + md.length = infosz; + md.threshold = 2; /*GET/REPLY*/ + md.max_size = 0; + md.options = LNET_MD_TRUNCATE; + md.user_ptr = NULL; + md.eq_handle = eqh; + + rc = LNetMDBind(md, LNET_UNLINK, &mdh); + if (rc != 0) { + CERROR("Can't bind MD: %d\n", rc); + goto out_1; + } + + rc = LNetGet(LNET_NID_ANY, mdh, id, + LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); + + if (rc != 0) { + /* Don't CERROR; this could be deliberate! */ + + rc2 = LNetMDUnlink(mdh); + LASSERT(rc2 == 0); + + /* NB must wait for the UNLINK event below... */ + unlinked = 1; + timeout_ms = a_long_time; + } + + do { + /* MUST block for unlink to complete */ + if (unlinked) + blocked = cfs_block_allsigs(); + + rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which); + + if (unlinked) + cfs_restore_sigs(blocked); + + CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2, + (rc2 <= 0) ? -1 : event.type, + (rc2 <= 0) ? -1 : event.status, + (rc2 > 0 && event.unlinked) ? " unlinked" : ""); + + LASSERT(rc2 != -EOVERFLOW); /* can't miss anything */ + + if (rc2 <= 0 || event.status != 0) { + /* timeout or error */ + if (!replied && rc == 0) + rc = (rc2 < 0) ? rc2 : + (rc2 == 0) ? -ETIMEDOUT : + event.status; + + if (!unlinked) { + /* Ensure completion in finite time... */ + LNetMDUnlink(mdh); + /* No assertion (racing with network) */ + unlinked = 1; + timeout_ms = a_long_time; + } else if (rc2 == 0) { + /* timed out waiting for unlink */ + CWARN("ping %s: late network completion\n", + libcfs_id2str(id)); + } + } else if (event.type == LNET_EVENT_REPLY) { + replied = 1; + rc = event.mlength; + } + + } while (rc2 <= 0 || !event.unlinked); + + if (!replied) { + if (rc >= 0) + CWARN("%s: Unexpected rc >= 0 but no reply!\n", + libcfs_id2str(id)); + rc = -EIO; + goto out_1; + } + + nob = rc; + LASSERT(nob >= 0 && nob <= infosz); + + rc = -EPROTO; /* if I can't parse... */ + + if (nob < 8) { + /* can't check magic/version */ + CERROR("%s: ping info too short %d\n", + libcfs_id2str(id), nob); + goto out_1; + } + + if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) { + lnet_swap_pinginfo(info); + } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) { + CERROR("%s: Unexpected magic %08x\n", + libcfs_id2str(id), info->pi_magic); + goto out_1; + } + + if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) { + CERROR("%s: ping w/o NI status: 0x%x\n", + libcfs_id2str(id), info->pi_features); + goto out_1; + } + + if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) { + CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id), + nob, (int)offsetof(lnet_ping_info_t, pi_ni[0])); + goto out_1; + } + + if (info->pi_nnis < n_ids) + n_ids = info->pi_nnis; + + if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) { + CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id), + nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids])); + goto out_1; + } + + rc = -EFAULT; /* If I SEGV... */ + + memset(&tmpid, 0, sizeof(tmpid)); + for (i = 0; i < n_ids; i++) { + tmpid.pid = info->pi_pid; + tmpid.nid = info->pi_ni[i].ns_nid; + if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid))) + goto out_1; + } + rc = info->pi_nnis; + + out_1: + rc2 = LNetEQFree(eqh); + if (rc2 != 0) + CERROR("rc2 %d\n", rc2); + LASSERT(rc2 == 0); + + out_0: + LIBCFS_FREE(info, infosz); + return rc; +} diff --git a/kernel/drivers/staging/lustre/lnet/lnet/config.c b/kernel/drivers/staging/lustre/lnet/lnet/config.c new file mode 100644 index 000000000..2dc4c4a1a --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/config.c @@ -0,0 +1,1292 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include "../../include/linux/lnet/lib-lnet.h" + +struct lnet_text_buf_t { /* tmp struct for parsing routes */ + struct list_head ltb_list; /* stash on lists */ + int ltb_size; /* allocated size */ + char ltb_text[0]; /* text buffer */ +}; + +static int lnet_tbnob; /* track text buf allocation */ +#define LNET_MAX_TEXTBUF_NOB (64<<10) /* bound allocation */ +#define LNET_SINGLE_TEXTBUF_NOB (4<<10) + +static void +lnet_syntax(char *name, char *str, int offset, int width) +{ + static char dots[LNET_SINGLE_TEXTBUF_NOB]; + static char dashes[LNET_SINGLE_TEXTBUF_NOB]; + + memset(dots, '.', sizeof(dots)); + dots[sizeof(dots)-1] = 0; + memset(dashes, '-', sizeof(dashes)); + dashes[sizeof(dashes)-1] = 0; + + LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str); + LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n", + (int)strlen(name), dots, offset, dots, + (width < 1) ? 0 : width - 1, dashes); +} + +static int +lnet_issep(char c) +{ + switch (c) { + case '\n': + case '\r': + case ';': + return 1; + default: + return 0; + } +} + +static int +lnet_net_unique(__u32 net, struct list_head *nilist) +{ + struct list_head *tmp; + lnet_ni_t *ni; + + list_for_each(tmp, nilist) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (LNET_NIDNET(ni->ni_nid) == net) + return 0; + } + + return 1; +} + +void +lnet_ni_free(struct lnet_ni *ni) +{ + if (ni->ni_refs != NULL) + cfs_percpt_free(ni->ni_refs); + + if (ni->ni_tx_queues != NULL) + cfs_percpt_free(ni->ni_tx_queues); + + if (ni->ni_cpts != NULL) + cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts); + + LIBCFS_FREE(ni, sizeof(*ni)); +} + +static lnet_ni_t * +lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist) +{ + struct lnet_tx_queue *tq; + struct lnet_ni *ni; + int rc; + int i; + + if (!lnet_net_unique(net, nilist)) { + LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n", + libcfs_net2str(net)); + return NULL; + } + + LIBCFS_ALLOC(ni, sizeof(*ni)); + if (ni == NULL) { + CERROR("Out of memory creating network %s\n", + libcfs_net2str(net)); + return NULL; + } + + spin_lock_init(&ni->ni_lock); + INIT_LIST_HEAD(&ni->ni_cptlist); + ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_refs[0])); + if (ni->ni_refs == NULL) + goto failed; + + ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_tx_queues[0])); + if (ni->ni_tx_queues == NULL) + goto failed; + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) + INIT_LIST_HEAD(&tq->tq_delayed); + + if (el == NULL) { + ni->ni_cpts = NULL; + ni->ni_ncpts = LNET_CPT_NUMBER; + } else { + rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts); + if (rc <= 0) { + CERROR("Failed to set CPTs for NI %s: %d\n", + libcfs_net2str(net), rc); + goto failed; + } + + LASSERT(rc <= LNET_CPT_NUMBER); + if (rc == LNET_CPT_NUMBER) { + LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0])); + ni->ni_cpts = NULL; + } + + ni->ni_ncpts = rc; + } + + /* LND will fill in the address part of the NID */ + ni->ni_nid = LNET_MKNID(net, 0); + ni->ni_last_alive = get_seconds(); + list_add_tail(&ni->ni_list, nilist); + return ni; + failed: + lnet_ni_free(ni); + return NULL; +} + +int +lnet_parse_networks(struct list_head *nilist, char *networks) +{ + struct cfs_expr_list *el = NULL; + int tokensize = strlen(networks) + 1; + char *tokens; + char *str; + char *tmp; + struct lnet_ni *ni; + __u32 net; + int nnets = 0; + + if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) { + /* _WAY_ conservative */ + LCONSOLE_ERROR_MSG(0x112, + "Can't parse networks: string too long\n"); + return -EINVAL; + } + + LIBCFS_ALLOC(tokens, tokensize); + if (tokens == NULL) { + CERROR("Can't allocate net tokens\n"); + return -ENOMEM; + } + + the_lnet.ln_network_tokens = tokens; + the_lnet.ln_network_tokens_nob = tokensize; + memcpy(tokens, networks, tokensize); + str = tmp = tokens; + + /* Add in the loopback network */ + ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist); + if (ni == NULL) + goto failed; + + while (str != NULL && *str != 0) { + char *comma = strchr(str, ','); + char *bracket = strchr(str, '('); + char *square = strchr(str, '['); + char *iface; + int niface; + int rc; + + /* NB we don't check interface conflicts here; it's the LNDs + * responsibility (if it cares at all) */ + + if (square != NULL && (comma == NULL || square < comma)) { + /* i.e: o2ib0(ib0)[1,2], number between square + * brackets are CPTs this NI needs to be bond */ + if (bracket != NULL && bracket > square) { + tmp = square; + goto failed_syntax; + } + + tmp = strchr(square, ']'); + if (tmp == NULL) { + tmp = square; + goto failed_syntax; + } + + rc = cfs_expr_list_parse(square, tmp - square + 1, + 0, LNET_CPT_NUMBER - 1, &el); + if (rc != 0) { + tmp = square; + goto failed_syntax; + } + + while (square <= tmp) + *square++ = ' '; + } + + if (bracket == NULL || + (comma != NULL && comma < bracket)) { + + /* no interface list specified */ + + if (comma != NULL) + *comma++ = 0; + net = libcfs_str2net(cfs_trimwhite(str)); + + if (net == LNET_NIDNET(LNET_NID_ANY)) { + LCONSOLE_ERROR_MSG(0x113, + "Unrecognised network type\n"); + tmp = str; + goto failed_syntax; + } + + if (LNET_NETTYP(net) != LOLND && /* LO is implicit */ + lnet_ni_alloc(net, el, nilist) == NULL) + goto failed; + + if (el != NULL) { + cfs_expr_list_free(el); + el = NULL; + } + + str = comma; + continue; + } + + *bracket = 0; + net = libcfs_str2net(cfs_trimwhite(str)); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + tmp = str; + goto failed_syntax; + } + + nnets++; + ni = lnet_ni_alloc(net, el, nilist); + if (ni == NULL) + goto failed; + + if (el != NULL) { + cfs_expr_list_free(el); + el = NULL; + } + + niface = 0; + iface = bracket + 1; + + bracket = strchr(iface, ')'); + if (bracket == NULL) { + tmp = iface; + goto failed_syntax; + } + + *bracket = 0; + do { + comma = strchr(iface, ','); + if (comma != NULL) + *comma++ = 0; + + iface = cfs_trimwhite(iface); + if (*iface == 0) { + tmp = iface; + goto failed_syntax; + } + + if (niface == LNET_MAX_INTERFACES) { + LCONSOLE_ERROR_MSG(0x115, + "Too many interfaces for net %s\n", + libcfs_net2str(net)); + goto failed; + } + + ni->ni_interfaces[niface++] = iface; + iface = comma; + } while (iface != NULL); + + str = bracket + 1; + comma = strchr(bracket + 1, ','); + if (comma != NULL) { + *comma = 0; + str = cfs_trimwhite(str); + if (*str != 0) { + tmp = str; + goto failed_syntax; + } + str = comma + 1; + continue; + } + + str = cfs_trimwhite(str); + if (*str != 0) { + tmp = str; + goto failed_syntax; + } + } + + LASSERT(!list_empty(nilist)); + return 0; + + failed_syntax: + lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp)); + failed: + while (!list_empty(nilist)) { + ni = list_entry(nilist->next, lnet_ni_t, ni_list); + + list_del(&ni->ni_list); + lnet_ni_free(ni); + } + + if (el != NULL) + cfs_expr_list_free(el); + + LIBCFS_FREE(tokens, tokensize); + the_lnet.ln_network_tokens = NULL; + + return -EINVAL; +} + +static struct lnet_text_buf_t * +lnet_new_text_buf(int str_len) +{ + struct lnet_text_buf_t *ltb; + int nob; + + /* NB allocate space for the terminating 0 */ + nob = offsetof(struct lnet_text_buf_t, ltb_text[str_len + 1]); + if (nob > LNET_SINGLE_TEXTBUF_NOB) { + /* _way_ conservative for "route net gateway..." */ + CERROR("text buffer too big\n"); + return NULL; + } + + if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) { + CERROR("Too many text buffers\n"); + return NULL; + } + + LIBCFS_ALLOC(ltb, nob); + if (ltb == NULL) + return NULL; + + ltb->ltb_size = nob; + ltb->ltb_text[0] = 0; + lnet_tbnob += nob; + return ltb; +} + +static void +lnet_free_text_buf(struct lnet_text_buf_t *ltb) +{ + lnet_tbnob -= ltb->ltb_size; + LIBCFS_FREE(ltb, ltb->ltb_size); +} + +static void +lnet_free_text_bufs(struct list_head *tbs) +{ + struct lnet_text_buf_t *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, struct lnet_text_buf_t, ltb_list); + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } +} + +static int +lnet_str2tbs_sep(struct list_head *tbs, char *str) +{ + struct list_head pending; + char *sep; + int nob; + int i; + struct lnet_text_buf_t *ltb; + + INIT_LIST_HEAD(&pending); + + /* Split 'str' into separate commands */ + for (;;) { + /* skip leading whitespace */ + while (isspace(*str)) + str++; + + /* scan for separator or comment */ + for (sep = str; *sep != 0; sep++) + if (lnet_issep(*sep) || *sep == '#') + break; + + nob = (int)(sep - str); + if (nob > 0) { + ltb = lnet_new_text_buf(nob); + if (ltb == NULL) { + lnet_free_text_bufs(&pending); + return -1; + } + + for (i = 0; i < nob; i++) + if (isspace(str[i])) + ltb->ltb_text[i] = ' '; + else + ltb->ltb_text[i] = str[i]; + + ltb->ltb_text[nob] = 0; + + list_add_tail(<b->ltb_list, &pending); + } + + if (*sep == '#') { + /* scan for separator */ + do { + sep++; + } while (*sep != 0 && !lnet_issep(*sep)); + } + + if (*sep == 0) + break; + + str = sep + 1; + } + + list_splice(&pending, tbs->prev); + return 0; +} + +static int +lnet_expand1tb(struct list_head *list, + char *str, char *sep1, char *sep2, + char *item, int itemlen) +{ + int len1 = (int)(sep1 - str); + int len2 = strlen(sep2 + 1); + struct lnet_text_buf_t *ltb; + + LASSERT(*sep1 == '['); + LASSERT(*sep2 == ']'); + + ltb = lnet_new_text_buf(len1 + itemlen + len2); + if (ltb == NULL) + return -ENOMEM; + + memcpy(ltb->ltb_text, str, len1); + memcpy(<b->ltb_text[len1], item, itemlen); + memcpy(<b->ltb_text[len1+itemlen], sep2 + 1, len2); + ltb->ltb_text[len1 + itemlen + len2] = 0; + + list_add_tail(<b->ltb_list, list); + return 0; +} + +static int +lnet_str2tbs_expand(struct list_head *tbs, char *str) +{ + char num[16]; + struct list_head pending; + char *sep; + char *sep2; + char *parsed; + char *enditem; + int lo; + int hi; + int stride; + int i; + int nob; + int scanned; + + INIT_LIST_HEAD(&pending); + + sep = strchr(str, '['); + if (sep == NULL) /* nothing to expand */ + return 0; + + sep2 = strchr(sep, ']'); + if (sep2 == NULL) + goto failed; + + for (parsed = sep; parsed < sep2; parsed = enditem) { + + enditem = ++parsed; + while (enditem < sep2 && *enditem != ',') + enditem++; + + if (enditem == parsed) /* no empty items */ + goto failed; + + if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, + &stride, &scanned) < 3) { + + if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) { + + /* simple string enumeration */ + if (lnet_expand1tb( + &pending, str, sep, sep2, + parsed, + (int)(enditem - parsed)) != 0) { + goto failed; + } + + continue; + } + + stride = 1; + } + + /* range expansion */ + + if (enditem != parsed + scanned) /* no trailing junk */ + goto failed; + + if (hi < 0 || lo < 0 || stride < 0 || hi < lo || + (hi - lo) % stride != 0) + goto failed; + + for (i = lo; i <= hi; i += stride) { + + snprintf(num, sizeof(num), "%d", i); + nob = strlen(num); + if (nob + 1 == sizeof(num)) + goto failed; + + if (lnet_expand1tb(&pending, str, sep, sep2, + num, nob) != 0) + goto failed; + } + } + + list_splice(&pending, tbs->prev); + return 1; + + failed: + lnet_free_text_bufs(&pending); + return -1; +} + +static int +lnet_parse_hops(char *str, unsigned int *hops) +{ + int len = strlen(str); + int nob = len; + + return (sscanf(str, "%u%n", hops, &nob) >= 1 && + nob == len && + *hops > 0 && *hops < 256); +} + +#define LNET_PRIORITY_SEPARATOR (':') + +static int +lnet_parse_priority(char *str, unsigned int *priority, char **token) +{ + int nob; + char *sep; + int len; + + sep = strchr(str, LNET_PRIORITY_SEPARATOR); + if (sep == NULL) { + *priority = 0; + return 0; + } + len = strlen(sep + 1); + + if ((sscanf((sep+1), "%u%n", priority, &nob) < 1) || (len != nob)) { + /* Update the caller's token pointer so it treats the found + priority as the token to report in the error message. */ + *token += sep - str + 1; + return -1; + } + + CDEBUG(D_NET, "gateway %s, priority %d, nob %d\n", str, *priority, nob); + + /* + * Change priority separator to \0 to be able to parse NID + */ + *sep = '\0'; + return 0; +} + +static int +lnet_parse_route(char *str, int *im_a_router) +{ + /* static scratch buffer OK (single threaded) */ + static char cmd[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head nets; + struct list_head gateways; + struct list_head *tmp1; + struct list_head *tmp2; + __u32 net; + lnet_nid_t nid; + struct lnet_text_buf_t *ltb; + int rc; + char *sep; + char *token = str; + int ntokens = 0; + int myrc = -1; + unsigned int hops; + int got_hops = 0; + unsigned int priority = 0; + + INIT_LIST_HEAD(&gateways); + INIT_LIST_HEAD(&nets); + + /* save a copy of the string for error messages */ + strncpy(cmd, str, sizeof(cmd) - 1); + cmd[sizeof(cmd) - 1] = 0; + + sep = str; + for (;;) { + /* scan for token start */ + while (isspace(*sep)) + sep++; + if (*sep == 0) { + if (ntokens < (got_hops ? 3 : 2)) + goto token_error; + break; + } + + ntokens++; + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !isspace(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens == 1) { + tmp2 = &nets; /* expanding nets */ + } else if (ntokens == 2 && + lnet_parse_hops(token, &hops)) { + got_hops = 1; /* got a hop count */ + continue; + } else { + tmp2 = &gateways; /* expanding gateways */ + } + + ltb = lnet_new_text_buf(strlen(token)); + if (ltb == NULL) + goto out; + + strcpy(ltb->ltb_text, token); + tmp1 = <b->ltb_list; + list_add_tail(tmp1, tmp2); + + while (tmp1 != tmp2) { + ltb = list_entry(tmp1, struct lnet_text_buf_t, + ltb_list); + + rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text); + if (rc < 0) + goto token_error; + + tmp1 = tmp1->next; + + if (rc > 0) { /* expanded! */ + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + continue; + } + + if (ntokens == 1) { + net = libcfs_str2net(ltb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND) + goto token_error; + } else { + rc = lnet_parse_priority(ltb->ltb_text, + &priority, &token); + if (rc < 0) + goto token_error; + + nid = libcfs_str2nid(ltb->ltb_text); + if (nid == LNET_NID_ANY || + LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + goto token_error; + } + } + } + + if (!got_hops) + hops = 1; + + LASSERT(!list_empty(&nets)); + LASSERT(!list_empty(&gateways)); + + list_for_each(tmp1, &nets) { + ltb = list_entry(tmp1, struct lnet_text_buf_t, ltb_list); + net = libcfs_str2net(ltb->ltb_text); + LASSERT(net != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each(tmp2, &gateways) { + ltb = list_entry(tmp2, struct lnet_text_buf_t, + ltb_list); + nid = libcfs_str2nid(ltb->ltb_text); + LASSERT(nid != LNET_NID_ANY); + + if (lnet_islocalnid(nid)) { + *im_a_router = 1; + continue; + } + + rc = lnet_add_route(net, hops, nid, priority); + if (rc != 0) { + CERROR("Can't create route to %s via %s\n", + libcfs_net2str(net), + libcfs_nid2str(nid)); + goto out; + } + } + } + + myrc = 0; + goto out; + + token_error: + lnet_syntax("routes", cmd, (int)(token - str), strlen(token)); + out: + lnet_free_text_bufs(&nets); + lnet_free_text_bufs(&gateways); + return myrc; +} + +static int +lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router) +{ + struct lnet_text_buf_t *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, struct lnet_text_buf_t, ltb_list); + + if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) { + lnet_free_text_bufs(tbs); + return -EINVAL; + } + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } + + return 0; +} + +int +lnet_parse_routes(char *routes, int *im_a_router) +{ + struct list_head tbs; + int rc = 0; + + *im_a_router = 0; + + INIT_LIST_HEAD(&tbs); + + if (lnet_str2tbs_sep(&tbs, routes) < 0) { + CERROR("Error parsing routes\n"); + rc = -EINVAL; + } else { + rc = lnet_parse_route_tbs(&tbs, im_a_router); + } + + LASSERT(lnet_tbnob == 0); + return rc; +} + +static int +lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip) +{ + LIST_HEAD(list); + int rc; + int i; + + rc = cfs_ip_addr_parse(token, len, &list); + if (rc != 0) + return rc; + + for (rc = i = 0; !rc && i < nip; i++) + rc = cfs_ip_addr_match(ipaddrs[i], &list); + + cfs_ip_addr_free(&list); + + return rc; +} + +static int +lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip) +{ + static char tokens[LNET_SINGLE_TEXTBUF_NOB]; + + int matched = 0; + int ntokens = 0; + int len; + char *net = NULL; + char *sep; + char *token; + int rc; + + LASSERT(strlen(net_entry) < sizeof(tokens)); + + /* work on a copy of the string */ + strcpy(tokens, net_entry); + sep = tokens; + for (;;) { + /* scan for token start */ + while (isspace(*sep)) + sep++; + if (*sep == 0) + break; + + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !isspace(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens++ == 0) { + net = token; + continue; + } + + len = strlen(token); + + rc = lnet_match_network_token(token, len, ipaddrs, nip); + if (rc < 0) { + lnet_syntax("ip2nets", net_entry, + (int)(token - tokens), len); + return rc; + } + + matched |= (rc != 0); + } + + if (!matched) + return 0; + + strcpy(net_entry, net); /* replace with matched net */ + return 1; +} + +static __u32 +lnet_netspec2net(char *netspec) +{ + char *bracket = strchr(netspec, '('); + __u32 net; + + if (bracket != NULL) + *bracket = 0; + + net = libcfs_str2net(netspec); + + if (bracket != NULL) + *bracket = '('; + + return net; +} + +static int +lnet_splitnets(char *source, struct list_head *nets) +{ + int offset = 0; + int offset2; + int len; + struct lnet_text_buf_t *tb; + struct lnet_text_buf_t *tb2; + struct list_head *t; + char *sep; + char *bracket; + __u32 net; + + LASSERT(!list_empty(nets)); + LASSERT(nets->next == nets->prev); /* single entry */ + + tb = list_entry(nets->next, struct lnet_text_buf_t, ltb_list); + + for (;;) { + sep = strchr(tb->ltb_text, ','); + bracket = strchr(tb->ltb_text, '('); + + if (sep != NULL && + bracket != NULL && + bracket < sep) { + /* netspec lists interfaces... */ + + offset2 = offset + (int)(bracket - tb->ltb_text); + len = strlen(bracket); + + bracket = strchr(bracket + 1, ')'); + + if (bracket == NULL || + !(bracket[1] == ',' || bracket[1] == 0)) { + lnet_syntax("ip2nets", source, offset2, len); + return -EINVAL; + } + + sep = (bracket[1] == 0) ? NULL : bracket + 1; + } + + if (sep != NULL) + *sep++ = 0; + + net = lnet_netspec2net(tb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + + list_for_each(t, nets) { + tb2 = list_entry(t, struct lnet_text_buf_t, ltb_list); + + if (tb2 == tb) + continue; + + if (net == lnet_netspec2net(tb2->ltb_text)) { + /* duplicate network */ + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + } + + if (sep == NULL) + return 0; + + offset += (int)(sep - tb->ltb_text); + tb2 = lnet_new_text_buf(strlen(sep)); + if (tb2 == NULL) + return -ENOMEM; + + strcpy(tb2->ltb_text, sep); + list_add_tail(&tb2->ltb_list, nets); + + tb = tb2; + } +} + +static int +lnet_match_networks(char **networksp, char *ip2nets, __u32 *ipaddrs, int nip) +{ + static char networks[LNET_SINGLE_TEXTBUF_NOB]; + static char source[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head raw_entries; + struct list_head matched_nets; + struct list_head current_nets; + struct list_head *t; + struct list_head *t2; + struct lnet_text_buf_t *tb; + struct lnet_text_buf_t *tb2; + __u32 net1; + __u32 net2; + int len; + int count; + int dup; + int rc; + + INIT_LIST_HEAD(&raw_entries); + if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) { + CERROR("Error parsing ip2nets\n"); + LASSERT(lnet_tbnob == 0); + return -EINVAL; + } + + INIT_LIST_HEAD(&matched_nets); + INIT_LIST_HEAD(¤t_nets); + networks[0] = 0; + count = 0; + len = 0; + rc = 0; + + while (!list_empty(&raw_entries)) { + tb = list_entry(raw_entries.next, struct lnet_text_buf_t, + ltb_list); + + strncpy(source, tb->ltb_text, sizeof(source)-1); + source[sizeof(source)-1] = 0; + + /* replace ltb_text with the network(s) add on match */ + rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip); + if (rc < 0) + break; + + list_del(&tb->ltb_list); + + if (rc == 0) { /* no match */ + lnet_free_text_buf(tb); + continue; + } + + /* split into separate networks */ + INIT_LIST_HEAD(¤t_nets); + list_add(&tb->ltb_list, ¤t_nets); + rc = lnet_splitnets(source, ¤t_nets); + if (rc < 0) + break; + + dup = 0; + list_for_each(t, ¤t_nets) { + tb = list_entry(t, struct lnet_text_buf_t, ltb_list); + net1 = lnet_netspec2net(tb->ltb_text); + LASSERT(net1 != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each(t2, &matched_nets) { + tb2 = list_entry(t2, struct lnet_text_buf_t, + ltb_list); + net2 = lnet_netspec2net(tb2->ltb_text); + LASSERT(net2 != LNET_NIDNET(LNET_NID_ANY)); + + if (net1 == net2) { + dup = 1; + break; + } + } + + if (dup) + break; + } + + if (dup) { + lnet_free_text_bufs(¤t_nets); + continue; + } + + list_for_each_safe(t, t2, ¤t_nets) { + tb = list_entry(t, struct lnet_text_buf_t, ltb_list); + + list_del(&tb->ltb_list); + list_add_tail(&tb->ltb_list, &matched_nets); + + len += snprintf(networks + len, sizeof(networks) - len, + "%s%s", (len == 0) ? "" : ",", + tb->ltb_text); + + if (len >= sizeof(networks)) { + CERROR("Too many matched networks\n"); + rc = -E2BIG; + goto out; + } + } + + count++; + } + + out: + lnet_free_text_bufs(&raw_entries); + lnet_free_text_bufs(&matched_nets); + lnet_free_text_bufs(¤t_nets); + LASSERT(lnet_tbnob == 0); + + if (rc < 0) + return rc; + + *networksp = networks; + return count; +} + +static void +lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip) +{ + LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs)); +} + +static int +lnet_ipaddr_enumerate(__u32 **ipaddrsp) +{ + int up; + __u32 netmask; + __u32 *ipaddrs; + __u32 *ipaddrs2; + int nip; + char **ifnames; + int nif = libcfs_ipif_enumerate(&ifnames); + int i; + int rc; + + if (nif <= 0) + return nif; + + LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs)); + if (ipaddrs == NULL) { + CERROR("Can't allocate ipaddrs[%d]\n", nif); + libcfs_ipif_free_enumeration(ifnames, nif); + return -ENOMEM; + } + + for (i = nip = 0; i < nif; i++) { + if (!strcmp(ifnames[i], "lo")) + continue; + + rc = libcfs_ipif_query(ifnames[i], &up, + &ipaddrs[nip], &netmask); + if (rc != 0) { + CWARN("Can't query interface %s: %d\n", + ifnames[i], rc); + continue; + } + + if (!up) { + CWARN("Ignoring interface %s: it's down\n", + ifnames[i]); + continue; + } + + nip++; + } + + libcfs_ipif_free_enumeration(ifnames, nif); + + if (nip == nif) { + *ipaddrsp = ipaddrs; + } else { + if (nip > 0) { + LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2)); + if (ipaddrs2 == NULL) { + CERROR("Can't allocate ipaddrs[%d]\n", nip); + nip = -ENOMEM; + } else { + memcpy(ipaddrs2, ipaddrs, + nip * sizeof(*ipaddrs)); + *ipaddrsp = ipaddrs2; + rc = nip; + } + } + lnet_ipaddr_free_enumeration(ipaddrs, nif); + } + return nip; +} + +int +lnet_parse_ip2nets(char **networksp, char *ip2nets) +{ + __u32 *ipaddrs = NULL; + int nip = lnet_ipaddr_enumerate(&ipaddrs); + int rc; + + if (nip < 0) { + LCONSOLE_ERROR_MSG(0x117, + "Error %d enumerating local IP interfaces for ip2nets to match\n", + nip); + return nip; + } + + if (nip == 0) { + LCONSOLE_ERROR_MSG(0x118, + "No local IP interfaces for ip2nets to match\n"); + return -ENOENT; + } + + rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip); + lnet_ipaddr_free_enumeration(ipaddrs, nip); + + if (rc < 0) { + LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc); + return rc; + } + + if (rc == 0) { + LCONSOLE_ERROR_MSG(0x11a, + "ip2nets does not match any local IP interfaces\n"); + return -ENOENT; + } + + return 0; +} + +int +lnet_set_ip_niaddr(lnet_ni_t *ni) +{ + __u32 net = LNET_NIDNET(ni->ni_nid); + char **names; + int n; + __u32 ip; + __u32 netmask; + int up; + int i; + int rc; + + /* Convenience for LNDs that use the IP address of a local interface as + * the local address part of their NID */ + + if (ni->ni_interfaces[0] != NULL) { + + CLASSERT(LNET_MAX_INTERFACES > 1); + + if (ni->ni_interfaces[1] != NULL) { + CERROR("Net %s doesn't support multiple interfaces\n", + libcfs_net2str(net)); + return -EPERM; + } + + rc = libcfs_ipif_query(ni->ni_interfaces[0], + &up, &ip, &netmask); + if (rc != 0) { + CERROR("Net %s can't query interface %s: %d\n", + libcfs_net2str(net), ni->ni_interfaces[0], rc); + return -EPERM; + } + + if (!up) { + CERROR("Net %s can't use interface %s: it's down\n", + libcfs_net2str(net), ni->ni_interfaces[0]); + return -ENETDOWN; + } + + ni->ni_nid = LNET_MKNID(net, ip); + return 0; + } + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Net %s can't enumerate interfaces: %d\n", + libcfs_net2str(net), n); + return 0; + } + + for (i = 0; i < n; i++) { + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &netmask); + + if (rc != 0) { + CWARN("Net %s can't query interface %s: %d\n", + libcfs_net2str(net), names[i], rc); + continue; + } + + if (!up) { + CWARN("Net %s ignoring interface %s (down)\n", + libcfs_net2str(net), names[i]); + continue; + } + + libcfs_ipif_free_enumeration(names, n); + ni->ni_nid = LNET_MKNID(net, ip); + return 0; + } + + CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net)); + libcfs_ipif_free_enumeration(names, n); + return -ENOENT; +} +EXPORT_SYMBOL(lnet_set_ip_niaddr); diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c new file mode 100644 index 000000000..5470148f5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c @@ -0,0 +1,441 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-eq.c + * + * Library level Event queue management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include "../../include/linux/lnet/lib-lnet.h" + +/** + * Create an event queue that has room for \a count number of events. + * + * The event queue is circular and older events will be overwritten by new + * ones if they are not removed in time by the user using the functions + * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to + * determine the appropriate size of the event queue to prevent this loss + * of events. Note that when EQ handler is specified in \a callback, no + * event loss can happen, since the handler is run for each event deposited + * into the EQ. + * + * \param count The number of events to be stored in the event queue. It + * will be rounded up to the next power of two. + * \param callback A handler function that runs when an event is deposited + * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to + * indicate that no event handler is desired. + * \param handle On successful return, this location will hold a handle for + * the newly created EQ. + * + * \retval 0 On success. + * \retval -EINVAL If an parameter is not valid. + * \retval -ENOMEM If memory for the EQ can't be allocated. + * + * \see lnet_eq_handler_t for the discussion on EQ handler semantics. + */ +int +LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, + lnet_handle_eq_t *handle) +{ + lnet_eq_t *eq; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + /* We need count to be a power of 2 so that when eq_{enq,deq}_seq + * overflow, they don't skip entries, so the queue has the same + * apparent capacity at all times */ + + count = cfs_power2_roundup(count); + + if (callback != LNET_EQ_HANDLER_NONE && count != 0) + CWARN("EQ callback is guaranteed to get every event, do you still want to set eqcount %d for polling event which will have locking overhead? Please contact with developer to confirm\n", count); + + /* count can be 0 if only need callback, we can eliminate + * overhead of enqueue event */ + if (count == 0 && callback == LNET_EQ_HANDLER_NONE) + return -EINVAL; + + eq = lnet_eq_alloc(); + if (eq == NULL) + return -ENOMEM; + + if (count != 0) { + LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t)); + if (eq->eq_events == NULL) + goto failed; + /* NB allocator has set all event sequence numbers to 0, + * so all them should be earlier than eq_deq_seq */ + } + + eq->eq_deq_seq = 1; + eq->eq_enq_seq = 1; + eq->eq_size = count; + eq->eq_callback = callback; + + eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*eq->eq_refs[0])); + if (eq->eq_refs == NULL) + goto failed; + + /* MUST hold both exclusive lnet_res_lock */ + lnet_res_lock(LNET_LOCK_EX); + /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do + * both EQ lookup and poll event with only lnet_eq_wait_lock */ + lnet_eq_wait_lock(); + + lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); + list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); + + lnet_eq_wait_unlock(); + lnet_res_unlock(LNET_LOCK_EX); + + lnet_eq2handle(handle, eq); + return 0; + +failed: + if (eq->eq_events != NULL) + LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t)); + + if (eq->eq_refs != NULL) + cfs_percpt_free(eq->eq_refs); + + lnet_eq_free(eq); + return -ENOMEM; +} +EXPORT_SYMBOL(LNetEQAlloc); + +/** + * Release the resources associated with an event queue if it's idle; + * otherwise do nothing and it's up to the user to try again. + * + * \param eqh A handle for the event queue to be released. + * + * \retval 0 If the EQ is not in use and freed. + * \retval -ENOENT If \a eqh does not point to a valid EQ. + * \retval -EBUSY If the EQ is still in use by some MDs. + */ +int +LNetEQFree(lnet_handle_eq_t eqh) +{ + struct lnet_eq *eq; + lnet_event_t *events = NULL; + int **refs = NULL; + int *ref; + int rc = 0; + int size = 0; + int i; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + lnet_res_lock(LNET_LOCK_EX); + /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do + * both EQ lookup and poll event with only lnet_eq_wait_lock */ + lnet_eq_wait_lock(); + + eq = lnet_handle2eq(&eqh); + if (eq == NULL) { + rc = -ENOENT; + goto out; + } + + cfs_percpt_for_each(ref, i, eq->eq_refs) { + LASSERT(*ref >= 0); + if (*ref == 0) + continue; + + CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", + i, *ref); + rc = -EBUSY; + goto out; + } + + /* stash for free after lock dropped */ + events = eq->eq_events; + size = eq->eq_size; + refs = eq->eq_refs; + + lnet_res_lh_invalidate(&eq->eq_lh); + list_del(&eq->eq_list); + lnet_eq_free_locked(eq); + out: + lnet_eq_wait_unlock(); + lnet_res_unlock(LNET_LOCK_EX); + + if (events != NULL) + LIBCFS_FREE(events, size * sizeof(lnet_event_t)); + if (refs != NULL) + cfs_percpt_free(refs); + + return rc; +} +EXPORT_SYMBOL(LNetEQFree); + +void +lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev) +{ + /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ + int index; + + if (eq->eq_size == 0) { + LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); + eq->eq_callback(ev); + return; + } + + lnet_eq_wait_lock(); + ev->sequence = eq->eq_enq_seq++; + + LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size)); + index = ev->sequence & (eq->eq_size - 1); + + eq->eq_events[index] = *ev; + + if (eq->eq_callback != LNET_EQ_HANDLER_NONE) + eq->eq_callback(ev); + + /* Wake anyone waiting in LNetEQPoll() */ + if (waitqueue_active(&the_lnet.ln_eq_waitq)) + wake_up_all(&the_lnet.ln_eq_waitq); + lnet_eq_wait_unlock(); +} + +static int +lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev) +{ + int new_index = eq->eq_deq_seq & (eq->eq_size - 1); + lnet_event_t *new_event = &eq->eq_events[new_index]; + int rc; + + /* must called with lnet_eq_wait_lock hold */ + if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) + return 0; + + /* We've got a new event... */ + *ev = *new_event; + + CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->eq_deq_seq, eq->eq_size); + + /* ...but did it overwrite an event we've not seen yet? */ + if (eq->eq_deq_seq == new_event->sequence) { + rc = 1; + } else { + /* don't complain with CERROR: some EQs are sized small + * anyway; if it's important, the caller should complain */ + CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", + eq->eq_deq_seq, new_event->sequence); + rc = -EOVERFLOW; + } + + eq->eq_deq_seq = new_event->sequence + 1; + return rc; +} + +/** + * A nonblocking function that can be used to get the next event in an EQ. + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully. The event is removed from the queue. + * + * \param eventq A handle for the event queue. + * \param event On successful return (1 or -EOVERFLOW), this location will + * hold the next event in the EQ. + * + * \retval 0 No pending event in the EQ. + * \retval 1 Indicates success. + * \retval -ENOENT If \a eventq does not point to a valid EQ. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ has been dropped due to limited space in the EQ. + */ +int +LNetEQGet(lnet_handle_eq_t eventq, lnet_event_t *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, 0, + event, &which); +} +EXPORT_SYMBOL(LNetEQGet); + +/** + * Block the calling process until there is an event in the EQ. + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully. This function returns the next event + * in the EQ and removes it from the EQ. + * + * \param eventq A handle for the event queue. + * \param event On successful return (1 or -EOVERFLOW), this location will + * hold the next event in the EQ. + * + * \retval 1 Indicates success. + * \retval -ENOENT If \a eventq does not point to a valid EQ. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ has been dropped due to limited space in the EQ. + */ +int +LNetEQWait(lnet_handle_eq_t eventq, lnet_event_t *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER, + event, &which); +} +EXPORT_SYMBOL(LNetEQWait); + + +static int +lnet_eq_wait_locked(int *timeout_ms) +__must_hold(&the_lnet.ln_eq_wait_lock) +{ + int tms = *timeout_ms; + int wait; + wait_queue_t wl; + unsigned long now; + + if (tms == 0) + return -1; /* don't want to wait and no new event */ + + init_waitqueue_entry(&wl, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&the_lnet.ln_eq_waitq, &wl); + + lnet_eq_wait_unlock(); + + if (tms < 0) { + schedule(); + + } else { + struct timeval tv; + + now = cfs_time_current(); + schedule_timeout(cfs_time_seconds(tms) / 1000); + cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv); + tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000); + if (tms < 0) /* no more wait but may have new event */ + tms = 0; + } + + wait = tms != 0; /* might need to call here again */ + *timeout_ms = tms; + + lnet_eq_wait_lock(); + remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); + + return wait; +} + + + +/** + * Block the calling process until there's an event from a set of EQs or + * timeout happens. + * + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully, in which case the corresponding event + * is consumed. + * + * LNetEQPoll() provides a timeout to allow applications to poll, block for a + * fixed period, or block indefinitely. + * + * \param eventqs,neq An array of EQ handles, and size of the array. + * \param timeout_ms Time in milliseconds to wait for an event to occur on + * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an + * infinite timeout. + * \param event,which On successful return (1 or -EOVERFLOW), \a event will + * hold the next event in the EQs, and \a which will contain the index of the + * EQ from which the event was taken. + * + * \retval 0 No pending event in the EQs after timeout. + * \retval 1 Indicates success. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ indicated by \a which has been dropped due to limited space in the EQ. + * \retval -ENOENT If there's an invalid handle in \a eventqs. + */ +int +LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms, + lnet_event_t *event, int *which) +{ + int wait = 1; + int rc; + int i; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if (neq < 1) + return -ENOENT; + + lnet_eq_wait_lock(); + + for (;;) { + for (i = 0; i < neq; i++) { + lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]); + + if (eq == NULL) { + lnet_eq_wait_unlock(); + return -ENOENT; + } + + rc = lnet_eq_dequeue_event(eq, event); + if (rc != 0) { + lnet_eq_wait_unlock(); + *which = i; + return rc; + } + } + + if (wait == 0) + break; + + /* + * return value of lnet_eq_wait_locked: + * -1 : did nothing and it's sure no new event + * 1 : sleep inside and wait until new event + * 0 : don't want to wait anymore, but might have new event + * so need to call dequeue again + */ + wait = lnet_eq_wait_locked(&timeout_ms); + if (wait < 0) /* no new event */ + break; + } + + lnet_eq_wait_unlock(); + return 0; +} diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-md.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-md.c new file mode 100644 index 000000000..89d660fef --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-md.c @@ -0,0 +1,454 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-md.c + * + * Memory Descriptor management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/lnet/lib-lnet.h" + +/* must be called with lnet_res_lock held */ +void +lnet_md_unlink(lnet_libmd_t *md) +{ + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) { + /* first unlink attempt... */ + lnet_me_t *me = md->md_me; + + md->md_flags |= LNET_MD_FLAG_ZOMBIE; + + /* Disassociate from ME (if any), + * and unlink it if it was created + * with LNET_UNLINK */ + if (me != NULL) { + /* detach MD from portal */ + lnet_ptl_detach_md(me, md); + if (me->me_unlink == LNET_UNLINK) + lnet_me_unlink(me); + } + + /* ensure all future handle lookups fail */ + lnet_res_lh_invalidate(&md->md_lh); + } + + if (md->md_refcount != 0) { + CDEBUG(D_NET, "Queueing unlink of md %p\n", md); + return; + } + + CDEBUG(D_NET, "Unlinking md %p\n", md); + + if (md->md_eq != NULL) { + int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + + LASSERT(*md->md_eq->eq_refs[cpt] > 0); + (*md->md_eq->eq_refs[cpt])--; + } + + LASSERT(!list_empty(&md->md_list)); + list_del_init(&md->md_list); + lnet_md_free_locked(md); +} + +static int +lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink) +{ + int i; + unsigned int niov; + int total_length = 0; + + lmd->md_me = NULL; + lmd->md_start = umd->start; + lmd->md_offset = 0; + lmd->md_max_size = umd->max_size; + lmd->md_options = umd->options; + lmd->md_user_ptr = umd->user_ptr; + lmd->md_eq = NULL; + lmd->md_threshold = umd->threshold; + lmd->md_refcount = 0; + lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0; + + if ((umd->options & LNET_MD_IOVEC) != 0) { + + if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */ + return -EINVAL; + + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.iov, umd->start, + niov * sizeof(lmd->md_iov.iov[0])); + + for (i = 0; i < (int)niov; i++) { + /* We take the base address on trust */ + /* invalid length */ + if (lmd->md_iov.iov[i].iov_len <= 0) + return -EINVAL; + + total_length += lmd->md_iov.iov[i].iov_len; + } + + lmd->md_length = total_length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* use max size */ + (umd->max_size < 0 || + umd->max_size > total_length)) /* illegal max_size */ + return -EINVAL; + + } else if ((umd->options & LNET_MD_KIOV) != 0) { + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.kiov, umd->start, + niov * sizeof(lmd->md_iov.kiov[0])); + + for (i = 0; i < (int)niov; i++) { + /* We take the page pointer on trust */ + if (lmd->md_iov.kiov[i].kiov_offset + + lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE) + return -EINVAL; /* invalid length */ + + total_length += lmd->md_iov.kiov[i].kiov_len; + } + + lmd->md_length = total_length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) /* illegal max_size */ + return -EINVAL; + } else { /* contiguous */ + lmd->md_length = umd->length; + lmd->md_niov = niov = 1; + lmd->md_iov.iov[0].iov_base = umd->start; + lmd->md_iov.iov[0].iov_len = umd->length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > (int)umd->length)) /* illegal max_size */ + return -EINVAL; + } + + return 0; +} + +/* must be called with resource lock held */ +static int +lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt) +{ + struct lnet_res_container *container = the_lnet.ln_md_containers[cpt]; + + /* NB we are passed an allocated, but inactive md. + * if we return success, caller may lnet_md_unlink() it. + * otherwise caller may only lnet_md_free() it. + */ + /* This implementation doesn't know how to create START events or + * disable END events. Best to LASSERT our caller is compliant so + * we find out quickly... */ + /* TODO - reevaluate what should be here in light of + * the removal of the start and end events + * maybe there we shouldn't even allow LNET_EQ_NONE!) + * LASSERT (eq == NULL); + */ + if (!LNetHandleIsInvalid(eq_handle)) { + md->md_eq = lnet_handle2eq(&eq_handle); + + if (md->md_eq == NULL) + return -ENOENT; + + (*md->md_eq->eq_refs[cpt])++; + } + + lnet_res_lh_initialize(container, &md->md_lh); + + LASSERT(list_empty(&md->md_list)); + list_add(&md->md_list, &container->rec_active); + + return 0; +} + +/* must be called with lnet_res_lock held */ +void +lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd) +{ + /* NB this doesn't copy out all the iov entries so when a + * discontiguous MD is copied out, the target gets to know the + * original iov pointer (in start) and the number of entries it had + * and that's all. + */ + umd->start = lmd->md_start; + umd->length = ((lmd->md_options & + (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ? + lmd->md_length : lmd->md_niov; + umd->threshold = lmd->md_threshold; + umd->max_size = lmd->md_max_size; + umd->options = lmd->md_options; + umd->user_ptr = lmd->md_user_ptr; + lnet_eq2handle(&umd->eq_handle, lmd->md_eq); +} + +static int +lnet_md_validate(lnet_md_t *umd) +{ + if (umd->start == NULL && umd->length != 0) { + CERROR("MD start pointer can not be NULL with length %u\n", + umd->length); + return -EINVAL; + } + + if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && + umd->length > LNET_MAX_IOV) { + CERROR("Invalid option: too many fragments %u, %d max\n", + umd->length, LNET_MAX_IOV); + return -EINVAL; + } + + return 0; +} + +/** + * Create a memory descriptor and attach it to a ME + * + * \param meh A handle for a ME to associate the new MD with. + * \param umd Provides initial values for the user-visible parts of a MD. + * Other than its use for initialization, there is no linkage between this + * structure and the MD maintained by the LNet. + * \param unlink A flag to indicate whether the MD is automatically unlinked + * when it becomes inactive, either because the operation threshold drops to + * zero or because the available memory becomes less than \a umd.max_size. + * (Note that the check for unlinking a MD only occurs after the completion + * of a successful operation on the MD.) The value LNET_UNLINK enables auto + * unlinking; the value LNET_RETAIN disables it. + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(). + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a + * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by + * calling LNetInvalidateHandle() on it. + * \retval -EBUSY If the ME pointed to by \a meh is already associated with + * a MD. + */ +int +LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, + lnet_unlink_t unlink, lnet_handle_md_t *handle) +{ + LIST_HEAD(matches); + LIST_HEAD(drops); + struct lnet_me *me; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if (lnet_md_validate(&umd) != 0) + return -EINVAL; + + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) { + CERROR("Invalid option: no MD_OP set\n"); + return -EINVAL; + } + + md = lnet_md_alloc(&umd); + if (md == NULL) + return -ENOMEM; + + rc = lnet_md_build(md, &umd, unlink); + cpt = lnet_cpt_of_cookie(meh.cookie); + + lnet_res_lock(cpt); + if (rc != 0) + goto failed; + + me = lnet_handle2me(&meh); + if (me == NULL) + rc = -ENOENT; + else if (me->me_md != NULL) + rc = -EBUSY; + else + rc = lnet_md_link(md, umd.eq_handle, cpt); + + if (rc != 0) + goto failed; + + /* attach this MD to portal of ME and check if it matches any + * blocked msgs on this portal */ + lnet_ptl_attach_md(me, md, &matches, &drops); + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + + lnet_drop_delayed_msg_list(&drops, "Bad match"); + lnet_recv_delayed_msg_list(&matches); + + return 0; + + failed: + lnet_md_free_locked(md); + + lnet_res_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetMDAttach); + +/** + * Create a "free floating" memory descriptor - a MD that is not associated + * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations. + * + * \param umd,unlink See the discussion for LNetMDAttach(). + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(), + * and LNetGet() operations. + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that + * it's OK to supply a NULL \a umd.eq_handle by calling + * LNetInvalidateHandle() on it. + */ +int +LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle) +{ + lnet_libmd_t *md; + int cpt; + int rc; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if (lnet_md_validate(&umd) != 0) + return -EINVAL; + + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) { + CERROR("Invalid option: GET|PUT illegal on active MDs\n"); + return -EINVAL; + } + + md = lnet_md_alloc(&umd); + if (md == NULL) + return -ENOMEM; + + rc = lnet_md_build(md, &umd, unlink); + + cpt = lnet_res_lock_current(); + if (rc != 0) + goto failed; + + rc = lnet_md_link(md, umd.eq_handle, cpt); + if (rc != 0) + goto failed; + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + return 0; + + failed: + lnet_md_free_locked(md); + + lnet_res_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetMDBind); + +/** + * Unlink the memory descriptor from any ME it may be linked to and release + * the internal resources associated with it. As a result, active messages + * associated with the MD may get aborted. + * + * This function does not free the memory region associated with the MD; + * i.e., the memory the user allocated for this MD. If the ME associated with + * this MD is not NULL and was created with auto unlink enabled, the ME is + * unlinked as well (see LNetMEAttach()). + * + * Explicitly unlinking a MD via this function call has the same behavior as + * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK + * is generated in the latter case. + * + * An unlinked event can be reported in two ways: + * - If there's no pending operations on the MD, it's unlinked immediately + * and an LNET_EVENT_UNLINK event is logged before this function returns. + * - Otherwise, the MD is only marked for deletion when this function + * returns, and the unlinked event will be piggybacked on the event of + * the completion of the last operation by setting the unlinked field of + * the event. No dedicated LNET_EVENT_UNLINK event is generated. + * + * Note that in both cases the unlinked field of the event is always set; no + * more event will happen on the MD after such an event is logged. + * + * \param mdh A handle for the MD to be unlinked. + * + * \retval 0 On success. + * \retval -ENOENT If \a mdh does not point to a valid MD object. + */ +int +LNetMDUnlink(lnet_handle_md_t mdh) +{ + lnet_event_t ev; + lnet_libmd_t *md; + int cpt; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL) { + lnet_res_unlock(cpt); + return -ENOENT; + } + + md->md_flags |= LNET_MD_FLAG_ABORTED; + /* If the MD is busy, lnet_md_unlink just marks it for deletion, and + * when the LND is done, the completion event flags that the MD was + * unlinked. Otherwise, we enqueue an event now... */ + if (md->md_eq != NULL && md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_eq_enqueue_event(md->md_eq, &ev); + } + + lnet_md_unlink(md); + + lnet_res_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(LNetMDUnlink); diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-me.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-me.c new file mode 100644 index 000000000..a3f929244 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-me.c @@ -0,0 +1,298 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-me.c + * + * Match Entry management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/lnet/lib-lnet.h" + +/** + * Create and attach a match entry to the match list of \a portal. The new + * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach() + * can be used to attach a MD to an empty ME. + * + * \param portal The portal table index where the ME should be attached. + * \param match_id Specifies the match criteria for the process ID of + * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be + * used to wildcard either of the identifiers in the lnet_process_id_t + * structure. + * \param match_bits,ignore_bits Specify the match criteria to apply + * to the match bits in the incoming request. The ignore bits are used + * to mask out insignificant bits in the incoming match bits. The resulting + * bits are then compared to the ME's match bits to determine if the + * incoming request meets the match criteria. + * \param unlink Indicates whether the ME should be unlinked when the memory + * descriptor associated with it is unlinked (Note that the check for + * unlinking a ME only occurs when the memory descriptor is unlinked.). + * Valid values are LNET_RETAIN and LNET_UNLINK. + * \param pos Indicates whether the new ME should be prepended or + * appended to the match list. Allowed constants: LNET_INS_BEFORE, + * LNET_INS_AFTER. + * \param handle On successful returns, a handle to the newly created ME + * object is saved here. This handle can be used later in LNetMEInsert(), + * LNetMEUnlink(), or LNetMDAttach() functions. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is invalid. + * \retval -ENOMEM If new ME object cannot be allocated. + */ +int +LNetMEAttach(unsigned int portal, + lnet_process_id_t match_id, + __u64 match_bits, __u64 ignore_bits, + lnet_unlink_t unlink, lnet_ins_pos_t pos, + lnet_handle_me_t *handle) +{ + struct lnet_match_table *mtable; + struct lnet_me *me; + struct list_head *head; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if ((int)portal >= the_lnet.ln_nportals) + return -EINVAL; + + mtable = lnet_mt_of_attach(portal, match_id, + match_bits, ignore_bits, pos); + if (mtable == NULL) /* can't match portal type */ + return -EPERM; + + me = lnet_me_alloc(); + if (me == NULL) + return -ENOMEM; + + lnet_res_lock(mtable->mt_cpt); + + me->me_portal = portal; + me->me_match_id = match_id; + me->me_match_bits = match_bits; + me->me_ignore_bits = ignore_bits; + me->me_unlink = unlink; + me->me_md = NULL; + + lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt], + &me->me_lh); + if (ignore_bits != 0) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, match_id, match_bits); + + me->me_pos = head - &mtable->mt_mhash[0]; + if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL) + list_add_tail(&me->me_list, head); + else + list_add(&me->me_list, head); + + lnet_me2handle(handle, me); + + lnet_res_unlock(mtable->mt_cpt); + return 0; +} +EXPORT_SYMBOL(LNetMEAttach); + +/** + * Create and a match entry and insert it before or after the ME pointed to by + * \a current_meh. The new ME is empty, i.e. not associated with a memory + * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME. + * + * This function is identical to LNetMEAttach() except for the position + * where the new ME is inserted. + * + * \param current_meh A handle for a ME. The new ME will be inserted + * immediately before or immediately after this ME. + * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion + * for LNetMEAttach(). + * + * \retval 0 On success. + * \retval -ENOMEM If new ME object cannot be allocated. + * \retval -ENOENT If \a current_meh does not point to a valid match entry. + */ +int +LNetMEInsert(lnet_handle_me_t current_meh, + lnet_process_id_t match_id, + __u64 match_bits, __u64 ignore_bits, + lnet_unlink_t unlink, lnet_ins_pos_t pos, + lnet_handle_me_t *handle) +{ + struct lnet_me *current_me; + struct lnet_me *new_me; + struct lnet_portal *ptl; + int cpt; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if (pos == LNET_INS_LOCAL) + return -EPERM; + + new_me = lnet_me_alloc(); + if (new_me == NULL) + return -ENOMEM; + + cpt = lnet_cpt_of_cookie(current_meh.cookie); + + lnet_res_lock(cpt); + + current_me = lnet_handle2me(¤t_meh); + if (current_me == NULL) { + lnet_me_free_locked(new_me); + + lnet_res_unlock(cpt); + return -ENOENT; + } + + LASSERT(current_me->me_portal < the_lnet.ln_nportals); + + ptl = the_lnet.ln_portals[current_me->me_portal]; + if (lnet_ptl_is_unique(ptl)) { + /* nosense to insertion on unique portal */ + lnet_me_free_locked(new_me); + lnet_res_unlock(cpt); + return -EPERM; + } + + new_me->me_pos = current_me->me_pos; + new_me->me_portal = current_me->me_portal; + new_me->me_match_id = match_id; + new_me->me_match_bits = match_bits; + new_me->me_ignore_bits = ignore_bits; + new_me->me_unlink = unlink; + new_me->me_md = NULL; + + lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh); + + if (pos == LNET_INS_AFTER) + list_add(&new_me->me_list, ¤t_me->me_list); + else + list_add_tail(&new_me->me_list, ¤t_me->me_list); + + lnet_me2handle(handle, new_me); + + lnet_res_unlock(cpt); + + return 0; +} +EXPORT_SYMBOL(LNetMEInsert); + +/** + * Unlink a match entry from its match list. + * + * This operation also releases any resources associated with the ME. If a + * memory descriptor is attached to the ME, then it will be unlinked as well + * and an unlink event will be generated. It is an error to use the ME handle + * after calling LNetMEUnlink(). + * + * \param meh A handle for the ME to be unlinked. + * + * \retval 0 On success. + * \retval -ENOENT If \a meh does not point to a valid ME. + * \see LNetMDUnlink() for the discussion on delivering unlink event. + */ +int +LNetMEUnlink(lnet_handle_me_t meh) +{ + lnet_me_t *me; + lnet_libmd_t *md; + lnet_event_t ev; + int cpt; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_cpt_of_cookie(meh.cookie); + lnet_res_lock(cpt); + + me = lnet_handle2me(&meh); + if (me == NULL) { + lnet_res_unlock(cpt); + return -ENOENT; + } + + md = me->me_md; + if (md != NULL) { + md->md_flags |= LNET_MD_FLAG_ABORTED; + if (md->md_eq != NULL && md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_eq_enqueue_event(md->md_eq, &ev); + } + } + + lnet_me_unlink(me); + + lnet_res_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(LNetMEUnlink); + +/* call with lnet_res_lock please */ +void +lnet_me_unlink(lnet_me_t *me) +{ + list_del(&me->me_list); + + if (me->me_md != NULL) { + lnet_libmd_t *md = me->me_md; + + /* detach MD from portal of this ME */ + lnet_ptl_detach_md(me, md); + lnet_md_unlink(md); + } + + lnet_res_lh_invalidate(&me->me_lh); + lnet_me_free_locked(me); +} + +#if 0 +static void +lib_me_dump(lnet_me_t *me) +{ + CWARN("Match Entry %p (%#llx)\n", me, + me->me_lh.lh_cookie); + + CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", + me->me_match_bits, me->me_ignore_bits); + + CWARN("\tMD\t= %p\n", me->md); + CWARN("\tprev\t= %p\n", + list_entry(me->me_list.prev, lnet_me_t, me_list)); + CWARN("\tnext\t= %p\n", + list_entry(me->me_list.next, lnet_me_t, me_list)); +} +#endif diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-move.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-move.c new file mode 100644 index 000000000..c2fb70e5f --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-move.c @@ -0,0 +1,2460 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-move.c + * + * Data movement routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/lnet/lib-lnet.h" + +static int local_nid_dist_zero = 1; +module_param(local_nid_dist_zero, int, 0444); +MODULE_PARM_DESC(local_nid_dist_zero, "Reserved"); + +int +lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) +{ + lnet_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + struct list_head cull; + + LASSERT(the_lnet.ln_init); + + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + if (threshold != 0) { + /* Adding a new entry */ + LIBCFS_ALLOC(tp, sizeof(*tp)); + if (tp == NULL) + return -ENOMEM; + + tp->tp_nid = nid; + tp->tp_threshold = threshold; + + lnet_net_lock(0); + list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers); + lnet_net_unlock(0); + return 0; + } + + /* removing entries */ + INIT_LIST_HEAD(&cull); + + lnet_net_lock(0); + + list_for_each_safe(el, next, &the_lnet.ln_test_peers) { + tp = list_entry(el, lnet_test_peer_t, tp_list); + + if (tp->tp_threshold == 0 || /* needs culling anyway */ + nid == LNET_NID_ANY || /* removing all entries */ + tp->tp_nid == nid) { /* matched this one */ + list_del(&tp->tp_list); + list_add(&tp->tp_list, &cull); + } + } + + lnet_net_unlock(0); + + while (!list_empty(&cull)) { + tp = list_entry(cull.next, lnet_test_peer_t, tp_list); + + list_del(&tp->tp_list); + LIBCFS_FREE(tp, sizeof(*tp)); + } + return 0; +} + +static int +fail_peer(lnet_nid_t nid, int outgoing) +{ + lnet_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + struct list_head cull; + int fail = 0; + + INIT_LIST_HEAD(&cull); + + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + lnet_net_lock(0); + + list_for_each_safe(el, next, &the_lnet.ln_test_peers) { + tp = list_entry(el, lnet_test_peer_t, tp_list); + + if (tp->tp_threshold == 0) { + /* zombie entry */ + if (outgoing) { + /* only cull zombies on outgoing tests, + * since we may be at interrupt priority on + * incoming messages. */ + list_del(&tp->tp_list); + list_add(&tp->tp_list, &cull); + } + continue; + } + + if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */ + nid == tp->tp_nid) { /* fail this peer */ + fail = 1; + + if (tp->tp_threshold != LNET_MD_THRESH_INF) { + tp->tp_threshold--; + if (outgoing && + tp->tp_threshold == 0) { + /* see above */ + list_del(&tp->tp_list); + list_add(&tp->tp_list, &cull); + } + } + break; + } + } + + lnet_net_unlock(0); + + while (!list_empty(&cull)) { + tp = list_entry(cull.next, lnet_test_peer_t, tp_list); + list_del(&tp->tp_list); + + LIBCFS_FREE(tp, sizeof(*tp)); + } + + return fail; +} + +unsigned int +lnet_iov_nob(unsigned int niov, struct kvec *iov) +{ + unsigned int nob = 0; + + while (niov-- > 0) + nob += (iov++)->iov_len; + + return nob; +} +EXPORT_SYMBOL(lnet_iov_nob); + +void +lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, + unsigned int nsiov, struct kvec *siov, unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + + if (nob == 0) + return; + + /* skip complete frags before 'doffset' */ + LASSERT(ndiov > 0); + while (doffset >= diov->iov_len) { + doffset -= diov->iov_len; + diov++; + ndiov--; + LASSERT(ndiov > 0); + } + + /* skip complete frags before 'soffset' */ + LASSERT(nsiov > 0); + while (soffset >= siov->iov_len) { + soffset -= siov->iov_len; + siov++; + nsiov--; + LASSERT(nsiov > 0); + } + + do { + LASSERT(ndiov > 0); + LASSERT(nsiov > 0); + this_nob = min(diov->iov_len - doffset, + siov->iov_len - soffset); + this_nob = min(this_nob, nob); + + memcpy((char *)diov->iov_base + doffset, + (char *)siov->iov_base + soffset, this_nob); + nob -= this_nob; + + if (diov->iov_len > doffset + this_nob) { + doffset += this_nob; + } else { + diov++; + ndiov--; + doffset = 0; + } + + if (siov->iov_len > soffset + this_nob) { + soffset += this_nob; + } else { + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); +} +EXPORT_SYMBOL(lnet_copy_iov2iov); + +int +lnet_extract_iov(int dst_niov, struct kvec *dst, + int src_niov, struct kvec *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return 0; /* no frags */ + + LASSERT(src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT(src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT(src_niov > 0); + LASSERT((int)niov <= dst_niov); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return niov; + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +EXPORT_SYMBOL(lnet_extract_iov); + + +unsigned int +lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov) +{ + unsigned int nob = 0; + + while (niov-- > 0) + nob += (kiov++)->kiov_len; + + return nob; +} +EXPORT_SYMBOL(lnet_kiov_nob); + +void +lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, + unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + char *daddr = NULL; + char *saddr = NULL; + + if (nob == 0) + return; + + LASSERT(!in_interrupt()); + + LASSERT(ndiov > 0); + while (doffset >= diov->kiov_len) { + doffset -= diov->kiov_len; + diov++; + ndiov--; + LASSERT(ndiov > 0); + } + + LASSERT(nsiov > 0); + while (soffset >= siov->kiov_len) { + soffset -= siov->kiov_len; + siov++; + nsiov--; + LASSERT(nsiov > 0); + } + + do { + LASSERT(ndiov > 0); + LASSERT(nsiov > 0); + this_nob = min(diov->kiov_len - doffset, + siov->kiov_len - soffset); + this_nob = min(this_nob, nob); + + if (daddr == NULL) + daddr = ((char *)kmap(diov->kiov_page)) + + diov->kiov_offset + doffset; + if (saddr == NULL) + saddr = ((char *)kmap(siov->kiov_page)) + + siov->kiov_offset + soffset; + + /* Vanishing risk of kmap deadlock when mapping 2 pages. + * However in practice at least one of the kiovs will be mapped + * kernel pages and the map/unmap will be NOOPs */ + + memcpy(daddr, saddr, this_nob); + nob -= this_nob; + + if (diov->kiov_len > doffset + this_nob) { + daddr += this_nob; + doffset += this_nob; + } else { + kunmap(diov->kiov_page); + daddr = NULL; + diov++; + ndiov--; + doffset = 0; + } + + if (siov->kiov_len > soffset + this_nob) { + saddr += this_nob; + soffset += this_nob; + } else { + kunmap(siov->kiov_page); + saddr = NULL; + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); + + if (daddr != NULL) + kunmap(diov->kiov_page); + if (saddr != NULL) + kunmap(siov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2kiov); + +void +lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov, unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, unsigned int nob) +{ + /* NB iov, kiov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT(!in_interrupt()); + + LASSERT(niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT(niov > 0); + } + + LASSERT(nkiov > 0); + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT(nkiov > 0); + } + + do { + LASSERT(niov > 0); + LASSERT(nkiov > 0); + this_nob = min(iov->iov_len - iovoffset, + (__kernel_size_t) kiov->kiov_len - kiovoffset); + this_nob = min(this_nob, nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; + + memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); + nob -= this_nob; + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2iov); + +void +lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, unsigned int niov, + struct kvec *iov, unsigned int iovoffset, + unsigned int nob) +{ + /* NB kiov, iov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT(!in_interrupt()); + + LASSERT(nkiov > 0); + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT(nkiov > 0); + } + + LASSERT(niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT(niov > 0); + } + + do { + LASSERT(nkiov > 0); + LASSERT(niov > 0); + this_nob = min((__kernel_size_t) kiov->kiov_len - kiovoffset, + iov->iov_len - iovoffset); + this_nob = min(this_nob, nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; + + memcpy(addr, (char *)iov->iov_base + iovoffset, this_nob); + nob -= this_nob; + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_iov2kiov); + +int +lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return 0; /* no frags */ + + LASSERT(src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT(src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT(src_niov > 0); + LASSERT((int)niov <= dst_niov); + + frag_len = src->kiov_len - offset; + dst->kiov_page = src->kiov_page; + dst->kiov_offset = src->kiov_offset + offset; + + if (len <= frag_len) { + dst->kiov_len = len; + LASSERT(dst->kiov_offset + dst->kiov_len + <= PAGE_CACHE_SIZE); + return niov; + } + + dst->kiov_len = frag_len; + LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE); + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +EXPORT_SYMBOL(lnet_extract_kiov); + +static void +lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + unsigned int niov = 0; + struct kvec *iov = NULL; + lnet_kiov_t *kiov = NULL; + int rc; + + LASSERT(!in_interrupt()); + LASSERT(mlen == 0 || msg != NULL); + + if (msg != NULL) { + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_sending); + LASSERT(rlen == msg->msg_len); + LASSERT(mlen <= msg->msg_len); + LASSERT(msg->msg_offset == offset); + LASSERT(msg->msg_wanted == mlen); + + msg->msg_receiving = 0; + + if (mlen != 0) { + niov = msg->msg_niov; + iov = msg->msg_iov; + kiov = msg->msg_kiov; + + LASSERT(niov > 0); + LASSERT((iov == NULL) != (kiov == NULL)); + } + } + + rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed, + niov, iov, kiov, offset, mlen, rlen); + if (rc < 0) + lnet_finalize(ni, msg, rc); +} + +static void +lnet_setpayloadbuffer(lnet_msg_t *msg) +{ + lnet_libmd_t *md = msg->msg_md; + + LASSERT(msg->msg_len > 0); + LASSERT(!msg->msg_routing); + LASSERT(md != NULL); + LASSERT(msg->msg_niov == 0); + LASSERT(msg->msg_iov == NULL); + LASSERT(msg->msg_kiov == NULL); + + msg->msg_niov = md->md_niov; + if ((md->md_options & LNET_MD_KIOV) != 0) + msg->msg_kiov = md->md_iov.kiov; + else + msg->msg_iov = md->md_iov.iov; +} + +void +lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, + unsigned int offset, unsigned int len) +{ + msg->msg_type = type; + msg->msg_target = target; + msg->msg_len = len; + msg->msg_offset = offset; + + if (len != 0) + lnet_setpayloadbuffer(msg); + + memset(&msg->msg_hdr, 0, sizeof(msg->msg_hdr)); + msg->msg_hdr.type = cpu_to_le32(type); + msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); + msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); + /* src_nid will be set later */ + msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); + msg->msg_hdr.payload_length = cpu_to_le32(len); +} + +static void +lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) +{ + void *priv = msg->msg_private; + int rc; + + LASSERT(!in_interrupt()); + LASSERT(LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || + (msg->msg_txcredit && msg->msg_peertxcredit)); + + rc = (ni->ni_lnd->lnd_send)(ni, priv, msg); + if (rc < 0) + lnet_finalize(ni, msg, rc); +} + +static int +lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg) +{ + int rc; + + LASSERT(!msg->msg_sending); + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_rx_ready_delay); + LASSERT(ni->ni_lnd->lnd_eager_recv != NULL); + + msg->msg_rx_ready_delay = 1; + rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, + &msg->msg_private); + if (rc != 0) { + CERROR("recv from %s / send to %s aborted: eager_recv failed %d\n", + libcfs_nid2str(msg->msg_rxpeer->lp_nid), + libcfs_id2str(msg->msg_target), rc); + LASSERT(rc < 0); /* required by my callers */ + } + + return rc; +} + +/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */ +static void +lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp) +{ + unsigned long last_alive = 0; + + LASSERT(lnet_peer_aliveness_enabled(lp)); + LASSERT(ni->ni_lnd->lnd_query != NULL); + + lnet_net_unlock(lp->lp_cpt); + (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive); + lnet_net_lock(lp->lp_cpt); + + lp->lp_last_query = cfs_time_current(); + + if (last_alive != 0) /* NI has updated timestamp */ + lp->lp_last_alive = last_alive; +} + +/* NB: always called with lnet_net_lock held */ +static inline int +lnet_peer_is_alive(lnet_peer_t *lp, unsigned long now) +{ + int alive; + unsigned long deadline; + + LASSERT(lnet_peer_aliveness_enabled(lp)); + + /* Trust lnet_notify() if it has more recent aliveness news, but + * ignore the initial assumed death (see lnet_peers_start_down()). + */ + if (!lp->lp_alive && lp->lp_alive_count > 0 && + cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive)) + return 0; + + deadline = cfs_time_add(lp->lp_last_alive, + cfs_time_seconds(lp->lp_ni->ni_peertimeout)); + alive = cfs_time_after(deadline, now); + + /* Update obsolete lp_alive except for routers assumed to be dead + * initially, because router checker would update aliveness in this + * case, and moreover lp_last_alive at peer creation is assumed. + */ + if (alive && !lp->lp_alive && + !(lnet_isrouter(lp) && lp->lp_alive_count == 0)) + lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); + + return alive; +} + + +/* NB: returns 1 when alive, 0 when dead, negative when error; + * may drop the lnet_net_lock */ +static int +lnet_peer_alive_locked(lnet_peer_t *lp) +{ + unsigned long now = cfs_time_current(); + + if (!lnet_peer_aliveness_enabled(lp)) + return -ENODEV; + + if (lnet_peer_is_alive(lp, now)) + return 1; + + /* Peer appears dead, but we should avoid frequent NI queries (at + * most once per lnet_queryinterval seconds). */ + if (lp->lp_last_query != 0) { + static const int lnet_queryinterval = 1; + + unsigned long next_query = + cfs_time_add(lp->lp_last_query, + cfs_time_seconds(lnet_queryinterval)); + + if (time_before(now, next_query)) { + if (lp->lp_alive) + CWARN("Unexpected aliveness of peer %s: %d < %d (%d/%d)\n", + libcfs_nid2str(lp->lp_nid), + (int)now, (int)next_query, + lnet_queryinterval, + lp->lp_ni->ni_peertimeout); + return 0; + } + } + + /* query NI for latest aliveness news */ + lnet_ni_query_locked(lp->lp_ni, lp); + + if (lnet_peer_is_alive(lp, now)) + return 1; + + lnet_notify_locked(lp, 0, 0, lp->lp_last_alive); + return 0; +} + +/** + * \param msg The message to be sent. + * \param do_send True if lnet_ni_send() should be called in this function. + * lnet_send() is going to lnet_net_unlock immediately after this, so + * it sets do_send FALSE and I don't do the unlock/send/lock bit. + * + * \retval 0 If \a msg sent or OK to send. + * \retval EAGAIN If \a msg blocked for credit. + * \retval EHOSTUNREACH If the next hop of the message appears dead. + * \retval ECANCELED If the MD of the message has been unlinked. + */ +static int +lnet_post_send_locked(lnet_msg_t *msg, int do_send) +{ + lnet_peer_t *lp = msg->msg_txpeer; + lnet_ni_t *ni = lp->lp_ni; + int cpt = msg->msg_tx_cpt; + struct lnet_tx_queue *tq = ni->ni_tx_queues[cpt]; + + /* non-lnet_send() callers have checked before */ + LASSERT(!do_send || msg->msg_tx_delayed); + LASSERT(!msg->msg_receiving); + LASSERT(msg->msg_tx_committed); + + /* NB 'lp' is always the next hop */ + if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && + lnet_peer_alive_locked(lp) == 0) { + the_lnet.ln_counters[cpt]->drop_count++; + the_lnet.ln_counters[cpt]->drop_length += msg->msg_len; + lnet_net_unlock(cpt); + + CNETERR("Dropping message for %s: peer not alive\n", + libcfs_id2str(msg->msg_target)); + if (do_send) + lnet_finalize(ni, msg, -EHOSTUNREACH); + + lnet_net_lock(cpt); + return EHOSTUNREACH; + } + + if (msg->msg_md != NULL && + (msg->msg_md->md_flags & LNET_MD_FLAG_ABORTED) != 0) { + lnet_net_unlock(cpt); + + CNETERR("Aborting message for %s: LNetM[DE]Unlink() already called on the MD/ME.\n", + libcfs_id2str(msg->msg_target)); + if (do_send) + lnet_finalize(ni, msg, -ECANCELED); + + lnet_net_lock(cpt); + return ECANCELED; + } + + if (!msg->msg_peertxcredit) { + LASSERT((lp->lp_txcredits < 0) == + !list_empty(&lp->lp_txq)); + + msg->msg_peertxcredit = 1; + lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t); + lp->lp_txcredits--; + + if (lp->lp_txcredits < lp->lp_mintxcredits) + lp->lp_mintxcredits = lp->lp_txcredits; + + if (lp->lp_txcredits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &lp->lp_txq); + return EAGAIN; + } + } + + if (!msg->msg_txcredit) { + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + msg->msg_txcredit = 1; + tq->tq_credits--; + + if (tq->tq_credits < tq->tq_credits_min) + tq->tq_credits_min = tq->tq_credits; + + if (tq->tq_credits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &tq->tq_delayed); + return EAGAIN; + } + } + + if (do_send) { + lnet_net_unlock(cpt); + lnet_ni_send(ni, msg); + lnet_net_lock(cpt); + } + return 0; +} + + +static lnet_rtrbufpool_t * +lnet_msg2bufpool(lnet_msg_t *msg) +{ + lnet_rtrbufpool_t *rbp; + int cpt; + + LASSERT(msg->msg_rx_committed); + + cpt = msg->msg_rx_cpt; + rbp = &the_lnet.ln_rtrpools[cpt][0]; + + LASSERT(msg->msg_len <= LNET_MTU); + while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) { + rbp++; + LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]); + } + + return rbp; +} + +static int +lnet_post_routed_recv_locked(lnet_msg_t *msg, int do_recv) +{ + /* lnet_parse is going to lnet_net_unlock immediately after this, so it + * sets do_recv FALSE and I don't do the unlock/send/lock bit. I + * return EAGAIN if msg blocked and 0 if received or OK to receive */ + lnet_peer_t *lp = msg->msg_rxpeer; + lnet_rtrbufpool_t *rbp; + lnet_rtrbuf_t *rb; + + LASSERT(msg->msg_iov == NULL); + LASSERT(msg->msg_kiov == NULL); + LASSERT(msg->msg_niov == 0); + LASSERT(msg->msg_routing); + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_sending); + + /* non-lnet_parse callers only receive delayed messages */ + LASSERT(!do_recv || msg->msg_rx_delayed); + + if (!msg->msg_peerrtrcredit) { + LASSERT((lp->lp_rtrcredits < 0) == + !list_empty(&lp->lp_rtrq)); + + msg->msg_peerrtrcredit = 1; + lp->lp_rtrcredits--; + if (lp->lp_rtrcredits < lp->lp_minrtrcredits) + lp->lp_minrtrcredits = lp->lp_rtrcredits; + + if (lp->lp_rtrcredits < 0) { + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &lp->lp_rtrq); + return EAGAIN; + } + } + + rbp = lnet_msg2bufpool(msg); + + if (!msg->msg_rtrcredit) { + LASSERT((rbp->rbp_credits < 0) == + !list_empty(&rbp->rbp_msgs)); + + msg->msg_rtrcredit = 1; + rbp->rbp_credits--; + if (rbp->rbp_credits < rbp->rbp_mincredits) + rbp->rbp_mincredits = rbp->rbp_credits; + + if (rbp->rbp_credits < 0) { + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &rbp->rbp_msgs); + return EAGAIN; + } + } + + LASSERT(!list_empty(&rbp->rbp_bufs)); + rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + + msg->msg_niov = rbp->rbp_npages; + msg->msg_kiov = &rb->rb_kiov[0]; + + if (do_recv) { + int cpt = msg->msg_rx_cpt; + + lnet_net_unlock(cpt); + lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1, + 0, msg->msg_len, msg->msg_len); + lnet_net_lock(cpt); + } + return 0; +} + +void +lnet_return_tx_credits_locked(lnet_msg_t *msg) +{ + lnet_peer_t *txpeer = msg->msg_txpeer; + lnet_msg_t *msg2; + + if (msg->msg_txcredit) { + struct lnet_ni *ni = txpeer->lp_ni; + struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt]; + + /* give back NI txcredits */ + msg->msg_txcredit = 0; + + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + tq->tq_credits++; + if (tq->tq_credits <= 0) { + msg2 = list_entry(tq->tq_delayed.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txpeer->lp_ni == ni); + LASSERT(msg2->msg_tx_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (msg->msg_peertxcredit) { + /* give back peer txcredits */ + msg->msg_peertxcredit = 0; + + LASSERT((txpeer->lp_txcredits < 0) == + !list_empty(&txpeer->lp_txq)); + + txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); + LASSERT(txpeer->lp_txqnob >= 0); + + txpeer->lp_txcredits++; + if (txpeer->lp_txcredits <= 0) { + msg2 = list_entry(txpeer->lp_txq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txpeer == txpeer); + LASSERT(msg2->msg_tx_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (txpeer != NULL) { + msg->msg_txpeer = NULL; + lnet_peer_decref_locked(txpeer); + } +} + +void +lnet_return_rx_credits_locked(lnet_msg_t *msg) +{ + lnet_peer_t *rxpeer = msg->msg_rxpeer; + lnet_msg_t *msg2; + + if (msg->msg_rtrcredit) { + /* give back global router credits */ + lnet_rtrbuf_t *rb; + lnet_rtrbufpool_t *rbp; + + /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays + * there until it gets one allocated, or aborts the wait + * itself */ + LASSERT(msg->msg_kiov != NULL); + + rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); + rbp = rb->rb_pool; + LASSERT(rbp == lnet_msg2bufpool(msg)); + + msg->msg_kiov = NULL; + msg->msg_rtrcredit = 0; + + LASSERT((rbp->rbp_credits < 0) == + !list_empty(&rbp->rbp_msgs)); + LASSERT((rbp->rbp_credits > 0) == + !list_empty(&rbp->rbp_bufs)); + + list_add(&rb->rb_list, &rbp->rbp_bufs); + rbp->rbp_credits++; + if (rbp->rbp_credits <= 0) { + msg2 = list_entry(rbp->rbp_msgs.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } + + if (msg->msg_peerrtrcredit) { + /* give back peer router credits */ + msg->msg_peerrtrcredit = 0; + + LASSERT((rxpeer->lp_rtrcredits < 0) == + !list_empty(&rxpeer->lp_rtrq)); + + rxpeer->lp_rtrcredits++; + if (rxpeer->lp_rtrcredits <= 0) { + msg2 = list_entry(rxpeer->lp_rtrq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } + if (rxpeer != NULL) { + msg->msg_rxpeer = NULL; + lnet_peer_decref_locked(rxpeer); + } +} + +static int +lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2) +{ + lnet_peer_t *p1 = r1->lr_gateway; + lnet_peer_t *p2 = r2->lr_gateway; + + if (r1->lr_priority < r2->lr_priority) + return 1; + + if (r1->lr_priority > r2->lr_priority) + return -1; + + if (r1->lr_hops < r2->lr_hops) + return 1; + + if (r1->lr_hops > r2->lr_hops) + return -1; + + if (p1->lp_txqnob < p2->lp_txqnob) + return 1; + + if (p1->lp_txqnob > p2->lp_txqnob) + return -1; + + if (p1->lp_txcredits > p2->lp_txcredits) + return 1; + + if (p1->lp_txcredits < p2->lp_txcredits) + return -1; + + if (r1->lr_seq - r2->lr_seq <= 0) + return 1; + + return -1; +} + +static lnet_peer_t * +lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid) +{ + lnet_remotenet_t *rnet; + lnet_route_t *rtr; + lnet_route_t *rtr_best; + lnet_route_t *rtr_last; + struct lnet_peer *lp_best; + struct lnet_peer *lp; + int rc; + + /* If @rtr_nid is not LNET_NID_ANY, return the gateway with + * rtr_nid nid, otherwise find the best gateway I can use */ + + rnet = lnet_find_net_locked(LNET_NIDNET(target)); + if (rnet == NULL) + return NULL; + + lp_best = NULL; + rtr_best = rtr_last = NULL; + list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) { + lp = rtr->lr_gateway; + + if (!lp->lp_alive || /* gateway is down */ + ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 && + rtr->lr_downis != 0)) /* NI to target is down */ + continue; + + if (ni != NULL && lp->lp_ni != ni) + continue; + + if (lp->lp_nid == rtr_nid) /* it's pre-determined router */ + return lp; + + if (lp_best == NULL) { + rtr_best = rtr_last = rtr; + lp_best = lp; + continue; + } + + /* no protection on below fields, but it's harmless */ + if (rtr_last->lr_seq - rtr->lr_seq < 0) + rtr_last = rtr; + + rc = lnet_compare_routes(rtr, rtr_best); + if (rc < 0) + continue; + + rtr_best = rtr; + lp_best = lp; + } + + /* set sequence number on the best router to the latest sequence + 1 + * so we can round-robin all routers, it's race and inaccurate but + * harmless and functional */ + if (rtr_best != NULL) + rtr_best->lr_seq = rtr_last->lr_seq + 1; + return lp_best; +} + +int +lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) +{ + lnet_nid_t dst_nid = msg->msg_target.nid; + struct lnet_ni *src_ni; + struct lnet_ni *local_ni; + struct lnet_peer *lp; + int cpt; + int cpt2; + int rc; + + /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, + * but we might want to use pre-determined router for ACK/REPLY + * in the future */ + /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ + LASSERT(msg->msg_txpeer == NULL); + LASSERT(!msg->msg_sending); + LASSERT(!msg->msg_target_is_router); + LASSERT(!msg->msg_receiving); + + msg->msg_sending = 1; + + LASSERT(!msg->msg_tx_committed); + cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid); + again: + lnet_net_lock(cpt); + + if (the_lnet.ln_shutdown) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + if (src_nid == LNET_NID_ANY) { + src_ni = NULL; + } else { + src_ni = lnet_nid2ni_locked(src_nid, cpt); + if (src_ni == NULL) { + lnet_net_unlock(cpt); + LCONSOLE_WARN("Can't send to %s: src %s is not a local nid\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EINVAL; + } + LASSERT(!msg->msg_routing); + } + + /* Is this for someone on a local network? */ + local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt); + + if (local_ni != NULL) { + if (src_ni == NULL) { + src_ni = local_ni; + src_nid = src_ni->ni_nid; + } else if (src_ni == local_ni) { + lnet_ni_decref_locked(local_ni, cpt); + } else { + lnet_ni_decref_locked(local_ni, cpt); + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + LCONSOLE_WARN("No route to %s via from %s\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EINVAL; + } + + LASSERT(src_nid != LNET_NID_ANY); + lnet_msg_commit(msg, cpt); + + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + + if (src_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ + lnet_net_unlock(cpt); + lnet_ni_send(src_ni, msg); + + lnet_net_lock(cpt); + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + return 0; + } + + rc = lnet_nid2peer_locked(&lp, dst_nid, cpt); + /* lp has ref on src_ni; lose mine */ + lnet_ni_decref_locked(src_ni, cpt); + if (rc != 0) { + lnet_net_unlock(cpt); + LCONSOLE_WARN("Error %d finding peer %s\n", rc, + libcfs_nid2str(dst_nid)); + /* ENOMEM or shutting down */ + return rc; + } + LASSERT(lp->lp_ni == src_ni); + } else { + /* sending to a remote network */ + lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid); + if (lp == NULL) { + if (src_ni != NULL) + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + + LCONSOLE_WARN("No route to %s via %s (all routers down)\n", + libcfs_id2str(msg->msg_target), + libcfs_nid2str(src_nid)); + return -EHOSTUNREACH; + } + + /* rtr_nid is LNET_NID_ANY or NID of pre-determined router, + * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't + * pre-determined router, this can happen if router table + * was changed when we release the lock */ + if (rtr_nid != lp->lp_nid) { + cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid); + if (cpt2 != cpt) { + if (src_ni != NULL) + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + + rtr_nid = lp->lp_nid; + cpt = cpt2; + goto again; + } + } + + CDEBUG(D_NET, "Best route to %s via %s for %s %d\n", + libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid), + lnet_msgtyp2str(msg->msg_type), msg->msg_len); + + if (src_ni == NULL) { + src_ni = lp->lp_ni; + src_nid = src_ni->ni_nid; + } else { + LASSERT(src_ni == lp->lp_ni); + lnet_ni_decref_locked(src_ni, cpt); + } + + lnet_peer_addref_locked(lp); + + LASSERT(src_nid != LNET_NID_ANY); + lnet_msg_commit(msg, cpt); + + if (!msg->msg_routing) { + /* I'm the source and now I know which NI to send on */ + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + } + + msg->msg_target_is_router = 1; + msg->msg_target.nid = lp->lp_nid; + msg->msg_target.pid = LUSTRE_SRV_LNET_PID; + } + + /* 'lp' is our best choice of peer */ + + LASSERT(!msg->msg_peertxcredit); + LASSERT(!msg->msg_txcredit); + LASSERT(msg->msg_txpeer == NULL); + + msg->msg_txpeer = lp; /* msg takes my ref on lp */ + + rc = lnet_post_send_locked(msg, 0); + lnet_net_unlock(cpt); + + if (rc == EHOSTUNREACH || rc == ECANCELED) + return -rc; + + if (rc == 0) + lnet_ni_send(src_ni, msg); + + return 0; /* rc == 0 or EAGAIN */ +} + +static void +lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob) +{ + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->drop_count++; + the_lnet.ln_counters[cpt]->drop_length += nob; + lnet_net_unlock(cpt); + + lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); +} + +static void +lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + + if (msg->msg_wanted != 0) + lnet_setpayloadbuffer(msg); + + lnet_build_msg_event(msg, LNET_EVENT_PUT); + + /* Must I ACK? If so I'll grab the ack_wmd out of the header and put + * it back into the ACK during lnet_finalize() */ + msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0); + + lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed, + msg->msg_offset, msg->msg_wanted, hdr->payload_length); +} + +static int +lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + struct lnet_match_info info; + int rc; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); + hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); + + info.mi_id.nid = hdr->src_nid; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + + msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL; + + again: + rc = lnet_ptl_match_md(&info, msg); + switch (rc) { + default: + LBUG(); + + case LNET_MATCHMD_OK: + lnet_recv_put(ni, msg); + return 0; + + case LNET_MATCHMD_NONE: + if (msg->msg_rx_delayed) /* attached on delayed list */ + return 0; + + rc = lnet_ni_eager_recv(ni, msg); + if (rc == 0) + goto again; + /* fall through */ + + case LNET_MATCHMD_DROP: + CNETERR("Dropping PUT from %s portal %d match %llu offset %d length %d: %d\n", + libcfs_id2str(info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength, rc); + + return ENOENT; /* +ve: OK but no match */ + } +} + +static int +lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) +{ + struct lnet_match_info info; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_handle_wire_t reply_wmd; + int rc; + + /* Convert get fields to host byte order */ + hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); + hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); + + info.mi_id.nid = hdr->src_nid; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_GET; + info.mi_portal = hdr->msg.get.ptl_index; + info.mi_rlength = hdr->msg.get.sink_length; + info.mi_roffset = hdr->msg.get.src_offset; + info.mi_mbits = hdr->msg.get.match_bits; + + rc = lnet_ptl_match_md(&info, msg); + if (rc == LNET_MATCHMD_DROP) { + CNETERR("Dropping GET from %s portal %d match %llu offset %d length %d\n", + libcfs_id2str(info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength); + return ENOENT; /* +ve: OK but no match */ + } + + LASSERT(rc == LNET_MATCHMD_OK); + + lnet_build_msg_event(msg, LNET_EVENT_GET); + + reply_wmd = hdr->msg.get.return_wmd; + + lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id, + msg->msg_offset, msg->msg_wanted); + + msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; + + if (rdma_get) { + /* The LND completes the REPLY from her recv procedure */ + lnet_ni_recv(ni, msg->msg_private, msg, 0, + msg->msg_offset, msg->msg_len, msg->msg_len); + return 0; + } + + lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); + msg->msg_receiving = 0; + + rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY); + if (rc < 0) { + /* didn't get as far as lnet_ni_send() */ + CERROR("%s: Unable to send REPLY for GET from %s: %d\n", + libcfs_nid2str(ni->ni_nid), + libcfs_id2str(info.mi_id), rc); + + lnet_finalize(ni, msg, rc); + } + + return 0; +} + +static int +lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) +{ + void *private = msg->msg_private; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {0}; + lnet_libmd_t *md; + int rlength; + int mlength; + int cpt; + + cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CNETERR("%s: Dropping REPLY from %s for %s MD %#llx.%#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return ENOENT; /* +ve: OK but no match */ + } + + LASSERT(md->md_offset == 0); + + rlength = hdr->payload_length; + mlength = min_t(uint, rlength, md->md_length); + + if (mlength < rlength && + (md->md_options & LNET_MD_TRUNCATE) == 0) { + CNETERR("%s: Dropping REPLY from %s length %d for MD %#llx would overflow (%d)\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, + mlength); + lnet_res_unlock(cpt); + return ENOENT; /* +ve: OK but no match */ + } + + CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, mlength); + + if (mlength != 0) + lnet_setpayloadbuffer(msg); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength); + return 0; +} + +static int +lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {0}; + lnet_libmd_t *md; + int cpt; + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); + + cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + /* Don't moan; this is expected */ + CDEBUG(D_NET, + "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return ENOENT; /* +ve! */ + } + + CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + hdr->msg.ack.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_ACK); + + lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len); + return 0; +} + +static int +lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg) +{ + int rc = 0; + + if (msg->msg_rxpeer->lp_rtrcredits <= 0 || + lnet_msg2bufpool(msg)->rbp_credits <= 0) { + if (ni->ni_lnd->lnd_eager_recv == NULL) { + msg->msg_rx_ready_delay = 1; + } else { + lnet_net_unlock(msg->msg_rx_cpt); + rc = lnet_ni_eager_recv(ni, msg); + lnet_net_lock(msg->msg_rx_cpt); + } + } + + if (rc == 0) + rc = lnet_post_routed_recv_locked(msg, 0); + return rc; +} + +char * +lnet_msgtyp2str(int type) +{ + switch (type) { + case LNET_MSG_ACK: + return "ACK"; + case LNET_MSG_PUT: + return "PUT"; + case LNET_MSG_GET: + return "GET"; + case LNET_MSG_REPLY: + return "REPLY"; + case LNET_MSG_HELLO: + return "HELLO"; + default: + return ""; + } +} +EXPORT_SYMBOL(lnet_msgtyp2str); + +void +lnet_print_hdr(lnet_hdr_t *hdr) +{ + lnet_process_id_t src = {0}; + lnet_process_id_t dst = {0}; + char *type_str = lnet_msgtyp2str(hdr->type); + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + dst.nid = hdr->dest_nid; + dst.pid = hdr->dest_pid; + + CWARN("P3 Header at %p of type %s\n", hdr, type_str); + CWARN(" From %s\n", libcfs_id2str(src)); + CWARN(" To %s\n", libcfs_id2str(dst)); + + switch (hdr->type) { + default: + break; + + case LNET_MSG_PUT: + CWARN(" Ptl index %d, ack md %#llx.%#llx, match bits %llu\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + CWARN(" Length %d, offset %d, hdr data %#llx\n", + hdr->payload_length, hdr->msg.put.offset, + hdr->msg.put.hdr_data); + break; + + case LNET_MSG_GET: + CWARN(" Ptl index %d, return md %#llx.%#llx, match bits %llu\n", + hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CWARN(" Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); + break; + + case LNET_MSG_ACK: + CWARN(" dst md %#llx.%#llx, manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); + break; + + case LNET_MSG_REPLY: + CWARN(" dst md %#llx.%#llx, length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + hdr->payload_length); + } + +} + +int +lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, + void *private, int rdma_req) +{ + int rc = 0; + int cpt; + int for_me; + struct lnet_msg *msg; + lnet_pid_t dest_pid; + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + __u32 payload_length; + __u32 type; + + LASSERT(!in_interrupt()); + + type = le32_to_cpu(hdr->type); + src_nid = le64_to_cpu(hdr->src_nid); + dest_nid = le64_to_cpu(hdr->dest_nid); + dest_pid = le32_to_cpu(hdr->dest_pid); + payload_length = le32_to_cpu(hdr->payload_length); + + for_me = (ni->ni_nid == dest_nid); + cpt = lnet_cpt_of_nid(from_nid); + + switch (type) { + case LNET_MSG_ACK: + case LNET_MSG_GET: + if (payload_length > 0) { + CERROR("%s, src %s: bad %s payload %d (0 expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), payload_length); + return -EPROTO; + } + break; + + case LNET_MSG_PUT: + case LNET_MSG_REPLY: + if (payload_length > + (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) { + CERROR("%s, src %s: bad %s payload %d (%d max expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), + payload_length, + for_me ? LNET_MAX_PAYLOAD : LNET_MTU); + return -EPROTO; + } + break; + + default: + CERROR("%s, src %s: Bad message type 0x%x\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), type); + return -EPROTO; + } + + if (the_lnet.ln_routing && + ni->ni_last_alive != get_seconds()) { + lnet_ni_lock(ni); + + /* NB: so far here is the only place to set NI status to "up */ + ni->ni_last_alive = get_seconds(); + if (ni->ni_status != NULL && + ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) + ni->ni_status->ns_status = LNET_NI_STATUS_UP; + lnet_ni_unlock(ni); + } + + /* Regard a bad destination NID as a protocol error. Senders should + * know what they're doing; if they don't they're misconfigured, buggy + * or malicious so we chop them off at the knees :) */ + + if (!for_me) { + if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) { + /* should have gone direct */ + CERROR("%s, src %s: Bad dest nid %s (should have been sent direct)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (lnet_islocalnid(dest_nid)) { + /* dest is another local NI; sender should have used + * this node's NID on its own network */ + CERROR("%s, src %s: Bad dest nid %s (it's my nid but on a different network)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (rdma_req && type == LNET_MSG_GET) { + CERROR("%s, src %s: Bad optimized GET for %s (final destination must be me)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (!the_lnet.ln_routing) { + CERROR("%s, src %s: Dropping message for %s (routing not enabled)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + goto drop; + } + } + + /* Message looks OK; we're not going to return an error, so we MUST + * call back lnd_recv() come what may... */ + + if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer(src_nid, 0)) { /* shall we now? */ + CERROR("%s, src %s: Dropping %s to simulate failure\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("%s, src %s: Dropping %s (out of memory)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + /* msg zeroed in lnet_msg_alloc; + * i.e. flags all clear, pointers NULL etc + */ + + msg->msg_type = type; + msg->msg_private = private; + msg->msg_receiving = 1; + msg->msg_len = msg->msg_wanted = payload_length; + msg->msg_offset = 0; + msg->msg_hdr = *hdr; + /* for building message event */ + msg->msg_from = from_nid; + if (!for_me) { + msg->msg_target.pid = dest_pid; + msg->msg_target.nid = dest_nid; + msg->msg_routing = 1; + + } else { + /* convert common msg->hdr fields to host byteorder */ + msg->msg_hdr.type = type; + msg->msg_hdr.src_nid = src_nid; + msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid); + msg->msg_hdr.dest_nid = dest_nid; + msg->msg_hdr.dest_pid = dest_pid; + msg->msg_hdr.payload_length = payload_length; + } + + lnet_net_lock(cpt); + rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt); + if (rc != 0) { + lnet_net_unlock(cpt); + CERROR("%s, src %s: Dropping %s (error %d looking up sender)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), rc); + lnet_msg_free(msg); + goto drop; + } + + if (lnet_isrouter(msg->msg_rxpeer)) { + lnet_peer_set_alive(msg->msg_rxpeer); + if (avoid_asym_router_failure && + LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { + /* received a remote message from router, update + * remote NI status on this router. + * NB: multi-hop routed message will be ignored. + */ + lnet_router_ni_update_locked(msg->msg_rxpeer, + LNET_NIDNET(src_nid)); + } + } + + lnet_msg_commit(msg, cpt); + + if (!for_me) { + rc = lnet_parse_forward_locked(ni, msg); + lnet_net_unlock(cpt); + + if (rc < 0) + goto free_drop; + if (rc == 0) { + lnet_ni_recv(ni, msg->msg_private, msg, 0, + 0, payload_length, payload_length); + } + return 0; + } + + lnet_net_unlock(cpt); + + switch (type) { + case LNET_MSG_ACK: + rc = lnet_parse_ack(ni, msg); + break; + case LNET_MSG_PUT: + rc = lnet_parse_put(ni, msg); + break; + case LNET_MSG_GET: + rc = lnet_parse_get(ni, msg, rdma_req); + break; + case LNET_MSG_REPLY: + rc = lnet_parse_reply(ni, msg); + break; + default: + LASSERT(0); + rc = -EPROTO; + goto free_drop; /* prevent an unused label if !kernel */ + } + + if (rc == 0) + return 0; + + LASSERT(rc == ENOENT); + + free_drop: + LASSERT(msg->msg_md == NULL); + lnet_finalize(ni, msg, rc); + + drop: + lnet_drop_message(ni, cpt, private, payload_length); + return 0; +} +EXPORT_SYMBOL(lnet_parse); + +void +lnet_drop_delayed_msg_list(struct list_head *head, char *reason) +{ + while (!list_empty(head)) { + lnet_process_id_t id = {0}; + lnet_msg_t *msg; + + msg = list_entry(head->next, lnet_msg_t, msg_list); + list_del(&msg->msg_list); + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_md == NULL); + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CWARN("Dropping delayed PUT from %s portal %d match %llu offset %d length %d: %s\n", + libcfs_id2str(id), + msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length, reason); + + /* NB I can't drop msg's ref on msg_rxpeer until after I've + * called lnet_drop_message(), so I just hang onto msg as well + * until that's done */ + + lnet_drop_message(msg->msg_rxpeer->lp_ni, + msg->msg_rxpeer->lp_cpt, + msg->msg_private, msg->msg_len); + /* + * NB: message will not generate event because w/o attached MD, + * but we still should give error code so lnet_msg_decommit() + * can skip counters operations and other checks. + */ + lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT); + } +} + +void +lnet_recv_delayed_msg_list(struct list_head *head) +{ + while (!list_empty(head)) { + lnet_msg_t *msg; + lnet_process_id_t id; + + msg = list_entry(head->next, lnet_msg_t, msg_list); + list_del(&msg->msg_list); + + /* md won't disappear under me, since each msg + * holds a ref on it */ + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_md != NULL); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d match %llu offset %d length %d.\n", + libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length); + + lnet_recv_put(msg->msg_rxpeer->lp_ni, msg); + } +} + +/** + * Initiate an asynchronous PUT operation. + * + * There are several events associated with a PUT: completion of the send on + * the initiator node (LNET_EVENT_SEND), and when the send completes + * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating + * that the operation was accepted by the target. The event LNET_EVENT_PUT is + * used at the target node to indicate the completion of incoming data + * delivery. + * + * The local events will be logged in the EQ associated with the MD pointed to + * by \a mdh handle. Using a MD without an associated EQ results in these + * events being discarded. In this case, the caller must have another + * mechanism (e.g., a higher level protocol) for determining when it is safe + * to modify the memory region associated with the MD. + * + * Note that LNet does not guarantee the order of LNET_EVENT_SEND and + * LNET_EVENT_ACK, though intuitively ACK should happen after SEND. + * + * \param self Indicates the NID of a local interface through which to send + * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself. + * \param mdh A handle for the MD that describes the memory to be sent. The MD + * must be "free floating" (See LNetMDBind()). + * \param ack Controls whether an acknowledgment is requested. + * Acknowledgments are only sent when they are requested by the initiating + * process and the target MD enables them. + * \param target A process identifier for the target process. + * \param portal The index in the \a target's portal table. + * \param match_bits The match bits to use for MD selection at the target + * process. + * \param offset The offset into the target MD (only used when the target + * MD has the LNET_MD_MANAGE_REMOTE option set). + * \param hdr_data 64 bits of user data that can be included in the message + * header. This data is written to an event queue entry at the target if an + * EQ is present on the matching MD. + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists). + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + * + * \see lnet_event_t::hdr_data and lnet_event_kind_t. + */ +int +LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, + lnet_process_id_t target, unsigned int portal, + __u64 match_bits, unsigned int offset, + __u64 hdr_data) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer(target.nid, 1)) { /* shall we now? */ + CERROR("Dropping PUT to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; + } + msg->msg_vmflush = !!memory_pressure_get(); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping PUT (%llu:%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + lnet_res_unlock(cpt); + + lnet_msg_free(msg); + return -ENOENT; + } + + CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); + + msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); + msg->msg_hdr.msg.put.hdr_data = hdr_data; + + /* NB handles only looked up by creator (no flips) */ + if (ack == LNET_ACK_REQ) { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + } else { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + } + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + rc = lnet_send(self, msg, LNET_NID_ANY); + if (rc != 0) { + CNETERR("Error sending PUT to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize(NULL, msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetPut); + +lnet_msg_t * +lnet_create_reply_msg(lnet_ni_t *ni, lnet_msg_t *getmsg) +{ + /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This + * returns a msg for the LND to pass to lnet_finalize() when the sink + * data has been received. + * + * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when + * lnet_finalize() is called on it, so the LND must call this first */ + + struct lnet_msg *msg = lnet_msg_alloc(); + struct lnet_libmd *getmd = getmsg->msg_md; + lnet_process_id_t peer_id = getmsg->msg_target; + int cpt; + + LASSERT(!getmsg->msg_target_is_router); + LASSERT(!getmsg->msg_routing); + + cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie); + lnet_res_lock(cpt); + + LASSERT(getmd->md_refcount > 0); + + if (msg == NULL) { + CERROR("%s: Dropping REPLY from %s: can't allocate msg\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id)); + goto drop; + } + + if (getmd->md_threshold == 0) { + CERROR("%s: Dropping REPLY from %s for inactive MD %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), + getmd); + lnet_res_unlock(cpt); + goto drop; + } + + LASSERT(getmd->md_offset == 0); + + CDEBUG(D_NET, "%s: Reply from %s md %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); + + /* setup information for lnet_build_msg_event */ + msg->msg_from = peer_id.nid; + msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ + msg->msg_hdr.src_nid = peer_id.nid; + msg->msg_hdr.payload_length = getmd->md_length; + msg->msg_receiving = 1; /* required by lnet_msg_attach_md */ + + lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length); + lnet_res_unlock(cpt); + + cpt = lnet_cpt_of_nid(peer_id.nid); + + lnet_net_lock(cpt); + lnet_msg_commit(msg, cpt); + lnet_net_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + return msg; + + drop: + cpt = lnet_cpt_of_nid(peer_id.nid); + + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->drop_count++; + the_lnet.ln_counters[cpt]->drop_length += getmd->md_length; + lnet_net_unlock(cpt); + + if (msg != NULL) + lnet_msg_free(msg); + + return NULL; +} +EXPORT_SYMBOL(lnet_create_reply_msg); + +void +lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len) +{ + /* Set the REPLY length, now the RDMA that elides the REPLY message has + * completed and I know it. */ + LASSERT(reply != NULL); + LASSERT(reply->msg_type == LNET_MSG_GET); + LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY); + + /* NB I trusted my peer to RDMA. If she tells me she's written beyond + * the end of my buffer, I might as well be dead. */ + LASSERT(len <= reply->msg_ev.mlength); + + reply->msg_ev.mlength = len; +} +EXPORT_SYMBOL(lnet_set_reply_msg_len); + +/** + * Initiate an asynchronous GET operation. + * + * On the initiator node, an LNET_EVENT_SEND is logged when the GET request + * is sent, and an LNET_EVENT_REPLY is logged when the data returned from + * the target node in the REPLY has been written to local MD. + * + * On the target node, an LNET_EVENT_GET is logged when the GET request + * arrives and is accepted into a MD. + * + * \param self,target,portal,match_bits,offset See the discussion in LNetPut(). + * \param mdh A handle for the MD that describes the memory into which the + * requested data will be received. The MD must be "free floating" + * (See LNetMDBind()). + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists) of the MD. + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + */ +int +LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, + lnet_process_id_t target, unsigned int portal, + __u64 match_bits, unsigned int offset) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer(target.nid, 1)) { /* shall we now? */ + CERROR("Dropping GET to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; + } + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping GET (%llu:%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + + lnet_msg_free(msg); + return -ENOENT; + } + + CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); + + msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); + msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); + + /* NB handles only looked up by creator (no flips) */ + msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + rc = lnet_send(self, msg, LNET_NID_ANY); + if (rc < 0) { + CNETERR("Error sending GET to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize(NULL, msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetGet); + +/** + * Calculate distance to node at \a dstnid. + * + * \param dstnid Target NID. + * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid + * is saved here. + * \param orderp If not NULL, order of the route to reach \a dstnid is saved + * here. + * + * \retval 0 If \a dstnid belongs to a local interface, and reserved option + * local_nid_dist_zero is set, which is the default. + * \retval positives Distance to target NID, i.e. number of hops plus one. + * \retval -EHOSTUNREACH If \a dstnid is not reachable. + */ +int +LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) +{ + struct list_head *e; + struct lnet_ni *ni; + lnet_remotenet_t *rnet; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int cpt; + __u32 order = 2; + struct list_head *rn_list; + + /* if !local_nid_dist_zero, I don't return a distance of 0 ever + * (when lustre sees a distance of 0, it substitutes 0@lo), so I + * keep order 0 free for 0@lo and order 1 free for a local NID + * match */ + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_net_lock_current(); + + list_for_each(e, &the_lnet.ln_nis) { + ni = list_entry(e, lnet_ni_t, ni_list); + + if (ni->ni_nid == dstnid) { + if (srcnidp != NULL) + *srcnidp = dstnid; + if (orderp != NULL) { + if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) + *orderp = 0; + else + *orderp = 1; + } + lnet_net_unlock(cpt); + + return local_nid_dist_zero ? 0 : 1; + } + + if (LNET_NIDNET(ni->ni_nid) == dstnet) { + if (srcnidp != NULL) + *srcnidp = ni->ni_nid; + if (orderp != NULL) + *orderp = order; + lnet_net_unlock(cpt); + return 1; + } + + order++; + } + + rn_list = lnet_net2rnethash(dstnet); + list_for_each(e, rn_list) { + rnet = list_entry(e, lnet_remotenet_t, lrn_list); + + if (rnet->lrn_net == dstnet) { + lnet_route_t *route; + lnet_route_t *shortest = NULL; + + LASSERT(!list_empty(&rnet->lrn_routes)); + + list_for_each_entry(route, &rnet->lrn_routes, + lr_list) { + if (shortest == NULL || + route->lr_hops < shortest->lr_hops) + shortest = route; + } + + LASSERT(shortest != NULL); + hops = shortest->lr_hops; + if (srcnidp != NULL) + *srcnidp = shortest->lr_gateway->lp_ni->ni_nid; + if (orderp != NULL) + *orderp = order; + lnet_net_unlock(cpt); + return hops + 1; + } + order++; + } + + lnet_net_unlock(cpt); + return -EHOSTUNREACH; +} +EXPORT_SYMBOL(LNetDist); + +/** + * Set the number of asynchronous messages expected from a target process. + * + * This function is only meaningful for userspace callers. It's a no-op when + * called from kernel. + * + * Asynchronous messages are those that can come from a target when the + * userspace process is not waiting for IO to complete; e.g., AST callbacks + * from Lustre servers. Specifying the expected number of such messages + * allows them to be eagerly received when user process is not running in + * LNet; otherwise network errors may occur. + * + * \param id Process ID of the target process. + * \param nasync Number of asynchronous messages expected from the target. + * + * \return 0 on success, and an error code otherwise. + */ +int +LNetSetAsync(lnet_process_id_t id, int nasync) +{ + return 0; +} +EXPORT_SYMBOL(LNetSetAsync); diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c new file mode 100644 index 000000000..a46ccbf66 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c @@ -0,0 +1,647 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-msg.c + * + * Message decoding, parsing and finalizing routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/lnet/lib-lnet.h" + +void +lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev) +{ + memset(ev, 0, sizeof(*ev)); + + ev->status = 0; + ev->unlinked = 1; + ev->type = LNET_EVENT_UNLINK; + lnet_md_deconstruct(md, &ev->md); + lnet_md2handle(&ev->md_handle, md); +} + +/* + * Don't need any lock, must be called after lnet_commit_md + */ +void +lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_event_t *ev = &msg->msg_ev; + + LASSERT(!msg->msg_routing); + + ev->type = ev_type; + + if (ev_type == LNET_EVENT_SEND) { + /* event for active message */ + ev->target.nid = le64_to_cpu(hdr->dest_nid); + ev->target.pid = le32_to_cpu(hdr->dest_pid); + ev->initiator.nid = LNET_NID_ANY; + ev->initiator.pid = the_lnet.ln_pid; + ev->sender = LNET_NID_ANY; + + } else { + /* event for passive message */ + ev->target.pid = hdr->dest_pid; + ev->target.nid = hdr->dest_nid; + ev->initiator.pid = hdr->src_pid; + ev->initiator.nid = hdr->src_nid; + ev->rlength = hdr->payload_length; + ev->sender = msg->msg_from; + ev->mlength = msg->msg_wanted; + ev->offset = msg->msg_offset; + } + + switch (ev_type) { + default: + LBUG(); + + case LNET_EVENT_PUT: /* passive PUT */ + ev->pt_index = hdr->msg.put.ptl_index; + ev->match_bits = hdr->msg.put.match_bits; + ev->hdr_data = hdr->msg.put.hdr_data; + return; + + case LNET_EVENT_GET: /* passive GET */ + ev->pt_index = hdr->msg.get.ptl_index; + ev->match_bits = hdr->msg.get.match_bits; + ev->hdr_data = 0; + return; + + case LNET_EVENT_ACK: /* ACK */ + ev->match_bits = hdr->msg.ack.match_bits; + ev->mlength = hdr->msg.ack.mlength; + return; + + case LNET_EVENT_REPLY: /* REPLY */ + return; + + case LNET_EVENT_SEND: /* active message */ + if (msg->msg_type == LNET_MSG_PUT) { + ev->pt_index = le32_to_cpu(hdr->msg.put.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits); + ev->offset = le32_to_cpu(hdr->msg.put.offset); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->payload_length); + ev->hdr_data = le64_to_cpu(hdr->msg.put.hdr_data); + + } else { + LASSERT(msg->msg_type == LNET_MSG_GET); + ev->pt_index = le32_to_cpu(hdr->msg.get.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->msg.get.sink_length); + ev->offset = le32_to_cpu(hdr->msg.get.src_offset); + ev->hdr_data = 0; + } + return; + } +} + +void +lnet_msg_commit(lnet_msg_t *msg, int cpt) +{ + struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt]; + lnet_counters_t *counters = the_lnet.ln_counters[cpt]; + + /* routed message can be committed for both receiving and sending */ + LASSERT(!msg->msg_tx_committed); + + if (msg->msg_sending) { + LASSERT(!msg->msg_receiving); + + msg->msg_tx_cpt = cpt; + msg->msg_tx_committed = 1; + if (msg->msg_rx_committed) { /* routed message REPLY */ + LASSERT(msg->msg_onactivelist); + return; + } + } else { + LASSERT(!msg->msg_sending); + msg->msg_rx_cpt = cpt; + msg->msg_rx_committed = 1; + } + + LASSERT(!msg->msg_onactivelist); + msg->msg_onactivelist = 1; + list_add(&msg->msg_activelist, &container->msc_active); + + counters->msgs_alloc++; + if (counters->msgs_alloc > counters->msgs_max) + counters->msgs_max = counters->msgs_alloc; +} + +static void +lnet_msg_decommit_tx(lnet_msg_t *msg, int status) +{ + lnet_counters_t *counters; + lnet_event_t *ev = &msg->msg_ev; + + LASSERT(msg->msg_tx_committed); + if (status != 0) + goto out; + + counters = the_lnet.ln_counters[msg->msg_tx_cpt]; + switch (ev->type) { + default: /* routed message */ + LASSERT(msg->msg_routing); + LASSERT(msg->msg_rx_committed); + LASSERT(ev->type == 0); + + counters->route_length += msg->msg_len; + counters->route_count++; + goto out; + + case LNET_EVENT_PUT: + /* should have been decommitted */ + LASSERT(!msg->msg_rx_committed); + /* overwritten while sending ACK */ + LASSERT(msg->msg_type == LNET_MSG_ACK); + msg->msg_type = LNET_MSG_PUT; /* fix type */ + break; + + case LNET_EVENT_SEND: + LASSERT(!msg->msg_rx_committed); + if (msg->msg_type == LNET_MSG_PUT) + counters->send_length += msg->msg_len; + break; + + case LNET_EVENT_GET: + LASSERT(msg->msg_rx_committed); + /* overwritten while sending reply, we should never be + * here for optimized GET */ + LASSERT(msg->msg_type == LNET_MSG_REPLY); + msg->msg_type = LNET_MSG_GET; /* fix type */ + break; + } + + counters->send_count++; + out: + lnet_return_tx_credits_locked(msg); + msg->msg_tx_committed = 0; +} + +static void +lnet_msg_decommit_rx(lnet_msg_t *msg, int status) +{ + lnet_counters_t *counters; + lnet_event_t *ev = &msg->msg_ev; + + LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */ + LASSERT(msg->msg_rx_committed); + + if (status != 0) + goto out; + + counters = the_lnet.ln_counters[msg->msg_rx_cpt]; + switch (ev->type) { + default: + LASSERT(ev->type == 0); + LASSERT(msg->msg_routing); + goto out; + + case LNET_EVENT_ACK: + LASSERT(msg->msg_type == LNET_MSG_ACK); + break; + + case LNET_EVENT_GET: + /* type is "REPLY" if it's an optimized GET on passive side, + * because optimized GET will never be committed for sending, + * so message type wouldn't be changed back to "GET" by + * lnet_msg_decommit_tx(), see details in lnet_parse_get() */ + LASSERT(msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_GET); + counters->send_length += msg->msg_wanted; + break; + + case LNET_EVENT_PUT: + LASSERT(msg->msg_type == LNET_MSG_PUT); + break; + + case LNET_EVENT_REPLY: + /* type is "GET" if it's an optimized GET on active side, + * see details in lnet_create_reply_msg() */ + LASSERT(msg->msg_type == LNET_MSG_GET || + msg->msg_type == LNET_MSG_REPLY); + break; + } + + counters->recv_count++; + if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY) + counters->recv_length += msg->msg_wanted; + + out: + lnet_return_rx_credits_locked(msg); + msg->msg_rx_committed = 0; +} + +void +lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status) +{ + int cpt2 = cpt; + + LASSERT(msg->msg_tx_committed || msg->msg_rx_committed); + LASSERT(msg->msg_onactivelist); + + if (msg->msg_tx_committed) { /* always decommit for sending first */ + LASSERT(cpt == msg->msg_tx_cpt); + lnet_msg_decommit_tx(msg, status); + } + + if (msg->msg_rx_committed) { + /* forwarding msg committed for both receiving and sending */ + if (cpt != msg->msg_rx_cpt) { + lnet_net_unlock(cpt); + cpt2 = msg->msg_rx_cpt; + lnet_net_lock(cpt2); + } + lnet_msg_decommit_rx(msg, status); + } + + list_del(&msg->msg_activelist); + msg->msg_onactivelist = 0; + + the_lnet.ln_counters[cpt2]->msgs_alloc--; + + if (cpt2 != cpt) { + lnet_net_unlock(cpt2); + lnet_net_lock(cpt); + } +} + +void +lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md, + unsigned int offset, unsigned int mlen) +{ + /* NB: @offset and @len are only useful for receiving */ + /* Here, we attach the MD on lnet_msg and mark it busy and + * decrementing its threshold. Come what may, the lnet_msg "owns" + * the MD until a call to lnet_msg_detach_md or lnet_finalize() + * signals completion. */ + LASSERT(!msg->msg_routing); + + msg->msg_md = md; + if (msg->msg_receiving) { /* committed for receiving */ + msg->msg_offset = offset; + msg->msg_wanted = mlen; + } + + md->md_refcount++; + if (md->md_threshold != LNET_MD_THRESH_INF) { + LASSERT(md->md_threshold > 0); + md->md_threshold--; + } + + /* build umd in event */ + lnet_md2handle(&msg->msg_ev.md_handle, md); + lnet_md_deconstruct(md, &msg->msg_ev.md); +} + +void +lnet_msg_detach_md(lnet_msg_t *msg, int status) +{ + lnet_libmd_t *md = msg->msg_md; + int unlink; + + /* Now it's safe to drop my caller's ref */ + md->md_refcount--; + LASSERT(md->md_refcount >= 0); + + unlink = lnet_md_unlinkable(md); + if (md->md_eq != NULL) { + msg->msg_ev.status = status; + msg->msg_ev.unlinked = unlink; + lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev); + } + + if (unlink) + lnet_md_unlink(md); + + msg->msg_md = NULL; +} + +static int +lnet_complete_msg_locked(lnet_msg_t *msg, int cpt) +{ + lnet_handle_wire_t ack_wmd; + int rc; + int status = msg->msg_ev.status; + + LASSERT(msg->msg_onactivelist); + + if (status == 0 && msg->msg_ack) { + /* Only send an ACK if the PUT completed successfully */ + + lnet_msg_decommit(msg, cpt, 0); + + msg->msg_ack = 0; + lnet_net_unlock(cpt); + + LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); + LASSERT(!msg->msg_routing); + + ack_wmd = msg->msg_hdr.msg.put.ack_wmd; + + lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0); + + msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; + msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; + msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); + + /* NB: we probably want to use NID of msg::msg_from as 3rd + * parameter (router NID) if it's routed message */ + rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is committed for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either + * because CPT for sending can be different with CPT for + * receiving, so we should return back to lnet_finalize() + * to make sure we are locking the correct partition. + */ + return rc; + + } else if (status == 0 && /* OK so far */ + (msg->msg_routing && !msg->msg_sending)) { + /* not forwarded */ + LASSERT(!msg->msg_receiving); /* called back recv already */ + lnet_net_unlock(cpt); + + rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is committed for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either: + * - The rule is message must decommit for sending first if + * the it's committed for both sending and receiving + * - CPT for sending can be different with CPT for receiving, + * so we should return back to lnet_finalize() to make + * sure we are locking the correct partition. + */ + return rc; + } + + lnet_msg_decommit(msg, cpt, status); + lnet_msg_free_locked(msg); + return 0; +} + +void +lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int status) +{ + struct lnet_msg_container *container; + int my_slot; + int cpt; + int rc; + int i; + + LASSERT(!in_interrupt()); + + if (msg == NULL) + return; +#if 0 + CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n", + lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target), + msg->msg_target_is_router ? "t" : "", + msg->msg_routing ? "X" : "", + msg->msg_ack ? "A" : "", + msg->msg_sending ? "S" : "", + msg->msg_receiving ? "R" : "", + msg->msg_delayed ? "d" : "", + msg->msg_txcredit ? "C" : "", + msg->msg_peertxcredit ? "c" : "", + msg->msg_rtrcredit ? "F" : "", + msg->msg_peerrtrcredit ? "f" : "", + msg->msg_onactivelist ? "!" : "", + msg->msg_txpeer == NULL ? "" : libcfs_nid2str(msg->msg_txpeer->lp_nid), + msg->msg_rxpeer == NULL ? "" : libcfs_nid2str(msg->msg_rxpeer->lp_nid)); +#endif + msg->msg_ev.status = status; + + if (msg->msg_md != NULL) { + cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie); + + lnet_res_lock(cpt); + lnet_msg_detach_md(msg, status); + lnet_res_unlock(cpt); + } + + again: + rc = 0; + if (!msg->msg_tx_committed && !msg->msg_rx_committed) { + /* not committed to network yet */ + LASSERT(!msg->msg_onactivelist); + lnet_msg_free(msg); + return; + } + + /* + * NB: routed message can be committed for both receiving and sending, + * we should finalize in LIFO order and keep counters correct. + * (finalize sending first then finalize receiving) + */ + cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt; + lnet_net_lock(cpt); + + container = the_lnet.ln_msg_containers[cpt]; + list_add_tail(&msg->msg_list, &container->msc_finalizing); + + /* Recursion breaker. Don't complete the message here if I am (or + * enough other threads are) already completing messages */ + + my_slot = -1; + for (i = 0; i < container->msc_nfinalizers; i++) { + if (container->msc_finalizers[i] == current) + break; + + if (my_slot < 0 && container->msc_finalizers[i] == NULL) + my_slot = i; + } + + if (i < container->msc_nfinalizers || my_slot < 0) { + lnet_net_unlock(cpt); + return; + } + + container->msc_finalizers[my_slot] = current; + + while (!list_empty(&container->msc_finalizing)) { + msg = list_entry(container->msc_finalizing.next, + lnet_msg_t, msg_list); + + list_del(&msg->msg_list); + + /* NB drops and regains the lnet lock if it actually does + * anything, so my finalizing friends can chomp along too */ + rc = lnet_complete_msg_locked(msg, cpt); + if (rc != 0) + break; + } + + container->msc_finalizers[my_slot] = NULL; + lnet_net_unlock(cpt); + + if (rc != 0) + goto again; +} +EXPORT_SYMBOL(lnet_finalize); + +void +lnet_msg_container_cleanup(struct lnet_msg_container *container) +{ + int count = 0; + + if (container->msc_init == 0) + return; + + while (!list_empty(&container->msc_active)) { + lnet_msg_t *msg = list_entry(container->msc_active.next, + lnet_msg_t, msg_activelist); + + LASSERT(msg->msg_onactivelist); + msg->msg_onactivelist = 0; + list_del(&msg->msg_activelist); + lnet_msg_free(msg); + count++; + } + + if (count > 0) + CERROR("%d active msg on exit\n", count); + + if (container->msc_finalizers != NULL) { + LIBCFS_FREE(container->msc_finalizers, + container->msc_nfinalizers * + sizeof(*container->msc_finalizers)); + container->msc_finalizers = NULL; + } +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_fini(&container->msc_freelist); +#endif + container->msc_init = 0; +} + +int +lnet_msg_container_setup(struct lnet_msg_container *container, int cpt) +{ + int rc; + + container->msc_init = 1; + + INIT_LIST_HEAD(&container->msc_active); + INIT_LIST_HEAD(&container->msc_finalizing); + +#ifdef LNET_USE_LIB_FREELIST + memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t)); + + rc = lnet_freelist_init(&container->msc_freelist, + LNET_FL_MAX_MSGS, sizeof(lnet_msg_t)); + if (rc != 0) { + CERROR("Failed to init freelist for message container\n"); + lnet_msg_container_cleanup(container); + return rc; + } +#else + rc = 0; +#endif + /* number of CPUs */ + container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt); + + LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt, + container->msc_nfinalizers * + sizeof(*container->msc_finalizers)); + + if (container->msc_finalizers == NULL) { + CERROR("Failed to allocate message finalizers\n"); + lnet_msg_container_cleanup(container); + return -ENOMEM; + } + + return rc; +} + +void +lnet_msg_containers_destroy(void) +{ + struct lnet_msg_container *container; + int i; + + if (the_lnet.ln_msg_containers == NULL) + return; + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) + lnet_msg_container_cleanup(container); + + cfs_percpt_free(the_lnet.ln_msg_containers); + the_lnet.ln_msg_containers = NULL; +} + +int +lnet_msg_containers_create(void) +{ + struct lnet_msg_container *container; + int rc; + int i; + + the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*container)); + + if (the_lnet.ln_msg_containers == NULL) { + CERROR("Failed to allocate cpu-partition data for network\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) { + rc = lnet_msg_container_setup(container, i); + if (rc != 0) { + lnet_msg_containers_destroy(); + return rc; + } + } + + return 0; +} diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c new file mode 100644 index 000000000..3ba0da919 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c @@ -0,0 +1,935 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-ptl.c + * + * portal & match routines + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/lnet/lib-lnet.h" + +/* NB: add /proc interfaces in upcoming patches */ +int portal_rotor = LNET_PTL_ROTOR_HASH_RT; +module_param(portal_rotor, int, 0644); +MODULE_PARM_DESC(portal_rotor, "redirect PUTs to different cpu-partitions"); + +static int +lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id, + __u64 mbits, __u64 ignore_bits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[index]; + int unique; + + unique = ignore_bits == 0 && + match_id.nid != LNET_NID_ANY && + match_id.pid != LNET_PID_ANY; + + LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl)); + + /* prefer to check w/o any lock */ + if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) + goto match; + + /* unset, new portal */ + lnet_ptl_lock(ptl); + /* check again with lock */ + if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) { + lnet_ptl_unlock(ptl); + goto match; + } + + /* still not set */ + if (unique) + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE); + else + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD); + + lnet_ptl_unlock(ptl); + + return 1; + + match: + if ((lnet_ptl_is_unique(ptl) && !unique) || + (lnet_ptl_is_wildcard(ptl) && unique)) + return 0; + return 1; +} + +static void +lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + mtable->mt_enabled = 1; + + ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt; + for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) { + LASSERT(ptl->ptl_mt_maps[i] != cpt); + if (ptl->ptl_mt_maps[i] < cpt) + break; + + /* swap to order */ + ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i]; + ptl->ptl_mt_maps[i] = cpt; + } + + ptl->ptl_mt_nmaps++; +} + +static void +lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + if (LNET_CPT_NUMBER == 1) + return; /* never disable the only match-table */ + + mtable->mt_enabled = 0; + + LASSERT(ptl->ptl_mt_nmaps > 0 && + ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER); + + /* remove it from mt_maps */ + ptl->ptl_mt_nmaps--; + for (i = 0; i < ptl->ptl_mt_nmaps; i++) { + if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */ + ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1]; + } +} + +static int +lnet_try_match_md(lnet_libmd_t *md, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock; + * lnet_match_blocked_msg() relies on this to avoid races */ + unsigned int offset; + unsigned int mlength; + lnet_me_t *me = md->md_me; + + /* MD exhausted */ + if (lnet_md_exhausted(md)) + return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED; + + /* mismatched MD op */ + if ((md->md_options & info->mi_opc) == 0) + return LNET_MATCHMD_NONE; + + /* mismatched ME nid/pid? */ + if (me->me_match_id.nid != LNET_NID_ANY && + me->me_match_id.nid != info->mi_id.nid) + return LNET_MATCHMD_NONE; + + if (me->me_match_id.pid != LNET_PID_ANY && + me->me_match_id.pid != info->mi_id.pid) + return LNET_MATCHMD_NONE; + + /* mismatched ME matchbits? */ + if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0) + return LNET_MATCHMD_NONE; + + /* Hurrah! This _is_ a match; check it out... */ + + if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0) + offset = md->md_offset; + else + offset = info->mi_roffset; + + if ((md->md_options & LNET_MD_MAX_SIZE) != 0) { + mlength = md->md_max_size; + LASSERT(md->md_offset + mlength <= md->md_length); + } else { + mlength = md->md_length - offset; + } + + if (info->mi_rlength <= mlength) { /* fits in allowed space */ + mlength = info->mi_rlength; + } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet from %s, match %llu length %d too big: %d left, %d allowed\n", + libcfs_id2str(info->mi_id), info->mi_mbits, + info->mi_rlength, md->md_length - offset, mlength); + + return LNET_MATCHMD_DROP; + } + + /* Commit to this ME/MD */ + CDEBUG(D_NET, "Incoming %s index %x from %s of length %d/%d into md %#llx [%d] + %d\n", + (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get", + info->mi_portal, libcfs_id2str(info->mi_id), mlength, + info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset); + + lnet_msg_attach_md(msg, md, offset, mlength); + md->md_offset = offset + mlength; + + if (!lnet_md_exhausted(md)) + return LNET_MATCHMD_OK; + + /* Auto-unlink NOW, so the ME gets unlinked if required. + * We bumped md->md_refcount above so the MD just gets flagged + * for unlink when it is finalized. */ + if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0) + lnet_md_unlink(md); + + return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED; +} + +static struct lnet_match_table * +lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits) +{ + if (LNET_CPT_NUMBER == 1) + return ptl->ptl_mtables[0]; /* the only one */ + + /* if it's a unique portal, return match-table hashed by NID */ + return lnet_ptl_is_unique(ptl) ? + ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL; +} + +struct lnet_match_table * +lnet_mt_of_attach(unsigned int index, lnet_process_id_t id, + __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos) +{ + struct lnet_portal *ptl; + struct lnet_match_table *mtable; + + /* NB: called w/o lock */ + LASSERT(index < the_lnet.ln_nportals); + + if (!lnet_ptl_match_type(index, id, mbits, ignore_bits)) + return NULL; + + ptl = the_lnet.ln_portals[index]; + + mtable = lnet_match2mt(ptl, id, mbits); + if (mtable != NULL) /* unique portal or only one match-table */ + return mtable; + + /* it's a wildcard portal */ + switch (pos) { + default: + return NULL; + case LNET_INS_BEFORE: + case LNET_INS_AFTER: + /* posted by no affinity thread, always hash to specific + * match-table to avoid buffer stealing which is heavy */ + return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER]; + case LNET_INS_LOCAL: + /* posted by cpu-affinity thread */ + return ptl->ptl_mtables[lnet_cpt_current()]; + } +} + +static struct lnet_match_table * +lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + unsigned int nmaps; + unsigned int rotor; + unsigned int cpt; + bool routed; + + /* NB: called w/o lock */ + LASSERT(info->mi_portal < the_lnet.ln_nportals); + ptl = the_lnet.ln_portals[info->mi_portal]; + + LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)); + + mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits); + if (mtable != NULL) + return mtable; + + /* it's a wildcard portal */ + routed = LNET_NIDNET(msg->msg_hdr.src_nid) != + LNET_NIDNET(msg->msg_hdr.dest_nid); + + if (portal_rotor == LNET_PTL_ROTOR_OFF || + (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) { + cpt = lnet_cpt_current(); + if (ptl->ptl_mtables[cpt]->mt_enabled) + return ptl->ptl_mtables[cpt]; + } + + rotor = ptl->ptl_rotor++; /* get round-robin factor */ + if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed) + cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid); + else + cpt = rotor % LNET_CPT_NUMBER; + + if (!ptl->ptl_mtables[cpt]->mt_enabled) { + /* is there any active entry for this portal? */ + nmaps = ptl->ptl_mt_nmaps; + /* map to an active mtable to avoid heavy "stealing" */ + if (nmaps != 0) { + /* NB: there is possibility that ptl_mt_maps is being + * changed because we are not under protection of + * lnet_ptl_lock, but it shouldn't hurt anything */ + cpt = ptl->ptl_mt_maps[rotor % nmaps]; + } + } + + return ptl->ptl_mtables[cpt]; +} + +static int +lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos) +{ + __u64 *bmap; + int i; + + if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + return 0; + + if (pos < 0) { /* check all bits */ + for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) { + if (mtable->mt_exhausted[i] != (__u64)(-1)) + return 0; + } + return 1; + } + + LASSERT(pos <= LNET_MT_HASH_IGNORE); + /* mtable::mt_mhash[pos] is marked as exhausted or not */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + return ((*bmap) & (1ULL << pos)) != 0; +} + +static void +lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted) +{ + __u64 *bmap; + + LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])); + LASSERT(pos <= LNET_MT_HASH_IGNORE); + + /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + if (!exhausted) + *bmap &= ~(1ULL << pos); + else + *bmap |= 1ULL << pos; +} + +struct list_head * +lnet_mt_match_head(struct lnet_match_table *mtable, + lnet_process_id_t id, __u64 mbits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal]; + + if (lnet_ptl_is_wildcard(ptl)) { + return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK]; + } else { + unsigned long hash = mbits + id.nid + id.pid; + + LASSERT(lnet_ptl_is_unique(ptl)); + hash = hash_long(hash, LNET_MT_HASH_BITS); + return &mtable->mt_mhash[hash]; + } +} + +int +lnet_mt_match_md(struct lnet_match_table *mtable, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct list_head *head; + lnet_me_t *me; + lnet_me_t *tmp; + int exhausted = 0; + int rc; + + /* any ME with ignore bits? */ + if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE])) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); + again: + /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */ + if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + exhausted = LNET_MATCHMD_EXHAUSTED; + + list_for_each_entry_safe(me, tmp, head, me_list) { + /* ME attached but MD not attached yet */ + if (me->me_md == NULL) + continue; + + LASSERT(me == me->me_md->md_me); + + rc = lnet_try_match_md(me->me_md, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) == 0) + exhausted = 0; /* mlist is not empty */ + + if ((rc & LNET_MATCHMD_FINISH) != 0) { + /* don't return EXHAUSTED bit because we don't know + * whether the mlist is empty or not */ + return rc & ~LNET_MATCHMD_EXHAUSTED; + } + } + + if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */ + lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1); + if (!lnet_mt_test_exhausted(mtable, -1)) + exhausted = 0; + } + + if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) { + head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); + goto again; /* re-check MEs w/o ignore-bits */ + } + + if (info->mi_opc == LNET_MD_OP_GET || + !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal])) + return LNET_MATCHMD_DROP | exhausted; + + return LNET_MATCHMD_NONE | exhausted; +} + +static int +lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg) +{ + int rc; + + /* message arrived before any buffer posting on this portal, + * simply delay or drop this message */ + if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl))) + return 0; + + lnet_ptl_lock(ptl); + /* check it again with hold of lock */ + if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) { + lnet_ptl_unlock(ptl); + return 0; + } + + if (lnet_ptl_is_lazy(ptl)) { + if (msg->msg_rx_ready_delay) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + } + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + + lnet_ptl_unlock(ptl); + return rc; +} + +static int +lnet_ptl_match_delay(struct lnet_portal *ptl, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + int first = ptl->ptl_mt_maps[0]; /* read w/o lock */ + int rc = 0; + int i; + + /* steal buffer from other CPTs, and delay it if nothing to steal, + * this function is more expensive than a regular match, but we + * don't expect it can happen a lot */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + for (i = 0; i < LNET_CPT_NUMBER; i++) { + struct lnet_match_table *mtable; + int cpt; + + cpt = (first + i) % LNET_CPT_NUMBER; + mtable = ptl->ptl_mtables[cpt]; + if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled) + continue; + + lnet_res_lock(cpt); + lnet_ptl_lock(ptl); + + if (i == 0) { /* the first try, attach on stealing list */ + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_stealing); + } + + if (!list_empty(&msg->msg_list)) { /* on stealing list */ + rc = lnet_mt_match_md(mtable, info, msg); + + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && + mtable->mt_enabled) + lnet_ptl_disable_mt(ptl, cpt); + + if ((rc & LNET_MATCHMD_FINISH) != 0) + list_del_init(&msg->msg_list); + + } else { + /* could be matched by lnet_ptl_attach_md() + * which is called by another thread */ + rc = msg->msg_md == NULL ? + LNET_MATCHMD_DROP : LNET_MATCHMD_OK; + } + + if (!list_empty(&msg->msg_list) && /* not matched yet */ + (i == LNET_CPT_NUMBER - 1 || /* the last CPT */ + ptl->ptl_mt_nmaps == 0 || /* no active CPT */ + (ptl->ptl_mt_nmaps == 1 && /* the only active CPT */ + ptl->ptl_mt_maps[0] == cpt))) { + /* nothing to steal, delay or drop */ + list_del_init(&msg->msg_list); + + if (lnet_ptl_is_lazy(ptl)) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + } + + lnet_ptl_unlock(ptl); + lnet_res_unlock(cpt); + + if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed) + break; + } + + return rc; +} + +int +lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + int rc; + + CDEBUG(D_NET, "Request from %s of length %d into portal %d MB=%#llx\n", + libcfs_id2str(info->mi_id), info->mi_rlength, info->mi_portal, + info->mi_mbits); + + if (info->mi_portal >= the_lnet.ln_nportals) { + CERROR("Invalid portal %d not in [0-%d]\n", + info->mi_portal, the_lnet.ln_nportals); + return LNET_MATCHMD_DROP; + } + + ptl = the_lnet.ln_portals[info->mi_portal]; + rc = lnet_ptl_match_early(ptl, msg); + if (rc != 0) /* matched or delayed early message */ + return rc; + + mtable = lnet_mt_of_match(info, msg); + lnet_res_lock(mtable->mt_cpt); + + if (the_lnet.ln_shutdown) { + rc = LNET_MATCHMD_DROP; + goto out1; + } + + rc = lnet_mt_match_md(mtable, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) { + lnet_ptl_lock(ptl); + lnet_ptl_disable_mt(ptl, mtable->mt_cpt); + lnet_ptl_unlock(ptl); + } + + if ((rc & LNET_MATCHMD_FINISH) != 0) /* matched or dropping */ + goto out1; + + if (!msg->msg_rx_ready_delay) + goto out1; + + LASSERT(lnet_ptl_is_lazy(ptl)); + LASSERT(!msg->msg_rx_delayed); + + /* NB: we don't expect "delay" can happen a lot */ + if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) { + lnet_ptl_lock(ptl); + + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(mtable->mt_cpt); + + } else { + lnet_res_unlock(mtable->mt_cpt); + rc = lnet_ptl_match_delay(ptl, info, msg); + } + + if (msg->msg_rx_delayed) { + CDEBUG(D_NET, + "Delaying %s from %s ptl %d MB %#llx off %d len %d\n", + info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET", + libcfs_id2str(info->mi_id), info->mi_portal, + info->mi_mbits, info->mi_roffset, info->mi_rlength); + } + goto out0; + out1: + lnet_res_unlock(mtable->mt_cpt); + out0: + /* EXHAUSTED bit is only meaningful for internal functions */ + return rc & ~LNET_MATCHMD_EXHAUSTED; +} + +void +lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md) +{ + LASSERT(me->me_md == md && md->md_me == me); + + me->me_md = NULL; + md->md_me = NULL; +} + +/* called with lnet_res_lock held */ +void +lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md, + struct list_head *matches, struct list_head *drops) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal]; + struct lnet_match_table *mtable; + struct list_head *head; + lnet_msg_t *tmp; + lnet_msg_t *msg; + int exhausted = 0; + int cpt; + + LASSERT(md->md_refcount == 0); /* a brand new MD */ + + me->me_md = md; + md->md_me = me; + + cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + mtable = ptl->ptl_mtables[cpt]; + + if (list_empty(&ptl->ptl_msg_stealing) && + list_empty(&ptl->ptl_msg_delayed) && + !lnet_mt_test_exhausted(mtable, me->me_pos)) + return; + + lnet_ptl_lock(ptl); + head = &ptl->ptl_msg_stealing; + again: + list_for_each_entry_safe(msg, tmp, head, msg_list) { + struct lnet_match_info info; + lnet_hdr_t *hdr; + int rc; + + LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing); + + hdr = &msg->msg_hdr; + info.mi_id.nid = hdr->src_nid; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + + rc = lnet_try_match_md(md, &info, msg); + + exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0; + if ((rc & LNET_MATCHMD_NONE) != 0) { + if (exhausted) + break; + continue; + } + + /* Hurrah! This _is_ a match */ + LASSERT((rc & LNET_MATCHMD_FINISH) != 0); + list_del_init(&msg->msg_list); + + if (head == &ptl->ptl_msg_stealing) { + if (exhausted) + break; + /* stealing thread will handle the message */ + continue; + } + + if ((rc & LNET_MATCHMD_OK) != 0) { + list_add_tail(&msg->msg_list, matches); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d match %llu offset %d length %d.\n", + libcfs_id2str(info.mi_id), + info.mi_portal, info.mi_mbits, + info.mi_roffset, info.mi_rlength); + } else { + list_add_tail(&msg->msg_list, drops); + } + + if (exhausted) + break; + } + + if (!exhausted && head == &ptl->ptl_msg_stealing) { + head = &ptl->ptl_msg_delayed; + goto again; + } + + if (lnet_ptl_is_wildcard(ptl) && !exhausted) { + lnet_mt_set_exhausted(mtable, me->me_pos, 0); + if (!mtable->mt_enabled) + lnet_ptl_enable_mt(ptl, cpt); + } + + lnet_ptl_unlock(ptl); +} + +static void +lnet_ptl_cleanup(struct lnet_portal *ptl) +{ + struct lnet_match_table *mtable; + int i; + + if (ptl->ptl_mtables == NULL) /* uninitialized portal */ + return; + + LASSERT(list_empty(&ptl->ptl_msg_delayed)); + LASSERT(list_empty(&ptl->ptl_msg_stealing)); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + struct list_head *mhash; + lnet_me_t *me; + int j; + + if (mtable->mt_mhash == NULL) /* uninitialized match-table */ + continue; + + mhash = mtable->mt_mhash; + /* cleanup ME */ + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) { + while (!list_empty(&mhash[j])) { + me = list_entry(mhash[j].next, + lnet_me_t, me_list); + CERROR("Active ME %p on exit\n", me); + list_del(&me->me_list); + lnet_me_free(me); + } + } + /* the extra entry is for MEs with ignore bits */ + LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1)); + } + + cfs_percpt_free(ptl->ptl_mtables); + ptl->ptl_mtables = NULL; +} + +static int +lnet_ptl_setup(struct lnet_portal *ptl, int index) +{ + struct lnet_match_table *mtable; + struct list_head *mhash; + int i; + int j; + + ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct lnet_match_table)); + if (ptl->ptl_mtables == NULL) { + CERROR("Failed to create match table for portal %d\n", index); + return -ENOMEM; + } + + ptl->ptl_index = index; + INIT_LIST_HEAD(&ptl->ptl_msg_delayed); + INIT_LIST_HEAD(&ptl->ptl_msg_stealing); + spin_lock_init(&ptl->ptl_lock); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + /* the extra entry is for MEs with ignore bits */ + LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i, + sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1)); + if (mhash == NULL) { + CERROR("Failed to create match hash for portal %d\n", + index); + goto failed; + } + + memset(&mtable->mt_exhausted[0], -1, + sizeof(mtable->mt_exhausted[0]) * + LNET_MT_EXHAUSTED_BMAP); + mtable->mt_mhash = mhash; + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) + INIT_LIST_HEAD(&mhash[j]); + + mtable->mt_portal = index; + mtable->mt_cpt = i; + } + + return 0; + failed: + lnet_ptl_cleanup(ptl); + return -ENOMEM; +} + +void +lnet_portals_destroy(void) +{ + int i; + + if (the_lnet.ln_portals == NULL) + return; + + for (i = 0; i < the_lnet.ln_nportals; i++) + lnet_ptl_cleanup(the_lnet.ln_portals[i]); + + cfs_array_free(the_lnet.ln_portals); + the_lnet.ln_portals = NULL; +} + +int +lnet_portals_create(void) +{ + int size; + int i; + + size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]); + + the_lnet.ln_nportals = MAX_PORTALS; + the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size); + if (the_lnet.ln_portals == NULL) { + CERROR("Failed to allocate portals table\n"); + return -ENOMEM; + } + + for (i = 0; i < the_lnet.ln_nportals; i++) { + if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) { + lnet_portals_destroy(); + return -ENOMEM; + } + } + + return 0; +} + +/** + * Turn on the lazy portal attribute. Use with caution! + * + * This portal attribute only affects incoming PUT requests to the portal, + * and is off by default. By default, if there's no matching MD for an + * incoming PUT request, it is simply dropped. With the lazy attribute on, + * such requests are queued indefinitely until either a matching MD is + * posted to the portal or the lazy attribute is turned off. + * + * It would prevent dropped requests, however it should be regarded as the + * last line of defense - i.e. users must keep a close watch on active + * buffers on a lazy portal and once it becomes too low post more buffers as + * soon as possible. This is because delayed requests usually have detrimental + * effects on underlying network connections. A few delayed requests often + * suffice to bring an underlying connection to a complete halt, due to flow + * control mechanisms. + * + * There's also a DOS attack risk. If users don't post match-all MDs on a + * lazy portal, a malicious peer can easily stop a service by sending some + * PUT requests with match bits that won't match any MD. A routed server is + * especially vulnerable since the connections to its neighbor routers are + * shared among all clients. + * + * \param portal Index of the portal to enable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetSetLazyPortal(int portal) +{ + struct lnet_portal *ptl; + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + CDEBUG(D_NET, "Setting portal %d lazy\n", portal); + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + lnet_ptl_setopt(ptl, LNET_PTL_LAZY); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + return 0; +} +EXPORT_SYMBOL(LNetSetLazyPortal); + +/** + * Turn off the lazy portal attribute. Delayed requests on the portal, + * if any, will be all dropped when this function returns. + * + * \param portal Index of the portal to disable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetClearLazyPortal(int portal) +{ + struct lnet_portal *ptl; + LIST_HEAD (zombies); + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + if (!lnet_ptl_is_lazy(ptl)) { + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + return 0; + } + + if (the_lnet.ln_shutdown) + CWARN("Active lazy portal %d on exit\n", portal); + else + CDEBUG(D_NET, "clearing portal %d lazy\n", portal); + + /* grab all the blocked messages atomically */ + list_splice_init(&ptl->ptl_msg_delayed, &zombies); + + lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr"); + + return 0; +} +EXPORT_SYMBOL(LNetClearLazyPortal); diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lo.c b/kernel/drivers/staging/lustre/lnet/lnet/lo.c new file mode 100644 index 000000000..f708c2e64 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/lo.c @@ -0,0 +1,120 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include "../../include/linux/lnet/lib-lnet.h" + +static int +lolnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + LASSERT(!lntmsg->msg_routing); + LASSERT(!lntmsg->msg_target_is_router); + + return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0); +} + +static int +lolnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + lnet_msg_t *sendmsg = private; + + if (lntmsg != NULL) { /* not discarding */ + if (sendmsg->msg_iov != NULL) { + if (iov != NULL) + lnet_copy_iov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + else + lnet_copy_iov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + } else { + if (iov != NULL) + lnet_copy_kiov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + else + lnet_copy_kiov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + } + + lnet_finalize(ni, lntmsg, 0); + } + + lnet_finalize(ni, sendmsg, 0); + return 0; +} + +static int lolnd_instanced; + +static void +lolnd_shutdown(lnet_ni_t *ni) +{ + CDEBUG(D_NET, "shutdown\n"); + LASSERT(lolnd_instanced); + + lolnd_instanced = 0; +} + +static int +lolnd_startup(lnet_ni_t *ni) +{ + LASSERT(ni->ni_lnd == &the_lolnd); + LASSERT(!lolnd_instanced); + lolnd_instanced = 1; + + return 0; +} + +lnd_t the_lolnd = { + /* .lnd_list = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list}, + /* .lnd_refcount = */ 0, + /* .lnd_type = */ LOLND, + /* .lnd_startup = */ lolnd_startup, + /* .lnd_shutdown = */ lolnd_shutdown, + /* .lnt_ctl = */ NULL, + /* .lnd_send = */ lolnd_send, + /* .lnd_recv = */ lolnd_recv, + /* .lnd_eager_recv = */ NULL, + /* .lnd_notify = */ NULL, + /* .lnd_accept = */ NULL +}; diff --git a/kernel/drivers/staging/lustre/lnet/lnet/module.c b/kernel/drivers/staging/lustre/lnet/lnet/module.c new file mode 100644 index 000000000..72b7fbc83 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/module.c @@ -0,0 +1,155 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include "../../include/linux/lnet/lib-lnet.h" + +static int config_on_load; +module_param(config_on_load, int, 0444); +MODULE_PARM_DESC(config_on_load, "configure network at module load"); + +static struct mutex lnet_config_mutex; + +static int +lnet_configure(void *arg) +{ + /* 'arg' only there so I can be passed to cfs_create_thread() */ + int rc = 0; + + LNET_MUTEX_LOCK(&lnet_config_mutex); + + if (!the_lnet.ln_niinit_self) { + rc = LNetNIInit(LUSTRE_SRV_LNET_PID); + if (rc >= 0) { + the_lnet.ln_niinit_self = 1; + rc = 0; + } + } + + LNET_MUTEX_UNLOCK(&lnet_config_mutex); + return rc; +} + +static int +lnet_unconfigure(void) +{ + int refcount; + + LNET_MUTEX_LOCK(&lnet_config_mutex); + + if (the_lnet.ln_niinit_self) { + the_lnet.ln_niinit_self = 0; + LNetNIFini(); + } + + LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); + refcount = the_lnet.ln_refcount; + LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); + + LNET_MUTEX_UNLOCK(&lnet_config_mutex); + return (refcount == 0) ? 0 : -EBUSY; +} + +static int +lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data) +{ + int rc; + + switch (cmd) { + case IOC_LIBCFS_CONFIGURE: + return lnet_configure(NULL); + + case IOC_LIBCFS_UNCONFIGURE: + return lnet_unconfigure(); + + default: + /* Passing LNET_PID_ANY only gives me a ref if the net is up + * already; I'll need it to ensure the net can't go down while + * I'm called into it */ + rc = LNetNIInit(LNET_PID_ANY); + if (rc >= 0) { + rc = LNetCtl(cmd, data); + LNetNIFini(); + } + return rc; + } +} + +static DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl); + +static int __init +init_lnet(void) +{ + int rc; + + mutex_init(&lnet_config_mutex); + + rc = LNetInit(); + if (rc != 0) { + CERROR("LNetInit: error %d\n", rc); + return rc; + } + + rc = libcfs_register_ioctl(&lnet_ioctl_handler); + LASSERT(rc == 0); + + if (config_on_load) { + /* Have to schedule a separate thread to avoid deadlocking + * in modload */ + (void) kthread_run(lnet_configure, NULL, "lnet_initd"); + } + + return 0; +} + +static void __exit +fini_lnet(void) +{ + int rc; + + rc = libcfs_deregister_ioctl(&lnet_ioctl_handler); + LASSERT(rc == 0); + + LNetFini(); +} + +MODULE_AUTHOR("Peter J. Braam "); +MODULE_DESCRIPTION("Portals v3.1"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0.0"); + +module_init(init_lnet); +module_exit(fini_lnet); diff --git a/kernel/drivers/staging/lustre/lnet/lnet/peer.c b/kernel/drivers/staging/lustre/lnet/lnet/peer.c new file mode 100644 index 000000000..45b5742f1 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/peer.c @@ -0,0 +1,338 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/peer.c + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/lnet/lib-lnet.h" + +int +lnet_peer_tables_create(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ptable)); + if (the_lnet.ln_peer_tables == NULL) { + CERROR("Failed to allocate cpu-partition peer tables\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + INIT_LIST_HEAD(&ptable->pt_deathrow); + + LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i, + LNET_PEER_HASH_SIZE * sizeof(*hash)); + if (hash == NULL) { + CERROR("Failed to create peer hash table\n"); + lnet_peer_tables_destroy(); + return -ENOMEM; + } + + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + INIT_LIST_HEAD(&hash[j]); + ptable->pt_hash = hash; /* sign of initialization */ + } + + return 0; +} + +void +lnet_peer_tables_destroy(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + if (the_lnet.ln_peer_tables == NULL) + return; + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + hash = ptable->pt_hash; + if (hash == NULL) /* not initialized */ + break; + + LASSERT(list_empty(&ptable->pt_deathrow)); + + ptable->pt_hash = NULL; + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + LASSERT(list_empty(&hash[j])); + + LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash)); + } + + cfs_percpt_free(the_lnet.ln_peer_tables); + the_lnet.ln_peer_tables = NULL; +} + +void +lnet_peer_tables_cleanup(void) +{ + struct lnet_peer_table *ptable; + int i; + int j; + + LASSERT(the_lnet.ln_shutdown); /* i.e. no new peers */ + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + lnet_net_lock(i); + + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) { + struct list_head *peers = &ptable->pt_hash[j]; + + while (!list_empty(peers)) { + lnet_peer_t *lp = list_entry(peers->next, + lnet_peer_t, + lp_hashlist); + list_del_init(&lp->lp_hashlist); + /* lose hash table's ref */ + lnet_peer_decref_locked(lp); + } + } + + lnet_net_unlock(i); + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + LIST_HEAD(deathrow); + lnet_peer_t *lp; + + lnet_net_lock(i); + + for (j = 3; ptable->pt_number != 0; j++) { + lnet_net_unlock(i); + + if ((j & (j - 1)) == 0) { + CDEBUG(D_WARNING, + "Waiting for %d peers on peer table\n", + ptable->pt_number); + } + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 2); + lnet_net_lock(i); + } + list_splice_init(&ptable->pt_deathrow, &deathrow); + + lnet_net_unlock(i); + + while (!list_empty(&deathrow)) { + lp = list_entry(deathrow.next, + lnet_peer_t, lp_hashlist); + list_del(&lp->lp_hashlist); + LIBCFS_FREE(lp, sizeof(*lp)); + } + } +} + +void +lnet_destroy_peer_locked(lnet_peer_t *lp) +{ + struct lnet_peer_table *ptable; + + LASSERT(lp->lp_refcount == 0); + LASSERT(lp->lp_rtr_refcount == 0); + LASSERT(list_empty(&lp->lp_txq)); + LASSERT(list_empty(&lp->lp_hashlist)); + LASSERT(lp->lp_txqnob == 0); + + ptable = the_lnet.ln_peer_tables[lp->lp_cpt]; + LASSERT(ptable->pt_number > 0); + ptable->pt_number--; + + lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt); + lp->lp_ni = NULL; + + list_add(&lp->lp_hashlist, &ptable->pt_deathrow); +} + +lnet_peer_t * +lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid) +{ + struct list_head *peers; + lnet_peer_t *lp; + + LASSERT(!the_lnet.ln_shutdown); + + peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; + list_for_each_entry(lp, peers, lp_hashlist) { + if (lp->lp_nid == nid) { + lnet_peer_addref_locked(lp); + return lp; + } + } + + return NULL; +} + +int +lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt) +{ + struct lnet_peer_table *ptable; + lnet_peer_t *lp = NULL; + lnet_peer_t *lp2; + int cpt2; + int rc = 0; + + *lpp = NULL; + if (the_lnet.ln_shutdown) /* it's shutting down */ + return -ESHUTDOWN; + + /* cpt can be LNET_LOCK_EX if it's called from router functions */ + cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid); + + ptable = the_lnet.ln_peer_tables[cpt2]; + lp = lnet_find_peer_locked(ptable, nid); + if (lp != NULL) { + *lpp = lp; + return 0; + } + + if (!list_empty(&ptable->pt_deathrow)) { + lp = list_entry(ptable->pt_deathrow.next, + lnet_peer_t, lp_hashlist); + list_del(&lp->lp_hashlist); + } + + /* + * take extra refcount in case another thread has shutdown LNet + * and destroyed locks and peer-table before I finish the allocation + */ + ptable->pt_number++; + lnet_net_unlock(cpt); + + if (lp != NULL) + memset(lp, 0, sizeof(*lp)); + else + LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp)); + + if (lp == NULL) { + rc = -ENOMEM; + lnet_net_lock(cpt); + goto out; + } + + INIT_LIST_HEAD(&lp->lp_txq); + INIT_LIST_HEAD(&lp->lp_rtrq); + INIT_LIST_HEAD(&lp->lp_routes); + + lp->lp_notify = 0; + lp->lp_notifylnd = 0; + lp->lp_notifying = 0; + lp->lp_alive_count = 0; + lp->lp_timestamp = 0; + lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ + lp->lp_last_alive = cfs_time_current(); /* assumes alive */ + lp->lp_last_query = 0; /* haven't asked NI yet */ + lp->lp_ping_timestamp = 0; + lp->lp_ping_feats = LNET_PING_FEAT_INVAL; + lp->lp_nid = nid; + lp->lp_cpt = cpt2; + lp->lp_refcount = 2; /* 1 for caller; 1 for hash */ + lp->lp_rtr_refcount = 0; + + lnet_net_lock(cpt); + + if (the_lnet.ln_shutdown) { + rc = -ESHUTDOWN; + goto out; + } + + lp2 = lnet_find_peer_locked(ptable, nid); + if (lp2 != NULL) { + *lpp = lp2; + goto out; + } + + lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2); + if (lp->lp_ni == NULL) { + rc = -EHOSTUNREACH; + goto out; + } + + lp->lp_txcredits = + lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits; + lp->lp_rtrcredits = + lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni); + + list_add_tail(&lp->lp_hashlist, + &ptable->pt_hash[lnet_nid2peerhash(nid)]); + ptable->pt_version++; + *lpp = lp; + + return 0; +out: + if (lp != NULL) + list_add(&lp->lp_hashlist, &ptable->pt_deathrow); + ptable->pt_number--; + return rc; +} + +void +lnet_debug_peer(lnet_nid_t nid) +{ + char *aliveness = "NA"; + lnet_peer_t *lp; + int rc; + int cpt; + + cpt = lnet_cpt_of_nid(nid); + lnet_net_lock(cpt); + + rc = lnet_nid2peer_locked(&lp, nid, cpt); + if (rc != 0) { + lnet_net_unlock(cpt); + CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid)); + return; + } + + if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp)) + aliveness = lp->lp_alive ? "up" : "down"; + + CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", + libcfs_nid2str(lp->lp_nid), lp->lp_refcount, + aliveness, lp->lp_ni->ni_peertxcredits, + lp->lp_rtrcredits, lp->lp_minrtrcredits, + lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob); + + lnet_peer_decref_locked(lp); + + lnet_net_unlock(cpt); +} diff --git a/kernel/drivers/staging/lustre/lnet/lnet/router.c b/kernel/drivers/staging/lustre/lnet/lnet/router.c new file mode 100644 index 000000000..8510bae48 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/router.c @@ -0,0 +1,1706 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include "../../include/linux/lnet/lib-lnet.h" + +#if defined(LNET_ROUTER) + +#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */ +#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4) +#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */ +#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4) +#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */ +#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4) + +static char *forwarding = ""; +module_param(forwarding, charp, 0444); +MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks"); + +static int tiny_router_buffers; +module_param(tiny_router_buffers, int, 0444); +MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router"); +static int small_router_buffers; +module_param(small_router_buffers, int, 0444); +MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router"); +static int large_router_buffers; +module_param(large_router_buffers, int, 0444); +MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router"); +static int peer_buffer_credits; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer"); + +static int auto_down = 1; +module_param(auto_down, int, 0444); +MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error"); + +int +lnet_peer_buffer_credits(lnet_ni_t *ni) +{ + /* NI option overrides LNet default */ + if (ni->ni_peerrtrcredits > 0) + return ni->ni_peerrtrcredits; + if (peer_buffer_credits > 0) + return peer_buffer_credits; + + /* As an approximation, allow this peer the same number of router + * buffers as it is allowed outstanding sends */ + return ni->ni_peertxcredits; +} + +/* forward ref's */ +static int lnet_router_checker(void *); +#else + +int +lnet_peer_buffer_credits(lnet_ni_t *ni) +{ + return 0; +} + +#endif + +static int check_routers_before_use; +module_param(check_routers_before_use, int, 0444); +MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use"); + +int avoid_asym_router_failure = 1; +module_param(avoid_asym_router_failure, int, 0644); +MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)"); + +static int dead_router_check_interval = 60; +module_param(dead_router_check_interval, int, 0644); +MODULE_PARM_DESC(dead_router_check_interval, "Seconds between dead router health checks (<= 0 to disable)"); + +static int live_router_check_interval = 60; +module_param(live_router_check_interval, int, 0644); +MODULE_PARM_DESC(live_router_check_interval, "Seconds between live router health checks (<= 0 to disable)"); + +static int router_ping_timeout = 50; +module_param(router_ping_timeout, int, 0644); +MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query"); + +int +lnet_peers_start_down(void) +{ + return check_routers_before_use; +} + +void +lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, + unsigned long when) +{ + if (time_before(when, lp->lp_timestamp)) { /* out of date information */ + CDEBUG(D_NET, "Out of date\n"); + return; + } + + lp->lp_timestamp = when; /* update timestamp */ + lp->lp_ping_deadline = 0; /* disable ping timeout */ + + if (lp->lp_alive_count != 0 && /* got old news */ + (!lp->lp_alive) == (!alive)) { /* new date for old news */ + CDEBUG(D_NET, "Old news\n"); + return; + } + + /* Flag that notification is outstanding */ + + lp->lp_alive_count++; + lp->lp_alive = !(!alive); /* 1 bit! */ + lp->lp_notify = 1; + lp->lp_notifylnd |= notifylnd; + if (lp->lp_alive) + lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */ + + CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive); +} + +static void +lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp) +{ + int alive; + int notifylnd; + + /* Notify only in 1 thread at any time to ensure ordered notification. + * NB individual events can be missed; the only guarantee is that you + * always get the most recent news */ + + if (lp->lp_notifying || ni == NULL) + return; + + lp->lp_notifying = 1; + + while (lp->lp_notify) { + alive = lp->lp_alive; + notifylnd = lp->lp_notifylnd; + + lp->lp_notifylnd = 0; + lp->lp_notify = 0; + + if (notifylnd && ni->ni_lnd->lnd_notify != NULL) { + lnet_net_unlock(lp->lp_cpt); + + /* A new notification could happen now; I'll handle it + * when control returns to me */ + + (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive); + + lnet_net_lock(lp->lp_cpt); + } + } + + lp->lp_notifying = 0; +} + + +static void +lnet_rtr_addref_locked(lnet_peer_t *lp) +{ + LASSERT(lp->lp_refcount > 0); + LASSERT(lp->lp_rtr_refcount >= 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lp_rtr_refcount++; + if (lp->lp_rtr_refcount == 1) { + struct list_head *pos; + + /* a simple insertion sort */ + list_for_each_prev(pos, &the_lnet.ln_routers) { + lnet_peer_t *rtr = list_entry(pos, lnet_peer_t, + lp_rtr_list); + + if (rtr->lp_nid < lp->lp_nid) + break; + } + + list_add(&lp->lp_rtr_list, pos); + /* addref for the_lnet.ln_routers */ + lnet_peer_addref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +static void +lnet_rtr_decref_locked(lnet_peer_t *lp) +{ + LASSERT(lp->lp_refcount > 0); + LASSERT(lp->lp_rtr_refcount > 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lp_rtr_refcount--; + if (lp->lp_rtr_refcount == 0) { + LASSERT(list_empty(&lp->lp_routes)); + + if (lp->lp_rcd != NULL) { + list_add(&lp->lp_rcd->rcd_list, + &the_lnet.ln_rcd_deathrow); + lp->lp_rcd = NULL; + } + + list_del(&lp->lp_rtr_list); + /* decref for the_lnet.ln_routers */ + lnet_peer_decref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +lnet_remotenet_t * +lnet_find_net_locked(__u32 net) +{ + lnet_remotenet_t *rnet; + struct list_head *tmp; + struct list_head *rn_list; + + LASSERT(!the_lnet.ln_shutdown); + + rn_list = lnet_net2rnethash(net); + list_for_each(tmp, rn_list) { + rnet = list_entry(tmp, lnet_remotenet_t, lrn_list); + + if (rnet->lrn_net == net) + return rnet; + } + return NULL; +} + +static void lnet_shuffle_seed(void) +{ + static int seeded; + int lnd_type, seed[2]; + struct timeval tv; + lnet_ni_t *ni; + struct list_head *tmp; + + if (seeded) + return; + + cfs_get_random_bytes(seed, sizeof(seed)); + + /* Nodes with small feet have little entropy + * the NID for this node gives the most entropy in the low bits */ + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); + + if (lnd_type != LOLND) + seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type); + } + + do_gettimeofday(&tv); + cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]); + seeded = 1; +} + +/* NB expects LNET_LOCK held */ +static void +lnet_add_route_to_rnet(lnet_remotenet_t *rnet, lnet_route_t *route) +{ + unsigned int len = 0; + unsigned int offset = 0; + struct list_head *e; + + lnet_shuffle_seed(); + + list_for_each(e, &rnet->lrn_routes) { + len++; + } + + /* len+1 positions to add a new entry, also prevents division by 0 */ + offset = cfs_rand() % (len + 1); + list_for_each(e, &rnet->lrn_routes) { + if (offset == 0) + break; + offset--; + } + list_add(&route->lr_list, e); + list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes); + + the_lnet.ln_remote_nets_version++; + lnet_rtr_addref_locked(route->lr_gateway); +} + +int +lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway, + unsigned int priority) +{ + struct list_head *e; + lnet_remotenet_t *rnet; + lnet_remotenet_t *rnet2; + lnet_route_t *route; + lnet_ni_t *ni; + int add_route; + int rc; + + CDEBUG(D_NET, "Add route: net %s hops %u priority %u gw %s\n", + libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway)); + + if (gateway == LNET_NID_ANY || + LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND || + net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND || + LNET_NIDNET(gateway) == net || + hops < 1 || hops > 255) + return -EINVAL; + + if (lnet_islocalnet(net)) /* it's a local network */ + return 0; /* ignore the route entry */ + + /* Assume net, route, all new */ + LIBCFS_ALLOC(route, sizeof(*route)); + LIBCFS_ALLOC(rnet, sizeof(*rnet)); + if (route == NULL || rnet == NULL) { + CERROR("Out of memory creating route %s %d %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + if (route != NULL) + LIBCFS_FREE(route, sizeof(*route)); + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + return -ENOMEM; + } + + INIT_LIST_HEAD(&rnet->lrn_routes); + rnet->lrn_net = net; + route->lr_hops = hops; + route->lr_net = net; + route->lr_priority = priority; + + lnet_net_lock(LNET_LOCK_EX); + + rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX); + if (rc != 0) { + lnet_net_unlock(LNET_LOCK_EX); + + LIBCFS_FREE(route, sizeof(*route)); + LIBCFS_FREE(rnet, sizeof(*rnet)); + + if (rc == -EHOSTUNREACH) /* gateway is not on a local net */ + return 0; /* ignore the route entry */ + CERROR("Error %d creating route %s %d %s\n", rc, + libcfs_net2str(net), hops, + libcfs_nid2str(gateway)); + + return rc; + } + + LASSERT(!the_lnet.ln_shutdown); + + rnet2 = lnet_find_net_locked(net); + if (rnet2 == NULL) { + /* new network */ + list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net)); + rnet2 = rnet; + } + + /* Search for a duplicate route (it's a NOOP if it is) */ + add_route = 1; + list_for_each(e, &rnet2->lrn_routes) { + lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list); + + if (route2->lr_gateway == route->lr_gateway) { + add_route = 0; + break; + } + + /* our lookups must be true */ + LASSERT(route2->lr_gateway->lp_nid != gateway); + } + + if (add_route) { + lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */ + lnet_add_route_to_rnet(rnet2, route); + + ni = route->lr_gateway->lp_ni; + lnet_net_unlock(LNET_LOCK_EX); + + /* XXX Assume alive */ + if (ni->ni_lnd->lnd_notify != NULL) + (ni->ni_lnd->lnd_notify)(ni, gateway, 1); + + lnet_net_lock(LNET_LOCK_EX); + } + + /* -1 for notify or !add_route */ + lnet_peer_decref_locked(route->lr_gateway); + lnet_net_unlock(LNET_LOCK_EX); + + if (!add_route) + LIBCFS_FREE(route, sizeof(*route)); + + if (rnet != rnet2) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + return 0; +} + +int +lnet_check_routes(void) +{ + lnet_remotenet_t *rnet; + lnet_route_t *route; + lnet_route_t *route2; + struct list_head *e1; + struct list_head *e2; + int cpt; + struct list_head *rn_list; + int i; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each(e1, rn_list) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + route2 = NULL; + list_for_each(e2, &rnet->lrn_routes) { + lnet_nid_t nid1; + lnet_nid_t nid2; + int net; + + route = list_entry(e2, lnet_route_t, + lr_list); + + if (route2 == NULL) { + route2 = route; + continue; + } + + if (route->lr_gateway->lp_ni == + route2->lr_gateway->lp_ni) + continue; + + nid1 = route->lr_gateway->lp_nid; + nid2 = route2->lr_gateway->lp_nid; + net = rnet->lrn_net; + + lnet_net_unlock(cpt); + + CERROR("Routes to %s via %s and %s not supported\n", + libcfs_net2str(net), + libcfs_nid2str(nid1), + libcfs_nid2str(nid2)); + return -EINVAL; + } + } + } + + lnet_net_unlock(cpt); + return 0; +} + +int +lnet_del_route(__u32 net, lnet_nid_t gw_nid) +{ + struct lnet_peer *gateway; + lnet_remotenet_t *rnet; + lnet_route_t *route; + struct list_head *e1; + struct list_head *e2; + int rc = -ENOENT; + struct list_head *rn_list; + int idx = 0; + + CDEBUG(D_NET, "Del route: net %s : gw %s\n", + libcfs_net2str(net), libcfs_nid2str(gw_nid)); + + /* NB Caller may specify either all routes via the given gateway + * or a specific route entry actual NIDs) */ + + lnet_net_lock(LNET_LOCK_EX); + if (net == LNET_NIDNET(LNET_NID_ANY)) + rn_list = &the_lnet.ln_remote_nets_hash[0]; + else + rn_list = lnet_net2rnethash(net); + + again: + list_for_each(e1, rn_list) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + if (!(net == LNET_NIDNET(LNET_NID_ANY) || + net == rnet->lrn_net)) + continue; + + list_for_each(e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, lr_list); + + gateway = route->lr_gateway; + if (!(gw_nid == LNET_NID_ANY || + gw_nid == gateway->lp_nid)) + continue; + + list_del(&route->lr_list); + list_del(&route->lr_gwlist); + the_lnet.ln_remote_nets_version++; + + if (list_empty(&rnet->lrn_routes)) + list_del(&rnet->lrn_list); + else + rnet = NULL; + + lnet_rtr_decref_locked(gateway); + lnet_peer_decref_locked(gateway); + + lnet_net_unlock(LNET_LOCK_EX); + + LIBCFS_FREE(route, sizeof(*route)); + + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + rc = 0; + lnet_net_lock(LNET_LOCK_EX); + goto again; + } + } + + if (net == LNET_NIDNET(LNET_NID_ANY) && + ++idx < LNET_REMOTE_NETS_HASH_SIZE) { + rn_list = &the_lnet.ln_remote_nets_hash[idx]; + goto again; + } + lnet_net_unlock(LNET_LOCK_EX); + + return rc; +} + +void +lnet_destroy_routes(void) +{ + lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY); +} + +int +lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive, __u32 *priority) +{ + struct list_head *e1; + struct list_head *e2; + lnet_remotenet_t *rnet; + lnet_route_t *route; + int cpt; + int i; + struct list_head *rn_list; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each(e1, rn_list) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + list_for_each(e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, + lr_list); + + if (idx-- == 0) { + *net = rnet->lrn_net; + *hops = route->lr_hops; + *priority = route->lr_priority; + *gateway = route->lr_gateway->lp_nid; + *alive = route->lr_gateway->lp_alive; + lnet_net_unlock(cpt); + return 0; + } + } + } + } + + lnet_net_unlock(cpt); + return -ENOENT; +} + +void +lnet_swap_pinginfo(lnet_ping_info_t *info) +{ + int i; + lnet_ni_status_t *stat; + + __swab32s(&info->pi_magic); + __swab32s(&info->pi_features); + __swab32s(&info->pi_pid); + __swab32s(&info->pi_nnis); + for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { + stat = &info->pi_ni[i]; + __swab64s(&stat->ns_nid); + __swab32s(&stat->ns_status); + } +} + +/** + * parse router-checker pinginfo, record number of down NIs for remote + * networks on that router. + */ +static void +lnet_parse_rc_info(lnet_rc_data_t *rcd) +{ + lnet_ping_info_t *info = rcd->rcd_pinginfo; + struct lnet_peer *gw = rcd->rcd_gateway; + lnet_route_t *rtr; + + if (!gw->lp_alive) + return; + + if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) + lnet_swap_pinginfo(info); + + /* NB always racing with network! */ + if (info->pi_magic != LNET_PROTO_PING_MAGIC) { + CDEBUG(D_NET, "%s: Unexpected magic %08x\n", + libcfs_nid2str(gw->lp_nid), info->pi_magic); + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; + return; + } + + gw->lp_ping_feats = info->pi_features; + if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) { + CDEBUG(D_NET, "%s: Unexpected features 0x%x\n", + libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats); + return; /* nothing I can understand */ + } + + if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) + return; /* can't carry NI status info */ + + list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) { + int ptl_status = LNET_NI_STATUS_INVALID; + int down = 0; + int up = 0; + int i; + + for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { + lnet_ni_status_t *stat = &info->pi_ni[i]; + lnet_nid_t nid = stat->ns_nid; + + if (nid == LNET_NID_ANY) { + CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n", + libcfs_nid2str(gw->lp_nid)); + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; + return; + } + + if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + continue; + + if (stat->ns_status == LNET_NI_STATUS_DOWN) { + if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND) + down++; + else if (ptl_status != LNET_NI_STATUS_UP) + ptl_status = LNET_NI_STATUS_DOWN; + continue; + } + + if (stat->ns_status == LNET_NI_STATUS_UP) { + if (LNET_NIDNET(nid) == rtr->lr_net) { + up = 1; + break; + } + /* ptl NIs are considered down only when + * they're all down */ + if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND) + ptl_status = LNET_NI_STATUS_UP; + continue; + } + + CDEBUG(D_NET, "%s: Unexpected status 0x%x\n", + libcfs_nid2str(gw->lp_nid), stat->ns_status); + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; + return; + } + + if (up) { /* ignore downed NIs if NI for dest network is up */ + rtr->lr_downis = 0; + continue; + } + rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN); + } +} + +static void +lnet_router_checker_event(lnet_event_t *event) +{ + lnet_rc_data_t *rcd = event->md.user_ptr; + struct lnet_peer *lp; + + LASSERT(rcd != NULL); + + if (event->unlinked) { + LNetInvalidateHandle(&rcd->rcd_mdh); + return; + } + + LASSERT(event->type == LNET_EVENT_SEND || + event->type == LNET_EVENT_REPLY); + + lp = rcd->rcd_gateway; + LASSERT(lp != NULL); + + /* NB: it's called with holding lnet_res_lock, we have a few + * places need to hold both locks at the same time, please take + * care of lock ordering */ + lnet_net_lock(lp->lp_cpt); + if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) { + /* ignore if no longer a router or rcd is replaced */ + goto out; + } + + if (event->type == LNET_EVENT_SEND) { + lp->lp_ping_notsent = 0; + if (event->status == 0) + goto out; + } + + /* LNET_EVENT_REPLY */ + /* A successful REPLY means the router is up. If _any_ comms + * to the router fail I assume it's down (this will happen if + * we ping alive routers to try to detect router death before + * apps get burned). */ + + lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current()); + /* The router checker will wake up very shortly and do the + * actual notification. + * XXX If 'lp' stops being a router before then, it will still + * have the notification pending!!! */ + + if (avoid_asym_router_failure && event->status == 0) + lnet_parse_rc_info(rcd); + + out: + lnet_net_unlock(lp->lp_cpt); +} + +static void +lnet_wait_known_routerstate(void) +{ + lnet_peer_t *rtr; + struct list_head *entry; + int all_known; + + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + for (;;) { + int cpt = lnet_net_lock_current(); + + all_known = 1; + list_for_each(entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + if (rtr->lp_alive_count == 0) { + all_known = 0; + break; + } + } + + lnet_net_unlock(cpt); + + if (all_known) + return; + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } +} + +void +lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net) +{ + lnet_route_t *rte; + + if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) { + list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) { + if (rte->lr_net == net) { + rte->lr_downis = 0; + break; + } + } + } +} + +static void +lnet_update_ni_status_locked(void) +{ + lnet_ni_t *ni; + long now; + int timeout; + + LASSERT(the_lnet.ln_routing); + + timeout = router_ping_timeout + + max(live_router_check_interval, dead_router_check_interval); + + now = get_seconds(); + list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { + if (ni->ni_lnd->lnd_type == LOLND) + continue; + + if (now < ni->ni_last_alive + timeout) + continue; + + lnet_ni_lock(ni); + /* re-check with lock */ + if (now < ni->ni_last_alive + timeout) { + lnet_ni_unlock(ni); + continue; + } + + LASSERT(ni->ni_status != NULL); + + if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) { + CDEBUG(D_NET, "NI(%s:%d) status changed to down\n", + libcfs_nid2str(ni->ni_nid), timeout); + /* NB: so far, this is the only place to set + * NI status to "down" */ + ni->ni_status->ns_status = LNET_NI_STATUS_DOWN; + } + lnet_ni_unlock(ni); + } +} + +static void +lnet_destroy_rc_data(lnet_rc_data_t *rcd) +{ + LASSERT(list_empty(&rcd->rcd_list)); + /* detached from network */ + LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh)); + + if (rcd->rcd_gateway != NULL) { + int cpt = rcd->rcd_gateway->lp_cpt; + + lnet_net_lock(cpt); + lnet_peer_decref_locked(rcd->rcd_gateway); + lnet_net_unlock(cpt); + } + + if (rcd->rcd_pinginfo != NULL) + LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE); + + LIBCFS_FREE(rcd, sizeof(*rcd)); +} + +static lnet_rc_data_t * +lnet_create_rc_data_locked(lnet_peer_t *gateway) +{ + lnet_rc_data_t *rcd = NULL; + lnet_ping_info_t *pi; + int rc; + int i; + + lnet_net_unlock(gateway->lp_cpt); + + LIBCFS_ALLOC(rcd, sizeof(*rcd)); + if (rcd == NULL) + goto out; + + LNetInvalidateHandle(&rcd->rcd_mdh); + INIT_LIST_HEAD(&rcd->rcd_list); + + LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE); + if (pi == NULL) + goto out; + + for (i = 0; i < LNET_MAX_RTR_NIS; i++) { + pi->pi_ni[i].ns_nid = LNET_NID_ANY; + pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID; + } + rcd->rcd_pinginfo = pi; + + LASSERT(!LNetHandleIsInvalid(the_lnet.ln_rc_eqh)); + rc = LNetMDBind((lnet_md_t){.start = pi, + .user_ptr = rcd, + .length = LNET_PINGINFO_SIZE, + .threshold = LNET_MD_THRESH_INF, + .options = LNET_MD_TRUNCATE, + .eq_handle = the_lnet.ln_rc_eqh}, + LNET_UNLINK, + &rcd->rcd_mdh); + if (rc < 0) { + CERROR("Can't bind MD: %d\n", rc); + goto out; + } + LASSERT(rc == 0); + + lnet_net_lock(gateway->lp_cpt); + /* router table changed or someone has created rcd for this gateway */ + if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) { + lnet_net_unlock(gateway->lp_cpt); + goto out; + } + + lnet_peer_addref_locked(gateway); + rcd->rcd_gateway = gateway; + gateway->lp_rcd = rcd; + gateway->lp_ping_notsent = 0; + + return rcd; + + out: + if (rcd != NULL) { + if (!LNetHandleIsInvalid(rcd->rcd_mdh)) { + rc = LNetMDUnlink(rcd->rcd_mdh); + LASSERT(rc == 0); + } + lnet_destroy_rc_data(rcd); + } + + lnet_net_lock(gateway->lp_cpt); + return gateway->lp_rcd; +} + +static int +lnet_router_check_interval(lnet_peer_t *rtr) +{ + int secs; + + secs = rtr->lp_alive ? live_router_check_interval : + dead_router_check_interval; + if (secs < 0) + secs = 0; + + return secs; +} + +static void +lnet_ping_router_locked(lnet_peer_t *rtr) +{ + lnet_rc_data_t *rcd = NULL; + unsigned long now = cfs_time_current(); + int secs; + + lnet_peer_addref_locked(rtr); + + if (rtr->lp_ping_deadline != 0 && /* ping timed out? */ + cfs_time_after(now, rtr->lp_ping_deadline)) + lnet_notify_locked(rtr, 1, 0, now); + + /* Run any outstanding notifications */ + lnet_ni_notify_locked(rtr->lp_ni, rtr); + + if (!lnet_isrouter(rtr) || + the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) { + /* router table changed or router checker is shutting down */ + lnet_peer_decref_locked(rtr); + return; + } + + rcd = rtr->lp_rcd != NULL ? + rtr->lp_rcd : lnet_create_rc_data_locked(rtr); + + if (rcd == NULL) + return; + + secs = lnet_router_check_interval(rtr); + + CDEBUG(D_NET, + "rtr %s %d: deadline %lu ping_notsent %d alive %d alive_count %d lp_ping_timestamp %lu\n", + libcfs_nid2str(rtr->lp_nid), secs, + rtr->lp_ping_deadline, rtr->lp_ping_notsent, + rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp); + + if (secs != 0 && !rtr->lp_ping_notsent && + cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp, + cfs_time_seconds(secs)))) { + int rc; + lnet_process_id_t id; + lnet_handle_md_t mdh; + + id.nid = rtr->lp_nid; + id.pid = LUSTRE_SRV_LNET_PID; + CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id)); + + rtr->lp_ping_notsent = 1; + rtr->lp_ping_timestamp = now; + + mdh = rcd->rcd_mdh; + + if (rtr->lp_ping_deadline == 0) { + rtr->lp_ping_deadline = + cfs_time_shift(router_ping_timeout); + } + + lnet_net_unlock(rtr->lp_cpt); + + rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); + + lnet_net_lock(rtr->lp_cpt); + if (rc != 0) + rtr->lp_ping_notsent = 0; /* no event pending */ + } + + lnet_peer_decref_locked(rtr); +} + +int +lnet_router_checker_start(void) +{ + int rc; + int eqsz; + + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + + if (check_routers_before_use && + dead_router_check_interval <= 0) { + LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be set if 'check_routers_before_use' is set\n"); + return -EINVAL; + } + + if (!the_lnet.ln_routing && + live_router_check_interval <= 0 && + dead_router_check_interval <= 0) + return 0; + + sema_init(&the_lnet.ln_rc_signal, 0); + /* EQ size doesn't matter; the callback is guaranteed to get every + * event */ + eqsz = 0; + rc = LNetEQAlloc(eqsz, lnet_router_checker_event, + &the_lnet.ln_rc_eqh); + if (rc != 0) { + CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc); + return -ENOMEM; + } + + the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING; + rc = PTR_ERR(kthread_run(lnet_router_checker, + NULL, "router_checker")); + if (IS_ERR_VALUE(rc)) { + CERROR("Can't start router checker thread: %d\n", rc); + /* block until event callback signals exit */ + down(&the_lnet.ln_rc_signal); + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT(rc == 0); + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + return -ENOMEM; + } + + if (check_routers_before_use) { + /* Note that a helpful side-effect of pinging all known routers + * at startup is that it makes them drop stale connections they + * may have to a previous instance of me. */ + lnet_wait_known_routerstate(); + } + + return 0; +} + +void +lnet_router_checker_stop(void) +{ + int rc; + + if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN) + return; + + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING; + + /* block until event callback signals exit */ + down(&the_lnet.ln_rc_signal); + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT(rc == 0); +} + +static void +lnet_prune_rc_data(int wait_unlink) +{ + lnet_rc_data_t *rcd; + lnet_rc_data_t *tmp; + lnet_peer_t *lp; + struct list_head head; + int i = 2; + + if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING && + list_empty(&the_lnet.ln_rcd_deathrow) && + list_empty(&the_lnet.ln_rcd_zombie))) + return; + + INIT_LIST_HEAD(&head); + + lnet_net_lock(LNET_LOCK_EX); + + if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) { + /* router checker is stopping, prune all */ + list_for_each_entry(lp, &the_lnet.ln_routers, + lp_rtr_list) { + if (lp->lp_rcd == NULL) + continue; + + LASSERT(list_empty(&lp->lp_rcd->rcd_list)); + list_add(&lp->lp_rcd->rcd_list, + &the_lnet.ln_rcd_deathrow); + lp->lp_rcd = NULL; + } + } + + /* unlink all RCDs on deathrow list */ + list_splice_init(&the_lnet.ln_rcd_deathrow, &head); + + if (!list_empty(&head)) { + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry(rcd, &head, rcd_list) + LNetMDUnlink(rcd->rcd_mdh); + + lnet_net_lock(LNET_LOCK_EX); + } + + list_splice_init(&head, &the_lnet.ln_rcd_zombie); + + /* release all zombie RCDs */ + while (!list_empty(&the_lnet.ln_rcd_zombie)) { + list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie, + rcd_list) { + if (LNetHandleIsInvalid(rcd->rcd_mdh)) + list_move(&rcd->rcd_list, &head); + } + + wait_unlink = wait_unlink && + !list_empty(&the_lnet.ln_rcd_zombie); + + lnet_net_unlock(LNET_LOCK_EX); + + while (!list_empty(&head)) { + rcd = list_entry(head.next, + lnet_rc_data_t, rcd_list); + list_del_init(&rcd->rcd_list); + lnet_destroy_rc_data(rcd); + } + + if (!wait_unlink) + return; + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for rc buffers to unlink\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 4); + + lnet_net_lock(LNET_LOCK_EX); + } + + lnet_net_unlock(LNET_LOCK_EX); +} + + +#if defined(LNET_ROUTER) + +static int +lnet_router_checker(void *arg) +{ + lnet_peer_t *rtr; + struct list_head *entry; + + cfs_block_allsigs(); + + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { + __u64 version; + int cpt; + int cpt2; + + cpt = lnet_net_lock_current(); +rescan: + version = the_lnet.ln_routers_version; + + list_for_each(entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid); + if (cpt != cpt2) { + lnet_net_unlock(cpt); + cpt = cpt2; + lnet_net_lock(cpt); + /* the routers list has changed */ + if (version != the_lnet.ln_routers_version) + goto rescan; + } + + lnet_ping_router_locked(rtr); + + /* NB dropped lock */ + if (version != the_lnet.ln_routers_version) { + /* the routers list has changed */ + goto rescan; + } + } + + if (the_lnet.ln_routing) + lnet_update_ni_status_locked(); + + lnet_net_unlock(cpt); + + lnet_prune_rc_data(0); /* don't wait for UNLINK */ + + /* Call schedule_timeout() here always adds 1 to load average + * because kernel counts # active tasks as nr_running + * + nr_uninterruptible. */ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING); + + lnet_prune_rc_data(1); /* wait for UNLINK */ + + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + up(&the_lnet.ln_rc_signal); + /* The unlink event callback will signal final completion */ + return 0; +} + +static void +lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages) +{ + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); + + while (--npages >= 0) + __free_page(rb->rb_kiov[npages].kiov_page); + + LIBCFS_FREE(rb, sz); +} + +static lnet_rtrbuf_t * +lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt) +{ + int npages = rbp->rbp_npages; + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); + struct page *page; + lnet_rtrbuf_t *rb; + int i; + + LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz); + if (rb == NULL) + return NULL; + + rb->rb_pool = rbp; + + for (i = 0; i < npages; i++) { + page = alloc_pages_node( + cfs_cpt_spread_node(lnet_cpt_table(), cpt), + __GFP_ZERO | GFP_IOFS, 0); + if (page == NULL) { + while (--i >= 0) + __free_page(rb->rb_kiov[i].kiov_page); + + LIBCFS_FREE(rb, sz); + return NULL; + } + + rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE; + rb->rb_kiov[i].kiov_offset = 0; + rb->rb_kiov[i].kiov_page = page; + } + + return rb; +} + +static void +lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp) +{ + int npages = rbp->rbp_npages; + int nbuffers = 0; + lnet_rtrbuf_t *rb; + + if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */ + return; + + LASSERT(list_empty(&rbp->rbp_msgs)); + LASSERT(rbp->rbp_credits == rbp->rbp_nbuffers); + + while (!list_empty(&rbp->rbp_bufs)) { + LASSERT(rbp->rbp_credits > 0); + + rb = list_entry(rbp->rbp_bufs.next, + lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + lnet_destroy_rtrbuf(rb, npages); + nbuffers++; + } + + LASSERT(rbp->rbp_nbuffers == nbuffers); + LASSERT(rbp->rbp_credits == nbuffers); + + rbp->rbp_nbuffers = rbp->rbp_credits = 0; +} + +static int +lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt) +{ + lnet_rtrbuf_t *rb; + int i; + + if (rbp->rbp_nbuffers != 0) { + LASSERT(rbp->rbp_nbuffers == nbufs); + return 0; + } + + for (i = 0; i < nbufs; i++) { + rb = lnet_new_rtrbuf(rbp, cpt); + + if (rb == NULL) { + CERROR("Failed to allocate %d router bufs of %d pages\n", + nbufs, rbp->rbp_npages); + return -ENOMEM; + } + + rbp->rbp_nbuffers++; + rbp->rbp_credits++; + rbp->rbp_mincredits++; + list_add(&rb->rb_list, &rbp->rbp_bufs); + + /* No allocation "under fire" */ + /* Otherwise we'd need code to schedule blocked msgs etc */ + LASSERT(!the_lnet.ln_routing); + } + + LASSERT(rbp->rbp_credits == nbufs); + return 0; +} + +static void +lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages) +{ + INIT_LIST_HEAD(&rbp->rbp_msgs); + INIT_LIST_HEAD(&rbp->rbp_bufs); + + rbp->rbp_npages = npages; + rbp->rbp_credits = 0; + rbp->rbp_mincredits = 0; +} + +void +lnet_rtrpools_free(void) +{ + lnet_rtrbufpool_t *rtrp; + int i; + + if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */ + return; + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_free_bufs(&rtrp[0]); + lnet_rtrpool_free_bufs(&rtrp[1]); + lnet_rtrpool_free_bufs(&rtrp[2]); + } + + cfs_percpt_free(the_lnet.ln_rtrpools); + the_lnet.ln_rtrpools = NULL; +} + +static int +lnet_nrb_tiny_calculate(int npages) +{ + int nrbs = LNET_NRB_TINY; + + if (tiny_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "tiny_router_buffers=%d invalid when routing enabled\n", + tiny_router_buffers); + return -1; + } + + if (tiny_router_buffers > 0) + nrbs = tiny_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_TINY_MIN); +} + +static int +lnet_nrb_small_calculate(int npages) +{ + int nrbs = LNET_NRB_SMALL; + + if (small_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "small_router_buffers=%d invalid when routing enabled\n", + small_router_buffers); + return -1; + } + + if (small_router_buffers > 0) + nrbs = small_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_SMALL_MIN); +} + +static int +lnet_nrb_large_calculate(int npages) +{ + int nrbs = LNET_NRB_LARGE; + + if (large_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "large_router_buffers=%d invalid when routing enabled\n", + large_router_buffers); + return -1; + } + + if (large_router_buffers > 0) + nrbs = large_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_LARGE_MIN); +} + +int +lnet_rtrpools_alloc(int im_a_router) +{ + lnet_rtrbufpool_t *rtrp; + int large_pages; + int small_pages = 1; + int nrb_tiny; + int nrb_small; + int nrb_large; + int rc; + int i; + + large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + if (!strcmp(forwarding, "")) { + /* not set either way */ + if (!im_a_router) + return 0; + } else if (!strcmp(forwarding, "disabled")) { + /* explicitly disabled */ + return 0; + } else if (!strcmp(forwarding, "enabled")) { + /* explicitly enabled */ + } else { + LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either 'enabled' or 'disabled'\n"); + return -EINVAL; + } + + nrb_tiny = lnet_nrb_tiny_calculate(0); + if (nrb_tiny < 0) + return -EINVAL; + + nrb_small = lnet_nrb_small_calculate(small_pages); + if (nrb_small < 0) + return -EINVAL; + + nrb_large = lnet_nrb_large_calculate(large_pages); + if (nrb_large < 0) + return -EINVAL; + + the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(), + LNET_NRBPOOLS * + sizeof(lnet_rtrbufpool_t)); + if (the_lnet.ln_rtrpools == NULL) { + LCONSOLE_ERROR_MSG(0x10c, + "Failed to initialize router buffe pool\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_init(&rtrp[0], 0); + rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[1], small_pages); + rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[2], large_pages); + rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i); + if (rc != 0) + goto failed; + } + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 1; + lnet_net_unlock(LNET_LOCK_EX); + + return 0; + + failed: + lnet_rtrpools_free(); + return rc; +} + +int +lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, unsigned long when) +{ + struct lnet_peer *lp = NULL; + unsigned long now = cfs_time_current(); + int cpt = lnet_cpt_of_nid(nid); + + LASSERT(!in_interrupt ()); + + CDEBUG(D_NET, "%s notifying %s: %s\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), + alive ? "up" : "down"); + + if (ni != NULL && + LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) { + CWARN("Ignoring notification of %s %s by %s (different net)\n", + libcfs_nid2str(nid), alive ? "birth" : "death", + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + } + + /* can't do predictions... */ + if (cfs_time_after(when, now)) { + CWARN("Ignoring prediction from %s of %s %s %ld seconds in the future\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), alive ? "up" : "down", + cfs_duration_sec(cfs_time_sub(when, now))); + return -EINVAL; + } + + if (ni != NULL && !alive && /* LND telling me she's down */ + !auto_down) { /* auto-down disabled */ + CDEBUG(D_NET, "Auto-down disabled\n"); + return 0; + } + + lnet_net_lock(cpt); + + if (the_lnet.ln_shutdown) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid); + if (lp == NULL) { + /* nid not found */ + lnet_net_unlock(cpt); + CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); + return 0; + } + + /* We can't fully trust LND on reporting exact peer last_alive + * if he notifies us about dead peer. For example ksocklnd can + * call us with when == _time_when_the_node_was_booted_ if + * no connections were successfully established */ + if (ni != NULL && !alive && when < lp->lp_last_alive) + when = lp->lp_last_alive; + + lnet_notify_locked(lp, ni == NULL, alive, when); + + lnet_ni_notify_locked(ni, lp); + + lnet_peer_decref_locked(lp); + + lnet_net_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(lnet_notify); + +void +lnet_get_tunables(void) +{ +} + +#else + +int +lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, unsigned long when) +{ + return -EOPNOTSUPP; +} + +void +lnet_router_checker(void) +{ + static time_t last; + static int running; + + time_t now = get_seconds(); + int interval = now - last; + int rc; + __u64 version; + lnet_peer_t *rtr; + + /* It's no use to call me again within a sec - all intervals and + * timeouts are measured in seconds */ + if (last != 0 && interval < 2) + return; + + if (last != 0 && + interval > max(live_router_check_interval, + dead_router_check_interval)) + CNETERR("Checker(%d/%d) not called for %d seconds\n", + live_router_check_interval, dead_router_check_interval, + interval); + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_net_lock(0); + LASSERT(!running); /* recursion check */ + running = 1; + lnet_net_unlock(0); + + last = now; + + if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) + lnet_prune_rc_data(0); /* unlink all rcd and nowait */ + + /* consume all pending events */ + while (1) { + int i; + lnet_event_t ev; + + /* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the + * recursion breaker in LNetEQPoll would fail */ + rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i); + if (rc == 0) /* no event pending */ + break; + + /* NB a lost SENT prevents me from pinging a router again */ + if (rc == -EOVERFLOW) { + CERROR("Dropped an event!!!\n"); + abort(); + } + + LASSERT(rc == 1); + + lnet_router_checker_event(&ev); + } + + if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) { + lnet_prune_rc_data(1); /* release rcd */ + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + running = 0; + return; + } + + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + lnet_net_lock(0); + + version = the_lnet.ln_routers_version; + list_for_each_entry(rtr, &the_lnet.ln_routers, lp_rtr_list) { + lnet_ping_router_locked(rtr); + LASSERT(version == the_lnet.ln_routers_version); + } + + lnet_net_unlock(0); + + running = 0; /* lock only needed for the recursion check */ +} + +/* NB lnet_peers_start_down depends on me, + * so must be called before any peer creation */ +void +lnet_get_tunables(void) +{ + char *s; + + s = getenv("LNET_ROUTER_PING_TIMEOUT"); + if (s != NULL) + router_ping_timeout = atoi(s); + + s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL"); + if (s != NULL) + live_router_check_interval = atoi(s); + + s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL"); + if (s != NULL) + dead_router_check_interval = atoi(s); + + /* This replaces old lnd_notify mechanism */ + check_routers_before_use = 1; + if (dead_router_check_interval <= 0) + dead_router_check_interval = 30; +} + +void +lnet_rtrpools_free(void) +{ +} + +int +lnet_rtrpools_alloc(int im_a_arouter) +{ + return 0; +} + +#endif diff --git a/kernel/drivers/staging/lustre/lnet/lnet/router_proc.c b/kernel/drivers/staging/lustre/lnet/lnet/router_proc.c new file mode 100644 index 000000000..c055afc86 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/lnet/router_proc.c @@ -0,0 +1,968 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/lib-lnet.h" + +#if defined(LNET_ROUTER) + +/* This is really lnet_proc.c. You might need to update sanity test 215 + * if any file format is changed. */ + +static struct ctl_table_header *lnet_table_header; + +#define CTL_LNET (0x100) +enum { + PSDEV_LNET_STATS = 100, + PSDEV_LNET_ROUTES, + PSDEV_LNET_ROUTERS, + PSDEV_LNET_PEERS, + PSDEV_LNET_BUFFERS, + PSDEV_LNET_NIS, + PSDEV_LNET_PTL_ROTOR, +}; + +#define LNET_LOFFT_BITS (sizeof(loff_t) * 8) +/* + * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system + */ +#define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1) +/* change version, 16 bits or 8 bits */ +#define LNET_PROC_VER_BITS max_t(size_t, min_t(size_t, LNET_LOFFT_BITS, 64) / 4, 8) + +#define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS +/* + * bits for peer hash offset + * NB: we don't use the highest bit of *ppos because it's signed + */ +#define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \ + LNET_PROC_CPT_BITS - \ + LNET_PROC_VER_BITS - \ + LNET_PROC_HASH_BITS - 1) +/* bits for hash index + position */ +#define LNET_PROC_HPOS_BITS (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS) +/* bits for peer hash table + hash version */ +#define LNET_PROC_VPOS_BITS (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS) + +#define LNET_PROC_CPT_MASK ((1ULL << LNET_PROC_CPT_BITS) - 1) +#define LNET_PROC_VER_MASK ((1ULL << LNET_PROC_VER_BITS) - 1) +#define LNET_PROC_HASH_MASK ((1ULL << LNET_PROC_HASH_BITS) - 1) +#define LNET_PROC_HOFF_MASK ((1ULL << LNET_PROC_HOFF_BITS) - 1) + +#define LNET_PROC_CPT_GET(pos) \ + (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK) + +#define LNET_PROC_VER_GET(pos) \ + (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK) + +#define LNET_PROC_HASH_GET(pos) \ + (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK) + +#define LNET_PROC_HOFF_GET(pos) \ + (int)((pos) & LNET_PROC_HOFF_MASK) + +#define LNET_PROC_POS_MAKE(cpt, ver, hash, off) \ + (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) | \ + ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) | \ + ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \ + ((off) & LNET_PROC_HOFF_MASK)) + +#define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK)) + +static int proc_call_handler(void *data, int write, loff_t *ppos, + void __user *buffer, size_t *lenp, + int (*handler)(void *data, int write, + loff_t pos, void __user *buffer, int len)) +{ + int rc = handler(data, write, *ppos, buffer, *lenp); + + if (rc < 0) + return rc; + + if (write) { + *ppos += *lenp; + } else { + *lenp = rc; + *ppos += rc; + } + return 0; +} + +static int __proc_lnet_stats(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + int rc; + lnet_counters_t *ctrs; + int len; + char *tmpstr; + const int tmpsiz = 256; /* 7 %u and 4 %llu */ + + if (write) { + lnet_counters_reset(); + return 0; + } + + /* read */ + + LIBCFS_ALLOC(ctrs, sizeof(*ctrs)); + if (ctrs == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) { + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return -ENOMEM; + } + + lnet_counters_get(ctrs); + + len = snprintf(tmpstr, tmpsiz, + "%u %u %u %u %u %u %u %llu %llu %llu %llu", + ctrs->msgs_alloc, ctrs->msgs_max, + ctrs->errors, + ctrs->send_count, ctrs->recv_count, + ctrs->route_count, ctrs->drop_count, + ctrs->send_length, ctrs->recv_length, + ctrs->route_length, ctrs->drop_length); + + if (pos >= min_t(int, len, strlen(tmpstr))) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, "\n"); + + LIBCFS_FREE(tmpstr, tmpsiz); + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return rc; +} + +static int proc_lnet_stats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_call_handler(table->data, write, ppos, buffer, lenp, + __proc_lnet_stats); +} + +static int proc_lnet_routes(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + const int tmpsiz = 256; + char *tmpstr; + char *s; + int rc = 0; + int len; + int ver; + int off; + + CLASSERT(sizeof(loff_t) >= 4); + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT(!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n", + the_lnet.ln_routing ? "enabled" : "disabled"); + LASSERT(tmpstr + tmpsiz - s > 0); + + s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %8s %7s %s\n", + "net", "hops", "priority", "state", "router"); + LASSERT(tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_remote_nets_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *n; + struct list_head *r; + lnet_route_t *route = NULL; + lnet_remotenet_t *rnet = NULL; + int skip = off - 1; + struct list_head *rn_list; + int i; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) { + lnet_net_unlock(0); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL; + i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + + n = rn_list->next; + + while (n != rn_list && route == NULL) { + rnet = list_entry(n, lnet_remotenet_t, + lrn_list); + + r = rnet->lrn_routes.next; + + while (r != &rnet->lrn_routes) { + lnet_route_t *re = + list_entry(r, lnet_route_t, + lr_list); + if (skip == 0) { + route = re; + break; + } + + skip--; + r = r->next; + } + + n = n->next; + } + } + + if (route != NULL) { + __u32 net = rnet->lrn_net; + unsigned int hops = route->lr_hops; + unsigned int priority = route->lr_priority; + lnet_nid_t nid = route->lr_gateway->lp_nid; + int alive = route->lr_gateway->lp_alive; + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-8s %4u %8u %7s %s\n", + libcfs_net2str(net), hops, + priority, + alive ? "up" : "down", + libcfs_nid2str(nid)); + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +static int proc_lnet_routers(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + char *tmpstr; + char *s; + const int tmpsiz = 256; + int len; + int ver; + int off; + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT(!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n", + "ref", "rtr_ref", "alive_cnt", "state", + "last_ping", "ping_sent", "deadline", + "down_ni", "router"); + LASSERT(tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_routers_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *r; + struct lnet_peer *peer = NULL; + int skip = off - 1; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) { + lnet_net_unlock(0); + + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + r = the_lnet.ln_routers.next; + + while (r != &the_lnet.ln_routers) { + lnet_peer_t *lp = list_entry(r, lnet_peer_t, + lp_rtr_list); + + if (skip == 0) { + peer = lp; + break; + } + + skip--; + r = r->next; + } + + if (peer != NULL) { + lnet_nid_t nid = peer->lp_nid; + unsigned long now = cfs_time_current(); + unsigned long deadline = peer->lp_ping_deadline; + int nrefs = peer->lp_refcount; + int nrtrrefs = peer->lp_rtr_refcount; + int alive_cnt = peer->lp_alive_count; + int alive = peer->lp_alive; + int pingsent = !peer->lp_ping_notsent; + int last_ping = cfs_duration_sec(cfs_time_sub(now, + peer->lp_ping_timestamp)); + int down_ni = 0; + lnet_route_t *rtr; + + if ((peer->lp_ping_feats & + LNET_PING_FEAT_NI_STATUS) != 0) { + list_for_each_entry(rtr, &peer->lp_routes, + lr_gwlist) { + /* downis on any route should be the + * number of downis on the gateway */ + if (rtr->lr_downis != 0) { + down_ni = rtr->lr_downis; + break; + } + } + } + + if (deadline == 0) + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, "NA", down_ni, + libcfs_nid2str(nid)); + else + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, + cfs_duration_sec(cfs_time_sub(deadline, now)), + down_ni, libcfs_nid2str(nid)); + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +static int proc_lnet_peers(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + const int tmpsiz = 256; + struct lnet_peer_table *ptable; + char *tmpstr; + char *s; + int cpt = LNET_PROC_CPT_GET(*ppos); + int ver = LNET_PROC_VER_GET(*ppos); + int hash = LNET_PROC_HASH_GET(*ppos); + int hoff = LNET_PROC_HOFF_GET(*ppos); + int rc = 0; + int len; + + CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS); + LASSERT(!write); + + if (*lenp == 0) + return 0; + + if (cpt >= LNET_CPT_NUMBER) { + *lenp = 0; + return 0; + } + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n", + "nid", "refs", "state", "last", "max", + "rtr", "min", "tx", "min", "queue"); + LASSERT(tmpstr + tmpsiz - s > 0); + + hoff++; + } else { + struct lnet_peer *peer; + struct list_head *p; + int skip; + again: + p = NULL; + peer = NULL; + skip = hoff - 1; + + lnet_net_lock(cpt); + ptable = the_lnet.ln_peer_tables[cpt]; + if (hoff == 1) + ver = LNET_PROC_VERSION(ptable->pt_version); + + if (ver != LNET_PROC_VERSION(ptable->pt_version)) { + lnet_net_unlock(cpt); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + while (hash < LNET_PEER_HASH_SIZE) { + if (p == NULL) + p = ptable->pt_hash[hash].next; + + while (p != &ptable->pt_hash[hash]) { + lnet_peer_t *lp = list_entry(p, lnet_peer_t, + lp_hashlist); + if (skip == 0) { + peer = lp; + + /* minor optimization: start from idx+1 + * on next iteration if we've just + * drained lp_hashlist */ + if (lp->lp_hashlist.next == + &ptable->pt_hash[hash]) { + hoff = 1; + hash++; + } else { + hoff++; + } + + break; + } + + skip--; + p = lp->lp_hashlist.next; + } + + if (peer != NULL) + break; + + p = NULL; + hoff = 1; + hash++; + } + + if (peer != NULL) { + lnet_nid_t nid = peer->lp_nid; + int nrefs = peer->lp_refcount; + int lastalive = -1; + char *aliveness = "NA"; + int maxcr = peer->lp_ni->ni_peertxcredits; + int txcr = peer->lp_txcredits; + int mintxcr = peer->lp_mintxcredits; + int rtrcr = peer->lp_rtrcredits; + int minrtrcr = peer->lp_minrtrcredits; + int txqnob = peer->lp_txqnob; + + if (lnet_isrouter(peer) || + lnet_peer_aliveness_enabled(peer)) + aliveness = peer->lp_alive ? "up" : "down"; + + if (lnet_peer_aliveness_enabled(peer)) { + unsigned long now = cfs_time_current(); + long delta; + + delta = cfs_time_sub(now, peer->lp_last_alive); + lastalive = cfs_duration_sec(delta); + + /* No need to mess up peers contents with + * arbitrarily long integers - it suffices to + * know that lastalive is more than 10000s old + */ + if (lastalive >= 10000) + lastalive = 9999; + } + + lnet_net_unlock(cpt); + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n", + libcfs_nid2str(nid), nrefs, aliveness, + lastalive, maxcr, rtrcr, minrtrcr, txcr, + mintxcr, txqnob); + LASSERT(tmpstr + tmpsiz - s > 0); + + } else { /* peer is NULL */ + lnet_net_unlock(cpt); + } + + if (hash == LNET_PEER_HASH_SIZE) { + cpt++; + hash = 0; + hoff = 1; + if (peer == NULL && cpt < LNET_CPT_NUMBER) + goto again; + } + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff); + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +static int __proc_lnet_buffers(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + char *s; + char *tmpstr; + int tmpsiz; + int idx; + int len; + int rc; + int i; + + LASSERT(!write); + + /* (4 %d) * 4 * LNET_CPT_NUMBER */ + tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER; + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + s += snprintf(s, tmpstr + tmpsiz - s, + "%5s %5s %7s %7s\n", + "pages", "count", "credits", "min"); + LASSERT(tmpstr + tmpsiz - s > 0); + + if (the_lnet.ln_rtrpools == NULL) + goto out; /* I'm not a router */ + + for (idx = 0; idx < LNET_NRBPOOLS; idx++) { + lnet_rtrbufpool_t *rbp; + + lnet_net_lock(LNET_LOCK_EX); + cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%5d %5d %7d %7d\n", + rbp[idx].rbp_npages, + rbp[idx].rbp_nbuffers, + rbp[idx].rbp_credits, + rbp[idx].rbp_mincredits); + LASSERT(tmpstr + tmpsiz - s > 0); + } + lnet_net_unlock(LNET_LOCK_EX); + } + + out: + len = s - tmpstr; + + if (pos >= min_t(int, len, strlen(tmpstr))) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, NULL); + + LIBCFS_FREE(tmpstr, tmpsiz); + return rc; +} + +static int proc_lnet_buffers(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_call_handler(table->data, write, ppos, buffer, lenp, + __proc_lnet_buffers); +} + +static int proc_lnet_nis(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int tmpsiz = 128 * LNET_CPT_NUMBER; + int rc = 0; + char *tmpstr; + char *s; + int len; + + LASSERT(!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n", + "nid", "status", "alive", "refs", "peer", + "rtr", "max", "tx", "min"); + LASSERT(tmpstr + tmpsiz - s > 0); + } else { + struct list_head *n; + lnet_ni_t *ni = NULL; + int skip = *ppos - 1; + + lnet_net_lock(0); + + n = the_lnet.ln_nis.next; + + while (n != &the_lnet.ln_nis) { + lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list); + + if (skip == 0) { + ni = a_ni; + break; + } + + skip--; + n = n->next; + } + + if (ni != NULL) { + struct lnet_tx_queue *tq; + char *stat; + long now = get_seconds(); + int last_alive = -1; + int i; + int j; + + if (the_lnet.ln_routing) + last_alive = now - ni->ni_last_alive; + + /* @lo forever alive */ + if (ni->ni_lnd->lnd_type == LOLND) + last_alive = 0; + + lnet_ni_lock(ni); + LASSERT(ni->ni_status != NULL); + stat = (ni->ni_status->ns_status == + LNET_NI_STATUS_UP) ? "up" : "down"; + lnet_ni_unlock(ni); + + /* we actually output credits information for + * TX queue of each partition */ + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + for (j = 0; ni->ni_cpts != NULL && + j < ni->ni_ncpts; j++) { + if (i == ni->ni_cpts[j]) + break; + } + + if (j == ni->ni_ncpts) + continue; + + if (i != 0) + lnet_net_lock(i); + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n", + libcfs_nid2str(ni->ni_nid), stat, + last_alive, *ni->ni_refs[i], + ni->ni_peertxcredits, + ni->ni_peerrtrcredits, + tq->tq_credits_max, + tq->tq_credits, tq->tq_credits_min); + if (i != 0) + lnet_net_unlock(i); + } + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos += 1; + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +struct lnet_portal_rotors { + int pr_value; + const char *pr_name; + const char *pr_desc; +}; + +static struct lnet_portal_rotors portal_rotors[] = { + { + .pr_value = LNET_PTL_ROTOR_OFF, + .pr_name = "OFF", + .pr_desc = "Turn off message rotor for wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_ON, + .pr_name = "ON", + .pr_desc = "round-robin dispatch all PUT messages for wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_RR_RT, + .pr_name = "RR_RT", + .pr_desc = "round-robin dispatch routed PUT message for wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_HASH_RT, + .pr_name = "HASH_RT", + .pr_desc = "dispatch routed PUT message by hashing source NID for wildcard portals" + }, + { + .pr_value = -1, + .pr_name = NULL, + .pr_desc = NULL + }, +}; + +extern int portal_rotor; + +static int __proc_lnet_portal_rotor(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + const int buf_len = 128; + char *buf; + char *tmp; + int rc; + int i; + + LIBCFS_ALLOC(buf, buf_len); + if (buf == NULL) + return -ENOMEM; + + if (!write) { + lnet_res_lock(0); + + for (i = 0; portal_rotors[i].pr_value >= 0; i++) { + if (portal_rotors[i].pr_value == portal_rotor) + break; + } + + LASSERT(portal_rotors[i].pr_value == portal_rotor); + lnet_res_unlock(0); + + rc = snprintf(buf, buf_len, + "{\n\tportals: all\n" + "\trotor: %s\n\tdescription: %s\n}", + portal_rotors[i].pr_name, + portal_rotors[i].pr_desc); + + if (pos >= min_t(int, rc, buf_len)) { + rc = 0; + } else { + rc = cfs_trace_copyout_string(buffer, nob, + buf + pos, "\n"); + } + goto out; + } + + rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob); + if (rc < 0) + goto out; + + tmp = cfs_trimwhite(buf); + + rc = -EINVAL; + lnet_res_lock(0); + for (i = 0; portal_rotors[i].pr_name != NULL; i++) { + if (strncasecmp(portal_rotors[i].pr_name, tmp, + strlen(portal_rotors[i].pr_name)) == 0) { + portal_rotor = portal_rotors[i].pr_value; + rc = 0; + break; + } + } + lnet_res_unlock(0); +out: + LIBCFS_FREE(buf, buf_len); + return rc; +} + +static int proc_lnet_portal_rotor(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + return proc_call_handler(table->data, write, ppos, buffer, lenp, + __proc_lnet_portal_rotor); +} + +static struct ctl_table lnet_table[] = { + /* + * NB No .strategy entries have been provided since sysctl(8) prefers + * to go via /proc for portability. + */ + { + .procname = "stats", + .mode = 0644, + .proc_handler = &proc_lnet_stats, + }, + { + .procname = "routes", + .mode = 0444, + .proc_handler = &proc_lnet_routes, + }, + { + .procname = "routers", + .mode = 0444, + .proc_handler = &proc_lnet_routers, + }, + { + .procname = "peers", + .mode = 0444, + .proc_handler = &proc_lnet_peers, + }, + { + .procname = "buffers", + .mode = 0444, + .proc_handler = &proc_lnet_buffers, + }, + { + .procname = "nis", + .mode = 0444, + .proc_handler = &proc_lnet_nis, + }, + { + .procname = "portal_rotor", + .mode = 0644, + .proc_handler = &proc_lnet_portal_rotor, + }, + { + } +}; + +static struct ctl_table top_table[] = { + { + .procname = "lnet", + .mode = 0555, + .data = NULL, + .maxlen = 0, + .child = lnet_table, + }, + { + } +}; + +void +lnet_proc_init(void) +{ + if (lnet_table_header == NULL) + lnet_table_header = register_sysctl_table(top_table); +} + +void +lnet_proc_fini(void) +{ + if (lnet_table_header != NULL) + unregister_sysctl_table(lnet_table_header); + + lnet_table_header = NULL; +} + +#else + +void +lnet_proc_init(void) +{ +} + +void +lnet_proc_fini(void) +{ +} + +#endif diff --git a/kernel/drivers/staging/lustre/lnet/selftest/Makefile b/kernel/drivers/staging/lustre/lnet/selftest/Makefile new file mode 100644 index 000000000..c0de6e2d9 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o + +lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \ + module.o ping_test.o brw_test.o diff --git a/kernel/drivers/staging/lustre/lnet/selftest/brw_test.c b/kernel/drivers/staging/lustre/lnet/selftest/brw_test.c new file mode 100644 index 000000000..658f4584f --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/brw_test.c @@ -0,0 +1,508 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/brw_test.c + * + * Author: Isaac Huang + */ + +#include "selftest.h" + +static int brw_srv_workitems = SFW_TEST_WI_MAX; +module_param(brw_srv_workitems, int, 0644); +MODULE_PARM_DESC(brw_srv_workitems, "# BRW server workitems"); + +static int brw_inject_errors; +module_param(brw_inject_errors, int, 0644); +MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by default"); + +static void +brw_client_fini(sfw_test_instance_t *tsi) +{ + srpc_bulk_t *bulk; + sfw_test_unit_t *tsu; + + LASSERT(tsi->tsi_is_client); + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + bulk = tsu->tsu_private; + if (bulk == NULL) + continue; + + srpc_free_bulk(bulk); + tsu->tsu_private = NULL; + } +} + +static int +brw_client_init(sfw_test_instance_t *tsi) +{ + sfw_session_t *sn = tsi->tsi_batch->bat_session; + int flags; + int npg; + int len; + int opc; + srpc_bulk_t *bulk; + sfw_test_unit_t *tsu; + + LASSERT(sn != NULL); + LASSERT(tsi->tsi_is_client); + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0; + + opc = breq->blk_opc; + flags = breq->blk_flags; + npg = breq->blk_npg; + /* NB: this is not going to work for variable page size, + * but we have to keep it for compatibility */ + len = npg * PAGE_CACHE_SIZE; + + } else { + test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1; + + /* I should never get this step if it's unknown feature + * because make_session will reject unknown feature */ + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + opc = breq->blk_opc; + flags = breq->blk_flags; + len = breq->blk_len; + npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + } + + if (npg > LNET_MAX_IOV || npg <= 0) + return -EINVAL; + + if (opc != LST_BRW_READ && opc != LST_BRW_WRITE) + return -EINVAL; + + if (flags != LST_BRW_CHECK_NONE && + flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE) + return -EINVAL; + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid), + npg, len, opc == LST_BRW_READ); + if (bulk == NULL) { + brw_client_fini(tsi); + return -ENOMEM; + } + + tsu->tsu_private = bulk; + } + + return 0; +} + +#define BRW_POISON 0xbeefbeefbeefbeefULL +#define BRW_MAGIC 0xeeb0eeb1eeb2eeb3ULL +#define BRW_MSIZE sizeof(__u64) + +static int +brw_inject_one_error(void) +{ + struct timeval tv; + + if (brw_inject_errors <= 0) + return 0; + + do_gettimeofday(&tv); + + if ((tv.tv_usec & 1) == 0) + return 0; + + return brw_inject_errors--; +} + +static void +brw_fill_page(struct page *pg, int pattern, __u64 magic) +{ + char *addr = page_address(pg); + int i; + + LASSERT(addr != NULL); + + if (pattern == LST_BRW_CHECK_NONE) + return; + + if (magic == BRW_MAGIC) + magic += brw_inject_one_error(); + + if (pattern == LST_BRW_CHECK_SIMPLE) { + memcpy(addr, &magic, BRW_MSIZE); + addr += PAGE_CACHE_SIZE - BRW_MSIZE; + memcpy(addr, &magic, BRW_MSIZE); + return; + } + + if (pattern == LST_BRW_CHECK_FULL) { + for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) + memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE); + return; + } + + LBUG(); +} + +static int +brw_check_page(struct page *pg, int pattern, __u64 magic) +{ + char *addr = page_address(pg); + __u64 data = 0; /* make compiler happy */ + int i; + + LASSERT(addr != NULL); + + if (pattern == LST_BRW_CHECK_NONE) + return 0; + + if (pattern == LST_BRW_CHECK_SIMPLE) { + data = *((__u64 *) addr); + if (data != magic) + goto bad_data; + + addr += PAGE_CACHE_SIZE - BRW_MSIZE; + data = *((__u64 *) addr); + if (data != magic) + goto bad_data; + + return 0; + } + + if (pattern == LST_BRW_CHECK_FULL) { + for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) { + data = *(((__u64 *) addr) + i); + if (data != magic) + goto bad_data; + } + + return 0; + } + + LBUG(); + +bad_data: + CERROR("Bad data in page %p: %#llx, %#llx expected\n", + pg, data, magic); + return 1; +} + +static void +brw_fill_bulk(srpc_bulk_t *bk, int pattern, __u64 magic) +{ + int i; + struct page *pg; + + for (i = 0; i < bk->bk_niov; i++) { + pg = bk->bk_iovs[i].kiov_page; + brw_fill_page(pg, pattern, magic); + } +} + +static int +brw_check_bulk(srpc_bulk_t *bk, int pattern, __u64 magic) +{ + int i; + struct page *pg; + + for (i = 0; i < bk->bk_niov; i++) { + pg = bk->bk_iovs[i].kiov_page; + if (brw_check_page(pg, pattern, magic) != 0) { + CERROR("Bulk page %p (%d/%d) is corrupted!\n", + pg, i, bk->bk_niov); + return 1; + } + } + + return 0; +} + +static int +brw_client_prep_rpc(sfw_test_unit_t *tsu, + lnet_process_id_t dest, srpc_client_rpc_t **rpcpp) +{ + srpc_bulk_t *bulk = tsu->tsu_private; + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + srpc_client_rpc_t *rpc; + srpc_brw_reqst_t *req; + int flags; + int npg; + int len; + int opc; + int rc; + + LASSERT(sn != NULL); + LASSERT(bulk != NULL); + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0; + + opc = breq->blk_opc; + flags = breq->blk_flags; + npg = breq->blk_npg; + len = npg * PAGE_CACHE_SIZE; + + } else { + test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1; + + /* I should never get this step if it's unknown feature + * because make_session will reject unknown feature */ + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + opc = breq->blk_opc; + flags = breq->blk_flags; + len = breq->blk_len; + npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + } + + rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc); + if (rc != 0) + return rc; + + memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg])); + if (opc == LST_BRW_WRITE) + brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC); + else + brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON); + + req = &rpc->crpc_reqstmsg.msg_body.brw_reqst; + req->brw_flags = flags; + req->brw_rw = opc; + req->brw_len = len; + + *rpcpp = rpc; + return 0; +} + +static void +brw_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc) +{ + __u64 magic = BRW_MAGIC; + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + srpc_msg_t *msg = &rpc->crpc_replymsg; + srpc_brw_reply_t *reply = &msg->msg_body.brw_reply; + srpc_brw_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst; + + LASSERT(sn != NULL); + + if (rpc->crpc_status != 0) { + CERROR("BRW RPC to %s failed with %d\n", + libcfs_id2str(rpc->crpc_dest), rpc->crpc_status); + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_brw_errors); + goto out; + } + + if (msg->msg_magic != SRPC_MSG_MAGIC) { + __swab64s(&magic); + __swab32s(&reply->brw_status); + } + + CDEBUG(reply->brw_status ? D_WARNING : D_NET, + "BRW RPC to %s finished with brw_status: %d\n", + libcfs_id2str(rpc->crpc_dest), reply->brw_status); + + if (reply->brw_status != 0) { + atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -(int)reply->brw_status; + goto out; + } + + if (reqst->brw_rw == LST_BRW_WRITE) + goto out; + + if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) { + CERROR("Bulk data from %s is corrupted!\n", + libcfs_id2str(rpc->crpc_dest)); + atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -EBADMSG; + } + +out: + return; +} + +static void +brw_server_rpc_done(srpc_server_rpc_t *rpc) +{ + srpc_bulk_t *blk = rpc->srpc_bulk; + + if (blk == NULL) + return; + + if (rpc->srpc_status != 0) + CERROR("Bulk transfer %s %s has failed: %d\n", + blk->bk_sink ? "from" : "to", + libcfs_id2str(rpc->srpc_peer), rpc->srpc_status); + else + CDEBUG(D_NET, "Transferred %d pages bulk data %s %s\n", + blk->bk_niov, blk->bk_sink ? "from" : "to", + libcfs_id2str(rpc->srpc_peer)); + + sfw_free_pages(rpc); +} + +static int +brw_bulk_ready(srpc_server_rpc_t *rpc, int status) +{ + __u64 magic = BRW_MAGIC; + srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply; + srpc_brw_reqst_t *reqst; + srpc_msg_t *reqstmsg; + + LASSERT(rpc->srpc_bulk != NULL); + LASSERT(rpc->srpc_reqstbuf != NULL); + + reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + reqst = &reqstmsg->msg_body.brw_reqst; + + if (status != 0) { + CERROR("BRW bulk %s failed for RPC from %s: %d\n", + reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE", + libcfs_id2str(rpc->srpc_peer), status); + return -EIO; + } + + if (reqst->brw_rw == LST_BRW_READ) + return 0; + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) + __swab64s(&magic); + + if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) { + CERROR("Bulk data from %s is corrupted!\n", + libcfs_id2str(rpc->srpc_peer)); + reply->brw_status = EBADMSG; + } + + return 0; +} + +static int +brw_server_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + srpc_msg_t *replymsg = &rpc->srpc_replymsg; + srpc_msg_t *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply; + srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst; + int npg; + int rc; + + LASSERT(sv->sv_id == SRPC_SERVICE_BRW); + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { + LASSERT(reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + __swab32s(&reqst->brw_rw); + __swab32s(&reqst->brw_len); + __swab32s(&reqst->brw_flags); + __swab64s(&reqst->brw_rpyid); + __swab64s(&reqst->brw_bulkid); + } + LASSERT(reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id)); + + reply->brw_status = 0; + rpc->srpc_done = brw_server_rpc_done; + + if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) || + (reqst->brw_flags != LST_BRW_CHECK_NONE && + reqst->brw_flags != LST_BRW_CHECK_FULL && + reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) { + reply->brw_status = EINVAL; + return 0; + } + + if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + replymsg->msg_ses_feats = LST_FEATS_MASK; + reply->brw_status = EPROTO; + return 0; + } + + if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) { + /* compat with old version */ + if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) { + reply->brw_status = EINVAL; + return 0; + } + npg = reqst->brw_len >> PAGE_CACHE_SHIFT; + + } else { + npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + } + + replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; + + if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) { + reply->brw_status = EINVAL; + return 0; + } + + rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg, + reqst->brw_len, + reqst->brw_rw == LST_BRW_WRITE); + if (rc != 0) + return rc; + + if (reqst->brw_rw == LST_BRW_READ) + brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC); + else + brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON); + + return 0; +} + +sfw_test_client_ops_t brw_test_client; +void brw_init_test_client(void) +{ + brw_test_client.tso_init = brw_client_init; + brw_test_client.tso_fini = brw_client_fini; + brw_test_client.tso_prep_rpc = brw_client_prep_rpc; + brw_test_client.tso_done_rpc = brw_client_done_rpc; +}; + +srpc_service_t brw_test_service; +void brw_init_test_service(void) +{ + + brw_test_service.sv_id = SRPC_SERVICE_BRW; + brw_test_service.sv_name = "brw_test"; + brw_test_service.sv_handler = brw_server_handle; + brw_test_service.sv_bulk_ready = brw_bulk_ready; + brw_test_service.sv_wi_total = brw_srv_workitems; +} diff --git a/kernel/drivers/staging/lustre/lnet/selftest/conctl.c b/kernel/drivers/staging/lustre/lnet/selftest/conctl.c new file mode 100644 index 000000000..045fe295a --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/conctl.c @@ -0,0 +1,929 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * IOC handle in kernel + * + * Author: Liang Zhen + */ + +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/lib-lnet.h" +#include "../../include/linux/lnet/lnetst.h" +#include "console.h" + +static int +lst_session_new_ioctl(lstio_session_new_args_t *args) +{ + char *name; + int rc; + + if (args->lstio_ses_idp == NULL || /* address for output sid */ + args->lstio_ses_key == 0 || /* no key is specified */ + args->lstio_ses_namep == NULL || /* session name */ + args->lstio_ses_nmlen <= 0 || + args->lstio_ses_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_ses_namep, + args->lstio_ses_nmlen)) { + LIBCFS_FREE(name, args->lstio_ses_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_ses_nmlen] = 0; + + rc = lstcon_session_new(name, + args->lstio_ses_key, + args->lstio_ses_feats, + args->lstio_ses_force, + args->lstio_ses_timeout, + args->lstio_ses_idp); + + LIBCFS_FREE(name, args->lstio_ses_nmlen + 1); + return rc; +} + +static int +lst_session_end_ioctl(lstio_session_end_args_t *args) +{ + if (args->lstio_ses_key != console_session.ses_key) + return -EACCES; + + return lstcon_session_end(); +} + +static int +lst_session_info_ioctl(lstio_session_info_args_t *args) +{ + /* no checking of key */ + + if (args->lstio_ses_idp == NULL || /* address for output sid */ + args->lstio_ses_keyp == NULL || /* address for output key */ + args->lstio_ses_featp == NULL || /* address for output features */ + args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */ + args->lstio_ses_namep == NULL || /* address for output name */ + args->lstio_ses_nmlen <= 0 || + args->lstio_ses_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_session_info(args->lstio_ses_idp, + args->lstio_ses_keyp, + args->lstio_ses_featp, + args->lstio_ses_ndinfo, + args->lstio_ses_namep, + args->lstio_ses_nmlen); +} + +static int +lst_debug_ioctl(lstio_debug_args_t *args) +{ + char *name = NULL; + int client = 1; + int rc; + + if (args->lstio_dbg_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_dbg_resultp == NULL) + return -EINVAL; + + if (args->lstio_dbg_namep != NULL && /* name of batch/group */ + (args->lstio_dbg_nmlen <= 0 || + args->lstio_dbg_nmlen > LST_NAME_SIZE)) + return -EINVAL; + + if (args->lstio_dbg_namep != NULL) { + LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_dbg_namep, + args->lstio_dbg_nmlen)) { + LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1); + + return -EFAULT; + } + + name[args->lstio_dbg_nmlen] = 0; + } + + rc = -EINVAL; + + switch (args->lstio_dbg_type) { + case LST_OPC_SESSION: + rc = lstcon_session_debug(args->lstio_dbg_timeout, + args->lstio_dbg_resultp); + break; + + case LST_OPC_BATCHSRV: + client = 0; + case LST_OPC_BATCHCLI: + if (name == NULL) + goto out; + + rc = lstcon_batch_debug(args->lstio_dbg_timeout, + name, client, args->lstio_dbg_resultp); + break; + + case LST_OPC_GROUP: + if (name == NULL) + goto out; + + rc = lstcon_group_debug(args->lstio_dbg_timeout, + name, args->lstio_dbg_resultp); + break; + + case LST_OPC_NODES: + if (args->lstio_dbg_count <= 0 || + args->lstio_dbg_idsp == NULL) + goto out; + + rc = lstcon_nodes_debug(args->lstio_dbg_timeout, + args->lstio_dbg_count, + args->lstio_dbg_idsp, + args->lstio_dbg_resultp); + break; + + default: + break; + } + +out: + if (name != NULL) + LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1); + + return rc; +} + +static int +lst_group_add_ioctl(lstio_group_add_args_t *args) +{ + char *name; + int rc; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_add(name); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +static int +lst_group_del_ioctl(lstio_group_del_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_del(name); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +static int +lst_group_update_ioctl(lstio_group_update_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_resultp == NULL || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + switch (args->lstio_grp_opc) { + case LST_GROUP_CLEAN: + rc = lstcon_group_clean(name, args->lstio_grp_args); + break; + + case LST_GROUP_REFRESH: + rc = lstcon_group_refresh(name, args->lstio_grp_resultp); + break; + + case LST_GROUP_RMND: + if (args->lstio_grp_count <= 0 || + args->lstio_grp_idsp == NULL) { + rc = -EINVAL; + break; + } + rc = lstcon_nodes_remove(name, args->lstio_grp_count, + args->lstio_grp_idsp, + args->lstio_grp_resultp); + break; + + default: + rc = -EINVAL; + break; + } + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +static int +lst_nodes_add_ioctl(lstio_group_nodes_args_t *args) +{ + unsigned feats; + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_idsp == NULL || /* array of ids */ + args->lstio_grp_count <= 0 || + args->lstio_grp_resultp == NULL || + args->lstio_grp_featp == NULL || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_nodes_add(name, args->lstio_grp_count, + args->lstio_grp_idsp, &feats, + args->lstio_grp_resultp); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + if (rc == 0 && + copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) { + return -EINVAL; + } + + return rc; +} + +static int +lst_group_list_ioctl(lstio_group_list_args_t *args) +{ + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_idx < 0 || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_group_list(args->lstio_grp_idx, + args->lstio_grp_nmlen, + args->lstio_grp_namep); +} + +static int +lst_group_info_ioctl(lstio_group_info_args_t *args) +{ + char *name; + int ndent; + int index; + int rc; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_grp_entp == NULL && /* output: group entry */ + args->lstio_grp_dentsp == NULL) /* output: node entry */ + return -EINVAL; + + if (args->lstio_grp_dentsp != NULL) { /* have node entry */ + if (args->lstio_grp_idxp == NULL || /* node index */ + args->lstio_grp_ndentp == NULL) /* # of node entry */ + return -EINVAL; + + if (copy_from_user(&ndent, args->lstio_grp_ndentp, + sizeof(ndent)) || + copy_from_user(&index, args->lstio_grp_idxp, + sizeof(index))) + return -EFAULT; + + if (ndent <= 0 || index < 0) + return -EINVAL; + } + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_info(name, args->lstio_grp_entp, + &index, &ndent, args->lstio_grp_dentsp); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + if (rc != 0) + return rc; + + if (args->lstio_grp_dentsp != NULL && + (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) || + copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent)))) + rc = -EFAULT; + + return 0; +} + +static int +lst_batch_add_ioctl(lstio_batch_add_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_add(name); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_run_ioctl(lstio_batch_run_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_run(name, args->lstio_bat_timeout, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_stop_ioctl(lstio_batch_stop_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_resultp == NULL || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_stop(name, args->lstio_bat_force, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_query_ioctl(lstio_batch_query_args_t *args) +{ + char *name; + int rc; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_resultp == NULL || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_bat_testidx < 0) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_test_batch_query(name, + args->lstio_bat_testidx, + args->lstio_bat_client, + args->lstio_bat_timeout, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_list_ioctl(lstio_batch_list_args_t *args) +{ + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_idx < 0 || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_batch_list(args->lstio_bat_idx, + args->lstio_bat_nmlen, + args->lstio_bat_namep); +} + +static int +lst_batch_info_ioctl(lstio_batch_info_args_t *args) +{ + char *name; + int rc; + int index; + int ndent; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || /* batch name */ + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_bat_entp == NULL && /* output: batch entry */ + args->lstio_bat_dentsp == NULL) /* output: node entry */ + return -EINVAL; + + if (args->lstio_bat_dentsp != NULL) { /* have node entry */ + if (args->lstio_bat_idxp == NULL || /* node index */ + args->lstio_bat_ndentp == NULL) /* # of node entry */ + return -EINVAL; + + if (copy_from_user(&index, args->lstio_bat_idxp, + sizeof(index)) || + copy_from_user(&ndent, args->lstio_bat_ndentp, + sizeof(ndent))) + return -EFAULT; + + if (ndent <= 0 || index < 0) + return -EINVAL; + } + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_info(name, + args->lstio_bat_entp, args->lstio_bat_server, + args->lstio_bat_testidx, &index, &ndent, + args->lstio_bat_dentsp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + if (rc != 0) + return rc; + + if (args->lstio_bat_dentsp != NULL && + (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) || + copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent)))) + rc = -EFAULT; + + return rc; +} + +static int +lst_stat_query_ioctl(lstio_stat_args_t *args) +{ + int rc; + char *name; + + /* TODO: not finished */ + if (args->lstio_sta_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_sta_resultp == NULL || + (args->lstio_sta_namep == NULL && + args->lstio_sta_idsp == NULL) || + args->lstio_sta_nmlen <= 0 || + args->lstio_sta_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_sta_idsp != NULL && + args->lstio_sta_count <= 0) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_sta_namep, + args->lstio_sta_nmlen)) { + LIBCFS_FREE(name, args->lstio_sta_nmlen + 1); + return -EFAULT; + } + + if (args->lstio_sta_idsp == NULL) { + rc = lstcon_group_stat(name, args->lstio_sta_timeout, + args->lstio_sta_resultp); + } else { + rc = lstcon_nodes_stat(args->lstio_sta_count, + args->lstio_sta_idsp, + args->lstio_sta_timeout, + args->lstio_sta_resultp); + } + + LIBCFS_FREE(name, args->lstio_sta_nmlen + 1); + + return rc; +} + +static int lst_test_add_ioctl(lstio_test_args_t *args) +{ + char *batch_name; + char *src_name = NULL; + char *dst_name = NULL; + void *param = NULL; + int ret = 0; + int rc = -ENOMEM; + + if (args->lstio_tes_resultp == NULL || + args->lstio_tes_retp == NULL || + args->lstio_tes_bat_name == NULL || /* no specified batch */ + args->lstio_tes_bat_nmlen <= 0 || + args->lstio_tes_bat_nmlen > LST_NAME_SIZE || + args->lstio_tes_sgrp_name == NULL || /* no source group */ + args->lstio_tes_sgrp_nmlen <= 0 || + args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE || + args->lstio_tes_dgrp_name == NULL || /* no target group */ + args->lstio_tes_dgrp_nmlen <= 0 || + args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_tes_loop == 0 || /* negative is infinite */ + args->lstio_tes_concur <= 0 || + args->lstio_tes_dist <= 0 || + args->lstio_tes_span <= 0) + return -EINVAL; + + /* have parameter, check if parameter length is valid */ + if (args->lstio_tes_param != NULL && + (args->lstio_tes_param_len <= 0 || + args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t))) + return -EINVAL; + + LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1); + if (batch_name == NULL) + return rc; + + LIBCFS_ALLOC(src_name, args->lstio_tes_sgrp_nmlen + 1); + if (src_name == NULL) + goto out; + + LIBCFS_ALLOC(dst_name, args->lstio_tes_dgrp_nmlen + 1); + if (dst_name == NULL) + goto out; + + if (args->lstio_tes_param != NULL) { + LIBCFS_ALLOC(param, args->lstio_tes_param_len); + if (param == NULL) + goto out; + } + + rc = -EFAULT; + if (copy_from_user(batch_name, args->lstio_tes_bat_name, + args->lstio_tes_bat_nmlen) || + copy_from_user(src_name, args->lstio_tes_sgrp_name, + args->lstio_tes_sgrp_nmlen) || + copy_from_user(dst_name, args->lstio_tes_dgrp_name, + args->lstio_tes_dgrp_nmlen) || + copy_from_user(param, args->lstio_tes_param, + args->lstio_tes_param_len)) + goto out; + + rc = lstcon_test_add(batch_name, + args->lstio_tes_type, + args->lstio_tes_loop, + args->lstio_tes_concur, + args->lstio_tes_dist, args->lstio_tes_span, + src_name, dst_name, param, + args->lstio_tes_param_len, + &ret, args->lstio_tes_resultp); + + if (ret != 0) + rc = (copy_to_user(args->lstio_tes_retp, &ret, + sizeof(ret))) ? -EFAULT : 0; +out: + if (batch_name != NULL) + LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1); + + if (src_name != NULL) + LIBCFS_FREE(src_name, args->lstio_tes_sgrp_nmlen + 1); + + if (dst_name != NULL) + LIBCFS_FREE(dst_name, args->lstio_tes_dgrp_nmlen + 1); + + if (param != NULL) + LIBCFS_FREE(param, args->lstio_tes_param_len); + + return rc; +} + +int +lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data) +{ + char *buf; + int opc = data->ioc_u32[0]; + int rc; + + if (cmd != IOC_LIBCFS_LNETST) + return -EINVAL; + + if (data->ioc_plen1 > PAGE_CACHE_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(buf, data->ioc_plen1); + if (buf == NULL) + return -ENOMEM; + + /* copy in parameter */ + if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) { + LIBCFS_FREE(buf, data->ioc_plen1); + return -EFAULT; + } + + mutex_lock(&console_session.ses_mutex); + + console_session.ses_laststamp = get_seconds(); + + if (console_session.ses_shutdown) { + rc = -ESHUTDOWN; + goto out; + } + + if (console_session.ses_expired) + lstcon_session_end(); + + if (opc != LSTIO_SESSION_NEW && + console_session.ses_state == LST_SESSION_NONE) { + CDEBUG(D_NET, "LST no active session\n"); + rc = -ESRCH; + goto out; + } + + memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t)); + + switch (opc) { + case LSTIO_SESSION_NEW: + rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf); + break; + case LSTIO_SESSION_END: + rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf); + break; + case LSTIO_SESSION_INFO: + rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf); + break; + case LSTIO_DEBUG: + rc = lst_debug_ioctl((lstio_debug_args_t *)buf); + break; + case LSTIO_GROUP_ADD: + rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf); + break; + case LSTIO_GROUP_DEL: + rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf); + break; + case LSTIO_GROUP_UPDATE: + rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf); + break; + case LSTIO_NODES_ADD: + rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf); + break; + case LSTIO_GROUP_LIST: + rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf); + break; + case LSTIO_GROUP_INFO: + rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf); + break; + case LSTIO_BATCH_ADD: + rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf); + break; + case LSTIO_BATCH_START: + rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf); + break; + case LSTIO_BATCH_STOP: + rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf); + break; + case LSTIO_BATCH_QUERY: + rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf); + break; + case LSTIO_BATCH_LIST: + rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf); + break; + case LSTIO_BATCH_INFO: + rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf); + break; + case LSTIO_TEST_ADD: + rc = lst_test_add_ioctl((lstio_test_args_t *)buf); + break; + case LSTIO_STAT_QUERY: + rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf); + break; + default: + rc = -EINVAL; + } + + if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat, + sizeof(lstcon_trans_stat_t))) + rc = -EFAULT; +out: + mutex_unlock(&console_session.ses_mutex); + + LIBCFS_FREE(buf, data->ioc_plen1); + + return rc; +} + +EXPORT_SYMBOL(lstcon_ioctl_entry); diff --git a/kernel/drivers/staging/lustre/lnet/selftest/conrpc.c b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.c new file mode 100644 index 000000000..77f02b761 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.c @@ -0,0 +1,1396 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * Console framework rpcs + * + * Author: Liang Zhen + */ + + +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/lib-lnet.h" +#include "timer.h" +#include "conrpc.h" +#include "console.h" + +void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *, + lstcon_node_t *, lstcon_trans_stat_t *); + +static void +lstcon_rpc_done(srpc_client_rpc_t *rpc) +{ + lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv; + + LASSERT(crpc != NULL && rpc == crpc->crp_rpc); + LASSERT(crpc->crp_posted && !crpc->crp_finished); + + spin_lock(&rpc->crpc_lock); + + if (crpc->crp_trans == NULL) { + /* Orphan RPC is not in any transaction, + * I'm just a poor body and nobody loves me */ + spin_unlock(&rpc->crpc_lock); + + /* release it */ + lstcon_rpc_put(crpc); + return; + } + + /* not an orphan RPC */ + crpc->crp_finished = 1; + + if (crpc->crp_stamp == 0) { + /* not aborted */ + LASSERT(crpc->crp_status == 0); + + crpc->crp_stamp = cfs_time_current(); + crpc->crp_status = rpc->crpc_status; + } + + /* wakeup (transaction)thread if I'm the last RPC in the transaction */ + if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining)) + wake_up(&crpc->crp_trans->tas_waitq); + + spin_unlock(&rpc->crpc_lock); +} + +static int +lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats, + int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc) +{ + crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service, + feats, bulk_npg, bulk_len, + lstcon_rpc_done, (void *)crpc); + if (crpc->crp_rpc == NULL) + return -ENOMEM; + + crpc->crp_trans = NULL; + crpc->crp_node = nd; + crpc->crp_posted = 0; + crpc->crp_finished = 0; + crpc->crp_unpacked = 0; + crpc->crp_status = 0; + crpc->crp_stamp = 0; + crpc->crp_embedded = embedded; + INIT_LIST_HEAD(&crpc->crp_link); + + atomic_inc(&console_session.ses_rpc_counter); + + return 0; +} + +static int +lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats, + int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp) +{ + lstcon_rpc_t *crpc = NULL; + int rc; + + spin_lock(&console_session.ses_rpc_lock); + + if (!list_empty(&console_session.ses_rpc_freelist)) { + crpc = list_entry(console_session.ses_rpc_freelist.next, + lstcon_rpc_t, crp_link); + list_del_init(&crpc->crp_link); + } + + spin_unlock(&console_session.ses_rpc_lock); + + if (crpc == NULL) { + LIBCFS_ALLOC(crpc, sizeof(*crpc)); + if (crpc == NULL) + return -ENOMEM; + } + + rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc); + if (rc == 0) { + *crpcpp = crpc; + return 0; + } + + LIBCFS_FREE(crpc, sizeof(*crpc)); + + return rc; +} + +void +lstcon_rpc_put(lstcon_rpc_t *crpc) +{ + srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk; + int i; + + LASSERT(list_empty(&crpc->crp_link)); + + for (i = 0; i < bulk->bk_niov; i++) { + if (bulk->bk_iovs[i].kiov_page == NULL) + continue; + + __free_page(bulk->bk_iovs[i].kiov_page); + } + + srpc_client_rpc_decref(crpc->crp_rpc); + + if (crpc->crp_embedded) { + /* embedded RPC, don't recycle it */ + memset(crpc, 0, sizeof(*crpc)); + crpc->crp_embedded = 1; + + } else { + spin_lock(&console_session.ses_rpc_lock); + + list_add(&crpc->crp_link, + &console_session.ses_rpc_freelist); + + spin_unlock(&console_session.ses_rpc_lock); + } + + /* RPC is not alive now */ + atomic_dec(&console_session.ses_rpc_counter); +} + +static void +lstcon_rpc_post(lstcon_rpc_t *crpc) +{ + lstcon_rpc_trans_t *trans = crpc->crp_trans; + + LASSERT(trans != NULL); + + atomic_inc(&trans->tas_remaining); + crpc->crp_posted = 1; + + sfw_post_rpc(crpc->crp_rpc); +} + +static char * +lstcon_rpc_trans_name(int transop) +{ + if (transop == LST_TRANS_SESNEW) + return "SESNEW"; + + if (transop == LST_TRANS_SESEND) + return "SESEND"; + + if (transop == LST_TRANS_SESQRY) + return "SESQRY"; + + if (transop == LST_TRANS_SESPING) + return "SESPING"; + + if (transop == LST_TRANS_TSBCLIADD) + return "TSBCLIADD"; + + if (transop == LST_TRANS_TSBSRVADD) + return "TSBSRVADD"; + + if (transop == LST_TRANS_TSBRUN) + return "TSBRUN"; + + if (transop == LST_TRANS_TSBSTOP) + return "TSBSTOP"; + + if (transop == LST_TRANS_TSBCLIQRY) + return "TSBCLIQRY"; + + if (transop == LST_TRANS_TSBSRVQRY) + return "TSBSRVQRY"; + + if (transop == LST_TRANS_STATQRY) + return "STATQRY"; + + return "Unknown"; +} + +int +lstcon_rpc_trans_prep(struct list_head *translist, + int transop, lstcon_rpc_trans_t **transpp) +{ + lstcon_rpc_trans_t *trans; + + if (translist != NULL) { + list_for_each_entry(trans, translist, tas_link) { + /* Can't enqueue two private transaction on + * the same object */ + if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE) + return -EPERM; + } + } + + /* create a trans group */ + LIBCFS_ALLOC(trans, sizeof(*trans)); + if (trans == NULL) + return -ENOMEM; + + trans->tas_opc = transop; + + if (translist == NULL) + INIT_LIST_HEAD(&trans->tas_olink); + else + list_add_tail(&trans->tas_olink, translist); + + list_add_tail(&trans->tas_link, &console_session.ses_trans_list); + + INIT_LIST_HEAD(&trans->tas_rpcs_list); + atomic_set(&trans->tas_remaining, 0); + init_waitqueue_head(&trans->tas_waitq); + + spin_lock(&console_session.ses_rpc_lock); + trans->tas_features = console_session.ses_features; + spin_unlock(&console_session.ses_rpc_lock); + + *transpp = trans; + return 0; +} + +void +lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc) +{ + list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list); + crpc->crp_trans = trans; +} + +void +lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error) +{ + srpc_client_rpc_t *rpc; + lstcon_rpc_t *crpc; + lstcon_node_t *nd; + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + rpc = crpc->crp_rpc; + + spin_lock(&rpc->crpc_lock); + + if (!crpc->crp_posted || /* not posted */ + crpc->crp_stamp != 0) { /* rpc done or aborted already */ + if (crpc->crp_stamp == 0) { + crpc->crp_stamp = cfs_time_current(); + crpc->crp_status = -EINTR; + } + spin_unlock(&rpc->crpc_lock); + continue; + } + + crpc->crp_stamp = cfs_time_current(); + crpc->crp_status = error; + + spin_unlock(&rpc->crpc_lock); + + sfw_abort_rpc(rpc); + + if (error != ETIMEDOUT) + continue; + + nd = crpc->crp_node; + if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp)) + continue; + + nd->nd_stamp = crpc->crp_stamp; + nd->nd_state = LST_NODE_DOWN; + } +} + +static int +lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans) +{ + if (console_session.ses_shutdown && + !list_empty(&trans->tas_olink)) /* Not an end session RPC */ + return 1; + + return (atomic_read(&trans->tas_remaining) == 0) ? 1 : 0; +} + +int +lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout) +{ + lstcon_rpc_t *crpc; + int rc; + + if (list_empty(&trans->tas_rpcs_list)) + return 0; + + if (timeout < LST_TRANS_MIN_TIMEOUT) + timeout = LST_TRANS_MIN_TIMEOUT; + + CDEBUG(D_NET, "Transaction %s started\n", + lstcon_rpc_trans_name(trans->tas_opc)); + + /* post all requests */ + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + LASSERT(!crpc->crp_posted); + + lstcon_rpc_post(crpc); + } + + mutex_unlock(&console_session.ses_mutex); + + rc = wait_event_interruptible_timeout(trans->tas_waitq, + lstcon_rpc_trans_check(trans), + cfs_time_seconds(timeout)); + rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT); + + mutex_lock(&console_session.ses_mutex); + + if (console_session.ses_shutdown) + rc = -ESHUTDOWN; + + if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) { + /* treat short timeout as canceled */ + if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2) + rc = -EINTR; + + lstcon_rpc_trans_abort(trans, rc); + } + + CDEBUG(D_NET, "Transaction %s stopped: %d\n", + lstcon_rpc_trans_name(trans->tas_opc), rc); + + lstcon_rpc_trans_stat(trans, lstcon_trans_stat()); + + return rc; +} + +static int +lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp) +{ + lstcon_node_t *nd = crpc->crp_node; + srpc_client_rpc_t *rpc = crpc->crp_rpc; + srpc_generic_reply_t *rep; + + LASSERT(nd != NULL && rpc != NULL); + LASSERT(crpc->crp_stamp != 0); + + if (crpc->crp_status != 0) { + *msgpp = NULL; + return crpc->crp_status; + } + + *msgpp = &rpc->crpc_replymsg; + if (!crpc->crp_unpacked) { + sfw_unpack_message(*msgpp); + crpc->crp_unpacked = 1; + } + + if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp)) + return 0; + + nd->nd_stamp = crpc->crp_stamp; + rep = &(*msgpp)->msg_body.reply; + + if (rep->sid.ses_nid == LNET_NID_ANY) + nd->nd_state = LST_NODE_UNKNOWN; + else if (lstcon_session_match(rep->sid)) + nd->nd_state = LST_NODE_ACTIVE; + else + nd->nd_state = LST_NODE_BUSY; + + return 0; +} + +void +lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat) +{ + lstcon_rpc_t *crpc; + srpc_msg_t *rep; + int error; + + LASSERT(stat != NULL); + + memset(stat, 0, sizeof(*stat)); + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + lstcon_rpc_stat_total(stat, 1); + + LASSERT(crpc->crp_stamp != 0); + + error = lstcon_rpc_get_reply(crpc, &rep); + if (error != 0) { + lstcon_rpc_stat_failure(stat, 1); + if (stat->trs_rpc_errno == 0) + stat->trs_rpc_errno = -error; + + continue; + } + + lstcon_rpc_stat_success(stat, 1); + + lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat); + } + + if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) { + stat->trs_fwk_errno = + lstcon_session_feats_check(trans->tas_features); + } + + CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, RPC error(%d), Framework error(%d)\n", + lstcon_rpc_trans_name(trans->tas_opc), + lstcon_rpc_stat_success(stat, 0), + lstcon_rpc_stat_failure(stat, 0), + lstcon_rpc_stat_total(stat, 0), + stat->trs_rpc_errno, stat->trs_fwk_errno); + + return; +} + +int +lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans, + struct list_head *head_up, + lstcon_rpc_readent_func_t readent) +{ + struct list_head tmp; + struct list_head *next; + lstcon_rpc_ent_t *ent; + srpc_generic_reply_t *rep; + lstcon_rpc_t *crpc; + srpc_msg_t *msg; + lstcon_node_t *nd; + long dur; + struct timeval tv; + int error; + + LASSERT(head_up != NULL); + + next = head_up; + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + if (copy_from_user(&tmp, next, + sizeof(struct list_head))) + return -EFAULT; + + if (tmp.next == head_up) + return 0; + + next = tmp.next; + + ent = list_entry(next, lstcon_rpc_ent_t, rpe_link); + + LASSERT(crpc->crp_stamp != 0); + + error = lstcon_rpc_get_reply(crpc, &msg); + + nd = crpc->crp_node; + + dur = (long)cfs_time_sub(crpc->crp_stamp, + (unsigned long)console_session.ses_id.ses_stamp); + cfs_duration_usec(dur, &tv); + + if (copy_to_user(&ent->rpe_peer, + &nd->nd_id, sizeof(lnet_process_id_t)) || + copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) || + copy_to_user(&ent->rpe_state, + &nd->nd_state, sizeof(nd->nd_state)) || + copy_to_user(&ent->rpe_rpc_errno, &error, + sizeof(error))) + return -EFAULT; + + if (error != 0) + continue; + + /* RPC is done */ + rep = (srpc_generic_reply_t *)&msg->msg_body.reply; + + if (copy_to_user(&ent->rpe_sid, + &rep->sid, sizeof(lst_sid_t)) || + copy_to_user(&ent->rpe_fwk_errno, + &rep->status, sizeof(rep->status))) + return -EFAULT; + + if (readent == NULL) + continue; + + error = readent(trans->tas_opc, msg, ent); + + if (error != 0) + return error; + } + + return 0; +} + +void +lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans) +{ + srpc_client_rpc_t *rpc; + lstcon_rpc_t *crpc; + lstcon_rpc_t *tmp; + int count = 0; + + list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, + crp_link) { + rpc = crpc->crp_rpc; + + spin_lock(&rpc->crpc_lock); + + /* free it if not posted or finished already */ + if (!crpc->crp_posted || crpc->crp_finished) { + spin_unlock(&rpc->crpc_lock); + + list_del_init(&crpc->crp_link); + lstcon_rpc_put(crpc); + + continue; + } + + /* rpcs can be still not callbacked (even LNetMDUnlink is called) + * because huge timeout for inaccessible network, don't make + * user wait for them, just abandon them, they will be recycled + * in callback */ + + LASSERT(crpc->crp_status != 0); + + crpc->crp_node = NULL; + crpc->crp_trans = NULL; + list_del_init(&crpc->crp_link); + count++; + + spin_unlock(&rpc->crpc_lock); + + atomic_dec(&trans->tas_remaining); + } + + LASSERT(atomic_read(&trans->tas_remaining) == 0); + + list_del(&trans->tas_link); + if (!list_empty(&trans->tas_olink)) + list_del(&trans->tas_olink); + + CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n", + lstcon_rpc_trans_name(trans->tas_opc), count); + + LIBCFS_FREE(trans, sizeof(*trans)); + + return; +} + +int +lstcon_sesrpc_prep(lstcon_node_t *nd, int transop, + unsigned feats, lstcon_rpc_t **crpc) +{ + srpc_mksn_reqst_t *msrq; + srpc_rmsn_reqst_t *rsrq; + int rc; + + switch (transop) { + case LST_TRANS_SESNEW: + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION, + feats, 0, 0, crpc); + if (rc != 0) + return rc; + + msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst; + msrq->mksn_sid = console_session.ses_id; + msrq->mksn_force = console_session.ses_force; + strncpy(msrq->mksn_name, console_session.ses_name, + strlen(console_session.ses_name)); + break; + + case LST_TRANS_SESEND: + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION, + feats, 0, 0, crpc); + if (rc != 0) + return rc; + + rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst; + rsrq->rmsn_sid = console_session.ses_id; + break; + + default: + LBUG(); + } + + return 0; +} + +int +lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc) +{ + srpc_debug_reqst_t *drq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; + + drq->dbg_sid = console_session.ses_id; + drq->dbg_flags = 0; + + return rc; +} + +int +lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats, + lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc) +{ + lstcon_batch_t *batch; + srpc_batch_reqst_t *brq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst; + + brq->bar_sid = console_session.ses_id; + brq->bar_bid = tsb->tsb_id; + brq->bar_testidx = tsb->tsb_index; + brq->bar_opc = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN : + (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP : + SRPC_BATCH_OPC_QUERY); + + if (transop != LST_TRANS_TSBRUN && + transop != LST_TRANS_TSBSTOP) + return 0; + + LASSERT(tsb->tsb_index == 0); + + batch = (lstcon_batch_t *)tsb; + brq->bar_arg = batch->bat_arg; + + return 0; +} + +int +lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc) +{ + srpc_stat_reqst_t *srq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst; + + srq->str_sid = console_session.ses_id; + srq->str_type = 0; /* XXX remove it */ + + return 0; +} + +static lnet_process_id_packed_t * +lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov) +{ + lnet_process_id_packed_t *pid; + int i; + + i = idx / SFW_ID_PER_PAGE; + + LASSERT(i < nkiov); + + pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page); + + return &pid[idx % SFW_ID_PER_PAGE]; +} + +static int +lstcon_dstnodes_prep(lstcon_group_t *grp, int idx, + int dist, int span, int nkiov, lnet_kiov_t *kiov) +{ + lnet_process_id_packed_t *pid; + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + int start; + int end; + int i = 0; + + LASSERT(dist >= 1); + LASSERT(span >= 1); + LASSERT(grp->grp_nnode >= 1); + + if (span > grp->grp_nnode) + return -EINVAL; + + start = ((idx / dist) * span) % grp->grp_nnode; + end = ((idx / dist) * span + span - 1) % grp->grp_nnode; + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { + nd = ndl->ndl_node; + if (i < start) { + i++; + continue; + } + + if (i > (end >= start ? end : grp->grp_nnode)) + break; + + pid = lstcon_next_id((i - start), nkiov, kiov); + pid->nid = nd->nd_id.nid; + pid->pid = nd->nd_id.pid; + i++; + } + + if (start <= end) /* done */ + return 0; + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { + if (i > grp->grp_nnode + end) + break; + + nd = ndl->ndl_node; + pid = lstcon_next_id((i - start), nkiov, kiov); + pid->nid = nd->nd_id.nid; + pid->pid = nd->nd_id.pid; + i++; + } + + return 0; +} + +static int +lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req) +{ + test_ping_req_t *prq = &req->tsr_u.ping; + + prq->png_size = param->png_size; + prq->png_flags = param->png_flags; + /* TODO dest */ + return 0; +} + +static int +lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req) +{ + test_bulk_req_t *brq = &req->tsr_u.bulk_v0; + + brq->blk_opc = param->blk_opc; + brq->blk_npg = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; + brq->blk_flags = param->blk_flags; + + return 0; +} + +static int +lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req) +{ + test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1; + + brq->blk_opc = param->blk_opc; + brq->blk_flags = param->blk_flags; + brq->blk_len = param->blk_size; + brq->blk_offset = 0; /* reserved */ + + return 0; +} + +int +lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats, + lstcon_test_t *test, lstcon_rpc_t **crpc) +{ + lstcon_group_t *sgrp = test->tes_src_grp; + lstcon_group_t *dgrp = test->tes_dst_grp; + srpc_test_reqst_t *trq; + srpc_bulk_t *bulk; + int i; + int npg = 0; + int nob = 0; + int rc = 0; + + if (transop == LST_TRANS_TSBCLIADD) { + npg = sfw_id_pages(test->tes_span); + nob = (feats & LST_FEAT_BULK_LEN) == 0 ? + npg * PAGE_CACHE_SIZE : + sizeof(lnet_process_id_packed_t) * test->tes_span; + } + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc); + if (rc != 0) + return rc; + + trq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst; + + if (transop == LST_TRANS_TSBSRVADD) { + int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist; + int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span; + int nmax = (ndist + nspan - 1) / nspan; + + trq->tsr_ndest = 0; + trq->tsr_loop = nmax * test->tes_dist * test->tes_concur; + + } else { + bulk = &(*crpc)->crp_rpc->crpc_bulk; + + for (i = 0; i < npg; i++) { + int len; + + LASSERT(nob > 0); + + len = (feats & LST_FEAT_BULK_LEN) == 0 ? + PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE); + nob -= len; + + bulk->bk_iovs[i].kiov_offset = 0; + bulk->bk_iovs[i].kiov_len = len; + bulk->bk_iovs[i].kiov_page = + alloc_page(GFP_IOFS); + + if (bulk->bk_iovs[i].kiov_page == NULL) { + lstcon_rpc_put(*crpc); + return -ENOMEM; + } + } + + bulk->bk_sink = 0; + + LASSERT(transop == LST_TRANS_TSBCLIADD); + + rc = lstcon_dstnodes_prep(test->tes_dst_grp, + test->tes_cliidx++, + test->tes_dist, + test->tes_span, + npg, &bulk->bk_iovs[0]); + if (rc != 0) { + lstcon_rpc_put(*crpc); + return rc; + } + + trq->tsr_ndest = test->tes_span; + trq->tsr_loop = test->tes_loop; + } + + trq->tsr_sid = console_session.ses_id; + trq->tsr_bid = test->tes_hdr.tsb_id; + trq->tsr_concur = test->tes_concur; + trq->tsr_is_client = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0; + trq->tsr_stop_onerr = !!test->tes_stop_onerr; + + switch (test->tes_type) { + case LST_TEST_PING: + trq->tsr_service = SRPC_SERVICE_PING; + rc = lstcon_pingrpc_prep((lst_test_ping_param_t *) + &test->tes_param[0], trq); + break; + + case LST_TEST_BULK: + trq->tsr_service = SRPC_SERVICE_BRW; + if ((feats & LST_FEAT_BULK_LEN) == 0) { + rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *) + &test->tes_param[0], trq); + } else { + rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *) + &test->tes_param[0], trq); + } + + break; + default: + LBUG(); + break; + } + + return rc; +} + +static int +lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans, + lstcon_node_t *nd, srpc_msg_t *reply) +{ + srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply; + int status = mksn_rep->mksn_status; + + if (status == 0 && + (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + mksn_rep->mksn_status = EPROTO; + status = EPROTO; + } + + if (status == EPROTO) { + CNETERR("session protocol error from %s: %u\n", + libcfs_nid2str(nd->nd_id.nid), + reply->msg_ses_feats); + } + + if (status != 0) + return status; + + if (!trans->tas_feats_updated) { + trans->tas_feats_updated = 1; + trans->tas_features = reply->msg_ses_feats; + } + + if (reply->msg_ses_feats != trans->tas_features) { + CNETERR("Framework features %x from %s is different with features on this transaction: %x\n", + reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid), + trans->tas_features); + status = mksn_rep->mksn_status = EPROTO; + } + + if (status == 0) { + /* session timeout on remote node */ + nd->nd_timeout = mksn_rep->mksn_timeout; + } + + return status; +} + +void +lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg, + lstcon_node_t *nd, lstcon_trans_stat_t *stat) +{ + srpc_rmsn_reply_t *rmsn_rep; + srpc_debug_reply_t *dbg_rep; + srpc_batch_reply_t *bat_rep; + srpc_test_reply_t *test_rep; + srpc_stat_reply_t *stat_rep; + int rc = 0; + + switch (trans->tas_opc) { + case LST_TRANS_SESNEW: + rc = lstcon_sesnew_stat_reply(trans, nd, msg); + if (rc == 0) { + lstcon_sesop_stat_success(stat, 1); + return; + } + + lstcon_sesop_stat_failure(stat, 1); + break; + + case LST_TRANS_SESEND: + rmsn_rep = &msg->msg_body.rmsn_reply; + /* ESRCH is not an error for end session */ + if (rmsn_rep->rmsn_status == 0 || + rmsn_rep->rmsn_status == ESRCH) { + lstcon_sesop_stat_success(stat, 1); + return; + } + + lstcon_sesop_stat_failure(stat, 1); + rc = rmsn_rep->rmsn_status; + break; + + case LST_TRANS_SESQRY: + case LST_TRANS_SESPING: + dbg_rep = &msg->msg_body.dbg_reply; + + if (dbg_rep->dbg_status == ESRCH) { + lstcon_sesqry_stat_unknown(stat, 1); + return; + } + + if (lstcon_session_match(dbg_rep->dbg_sid)) + lstcon_sesqry_stat_active(stat, 1); + else + lstcon_sesqry_stat_busy(stat, 1); + return; + + case LST_TRANS_TSBRUN: + case LST_TRANS_TSBSTOP: + bat_rep = &msg->msg_body.bat_reply; + + if (bat_rep->bar_status == 0) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + if (bat_rep->bar_status == EPERM && + trans->tas_opc == LST_TRANS_TSBSTOP) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + lstcon_tsbop_stat_failure(stat, 1); + rc = bat_rep->bar_status; + break; + + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + bat_rep = &msg->msg_body.bat_reply; + + if (bat_rep->bar_active != 0) + lstcon_tsbqry_stat_run(stat, 1); + else + lstcon_tsbqry_stat_idle(stat, 1); + + if (bat_rep->bar_status == 0) + return; + + lstcon_tsbqry_stat_failure(stat, 1); + rc = bat_rep->bar_status; + break; + + case LST_TRANS_TSBCLIADD: + case LST_TRANS_TSBSRVADD: + test_rep = &msg->msg_body.tes_reply; + + if (test_rep->tsr_status == 0) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + lstcon_tsbop_stat_failure(stat, 1); + rc = test_rep->tsr_status; + break; + + case LST_TRANS_STATQRY: + stat_rep = &msg->msg_body.stat_reply; + + if (stat_rep->str_status == 0) { + lstcon_statqry_stat_success(stat, 1); + return; + } + + lstcon_statqry_stat_failure(stat, 1); + rc = stat_rep->str_status; + break; + + default: + LBUG(); + } + + if (stat->trs_fwk_errno == 0) + stat->trs_fwk_errno = rc; + + return; +} + +int +lstcon_rpc_trans_ndlist(struct list_head *ndlist, + struct list_head *translist, int transop, + void *arg, lstcon_rpc_cond_func_t condition, + lstcon_rpc_trans_t **transpp) +{ + lstcon_rpc_trans_t *trans; + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + lstcon_rpc_t *rpc; + unsigned feats; + int rc; + + /* Creating session RPG for list of nodes */ + + rc = lstcon_rpc_trans_prep(translist, transop, &trans); + if (rc != 0) { + CERROR("Can't create transaction %d: %d\n", transop, rc); + return rc; + } + + feats = trans->tas_features; + list_for_each_entry(ndl, ndlist, ndl_link) { + rc = condition == NULL ? 1 : + condition(transop, ndl->ndl_node, arg); + + if (rc == 0) + continue; + + if (rc < 0) { + CDEBUG(D_NET, "Condition error while creating RPC for transaction %d: %d\n", + transop, rc); + break; + } + + nd = ndl->ndl_node; + + switch (transop) { + case LST_TRANS_SESNEW: + case LST_TRANS_SESEND: + rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc); + break; + case LST_TRANS_SESQRY: + case LST_TRANS_SESPING: + rc = lstcon_dbgrpc_prep(nd, feats, &rpc); + break; + case LST_TRANS_TSBCLIADD: + case LST_TRANS_TSBSRVADD: + rc = lstcon_testrpc_prep(nd, transop, feats, + (lstcon_test_t *)arg, &rpc); + break; + case LST_TRANS_TSBRUN: + case LST_TRANS_TSBSTOP: + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + rc = lstcon_batrpc_prep(nd, transop, feats, + (lstcon_tsb_hdr_t *)arg, &rpc); + break; + case LST_TRANS_STATQRY: + rc = lstcon_statrpc_prep(nd, feats, &rpc); + break; + default: + rc = -EINVAL; + break; + } + + if (rc != 0) { + CERROR("Failed to create RPC for transaction %s: %d\n", + lstcon_rpc_trans_name(transop), rc); + break; + } + + lstcon_rpc_trans_addreq(trans, rpc); + } + + if (rc == 0) { + *transpp = trans; + return 0; + } + + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +static void +lstcon_rpc_pinger(void *arg) +{ + stt_timer_t *ptimer = (stt_timer_t *)arg; + lstcon_rpc_trans_t *trans; + lstcon_rpc_t *crpc; + srpc_msg_t *rep; + srpc_debug_reqst_t *drq; + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + time_t intv; + int count = 0; + int rc; + + /* RPC pinger is a special case of transaction, + * it's called by timer at 8 seconds interval. + */ + mutex_lock(&console_session.ses_mutex); + + if (console_session.ses_shutdown || console_session.ses_expired) { + mutex_unlock(&console_session.ses_mutex); + return; + } + + if (!console_session.ses_expired && + get_seconds() - console_session.ses_laststamp > + (time_t)console_session.ses_timeout) + console_session.ses_expired = 1; + + trans = console_session.ses_ping; + + LASSERT(trans != NULL); + + list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) { + nd = ndl->ndl_node; + + if (console_session.ses_expired) { + /* idle console, end session on all nodes */ + if (nd->nd_state != LST_NODE_ACTIVE) + continue; + + rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND, + trans->tas_features, &crpc); + if (rc != 0) { + CERROR("Out of memory\n"); + break; + } + + lstcon_rpc_trans_addreq(trans, crpc); + lstcon_rpc_post(crpc); + + continue; + } + + crpc = &nd->nd_ping; + + if (crpc->crp_rpc != NULL) { + LASSERT(crpc->crp_trans == trans); + LASSERT(!list_empty(&crpc->crp_link)); + + spin_lock(&crpc->crp_rpc->crpc_lock); + + LASSERT(crpc->crp_posted); + + if (!crpc->crp_finished) { + /* in flight */ + spin_unlock(&crpc->crp_rpc->crpc_lock); + continue; + } + + spin_unlock(&crpc->crp_rpc->crpc_lock); + + lstcon_rpc_get_reply(crpc, &rep); + + list_del_init(&crpc->crp_link); + + lstcon_rpc_put(crpc); + } + + if (nd->nd_state != LST_NODE_ACTIVE) + continue; + + intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(), + nd->nd_stamp)); + if (intv < (time_t)nd->nd_timeout / 2) + continue; + + rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG, + trans->tas_features, 0, 0, 1, crpc); + if (rc != 0) { + CERROR("Out of memory\n"); + break; + } + + drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; + + drq->dbg_sid = console_session.ses_id; + drq->dbg_flags = 0; + + lstcon_rpc_trans_addreq(trans, crpc); + lstcon_rpc_post(crpc); + + count++; + } + + if (console_session.ses_expired) { + mutex_unlock(&console_session.ses_mutex); + return; + } + + CDEBUG(D_NET, "Ping %d nodes in session\n", count); + + ptimer->stt_expires = (unsigned long)(get_seconds() + LST_PING_INTERVAL); + stt_add_timer(ptimer); + + mutex_unlock(&console_session.ses_mutex); +} + +int +lstcon_rpc_pinger_start(void) +{ + stt_timer_t *ptimer; + int rc; + + LASSERT(list_empty(&console_session.ses_rpc_freelist)); + LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0); + + rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING, + &console_session.ses_ping); + if (rc != 0) { + CERROR("Failed to create console pinger\n"); + return rc; + } + + ptimer = &console_session.ses_ping_timer; + ptimer->stt_expires = (unsigned long)(get_seconds() + LST_PING_INTERVAL); + + stt_add_timer(ptimer); + + return 0; +} + +void +lstcon_rpc_pinger_stop(void) +{ + LASSERT(console_session.ses_shutdown); + + stt_del_timer(&console_session.ses_ping_timer); + + lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN); + lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat()); + lstcon_rpc_trans_destroy(console_session.ses_ping); + + memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t)); + + console_session.ses_ping = NULL; +} + +void +lstcon_rpc_cleanup_wait(void) +{ + lstcon_rpc_trans_t *trans; + lstcon_rpc_t *crpc; + struct list_head *pacer; + struct list_head zlist; + + /* Called with hold of global mutex */ + + LASSERT(console_session.ses_shutdown); + + while (!list_empty(&console_session.ses_trans_list)) { + list_for_each(pacer, &console_session.ses_trans_list) { + trans = list_entry(pacer, lstcon_rpc_trans_t, + tas_link); + + CDEBUG(D_NET, "Session closed, wakeup transaction %s\n", + lstcon_rpc_trans_name(trans->tas_opc)); + + wake_up(&trans->tas_waitq); + } + + mutex_unlock(&console_session.ses_mutex); + + CWARN("Session is shutting down, waiting for termination of transactions\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + + mutex_lock(&console_session.ses_mutex); + } + + spin_lock(&console_session.ses_rpc_lock); + + lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0), + console_session.ses_rpc_lock, + "Network is not accessible or target is down, waiting for %d console RPCs to being recycled\n", + atomic_read(&console_session.ses_rpc_counter)); + + list_add(&zlist, &console_session.ses_rpc_freelist); + list_del_init(&console_session.ses_rpc_freelist); + + spin_unlock(&console_session.ses_rpc_lock); + + while (!list_empty(&zlist)) { + crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link); + + list_del(&crpc->crp_link); + LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t)); + } +} + +int +lstcon_rpc_module_init(void) +{ + INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list); + console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger; + console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer; + + console_session.ses_ping = NULL; + + spin_lock_init(&console_session.ses_rpc_lock); + atomic_set(&console_session.ses_rpc_counter, 0); + INIT_LIST_HEAD(&console_session.ses_rpc_freelist); + + return 0; +} + +void +lstcon_rpc_module_fini(void) +{ + LASSERT(list_empty(&console_session.ses_rpc_freelist)); + LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0); +} diff --git a/kernel/drivers/staging/lustre/lnet/selftest/conrpc.h b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.h new file mode 100644 index 000000000..2353889c6 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.h @@ -0,0 +1,146 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * /lnet/selftest/conrpc.h + * + * Console rpc + * + * Author: Liang Zhen + */ + +#ifndef __LST_CONRPC_H__ +#define __LST_CONRPC_H__ + +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/lnet.h" +#include "../../include/linux/lnet/lib-types.h" +#include "../../include/linux/lnet/lnetst.h" +#include "rpc.h" +#include "selftest.h" + +/* Console rpc and rpc transaction */ +#define LST_TRANS_TIMEOUT 30 +#define LST_TRANS_MIN_TIMEOUT 3 + +#define LST_VALIDATE_TIMEOUT(t) min(max(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT) + +#define LST_PING_INTERVAL 8 + +struct lstcon_rpc_trans; +struct lstcon_tsb_hdr; +struct lstcon_test; +struct lstcon_node; + +typedef struct lstcon_rpc { + struct list_head crp_link; /* chain on rpc transaction */ + srpc_client_rpc_t *crp_rpc; /* client rpc */ + struct lstcon_node *crp_node; /* destination node */ + struct lstcon_rpc_trans *crp_trans; /* conrpc transaction */ + + unsigned int crp_posted:1; /* rpc is posted */ + unsigned int crp_finished:1; /* rpc is finished */ + unsigned int crp_unpacked:1; /* reply is unpacked */ + /** RPC is embedded in other structure and can't free it */ + unsigned int crp_embedded:1; + int crp_status; /* console rpc errors */ + unsigned long crp_stamp; /* replied time stamp */ +} lstcon_rpc_t; + +typedef struct lstcon_rpc_trans { + struct list_head tas_olink; /* link chain on owner list */ + struct list_head tas_link; /* link chain on global list */ + int tas_opc; /* operation code of transaction */ + /* features mask is uptodate */ + unsigned tas_feats_updated; + /* test features mask */ + unsigned tas_features; + wait_queue_head_t tas_waitq; /* wait queue head */ + atomic_t tas_remaining; /* # of un-scheduled rpcs */ + struct list_head tas_rpcs_list; /* queued requests */ +} lstcon_rpc_trans_t; + +#define LST_TRANS_PRIVATE 0x1000 + +#define LST_TRANS_SESNEW (LST_TRANS_PRIVATE | 0x01) +#define LST_TRANS_SESEND (LST_TRANS_PRIVATE | 0x02) +#define LST_TRANS_SESQRY 0x03 +#define LST_TRANS_SESPING 0x04 + +#define LST_TRANS_TSBCLIADD (LST_TRANS_PRIVATE | 0x11) +#define LST_TRANS_TSBSRVADD (LST_TRANS_PRIVATE | 0x12) +#define LST_TRANS_TSBRUN (LST_TRANS_PRIVATE | 0x13) +#define LST_TRANS_TSBSTOP (LST_TRANS_PRIVATE | 0x14) +#define LST_TRANS_TSBCLIQRY 0x15 +#define LST_TRANS_TSBSRVQRY 0x16 + +#define LST_TRANS_STATQRY 0x21 + +typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *); +typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *); + +int lstcon_sesrpc_prep(struct lstcon_node *nd, int transop, + unsigned version, lstcon_rpc_t **crpc); +int lstcon_dbgrpc_prep(struct lstcon_node *nd, + unsigned version, lstcon_rpc_t **crpc); +int lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version, + struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc); +int lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version, + struct lstcon_test *test, lstcon_rpc_t **crpc); +int lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version, + lstcon_rpc_t **crpc); +void lstcon_rpc_put(lstcon_rpc_t *crpc); +int lstcon_rpc_trans_prep(struct list_head *translist, + int transop, lstcon_rpc_trans_t **transpp); +int lstcon_rpc_trans_ndlist(struct list_head *ndlist, + struct list_head *translist, int transop, + void *arg, lstcon_rpc_cond_func_t condition, + lstcon_rpc_trans_t **transpp); +void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, + lstcon_trans_stat_t *stat); +int lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans, + struct list_head *head_up, + lstcon_rpc_readent_func_t readent); +void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error); +void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans); +void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req); +int lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout); +int lstcon_rpc_pinger_start(void); +void lstcon_rpc_pinger_stop(void); +void lstcon_rpc_cleanup_wait(void); +int lstcon_rpc_module_init(void); +void lstcon_rpc_module_fini(void); + + +#endif diff --git a/kernel/drivers/staging/lustre/lnet/selftest/console.c b/kernel/drivers/staging/lustre/lnet/selftest/console.c new file mode 100644 index 000000000..2b5f53c7a --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/console.c @@ -0,0 +1,2096 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * Infrastructure of LST console + * + * Author: Liang Zhen + */ + + +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/lib-lnet.h" +#include "console.h" +#include "conrpc.h" + +#define LST_NODE_STATE_COUNTER(nd, p) \ +do { \ + if ((nd)->nd_state == LST_NODE_ACTIVE) \ + (p)->nle_nactive++; \ + else if ((nd)->nd_state == LST_NODE_BUSY) \ + (p)->nle_nbusy++; \ + else if ((nd)->nd_state == LST_NODE_DOWN) \ + (p)->nle_ndown++; \ + else \ + (p)->nle_nunknown++; \ + (p)->nle_nnode++; \ +} while (0) + +lstcon_session_t console_session; + +static void +lstcon_node_get(lstcon_node_t *nd) +{ + LASSERT(nd->nd_ref >= 1); + + nd->nd_ref++; +} + +static int +lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create) +{ + lstcon_ndlink_t *ndl; + unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE; + + LASSERT(id.nid != LNET_NID_ANY); + + list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) { + if (ndl->ndl_node->nd_id.nid != id.nid || + ndl->ndl_node->nd_id.pid != id.pid) + continue; + + lstcon_node_get(ndl->ndl_node); + *ndpp = ndl->ndl_node; + return 0; + } + + if (!create) + return -ENOENT; + + LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t)); + if (*ndpp == NULL) + return -ENOMEM; + + ndl = (lstcon_ndlink_t *)(*ndpp + 1); + + ndl->ndl_node = *ndpp; + + ndl->ndl_node->nd_ref = 1; + ndl->ndl_node->nd_id = id; + ndl->ndl_node->nd_stamp = cfs_time_current(); + ndl->ndl_node->nd_state = LST_NODE_UNKNOWN; + ndl->ndl_node->nd_timeout = 0; + memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t)); + + /* queued in global hash & list, no refcount is taken by + * global hash & list, if caller release his refcount, + * node will be released */ + list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]); + list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list); + + return 0; +} + +static void +lstcon_node_put(lstcon_node_t *nd) +{ + lstcon_ndlink_t *ndl; + + LASSERT(nd->nd_ref > 0); + + if (--nd->nd_ref > 0) + return; + + ndl = (lstcon_ndlink_t *)(nd + 1); + + LASSERT(!list_empty(&ndl->ndl_link)); + LASSERT(!list_empty(&ndl->ndl_hlink)); + + /* remove from session */ + list_del(&ndl->ndl_link); + list_del(&ndl->ndl_hlink); + + LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t)); +} + +static int +lstcon_ndlink_find(struct list_head *hash, + lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create) +{ + unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + int rc; + + if (id.nid == LNET_NID_ANY) + return -EINVAL; + + /* search in hash */ + list_for_each_entry(ndl, &hash[idx], ndl_hlink) { + if (ndl->ndl_node->nd_id.nid != id.nid || + ndl->ndl_node->nd_id.pid != id.pid) + continue; + + *ndlpp = ndl; + return 0; + } + + if (create == 0) + return -ENOENT; + + /* find or create in session hash */ + rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0); + if (rc != 0) + return rc; + + LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t)); + if (ndl == NULL) { + lstcon_node_put(nd); + return -ENOMEM; + } + + *ndlpp = ndl; + + ndl->ndl_node = nd; + INIT_LIST_HEAD(&ndl->ndl_link); + list_add_tail(&ndl->ndl_hlink, &hash[idx]); + + return 0; +} + +static void +lstcon_ndlink_release(lstcon_ndlink_t *ndl) +{ + LASSERT(list_empty(&ndl->ndl_link)); + LASSERT(!list_empty(&ndl->ndl_hlink)); + + list_del(&ndl->ndl_hlink); /* delete from hash */ + lstcon_node_put(ndl->ndl_node); + + LIBCFS_FREE(ndl, sizeof(*ndl)); +} + +static int +lstcon_group_alloc(char *name, lstcon_group_t **grpp) +{ + lstcon_group_t *grp; + int i; + + LIBCFS_ALLOC(grp, offsetof(lstcon_group_t, + grp_ndl_hash[LST_NODE_HASHSIZE])); + if (grp == NULL) + return -ENOMEM; + + grp->grp_ref = 1; + if (name != NULL) + strcpy(grp->grp_name, name); + + INIT_LIST_HEAD(&grp->grp_link); + INIT_LIST_HEAD(&grp->grp_ndl_list); + INIT_LIST_HEAD(&grp->grp_trans_list); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) + INIT_LIST_HEAD(&grp->grp_ndl_hash[i]); + + *grpp = grp; + + return 0; +} + +static void +lstcon_group_addref(lstcon_group_t *grp) +{ + grp->grp_ref++; +} + +static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *); + +static void +lstcon_group_drain(lstcon_group_t *grp, int keep) +{ + lstcon_ndlink_t *ndl; + lstcon_ndlink_t *tmp; + + list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) { + if ((ndl->ndl_node->nd_state & keep) == 0) + lstcon_group_ndlink_release(grp, ndl); + } +} + +static void +lstcon_group_decref(lstcon_group_t *grp) +{ + int i; + + if (--grp->grp_ref > 0) + return; + + if (!list_empty(&grp->grp_link)) + list_del(&grp->grp_link); + + lstcon_group_drain(grp, 0); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + LASSERT(list_empty(&grp->grp_ndl_hash[i])); + } + + LIBCFS_FREE(grp, offsetof(lstcon_group_t, + grp_ndl_hash[LST_NODE_HASHSIZE])); +} + +static int +lstcon_group_find(const char *name, lstcon_group_t **grpp) +{ + lstcon_group_t *grp; + + list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { + if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0) + continue; + + lstcon_group_addref(grp); /* +1 ref for caller */ + *grpp = grp; + return 0; + } + + return -ENOENT; +} + +static void +lstcon_group_put(lstcon_group_t *grp) +{ + lstcon_group_decref(grp); +} + +static int +lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id, + lstcon_ndlink_t **ndlpp, int create) +{ + int rc; + + rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create); + if (rc != 0) + return rc; + + if (!list_empty(&(*ndlpp)->ndl_link)) + return 0; + + list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list); + grp->grp_nnode++; + + return 0; +} + +static void +lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl) +{ + list_del_init(&ndl->ndl_link); + lstcon_ndlink_release(ndl); + grp->grp_nnode --; +} + +static void +lstcon_group_ndlink_move(lstcon_group_t *old, + lstcon_group_t *new, lstcon_ndlink_t *ndl) +{ + unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) % + LST_NODE_HASHSIZE; + + list_del(&ndl->ndl_hlink); + list_del(&ndl->ndl_link); + old->grp_nnode --; + + list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]); + list_add_tail(&ndl->ndl_link, &new->grp_ndl_list); + new->grp_nnode++; + + return; +} + +static void +lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new) +{ + lstcon_ndlink_t *ndl; + + while (!list_empty(&old->grp_ndl_list)) { + ndl = list_entry(old->grp_ndl_list.next, + lstcon_ndlink_t, ndl_link); + lstcon_group_ndlink_move(old, new, ndl); + } +} + +static int +lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg) +{ + lstcon_group_t *grp = (lstcon_group_t *)arg; + + switch (transop) { + case LST_TRANS_SESNEW: + if (nd->nd_state == LST_NODE_ACTIVE) + return 0; + break; + + case LST_TRANS_SESEND: + if (nd->nd_state != LST_NODE_ACTIVE) + return 0; + + if (grp != NULL && nd->nd_ref > 1) + return 0; + break; + + case LST_TRANS_SESQRY: + break; + + default: + LBUG(); + } + + return 1; +} + +static int +lstcon_sesrpc_readent(int transop, srpc_msg_t *msg, + lstcon_rpc_ent_t *ent_up) +{ + srpc_debug_reply_t *rep; + + switch (transop) { + case LST_TRANS_SESNEW: + case LST_TRANS_SESEND: + return 0; + + case LST_TRANS_SESQRY: + rep = &msg->msg_body.dbg_reply; + + if (copy_to_user(&ent_up->rpe_priv[0], + &rep->dbg_timeout, sizeof(int)) || + copy_to_user(&ent_up->rpe_payload[0], + &rep->dbg_name, LST_NAME_SIZE)) + return -EFAULT; + + return 0; + + default: + LBUG(); + } + + return 0; +} + +static int +lstcon_group_nodes_add(lstcon_group_t *grp, + int count, lnet_process_id_t *ids_up, + unsigned *featp, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + lstcon_ndlink_t *ndl; + lstcon_group_t *tmp; + lnet_process_id_t id; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0 ; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* skip if it's in this group already */ + rc = lstcon_group_ndlink_find(grp, id, &ndl, 0); + if (rc == 0) + continue; + + /* add to tmp group */ + rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1); + if (rc != 0) { + CERROR("Can't create ndlink, out of memory\n"); + break; + } + } + + if (rc != 0) { + lstcon_group_put(tmp); + return rc; + } + + rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, + &tmp->grp_trans_list, LST_TRANS_SESNEW, + tmp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + lstcon_group_put(tmp); + return rc; + } + + /* post all RPCs */ + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_sesrpc_readent); + *featp = trans->tas_features; + + /* destroy all RPGs */ + lstcon_rpc_trans_destroy(trans); + + lstcon_group_move(tmp, grp); + lstcon_group_put(tmp); + + return rc; +} + +static int +lstcon_group_nodes_remove(lstcon_group_t *grp, + int count, lnet_process_id_t *ids_up, + struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + lstcon_ndlink_t *ndl; + lstcon_group_t *tmp; + lnet_process_id_t id; + int rc; + int i; + + /* End session and remove node from the group */ + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + goto error; + } + + /* move node to tmp group */ + if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0) + lstcon_group_ndlink_move(grp, tmp, ndl); + } + + rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, + &tmp->grp_trans_list, LST_TRANS_SESEND, + tmp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + goto error; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* release nodes anyway, because we can't rollback status */ + lstcon_group_put(tmp); + + return rc; +error: + lstcon_group_move(tmp, grp); + lstcon_group_put(tmp); + + return rc; +} + +int +lstcon_group_add(char *name) +{ + lstcon_group_t *grp; + int rc; + + rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0; + if (rc != 0) { + /* find a group with same name */ + lstcon_group_put(grp); + return rc; + } + + rc = lstcon_group_alloc(name, &grp); + if (rc != 0) { + CERROR("Can't allocate descriptor for group %s\n", name); + return -ENOMEM; + } + + list_add_tail(&grp->grp_link, &console_session.ses_grp_list); + + return rc; +} + +int +lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up, + unsigned *featp, struct list_head *result_up) +{ + lstcon_group_t *grp; + int rc; + + LASSERT(count > 0); + LASSERT(ids_up != NULL); + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by other threads or test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + + return -EBUSY; + } + + rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up); + + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_group_del(char *name) +{ + lstcon_rpc_trans_t *trans; + lstcon_group_t *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by others threads or test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + return -EBUSY; + } + + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &grp->grp_trans_list, LST_TRANS_SESEND, + grp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + lstcon_group_put(grp); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + lstcon_rpc_trans_destroy(trans); + + lstcon_group_put(grp); + /* -ref for session, it's destroyed, + * status can't be rolled back, destroy group anyway */ + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_group_clean(char *name, int args) +{ + lstcon_group_t *grp = NULL; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + return -EBUSY; + } + + args = (LST_NODE_ACTIVE | LST_NODE_BUSY | + LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args; + + lstcon_group_drain(grp, args); + + lstcon_group_put(grp); + /* release empty group */ + if (list_empty(&grp->grp_ndl_list)) + lstcon_group_put(grp); + + return 0; +} + +int +lstcon_nodes_remove(char *name, int count, + lnet_process_id_t *ids_up, struct list_head *result_up) +{ + lstcon_group_t *grp = NULL; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + return -EBUSY; + } + + rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up); + + lstcon_group_put(grp); + /* release empty group */ + if (list_empty(&grp->grp_ndl_list)) + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_group_refresh(char *name, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + lstcon_group_t *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + return -EBUSY; + } + + /* re-invite all inactive nodes int the group */ + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &grp->grp_trans_list, LST_TRANS_SESNEW, + grp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + /* local error, return */ + CDEBUG(D_NET, "Can't create transaction: %d\n", rc); + lstcon_group_put(grp); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* -ref for me */ + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_group_list(int index, int len, char *name_up) +{ + lstcon_group_t *grp; + + LASSERT(index >= 0); + LASSERT(name_up != NULL); + + list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { + if (index-- == 0) { + return copy_to_user(name_up, grp->grp_name, len) ? + -EFAULT : 0; + } + } + + return -ENOENT; +} + +static int +lstcon_nodes_getent(struct list_head *head, int *index_p, + int *count_p, lstcon_node_ent_t *dents_up) +{ + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + int count = 0; + int index = 0; + + LASSERT(index_p != NULL && count_p != NULL); + LASSERT(dents_up != NULL); + LASSERT(*index_p >= 0); + LASSERT(*count_p > 0); + + list_for_each_entry(ndl, head, ndl_link) { + if (index++ < *index_p) + continue; + + if (count >= *count_p) + break; + + nd = ndl->ndl_node; + if (copy_to_user(&dents_up[count].nde_id, + &nd->nd_id, sizeof(nd->nd_id)) || + copy_to_user(&dents_up[count].nde_state, + &nd->nd_state, sizeof(nd->nd_state))) + return -EFAULT; + + count++; + } + + if (index <= *index_p) + return -ENOENT; + + *count_p = count; + *index_p = index; + + return 0; +} + +int +lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p, + int *index_p, int *count_p, lstcon_node_ent_t *dents_up) +{ + lstcon_ndlist_ent_t *gentp; + lstcon_group_t *grp; + lstcon_ndlink_t *ndl; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (dents_up) { + /* verbose query */ + rc = lstcon_nodes_getent(&grp->grp_ndl_list, + index_p, count_p, dents_up); + lstcon_group_put(grp); + + return rc; + } + + /* non-verbose query */ + LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t)); + if (gentp == NULL) { + CERROR("Can't allocate ndlist_ent\n"); + lstcon_group_put(grp); + + return -ENOMEM; + } + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp); + + rc = copy_to_user(gents_p, gentp, + sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0; + + LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t)); + + lstcon_group_put(grp); + + return 0; +} + +static int +lstcon_batch_find(const char *name, lstcon_batch_t **batpp) +{ + lstcon_batch_t *bat; + + list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { + if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) { + *batpp = bat; + return 0; + } + } + + return -ENOENT; +} + +int +lstcon_batch_add(char *name) +{ + lstcon_batch_t *bat; + int i; + int rc; + + rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0; + if (rc != 0) { + CDEBUG(D_NET, "Batch %s already exists\n", name); + return rc; + } + + LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t)); + if (bat == NULL) { + CERROR("Can't allocate descriptor for batch %s\n", name); + return -ENOMEM; + } + + LIBCFS_ALLOC(bat->bat_cli_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + if (bat->bat_cli_hash == NULL) { + CERROR("Can't allocate hash for batch %s\n", name); + LIBCFS_FREE(bat, sizeof(lstcon_batch_t)); + + return -ENOMEM; + } + + LIBCFS_ALLOC(bat->bat_srv_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + if (bat->bat_srv_hash == NULL) { + CERROR("Can't allocate hash for batch %s\n", name); + LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(lstcon_batch_t)); + + return -ENOMEM; + } + + strcpy(bat->bat_name, name); + bat->bat_hdr.tsb_index = 0; + bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie; + + bat->bat_ntest = 0; + bat->bat_state = LST_BATCH_IDLE; + + INIT_LIST_HEAD(&bat->bat_cli_list); + INIT_LIST_HEAD(&bat->bat_srv_list); + INIT_LIST_HEAD(&bat->bat_test_list); + INIT_LIST_HEAD(&bat->bat_trans_list); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + INIT_LIST_HEAD(&bat->bat_cli_hash[i]); + INIT_LIST_HEAD(&bat->bat_srv_hash[i]); + } + + list_add_tail(&bat->bat_link, &console_session.ses_bat_list); + + return rc; +} + +int +lstcon_batch_list(int index, int len, char *name_up) +{ + lstcon_batch_t *bat; + + LASSERT(name_up != NULL); + LASSERT(index >= 0); + + list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { + if (index-- == 0) { + return copy_to_user(name_up, bat->bat_name, len) ? + -EFAULT: 0; + } + } + + return -ENOENT; +} + +int +lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server, + int testidx, int *index_p, int *ndent_p, + lstcon_node_ent_t *dents_up) +{ + lstcon_test_batch_ent_t *entp; + struct list_head *clilst; + struct list_head *srvlst; + lstcon_test_t *test = NULL; + lstcon_batch_t *bat; + lstcon_ndlink_t *ndl; + int rc; + + rc = lstcon_batch_find(name, &bat); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + if (testidx > 0) { + /* query test, test index start from 1 */ + list_for_each_entry(test, &bat->bat_test_list, tes_link) { + if (testidx-- == 1) + break; + } + + if (testidx > 0) { + CDEBUG(D_NET, "Can't find specified test in batch\n"); + return -ENOENT; + } + } + + clilst = (test == NULL) ? &bat->bat_cli_list : + &test->tes_src_grp->grp_ndl_list; + srvlst = (test == NULL) ? &bat->bat_srv_list : + &test->tes_dst_grp->grp_ndl_list; + + if (dents_up != NULL) { + rc = lstcon_nodes_getent((server ? srvlst: clilst), + index_p, ndent_p, dents_up); + return rc; + } + + /* non-verbose query */ + LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t)); + if (entp == NULL) + return -ENOMEM; + + if (test == NULL) { + entp->u.tbe_batch.bae_ntest = bat->bat_ntest; + entp->u.tbe_batch.bae_state = bat->bat_state; + + } else { + + entp->u.tbe_test.tse_type = test->tes_type; + entp->u.tbe_test.tse_loop = test->tes_loop; + entp->u.tbe_test.tse_concur = test->tes_concur; + } + + list_for_each_entry(ndl, clilst, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle); + + list_for_each_entry(ndl, srvlst, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle); + + rc = copy_to_user(ent_up, entp, + sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0; + + LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t)); + + return rc; +} + +static int +lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg) +{ + switch (transop) { + case LST_TRANS_TSBRUN: + if (nd->nd_state != LST_NODE_ACTIVE) + return -ENETDOWN; + break; + + case LST_TRANS_TSBSTOP: + if (nd->nd_state != LST_NODE_ACTIVE) + return 0; + break; + + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + break; + } + + return 1; +} + +static int +lstcon_batch_op(lstcon_batch_t *bat, int transop, + struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + int rc; + + rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list, + &bat->bat_trans_list, transop, + bat, lstcon_batrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_batch_run(char *name, int timeout, struct list_head *result_up) +{ + lstcon_batch_t *bat; + int rc; + + if (lstcon_batch_find(name, &bat) != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + bat->bat_arg = timeout; + + rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up); + + /* mark batch as running if it's started in any node */ + if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0) + bat->bat_state = LST_BATCH_RUNNING; + + return rc; +} + +int +lstcon_batch_stop(char *name, int force, struct list_head *result_up) +{ + lstcon_batch_t *bat; + int rc; + + if (lstcon_batch_find(name, &bat) != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + bat->bat_arg = force; + + rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up); + + /* mark batch as stopped if all RPCs finished */ + if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0) + bat->bat_state = LST_BATCH_IDLE; + + return rc; +} + +static void +lstcon_batch_destroy(lstcon_batch_t *bat) +{ + lstcon_ndlink_t *ndl; + lstcon_test_t *test; + int i; + + list_del(&bat->bat_link); + + while (!list_empty(&bat->bat_test_list)) { + test = list_entry(bat->bat_test_list.next, + lstcon_test_t, tes_link); + LASSERT(list_empty(&test->tes_trans_list)); + + list_del(&test->tes_link); + + lstcon_group_put(test->tes_src_grp); + lstcon_group_put(test->tes_dst_grp); + + LIBCFS_FREE(test, offsetof(lstcon_test_t, + tes_param[test->tes_paramlen])); + } + + LASSERT(list_empty(&bat->bat_trans_list)); + + while (!list_empty(&bat->bat_cli_list)) { + ndl = list_entry(bat->bat_cli_list.next, + lstcon_ndlink_t, ndl_link); + list_del_init(&ndl->ndl_link); + + lstcon_ndlink_release(ndl); + } + + while (!list_empty(&bat->bat_srv_list)) { + ndl = list_entry(bat->bat_srv_list.next, + lstcon_ndlink_t, ndl_link); + list_del_init(&ndl->ndl_link); + + lstcon_ndlink_release(ndl); + } + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + LASSERT(list_empty(&bat->bat_cli_hash[i])); + LASSERT(list_empty(&bat->bat_srv_hash[i])); + } + + LIBCFS_FREE(bat->bat_cli_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + LIBCFS_FREE(bat->bat_srv_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(lstcon_batch_t)); +} + +static int +lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg) +{ + lstcon_test_t *test; + lstcon_batch_t *batch; + lstcon_ndlink_t *ndl; + struct list_head *hash; + struct list_head *head; + + test = (lstcon_test_t *)arg; + LASSERT(test != NULL); + + batch = test->tes_batch; + LASSERT(batch != NULL); + + if (test->tes_oneside && + transop == LST_TRANS_TSBSRVADD) + return 0; + + if (nd->nd_state != LST_NODE_ACTIVE) + return -ENETDOWN; + + if (transop == LST_TRANS_TSBCLIADD) { + hash = batch->bat_cli_hash; + head = &batch->bat_cli_list; + + } else { + LASSERT(transop == LST_TRANS_TSBSRVADD); + + hash = batch->bat_srv_hash; + head = &batch->bat_srv_list; + } + + LASSERT(nd->nd_id.nid != LNET_NID_ANY); + + if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0) + return -ENOMEM; + + if (list_empty(&ndl->ndl_link)) + list_add_tail(&ndl->ndl_link, head); + + return 1; +} + +static int +lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + lstcon_group_t *grp; + int transop; + int rc; + + LASSERT(test->tes_src_grp != NULL); + LASSERT(test->tes_dst_grp != NULL); + + transop = LST_TRANS_TSBSRVADD; + grp = test->tes_dst_grp; +again: + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &test->tes_trans_list, transop, + test, lstcon_testrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + if (lstcon_trans_stat()->trs_rpc_errno != 0 || + lstcon_trans_stat()->trs_fwk_errno != 0) { + lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* return if any error */ + CDEBUG(D_NET, "Failed to add test %s, RPC error %d, framework error %d\n", + transop == LST_TRANS_TSBCLIADD ? "client" : "server", + lstcon_trans_stat()->trs_rpc_errno, + lstcon_trans_stat()->trs_fwk_errno); + + return rc; + } + + lstcon_rpc_trans_destroy(trans); + + if (transop == LST_TRANS_TSBCLIADD) + return rc; + + transop = LST_TRANS_TSBCLIADD; + grp = test->tes_src_grp; + test->tes_cliidx = 0; + + /* requests to test clients */ + goto again; +} + +static int +lstcon_verify_batch(const char *name, lstcon_batch_t **batch) +{ + int rc; + + rc = lstcon_batch_find(name, batch); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return rc; + } + + if ((*batch)->bat_state != LST_BATCH_IDLE) { + CDEBUG(D_NET, "Can't change running batch %s\n", name); + return -EINVAL; + } + + return 0; +} + +static int +lstcon_verify_group(const char *name, lstcon_group_t **grp) +{ + int rc; + lstcon_ndlink_t *ndl; + + rc = lstcon_group_find(name, grp); + if (rc != 0) { + CDEBUG(D_NET, "can't find group %s\n", name); + return rc; + } + + list_for_each_entry(ndl, &(*grp)->grp_ndl_list, ndl_link) { + if (ndl->ndl_node->nd_state == LST_NODE_ACTIVE) + return 0; + } + + CDEBUG(D_NET, "Group %s has no ACTIVE nodes\n", name); + + return -EINVAL; +} + +int +lstcon_test_add(char *batch_name, int type, int loop, + int concur, int dist, int span, + char *src_name, char *dst_name, + void *param, int paramlen, int *retp, + struct list_head *result_up) +{ + lstcon_test_t *test = NULL; + int rc; + lstcon_group_t *src_grp = NULL; + lstcon_group_t *dst_grp = NULL; + lstcon_batch_t *batch = NULL; + + /* + * verify that a batch of the given name exists, and the groups + * that will be part of the batch exist and have at least one + * active node + */ + rc = lstcon_verify_batch(batch_name, &batch); + if (rc != 0) + goto out; + + rc = lstcon_verify_group(src_name, &src_grp); + if (rc != 0) + goto out; + + rc = lstcon_verify_group(dst_name, &dst_grp); + if (rc != 0) + goto out; + + if (dst_grp->grp_userland) + *retp = 1; + + LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen])); + if (!test) { + CERROR("Can't allocate test descriptor\n"); + rc = -ENOMEM; + + goto out; + } + + test->tes_hdr.tsb_id = batch->bat_hdr.tsb_id; + test->tes_batch = batch; + test->tes_type = type; + test->tes_oneside = 0; /* TODO */ + test->tes_loop = loop; + test->tes_concur = concur; + test->tes_stop_onerr = 1; /* TODO */ + test->tes_span = span; + test->tes_dist = dist; + test->tes_cliidx = 0; /* just used for creating RPC */ + test->tes_src_grp = src_grp; + test->tes_dst_grp = dst_grp; + INIT_LIST_HEAD(&test->tes_trans_list); + + if (param != NULL) { + test->tes_paramlen = paramlen; + memcpy(&test->tes_param[0], param, paramlen); + } + + rc = lstcon_test_nodes_add(test, result_up); + + if (rc != 0) + goto out; + + if (lstcon_trans_stat()->trs_rpc_errno != 0 || + lstcon_trans_stat()->trs_fwk_errno != 0) + CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, + batch_name); + + /* add to test list anyway, so user can check what's going on */ + list_add_tail(&test->tes_link, &batch->bat_test_list); + + batch->bat_ntest++; + test->tes_hdr.tsb_index = batch->bat_ntest; + + /* hold groups so nobody can change them */ + return rc; +out: + if (test != NULL) + LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen])); + + if (dst_grp != NULL) + lstcon_group_put(dst_grp); + + if (src_grp != NULL) + lstcon_group_put(src_grp); + + return rc; +} + +static int +lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp) +{ + lstcon_test_t *test; + + list_for_each_entry(test, &batch->bat_test_list, tes_link) { + if (idx == test->tes_hdr.tsb_index) { + *testpp = test; + return 0; + } + } + + return -ENOENT; +} + +static int +lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg, + lstcon_rpc_ent_t *ent_up) +{ + srpc_batch_reply_t *rep = &msg->msg_body.bat_reply; + + LASSERT(transop == LST_TRANS_TSBCLIQRY || + transop == LST_TRANS_TSBSRVQRY); + + /* positive errno, framework error code */ + if (copy_to_user(&ent_up->rpe_priv[0], + &rep->bar_active, sizeof(rep->bar_active))) + return -EFAULT; + + return 0; +} + +int +lstcon_test_batch_query(char *name, int testidx, int client, + int timeout, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + struct list_head *translist; + struct list_head *ndlist; + lstcon_tsb_hdr_t *hdr; + lstcon_batch_t *batch; + lstcon_test_t *test = NULL; + int transop; + int rc; + + rc = lstcon_batch_find(name, &batch); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch: %s\n", name); + return rc; + } + + if (testidx == 0) { + translist = &batch->bat_trans_list; + ndlist = &batch->bat_cli_list; + hdr = &batch->bat_hdr; + + } else { + /* query specified test only */ + rc = lstcon_test_find(batch, testidx, &test); + if (rc != 0) { + CDEBUG(D_NET, "Can't find test: %d\n", testidx); + return rc; + } + + translist = &test->tes_trans_list; + ndlist = &test->tes_src_grp->grp_ndl_list; + hdr = &test->tes_hdr; + } + + transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY; + + rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr, + lstcon_batrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, timeout); + + if (testidx == 0 && /* query a batch, not a test */ + lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 && + lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) { + /* all RPCs finished, and no active test */ + batch->bat_state = LST_BATCH_IDLE; + } + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_tsbrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +static int +lstcon_statrpc_readent(int transop, srpc_msg_t *msg, + lstcon_rpc_ent_t *ent_up) +{ + srpc_stat_reply_t *rep = &msg->msg_body.stat_reply; + sfw_counters_t *sfwk_stat; + srpc_counters_t *srpc_stat; + lnet_counters_t *lnet_stat; + + if (rep->str_status != 0) + return 0; + + sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0]; + srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat)); + lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat)); + + if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) || + copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) || + copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat))) + return -EFAULT; + + return 0; +} + +static int +lstcon_ndlist_stat(struct list_head *ndlist, + int timeout, struct list_head *result_up) +{ + struct list_head head; + lstcon_rpc_trans_t *trans; + int rc; + + INIT_LIST_HEAD(&head); + + rc = lstcon_rpc_trans_ndlist(ndlist, &head, + LST_TRANS_STATQRY, NULL, NULL, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_statrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up) +{ + lstcon_group_t *grp; + int rc; + + rc = lstcon_group_find(grp_name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", grp_name); + return rc; + } + + rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up); + + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_nodes_stat(int count, lnet_process_id_t *ids_up, + int timeout, struct list_head *result_up) +{ + lstcon_ndlink_t *ndl; + lstcon_group_t *tmp; + lnet_process_id_t id; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0 ; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* add to tmp group */ + rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2); + if (rc != 0) { + CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET, + "Failed to find or create %s: %d\n", + libcfs_id2str(id), rc); + break; + } + } + + if (rc != 0) { + lstcon_group_put(tmp); + return rc; + } + + rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up); + + lstcon_group_put(tmp); + + return rc; +} + +static int +lstcon_debug_ndlist(struct list_head *ndlist, + struct list_head *translist, + int timeout, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + int rc; + + rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY, + NULL, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_sesrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_session_debug(int timeout, struct list_head *result_up) +{ + return lstcon_debug_ndlist(&console_session.ses_ndl_list, + NULL, timeout, result_up); +} + +int +lstcon_batch_debug(int timeout, char *name, + int client, struct list_head *result_up) +{ + lstcon_batch_t *bat; + int rc; + + rc = lstcon_batch_find(name, &bat); + if (rc != 0) + return -ENOENT; + + rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list : + &bat->bat_srv_list, + NULL, timeout, result_up); + + return rc; +} + +int +lstcon_group_debug(int timeout, char *name, + struct list_head *result_up) +{ + lstcon_group_t *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) + return -ENOENT; + + rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, + timeout, result_up); + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_nodes_debug(int timeout, + int count, lnet_process_id_t *ids_up, + struct list_head *result_up) +{ + lnet_process_id_t id; + lstcon_ndlink_t *ndl; + lstcon_group_t *grp; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Out of memory\n"); + return rc; + } + + for (i = 0; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* node is added to tmp group */ + rc = lstcon_group_ndlink_find(grp, id, &ndl, 1); + if (rc != 0) { + CERROR("Can't create node link\n"); + break; + } + } + + if (rc != 0) { + lstcon_group_put(grp); + return rc; + } + + rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, + timeout, result_up); + + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_session_match(lst_sid_t sid) +{ + return (console_session.ses_id.ses_nid == sid.ses_nid && + console_session.ses_id.ses_stamp == sid.ses_stamp) ? 1: 0; +} + +static void +lstcon_new_session_id(lst_sid_t *sid) +{ + lnet_process_id_t id; + + LASSERT(console_session.ses_state == LST_SESSION_NONE); + + LNetGetId(1, &id); + sid->ses_nid = id.nid; + sid->ses_stamp = cfs_time_current(); +} + +extern srpc_service_t lstcon_acceptor_service; + +int +lstcon_session_new(char *name, int key, unsigned feats, + int timeout, int force, lst_sid_t *sid_up) +{ + int rc = 0; + int i; + + if (console_session.ses_state != LST_SESSION_NONE) { + /* session exists */ + if (!force) { + CNETERR("Session %s already exists\n", + console_session.ses_name); + return -EEXIST; + } + + rc = lstcon_session_end(); + + /* lstcon_session_end() only return local error */ + if (rc != 0) + return rc; + } + + if ((feats & ~LST_FEATS_MASK) != 0) { + CNETERR("Unknown session features %x\n", + (feats & ~LST_FEATS_MASK)); + return -EINVAL; + } + + for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) + LASSERT(list_empty(&console_session.ses_ndl_hash[i])); + + lstcon_new_session_id(&console_session.ses_id); + + console_session.ses_key = key; + console_session.ses_state = LST_SESSION_ACTIVE; + console_session.ses_force = !!force; + console_session.ses_features = feats; + console_session.ses_feats_updated = 0; + console_session.ses_timeout = (timeout <= 0) ? + LST_CONSOLE_TIMEOUT : timeout; + strcpy(console_session.ses_name, name); + + rc = lstcon_batch_add(LST_DEFAULT_BATCH); + if (rc != 0) + return rc; + + rc = lstcon_rpc_pinger_start(); + if (rc != 0) { + lstcon_batch_t *bat = NULL; + + lstcon_batch_find(LST_DEFAULT_BATCH, &bat); + lstcon_batch_destroy(bat); + + return rc; + } + + if (copy_to_user(sid_up, &console_session.ses_id, + sizeof(lst_sid_t)) == 0) + return rc; + + lstcon_session_end(); + + return -EFAULT; +} + +int +lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp, + lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len) +{ + lstcon_ndlist_ent_t *entp; + lstcon_ndlink_t *ndl; + int rc = 0; + + if (console_session.ses_state != LST_SESSION_ACTIVE) + return -ESRCH; + + LIBCFS_ALLOC(entp, sizeof(*entp)); + if (entp == NULL) + return -ENOMEM; + + list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, entp); + + if (copy_to_user(sid_up, &console_session.ses_id, + sizeof(lst_sid_t)) || + copy_to_user(key_up, &console_session.ses_key, + sizeof(*key_up)) || + copy_to_user(featp, &console_session.ses_features, + sizeof(*featp)) || + copy_to_user(ndinfo_up, entp, sizeof(*entp)) || + copy_to_user(name_up, console_session.ses_name, len)) + rc = -EFAULT; + + LIBCFS_FREE(entp, sizeof(*entp)); + + return rc; +} + +int +lstcon_session_end(void) +{ + lstcon_rpc_trans_t *trans; + lstcon_group_t *grp; + lstcon_batch_t *bat; + int rc = 0; + + LASSERT(console_session.ses_state == LST_SESSION_ACTIVE); + + rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list, + NULL, LST_TRANS_SESEND, NULL, + lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + console_session.ses_shutdown = 1; + + lstcon_rpc_pinger_stop(); + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + lstcon_rpc_trans_destroy(trans); + /* User can do nothing even rpc failed, so go on */ + + /* waiting for orphan rpcs to die */ + lstcon_rpc_cleanup_wait(); + + console_session.ses_id = LST_INVALID_SID; + console_session.ses_state = LST_SESSION_NONE; + console_session.ses_key = 0; + console_session.ses_force = 0; + console_session.ses_feats_updated = 0; + + /* destroy all batches */ + while (!list_empty(&console_session.ses_bat_list)) { + bat = list_entry(console_session.ses_bat_list.next, + lstcon_batch_t, bat_link); + + lstcon_batch_destroy(bat); + } + + /* destroy all groups */ + while (!list_empty(&console_session.ses_grp_list)) { + grp = list_entry(console_session.ses_grp_list.next, + lstcon_group_t, grp_link); + LASSERT(grp->grp_ref == 1); + + lstcon_group_put(grp); + } + + /* all nodes should be released */ + LASSERT(list_empty(&console_session.ses_ndl_list)); + + console_session.ses_shutdown = 0; + console_session.ses_expired = 0; + + return rc; +} + +int +lstcon_session_feats_check(unsigned feats) +{ + int rc = 0; + + if ((feats & ~LST_FEATS_MASK) != 0) { + CERROR("Can't support these features: %x\n", + (feats & ~LST_FEATS_MASK)); + return -EPROTO; + } + + spin_lock(&console_session.ses_rpc_lock); + + if (!console_session.ses_feats_updated) { + console_session.ses_feats_updated = 1; + console_session.ses_features = feats; + } + + if (console_session.ses_features != feats) + rc = -EPROTO; + + spin_unlock(&console_session.ses_rpc_lock); + + if (rc != 0) { + CERROR("remote features %x do not match with session features %x of console\n", + feats, console_session.ses_features); + } + + return rc; +} + +static int +lstcon_acceptor_handle(srpc_server_rpc_t *rpc) +{ + srpc_msg_t *rep = &rpc->srpc_replymsg; + srpc_msg_t *req = &rpc->srpc_reqstbuf->buf_msg; + srpc_join_reqst_t *jreq = &req->msg_body.join_reqst; + srpc_join_reply_t *jrep = &rep->msg_body.join_reply; + lstcon_group_t *grp = NULL; + lstcon_ndlink_t *ndl; + int rc = 0; + + sfw_unpack_message(req); + + mutex_lock(&console_session.ses_mutex); + + jrep->join_sid = console_session.ses_id; + + if (console_session.ses_id.ses_nid == LNET_NID_ANY) { + jrep->join_status = ESRCH; + goto out; + } + + if (lstcon_session_feats_check(req->msg_ses_feats) != 0) { + jrep->join_status = EPROTO; + goto out; + } + + if (jreq->join_sid.ses_nid != LNET_NID_ANY && + !lstcon_session_match(jreq->join_sid)) { + jrep->join_status = EBUSY; + goto out; + } + + if (lstcon_group_find(jreq->join_group, &grp) != 0) { + rc = lstcon_group_alloc(jreq->join_group, &grp); + if (rc != 0) { + CERROR("Out of memory\n"); + goto out; + } + + list_add_tail(&grp->grp_link, + &console_session.ses_grp_list); + lstcon_group_addref(grp); + } + + if (grp->grp_ref > 2) { + /* Group in using */ + jrep->join_status = EBUSY; + goto out; + } + + rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0); + if (rc == 0) { + jrep->join_status = EEXIST; + goto out; + } + + rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1); + if (rc != 0) { + CERROR("Out of memory\n"); + goto out; + } + + ndl->ndl_node->nd_state = LST_NODE_ACTIVE; + ndl->ndl_node->nd_timeout = console_session.ses_timeout; + + if (grp->grp_userland == 0) + grp->grp_userland = 1; + + strcpy(jrep->join_session, console_session.ses_name); + jrep->join_timeout = console_session.ses_timeout; + jrep->join_status = 0; + +out: + rep->msg_ses_feats = console_session.ses_features; + if (grp != NULL) + lstcon_group_put(grp); + + mutex_unlock(&console_session.ses_mutex); + + return rc; +} + +srpc_service_t lstcon_acceptor_service; +static void lstcon_init_acceptor_service(void) +{ + /* initialize selftest console acceptor service table */ + lstcon_acceptor_service.sv_name = "join session"; + lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle; + lstcon_acceptor_service.sv_id = SRPC_SERVICE_JOIN; + lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX; +} + +extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data); + +static DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry); + +/* initialize console */ +int +lstcon_console_init(void) +{ + int i; + int rc; + + memset(&console_session, 0, sizeof(lstcon_session_t)); + + console_session.ses_id = LST_INVALID_SID; + console_session.ses_state = LST_SESSION_NONE; + console_session.ses_timeout = 0; + console_session.ses_force = 0; + console_session.ses_expired = 0; + console_session.ses_feats_updated = 0; + console_session.ses_features = LST_FEATS_MASK; + console_session.ses_laststamp = get_seconds(); + + mutex_init(&console_session.ses_mutex); + + INIT_LIST_HEAD(&console_session.ses_ndl_list); + INIT_LIST_HEAD(&console_session.ses_grp_list); + INIT_LIST_HEAD(&console_session.ses_bat_list); + INIT_LIST_HEAD(&console_session.ses_trans_list); + + LIBCFS_ALLOC(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + if (console_session.ses_ndl_hash == NULL) + return -ENOMEM; + + for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) + INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]); + + + /* initialize acceptor service table */ + lstcon_init_acceptor_service(); + + rc = srpc_add_service(&lstcon_acceptor_service); + LASSERT(rc != -EBUSY); + if (rc != 0) { + LIBCFS_FREE(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + return rc; + } + + rc = srpc_service_add_buffers(&lstcon_acceptor_service, + lstcon_acceptor_service.sv_wi_total); + if (rc != 0) { + rc = -ENOMEM; + goto out; + } + + rc = libcfs_register_ioctl(&lstcon_ioctl_handler); + + if (rc == 0) { + lstcon_rpc_module_init(); + return 0; + } + +out: + srpc_shutdown_service(&lstcon_acceptor_service); + srpc_remove_service(&lstcon_acceptor_service); + + LIBCFS_FREE(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + + srpc_wait_service_shutdown(&lstcon_acceptor_service); + + return rc; +} + +int +lstcon_console_fini(void) +{ + int i; + + libcfs_deregister_ioctl(&lstcon_ioctl_handler); + + mutex_lock(&console_session.ses_mutex); + + srpc_shutdown_service(&lstcon_acceptor_service); + srpc_remove_service(&lstcon_acceptor_service); + + if (console_session.ses_state != LST_SESSION_NONE) + lstcon_session_end(); + + lstcon_rpc_module_fini(); + + mutex_unlock(&console_session.ses_mutex); + + LASSERT(list_empty(&console_session.ses_ndl_list)); + LASSERT(list_empty(&console_session.ses_grp_list)); + LASSERT(list_empty(&console_session.ses_bat_list)); + LASSERT(list_empty(&console_session.ses_trans_list)); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + LASSERT(list_empty(&console_session.ses_ndl_hash[i])); + } + + LIBCFS_FREE(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + + srpc_wait_service_shutdown(&lstcon_acceptor_service); + + return 0; +} diff --git a/kernel/drivers/staging/lustre/lnet/selftest/console.h b/kernel/drivers/staging/lustre/lnet/selftest/console.h new file mode 100644 index 000000000..e41ca89f1 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/console.h @@ -0,0 +1,235 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/console.h + * + * kernel structure for LST console + * + * Author: Liang Zhen + */ + +#ifndef __LST_CONSOLE_H__ +#define __LST_CONSOLE_H__ + + +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/lnet.h" +#include "../../include/linux/lnet/lib-types.h" +#include "../../include/linux/lnet/lnetst.h" +#include "selftest.h" +#include "conrpc.h" + +typedef struct lstcon_node { + lnet_process_id_t nd_id; /* id of the node */ + int nd_ref; /* reference count */ + int nd_state; /* state of the node */ + int nd_timeout; /* session timeout */ + unsigned long nd_stamp; /* timestamp of last replied RPC */ + struct lstcon_rpc nd_ping; /* ping rpc */ +} lstcon_node_t; /*** node descriptor */ + +typedef struct { + struct list_head ndl_link; /* chain on list */ + struct list_head ndl_hlink; /* chain on hash */ + lstcon_node_t *ndl_node; /* pointer to node */ +} lstcon_ndlink_t; /*** node link descriptor */ + +typedef struct { + struct list_head grp_link; /* chain on global group list */ + int grp_ref; /* reference count */ + int grp_userland; /* has userland nodes */ + int grp_nnode; /* # of nodes */ + char grp_name[LST_NAME_SIZE]; /* group name */ + + struct list_head grp_trans_list; /* transaction list */ + struct list_head grp_ndl_list; /* nodes list */ + struct list_head grp_ndl_hash[0];/* hash table for nodes */ +} lstcon_group_t; /*** (alias of nodes) group descriptor */ + +#define LST_BATCH_IDLE 0xB0 /* idle batch */ +#define LST_BATCH_RUNNING 0xB1 /* running batch */ + +typedef struct lstcon_tsb_hdr { + lst_bid_t tsb_id; /* batch ID */ + int tsb_index; /* test index */ +} lstcon_tsb_hdr_t; + +typedef struct { + lstcon_tsb_hdr_t bat_hdr; /* test_batch header */ + struct list_head bat_link; /* chain on session's batches list */ + int bat_ntest; /* # of test */ + int bat_state; /* state of the batch */ + int bat_arg; /* parameter for run|stop, timeout for run, force for stop */ + char bat_name[LST_NAME_SIZE]; /* name of batch */ + + struct list_head bat_test_list; /* list head of tests (lstcon_test_t) */ + struct list_head bat_trans_list; /* list head of transaction */ + struct list_head bat_cli_list; /* list head of client nodes (lstcon_node_t) */ + struct list_head *bat_cli_hash; /* hash table of client nodes */ + struct list_head bat_srv_list; /* list head of server nodes */ + struct list_head *bat_srv_hash; /* hash table of server nodes */ +} lstcon_batch_t; /*** (tests ) batch descriptor */ + +typedef struct lstcon_test { + lstcon_tsb_hdr_t tes_hdr; /* test batch header */ + struct list_head tes_link; /* chain on batch's tests list */ + lstcon_batch_t *tes_batch; /* pointer to batch */ + + int tes_type; /* type of the test, i.e: bulk, ping */ + int tes_stop_onerr; /* stop on error */ + int tes_oneside; /* one-sided test */ + int tes_concur; /* concurrency */ + int tes_loop; /* loop count */ + int tes_dist; /* nodes distribution of target group */ + int tes_span; /* nodes span of target group */ + int tes_cliidx; /* client index, used for RPC creating */ + + struct list_head tes_trans_list; /* transaction list */ + lstcon_group_t *tes_src_grp; /* group run the test */ + lstcon_group_t *tes_dst_grp; /* target group */ + + int tes_paramlen; /* test parameter length */ + char tes_param[0]; /* test parameter */ +} lstcon_test_t; /*** a single test descriptor */ + +#define LST_GLOBAL_HASHSIZE 503 /* global nodes hash table size */ +#define LST_NODE_HASHSIZE 239 /* node hash table (for batch or group) */ + +#define LST_SESSION_NONE 0x0 /* no session */ +#define LST_SESSION_ACTIVE 0x1 /* working session */ + +#define LST_CONSOLE_TIMEOUT 300 /* default console timeout */ + +typedef struct { + struct mutex ses_mutex; /* only 1 thread in session */ + lst_sid_t ses_id; /* global session id */ + int ses_key; /* local session key */ + int ses_state; /* state of session */ + int ses_timeout; /* timeout in seconds */ + time_t ses_laststamp; /* last operation stamp (seconds) */ + /** tests features of the session */ + unsigned ses_features; + /** features are synced with remote test nodes */ + unsigned ses_feats_updated:1; + /** force creating */ + unsigned ses_force:1; + /** session is shutting down */ + unsigned ses_shutdown:1; + /** console is timedout */ + unsigned ses_expired:1; + __u64 ses_id_cookie; /* batch id cookie */ + char ses_name[LST_NAME_SIZE]; /* session name */ + lstcon_rpc_trans_t *ses_ping; /* session pinger */ + stt_timer_t ses_ping_timer; /* timer for pinger */ + lstcon_trans_stat_t ses_trans_stat; /* transaction stats */ + + struct list_head ses_trans_list; /* global list of transaction */ + struct list_head ses_grp_list; /* global list of groups */ + struct list_head ses_bat_list; /* global list of batches */ + struct list_head ses_ndl_list; /* global list of nodes */ + struct list_head *ses_ndl_hash; /* hash table of nodes */ + + spinlock_t ses_rpc_lock; /* serialize */ + atomic_t ses_rpc_counter;/* # of initialized RPCs */ + struct list_head ses_rpc_freelist; /* idle console rpc */ +} lstcon_session_t; /*** session descriptor */ + +extern lstcon_session_t console_session; + +static inline lstcon_trans_stat_t * +lstcon_trans_stat(void) +{ + return &console_session.ses_trans_stat; +} + +static inline struct list_head * +lstcon_id2hash (lnet_process_id_t id, struct list_head *hash) +{ + unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; + + return &hash[idx]; +} + +int lstcon_console_init(void); +int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data); +int lstcon_console_fini(void); +extern int lstcon_session_match(lst_sid_t sid); +extern int lstcon_session_new(char *name, int key, unsigned version, + int timeout, int flags, lst_sid_t *sid_up); +extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp, + lstcon_ndlist_ent_t *entp, char *name_up, int len); +extern int lstcon_session_end(void); +extern int lstcon_session_debug(int timeout, struct list_head *result_up); +extern int lstcon_session_feats_check(unsigned feats); +extern int lstcon_batch_debug(int timeout, char *name, + int client, struct list_head *result_up); +extern int lstcon_group_debug(int timeout, char *name, + struct list_head *result_up); +extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up, + struct list_head *result_up); +extern int lstcon_group_add(char *name); +extern int lstcon_group_del(char *name); +extern int lstcon_group_clean(char *name, int args); +extern int lstcon_group_refresh(char *name, struct list_head *result_up); +extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up, + unsigned *featp, struct list_head *result_up); +extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up, + struct list_head *result_up); +extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up, + int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up); +extern int lstcon_group_list(int idx, int len, char *name_up); +extern int lstcon_batch_add(char *name); +extern int lstcon_batch_run(char *name, int timeout, + struct list_head *result_up); +extern int lstcon_batch_stop(char *name, int force, + struct list_head *result_up); +extern int lstcon_test_batch_query(char *name, int testidx, + int client, int timeout, + struct list_head *result_up); +extern int lstcon_batch_del(char *name); +extern int lstcon_batch_list(int idx, int namelen, char *name_up); +extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, + int server, int testidx, int *index_p, + int *ndent_p, lstcon_node_ent_t *dents_up); +extern int lstcon_group_stat(char *grp_name, int timeout, + struct list_head *result_up); +extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up, + int timeout, struct list_head *result_up); +extern int lstcon_test_add(char *batch_name, int type, int loop, + int concur, int dist, int span, + char *src_name, char *dst_name, + void *param, int paramlen, int *retp, + struct list_head *result_up); +#endif diff --git a/kernel/drivers/staging/lustre/lnet/selftest/framework.c b/kernel/drivers/staging/lustre/lnet/selftest/framework.c new file mode 100644 index 000000000..a93a90de0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/framework.c @@ -0,0 +1,1804 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/framework.c + * + * Author: Isaac Huang + * Author: Liang Zhen + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1}; + +static int session_timeout = 100; +module_param(session_timeout, int, 0444); +MODULE_PARM_DESC(session_timeout, "test session timeout in seconds (100 by default, 0 == never)"); + +static int rpc_timeout = 64; +module_param(rpc_timeout, int, 0644); +MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never)"); + +#define sfw_unpack_id(id) \ +do { \ + __swab64s(&(id).nid); \ + __swab32s(&(id).pid); \ +} while (0) + +#define sfw_unpack_sid(sid) \ +do { \ + __swab64s(&(sid).ses_nid); \ + __swab64s(&(sid).ses_stamp); \ +} while (0) + +#define sfw_unpack_fw_counters(fc) \ +do { \ + __swab32s(&(fc).running_ms); \ + __swab32s(&(fc).active_batches); \ + __swab32s(&(fc).zombie_sessions); \ + __swab32s(&(fc).brw_errors); \ + __swab32s(&(fc).ping_errors); \ +} while (0) + +#define sfw_unpack_rpc_counters(rc) \ +do { \ + __swab32s(&(rc).errors); \ + __swab32s(&(rc).rpcs_sent); \ + __swab32s(&(rc).rpcs_rcvd); \ + __swab32s(&(rc).rpcs_dropped); \ + __swab32s(&(rc).rpcs_expired); \ + __swab64s(&(rc).bulk_get); \ + __swab64s(&(rc).bulk_put); \ +} while (0) + +#define sfw_unpack_lnet_counters(lc) \ +do { \ + __swab32s(&(lc).errors); \ + __swab32s(&(lc).msgs_max); \ + __swab32s(&(lc).msgs_alloc); \ + __swab32s(&(lc).send_count); \ + __swab32s(&(lc).recv_count); \ + __swab32s(&(lc).drop_count); \ + __swab32s(&(lc).route_count); \ + __swab64s(&(lc).send_length); \ + __swab64s(&(lc).recv_length); \ + __swab64s(&(lc).drop_length); \ + __swab64s(&(lc).route_length); \ +} while (0) + +#define sfw_test_active(t) (atomic_read(&(t)->tsi_nactive) != 0) +#define sfw_batch_active(b) (atomic_read(&(b)->bat_nactive) != 0) + +static struct smoketest_framework { + struct list_head fw_zombie_rpcs; /* RPCs to be recycled */ + struct list_head fw_zombie_sessions; /* stopping sessions */ + struct list_head fw_tests; /* registered test cases */ + atomic_t fw_nzombies; /* # zombie sessions */ + spinlock_t fw_lock; /* serialise */ + sfw_session_t *fw_session; /* _the_ session */ + int fw_shuttingdown; /* shutdown in progress */ + srpc_server_rpc_t *fw_active_srpc; /* running RPC */ +} sfw_data; + +/* forward ref's */ +int sfw_stop_batch(sfw_batch_t *tsb, int force); +void sfw_destroy_session(sfw_session_t *sn); + +static inline sfw_test_case_t * +sfw_find_test_case(int id) +{ + sfw_test_case_t *tsc; + + LASSERT(id <= SRPC_SERVICE_MAX_ID); + LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID); + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + if (tsc->tsc_srv_service->sv_id == id) + return tsc; + } + + return NULL; +} + +static int +sfw_register_test(srpc_service_t *service, sfw_test_client_ops_t *cliops) +{ + sfw_test_case_t *tsc; + + if (sfw_find_test_case(service->sv_id) != NULL) { + CERROR("Failed to register test %s (%d)\n", + service->sv_name, service->sv_id); + return -EEXIST; + } + + LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t)); + if (tsc == NULL) + return -ENOMEM; + + tsc->tsc_cli_ops = cliops; + tsc->tsc_srv_service = service; + + list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests); + return 0; +} + +static void +sfw_add_session_timer(void) +{ + sfw_session_t *sn = sfw_data.fw_session; + stt_timer_t *timer = &sn->sn_timer; + + LASSERT(!sfw_data.fw_shuttingdown); + + if (sn == NULL || sn->sn_timeout == 0) + return; + + LASSERT(!sn->sn_timer_active); + + sn->sn_timer_active = 1; + timer->stt_expires = cfs_time_add(sn->sn_timeout, + get_seconds()); + stt_add_timer(timer); + return; +} + +static int +sfw_del_session_timer(void) +{ + sfw_session_t *sn = sfw_data.fw_session; + + if (sn == NULL || !sn->sn_timer_active) + return 0; + + LASSERT(sn->sn_timeout != 0); + + if (stt_del_timer(&sn->sn_timer)) { /* timer defused */ + sn->sn_timer_active = 0; + return 0; + } + + return EBUSY; /* racing with sfw_session_expired() */ +} + +static void +sfw_deactivate_session(void) + __must_hold(&sfw_data.fw_lock) +{ + sfw_session_t *sn = sfw_data.fw_session; + int nactive = 0; + sfw_batch_t *tsb; + sfw_test_case_t *tsc; + + if (sn == NULL) return; + + LASSERT(!sn->sn_timer_active); + + sfw_data.fw_session = NULL; + atomic_inc(&sfw_data.fw_nzombies); + list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions); + + spin_unlock(&sfw_data.fw_lock); + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + srpc_abort_service(tsc->tsc_srv_service); + } + + spin_lock(&sfw_data.fw_lock); + + list_for_each_entry(tsb, &sn->sn_batches, bat_list) { + if (sfw_batch_active(tsb)) { + nactive++; + sfw_stop_batch(tsb, 1); + } + } + + if (nactive != 0) + return; /* wait for active batches to stop */ + + list_del_init(&sn->sn_list); + spin_unlock(&sfw_data.fw_lock); + + sfw_destroy_session(sn); + + spin_lock(&sfw_data.fw_lock); +} + + +static void +sfw_session_expired(void *data) +{ + sfw_session_t *sn = data; + + spin_lock(&sfw_data.fw_lock); + + LASSERT(sn->sn_timer_active); + LASSERT(sn == sfw_data.fw_session); + + CWARN("Session expired! sid: %s-%llu, name: %s\n", + libcfs_nid2str(sn->sn_id.ses_nid), + sn->sn_id.ses_stamp, &sn->sn_name[0]); + + sn->sn_timer_active = 0; + sfw_deactivate_session(); + + spin_unlock(&sfw_data.fw_lock); +} + +static inline void +sfw_init_session(sfw_session_t *sn, lst_sid_t sid, + unsigned features, const char *name) +{ + stt_timer_t *timer = &sn->sn_timer; + + memset(sn, 0, sizeof(sfw_session_t)); + INIT_LIST_HEAD(&sn->sn_list); + INIT_LIST_HEAD(&sn->sn_batches); + atomic_set(&sn->sn_refcount, 1); /* +1 for caller */ + atomic_set(&sn->sn_brw_errors, 0); + atomic_set(&sn->sn_ping_errors, 0); + strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name)); + + sn->sn_timer_active = 0; + sn->sn_id = sid; + sn->sn_features = features; + sn->sn_timeout = session_timeout; + sn->sn_started = cfs_time_current(); + + timer->stt_data = sn; + timer->stt_func = sfw_session_expired; + INIT_LIST_HEAD(&timer->stt_list); +} + +/* completion handler for incoming framework RPCs */ +static void +sfw_server_rpc_done(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + int status = rpc->srpc_status; + + CDEBUG(D_NET, + "Incoming framework RPC done: service %s, peer %s, status %s:%d\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), + status); + + if (rpc->srpc_bulk != NULL) + sfw_free_pages(rpc); + return; +} + +static void +sfw_client_rpc_fini(srpc_client_rpc_t *rpc) +{ + LASSERT(rpc->crpc_bulk.bk_niov == 0); + LASSERT(list_empty(&rpc->crpc_list)); + LASSERT(atomic_read(&rpc->crpc_refcount) == 0); + + CDEBUG(D_NET, + "Outgoing framework RPC done: service %d, peer %s, status %s:%d:%d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(rpc->crpc_wi.swi_state), + rpc->crpc_aborted, rpc->crpc_status); + + spin_lock(&sfw_data.fw_lock); + + /* my callers must finish all RPCs before shutting me down */ + LASSERT(!sfw_data.fw_shuttingdown); + list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs); + + spin_unlock(&sfw_data.fw_lock); +} + +static sfw_batch_t * +sfw_find_batch(lst_bid_t bid) +{ + sfw_session_t *sn = sfw_data.fw_session; + sfw_batch_t *bat; + + LASSERT(sn != NULL); + + list_for_each_entry(bat, &sn->sn_batches, bat_list) { + if (bat->bat_id.bat_id == bid.bat_id) + return bat; + } + + return NULL; +} + +static sfw_batch_t * +sfw_bid2batch(lst_bid_t bid) +{ + sfw_session_t *sn = sfw_data.fw_session; + sfw_batch_t *bat; + + LASSERT(sn != NULL); + + bat = sfw_find_batch(bid); + if (bat != NULL) + return bat; + + LIBCFS_ALLOC(bat, sizeof(sfw_batch_t)); + if (bat == NULL) + return NULL; + + bat->bat_error = 0; + bat->bat_session = sn; + bat->bat_id = bid; + atomic_set(&bat->bat_nactive, 0); + INIT_LIST_HEAD(&bat->bat_tests); + + list_add_tail(&bat->bat_list, &sn->sn_batches); + return bat; +} + +static int +sfw_get_stats(srpc_stat_reqst_t *request, srpc_stat_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + sfw_counters_t *cnt = &reply->str_fw; + sfw_batch_t *bat; + struct timeval tv; + + reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->str_sid.ses_nid == LNET_NID_ANY) { + reply->str_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) { + reply->str_status = ESRCH; + return 0; + } + + lnet_counters_get(&reply->str_lnet); + srpc_get_counters(&reply->str_rpc); + + /* send over the msecs since the session was started + - with 32 bits to send, this is ~49 days */ + cfs_duration_usec(cfs_time_sub(cfs_time_current(), + sn->sn_started), &tv); + + cnt->running_ms = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000); + cnt->brw_errors = atomic_read(&sn->sn_brw_errors); + cnt->ping_errors = atomic_read(&sn->sn_ping_errors); + cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies); + + cnt->active_batches = 0; + list_for_each_entry(bat, &sn->sn_batches, bat_list) { + if (atomic_read(&bat->bat_nactive) > 0) + cnt->active_batches++; + } + + reply->str_status = 0; + return 0; +} + +int +sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + srpc_msg_t *msg = container_of(request, srpc_msg_t, + msg_body.mksn_reqst); + int cplen = 0; + + if (request->mksn_sid.ses_nid == LNET_NID_ANY) { + reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + reply->mksn_status = EINVAL; + return 0; + } + + if (sn != NULL) { + reply->mksn_status = 0; + reply->mksn_sid = sn->sn_id; + reply->mksn_timeout = sn->sn_timeout; + + if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) { + atomic_inc(&sn->sn_refcount); + return 0; + } + + if (!request->mksn_force) { + reply->mksn_status = EBUSY; + cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0], + sizeof(reply->mksn_name)); + if (cplen >= sizeof(reply->mksn_name)) + return -E2BIG; + return 0; + } + } + + /* reject the request if it requires unknown features + * NB: old version will always accept all features because it's not + * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also + * harmless because it will return zero feature to console, and it's + * console's responsibility to make sure all nodes in a session have + * same feature mask. */ + if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + reply->mksn_status = EPROTO; + return 0; + } + + /* brand new or create by force */ + LIBCFS_ALLOC(sn, sizeof(sfw_session_t)); + if (sn == NULL) { + CERROR("Dropping RPC (mksn) under memory pressure.\n"); + return -ENOMEM; + } + + sfw_init_session(sn, request->mksn_sid, + msg->msg_ses_feats, &request->mksn_name[0]); + + spin_lock(&sfw_data.fw_lock); + + sfw_deactivate_session(); + LASSERT(sfw_data.fw_session == NULL); + sfw_data.fw_session = sn; + + spin_unlock(&sfw_data.fw_lock); + + reply->mksn_status = 0; + reply->mksn_sid = sn->sn_id; + reply->mksn_timeout = sn->sn_timeout; + return 0; +} + +static int +sfw_remove_session(srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + + reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->rmsn_sid.ses_nid == LNET_NID_ANY) { + reply->rmsn_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) { + reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY; + return 0; + } + + if (!atomic_dec_and_test(&sn->sn_refcount)) { + reply->rmsn_status = 0; + return 0; + } + + spin_lock(&sfw_data.fw_lock); + sfw_deactivate_session(); + spin_unlock(&sfw_data.fw_lock); + + reply->rmsn_status = 0; + reply->rmsn_sid = LST_INVALID_SID; + LASSERT(sfw_data.fw_session == NULL); + return 0; +} + +static int +sfw_debug_session(srpc_debug_reqst_t *request, srpc_debug_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + + if (sn == NULL) { + reply->dbg_status = ESRCH; + reply->dbg_sid = LST_INVALID_SID; + return 0; + } + + reply->dbg_status = 0; + reply->dbg_sid = sn->sn_id; + reply->dbg_timeout = sn->sn_timeout; + if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name)) + >= sizeof(reply->dbg_name)) + return -E2BIG; + + return 0; +} + +static void +sfw_test_rpc_fini(srpc_client_rpc_t *rpc) +{ + sfw_test_unit_t *tsu = rpc->crpc_priv; + sfw_test_instance_t *tsi = tsu->tsu_instance; + + /* Called with hold of tsi->tsi_lock */ + LASSERT(list_empty(&rpc->crpc_list)); + list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); +} + +static inline int +sfw_test_buffers(sfw_test_instance_t *tsi) +{ + struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service); + struct srpc_service *svc = tsc->tsc_srv_service; + int nbuf; + + nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts; + return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA); +} + +static int +sfw_load_test(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc; + struct srpc_service *svc; + int nbuf; + int rc; + + LASSERT(tsi != NULL); + tsc = sfw_find_test_case(tsi->tsi_service); + nbuf = sfw_test_buffers(tsi); + LASSERT(tsc != NULL); + svc = tsc->tsc_srv_service; + + if (tsi->tsi_is_client) { + tsi->tsi_ops = tsc->tsc_cli_ops; + return 0; + } + + rc = srpc_service_add_buffers(svc, nbuf); + if (rc != 0) { + CWARN("Failed to reserve enough buffers: service %s, %d needed: %d\n", + svc->sv_name, nbuf, rc); + /* NB: this error handler is not strictly correct, because + * it may release more buffers than already allocated, + * but it doesn't matter because request portal should + * be lazy portal and will grow buffers if necessary. */ + srpc_service_remove_buffers(svc, nbuf); + return -ENOMEM; + } + + CDEBUG(D_NET, "Reserved %d buffers for test %s\n", + nbuf * (srpc_serv_is_framework(svc) ? + 1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name); + return 0; +} + +static void +sfw_unload_test(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service); + + LASSERT(tsc != NULL); + + if (tsi->tsi_is_client) + return; + + /* shrink buffers, because request portal is lazy portal + * which can grow buffers at runtime so we may leave + * some buffers behind, but never mind... */ + srpc_service_remove_buffers(tsc->tsc_srv_service, + sfw_test_buffers(tsi)); + return; +} + +static void +sfw_destroy_test_instance(sfw_test_instance_t *tsi) +{ + srpc_client_rpc_t *rpc; + sfw_test_unit_t *tsu; + + if (!tsi->tsi_is_client) goto clean; + + tsi->tsi_ops->tso_fini(tsi); + + LASSERT(!tsi->tsi_stopping); + LASSERT(list_empty(&tsi->tsi_active_rpcs)); + LASSERT(!sfw_test_active(tsi)); + + while (!list_empty(&tsi->tsi_units)) { + tsu = list_entry(tsi->tsi_units.next, + sfw_test_unit_t, tsu_list); + list_del(&tsu->tsu_list); + LIBCFS_FREE(tsu, sizeof(*tsu)); + } + + while (!list_empty(&tsi->tsi_free_rpcs)) { + rpc = list_entry(tsi->tsi_free_rpcs.next, + srpc_client_rpc_t, crpc_list); + list_del(&rpc->crpc_list); + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } + +clean: + sfw_unload_test(tsi); + LIBCFS_FREE(tsi, sizeof(*tsi)); + return; +} + +static void +sfw_destroy_batch(sfw_batch_t *tsb) +{ + sfw_test_instance_t *tsi; + + LASSERT(!sfw_batch_active(tsb)); + LASSERT(list_empty(&tsb->bat_list)); + + while (!list_empty(&tsb->bat_tests)) { + tsi = list_entry(tsb->bat_tests.next, + sfw_test_instance_t, tsi_list); + list_del_init(&tsi->tsi_list); + sfw_destroy_test_instance(tsi); + } + + LIBCFS_FREE(tsb, sizeof(sfw_batch_t)); + return; +} + +void +sfw_destroy_session(sfw_session_t *sn) +{ + sfw_batch_t *batch; + + LASSERT(list_empty(&sn->sn_list)); + LASSERT(sn != sfw_data.fw_session); + + while (!list_empty(&sn->sn_batches)) { + batch = list_entry(sn->sn_batches.next, + sfw_batch_t, bat_list); + list_del_init(&batch->bat_list); + sfw_destroy_batch(batch); + } + + LIBCFS_FREE(sn, sizeof(*sn)); + atomic_dec(&sfw_data.fw_nzombies); + return; +} + +static void +sfw_unpack_addtest_req(srpc_msg_t *msg) +{ + srpc_test_reqst_t *req = &msg->msg_body.tes_reqst; + + LASSERT(msg->msg_type == SRPC_MSG_TEST_REQST); + LASSERT(req->tsr_is_client); + + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + LASSERT(msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + if (req->tsr_service == SRPC_SERVICE_BRW) { + if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) { + test_bulk_req_t *bulk = &req->tsr_u.bulk_v0; + + __swab32s(&bulk->blk_opc); + __swab32s(&bulk->blk_npg); + __swab32s(&bulk->blk_flags); + + } else { + test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1; + + __swab16s(&bulk->blk_opc); + __swab16s(&bulk->blk_flags); + __swab32s(&bulk->blk_offset); + __swab32s(&bulk->blk_len); + } + + return; + } + + if (req->tsr_service == SRPC_SERVICE_PING) { + test_ping_req_t *ping = &req->tsr_u.ping; + + __swab32s(&ping->png_size); + __swab32s(&ping->png_flags); + return; + } + + LBUG(); + return; +} + +static int +sfw_add_test_instance(sfw_batch_t *tsb, srpc_server_rpc_t *rpc) +{ + srpc_msg_t *msg = &rpc->srpc_reqstbuf->buf_msg; + srpc_test_reqst_t *req = &msg->msg_body.tes_reqst; + srpc_bulk_t *bk = rpc->srpc_bulk; + int ndest = req->tsr_ndest; + sfw_test_unit_t *tsu; + sfw_test_instance_t *tsi; + int i; + int rc; + + LIBCFS_ALLOC(tsi, sizeof(*tsi)); + if (tsi == NULL) { + CERROR("Can't allocate test instance for batch: %llu\n", + tsb->bat_id.bat_id); + return -ENOMEM; + } + + spin_lock_init(&tsi->tsi_lock); + atomic_set(&tsi->tsi_nactive, 0); + INIT_LIST_HEAD(&tsi->tsi_units); + INIT_LIST_HEAD(&tsi->tsi_free_rpcs); + INIT_LIST_HEAD(&tsi->tsi_active_rpcs); + + tsi->tsi_stopping = 0; + tsi->tsi_batch = tsb; + tsi->tsi_loop = req->tsr_loop; + tsi->tsi_concur = req->tsr_concur; + tsi->tsi_service = req->tsr_service; + tsi->tsi_is_client = !!(req->tsr_is_client); + tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr); + + rc = sfw_load_test(tsi); + if (rc != 0) { + LIBCFS_FREE(tsi, sizeof(*tsi)); + return rc; + } + + LASSERT(!sfw_batch_active(tsb)); + + if (!tsi->tsi_is_client) { + /* it's test server, just add it to tsb */ + list_add_tail(&tsi->tsi_list, &tsb->bat_tests); + return 0; + } + + LASSERT(bk != NULL); + LASSERT(bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest); + LASSERT((unsigned int)bk->bk_len >= + sizeof(lnet_process_id_packed_t) * ndest); + + sfw_unpack_addtest_req(msg); + memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u)); + + for (i = 0; i < ndest; i++) { + lnet_process_id_packed_t *dests; + lnet_process_id_packed_t id; + int j; + + dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page); + LASSERT(dests != NULL); /* my pages are within KVM always */ + id = dests[i % SFW_ID_PER_PAGE]; + if (msg->msg_magic != SRPC_MSG_MAGIC) + sfw_unpack_id(id); + + for (j = 0; j < tsi->tsi_concur; j++) { + LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t)); + if (tsu == NULL) { + rc = -ENOMEM; + CERROR("Can't allocate tsu for %d\n", + tsi->tsi_service); + goto error; + } + + tsu->tsu_dest.nid = id.nid; + tsu->tsu_dest.pid = id.pid; + tsu->tsu_instance = tsi; + tsu->tsu_private = NULL; + list_add_tail(&tsu->tsu_list, &tsi->tsi_units); + } + } + + rc = tsi->tsi_ops->tso_init(tsi); + if (rc == 0) { + list_add_tail(&tsi->tsi_list, &tsb->bat_tests); + return 0; + } + +error: + LASSERT(rc != 0); + sfw_destroy_test_instance(tsi); + return rc; +} + +static void +sfw_test_unit_done(sfw_test_unit_t *tsu) +{ + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_batch_t *tsb = tsi->tsi_batch; + sfw_session_t *sn = tsb->bat_session; + + LASSERT(sfw_test_active(tsi)); + + if (!atomic_dec_and_test(&tsi->tsi_nactive)) + return; + + /* the test instance is done */ + spin_lock(&tsi->tsi_lock); + + tsi->tsi_stopping = 0; + + spin_unlock(&tsi->tsi_lock); + + spin_lock(&sfw_data.fw_lock); + + if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */ + sn == sfw_data.fw_session) { /* sn also active */ + spin_unlock(&sfw_data.fw_lock); + return; + } + + LASSERT(!list_empty(&sn->sn_list)); /* I'm a zombie! */ + + list_for_each_entry(tsb, &sn->sn_batches, bat_list) { + if (sfw_batch_active(tsb)) { + spin_unlock(&sfw_data.fw_lock); + return; + } + } + + list_del_init(&sn->sn_list); + spin_unlock(&sfw_data.fw_lock); + + sfw_destroy_session(sn); + return; +} + +static void +sfw_test_rpc_done(srpc_client_rpc_t *rpc) +{ + sfw_test_unit_t *tsu = rpc->crpc_priv; + sfw_test_instance_t *tsi = tsu->tsu_instance; + int done = 0; + + tsi->tsi_ops->tso_done_rpc(tsu, rpc); + + spin_lock(&tsi->tsi_lock); + + LASSERT(sfw_test_active(tsi)); + LASSERT(!list_empty(&rpc->crpc_list)); + + list_del_init(&rpc->crpc_list); + + /* batch is stopping or loop is done or get error */ + if (tsi->tsi_stopping || + tsu->tsu_loop == 0 || + (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr)) + done = 1; + + /* dec ref for poster */ + srpc_client_rpc_decref(rpc); + + spin_unlock(&tsi->tsi_lock); + + if (!done) { + swi_schedule_workitem(&tsu->tsu_worker); + return; + } + + sfw_test_unit_done(tsu); + return; +} + +int +sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer, + unsigned features, int nblk, int blklen, + srpc_client_rpc_t **rpcpp) +{ + srpc_client_rpc_t *rpc = NULL; + sfw_test_instance_t *tsi = tsu->tsu_instance; + + spin_lock(&tsi->tsi_lock); + + LASSERT(sfw_test_active(tsi)); + + if (!list_empty(&tsi->tsi_free_rpcs)) { + /* pick request from buffer */ + rpc = list_entry(tsi->tsi_free_rpcs.next, + srpc_client_rpc_t, crpc_list); + LASSERT(nblk == rpc->crpc_bulk.bk_niov); + list_del_init(&rpc->crpc_list); + } + + spin_unlock(&tsi->tsi_lock); + + if (rpc == NULL) { + rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk, + blklen, sfw_test_rpc_done, + sfw_test_rpc_fini, tsu); + } else { + srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk, + blklen, sfw_test_rpc_done, + sfw_test_rpc_fini, tsu); + } + + if (rpc == NULL) { + CERROR("Can't create rpc for test %d\n", tsi->tsi_service); + return -ENOMEM; + } + + rpc->crpc_reqstmsg.msg_ses_feats = features; + *rpcpp = rpc; + + return 0; +} + +static int +sfw_run_test(swi_workitem_t *wi) +{ + sfw_test_unit_t *tsu = wi->swi_workitem.wi_data; + sfw_test_instance_t *tsi = tsu->tsu_instance; + srpc_client_rpc_t *rpc = NULL; + + LASSERT(wi == &tsu->tsu_worker); + + if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) { + LASSERT(rpc == NULL); + goto test_done; + } + + LASSERT(rpc != NULL); + + spin_lock(&tsi->tsi_lock); + + if (tsi->tsi_stopping) { + list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); + spin_unlock(&tsi->tsi_lock); + goto test_done; + } + + if (tsu->tsu_loop > 0) + tsu->tsu_loop--; + + list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs); + spin_unlock(&tsi->tsi_lock); + + rpc->crpc_timeout = rpc_timeout; + + spin_lock(&rpc->crpc_lock); + srpc_post_rpc(rpc); + spin_unlock(&rpc->crpc_lock); + return 0; + +test_done: + /* + * No one can schedule me now since: + * - previous RPC, if any, has done and + * - no new RPC is initiated. + * - my batch is still active; no one can run it again now. + * Cancel pending schedules and prevent future schedule attempts: + */ + swi_exit_workitem(wi); + sfw_test_unit_done(tsu); + return 1; +} + +static int +sfw_run_batch(sfw_batch_t *tsb) +{ + swi_workitem_t *wi; + sfw_test_unit_t *tsu; + sfw_test_instance_t *tsi; + + if (sfw_batch_active(tsb)) { + CDEBUG(D_NET, "Batch already active: %llu (%d)\n", + tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive)); + return 0; + } + + list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { + if (!tsi->tsi_is_client) /* skip server instances */ + continue; + + LASSERT(!tsi->tsi_stopping); + LASSERT(!sfw_test_active(tsi)); + + atomic_inc(&tsb->bat_nactive); + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + atomic_inc(&tsi->tsi_nactive); + tsu->tsu_loop = tsi->tsi_loop; + wi = &tsu->tsu_worker; + swi_init_workitem(wi, tsu, sfw_run_test, + lst_sched_test[\ + lnet_cpt_of_nid(tsu->tsu_dest.nid)]); + swi_schedule_workitem(wi); + } + } + + return 0; +} + +int +sfw_stop_batch(sfw_batch_t *tsb, int force) +{ + sfw_test_instance_t *tsi; + srpc_client_rpc_t *rpc; + + if (!sfw_batch_active(tsb)) { + CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id); + return 0; + } + + list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { + spin_lock(&tsi->tsi_lock); + + if (!tsi->tsi_is_client || + !sfw_test_active(tsi) || tsi->tsi_stopping) { + spin_unlock(&tsi->tsi_lock); + continue; + } + + tsi->tsi_stopping = 1; + + if (!force) { + spin_unlock(&tsi->tsi_lock); + continue; + } + + /* abort launched rpcs in the test */ + list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) { + spin_lock(&rpc->crpc_lock); + + srpc_abort_rpc(rpc, -EINTR); + + spin_unlock(&rpc->crpc_lock); + } + + spin_unlock(&tsi->tsi_lock); + } + + return 0; +} + +static int +sfw_query_batch(sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply) +{ + sfw_test_instance_t *tsi; + + if (testidx < 0) + return -EINVAL; + + if (testidx == 0) { + reply->bar_active = atomic_read(&tsb->bat_nactive); + return 0; + } + + list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { + if (testidx-- > 1) + continue; + + reply->bar_active = atomic_read(&tsi->tsi_nactive); + return 0; + } + + return -ENOENT; +} + +void +sfw_free_pages(srpc_server_rpc_t *rpc) +{ + srpc_free_bulk(rpc->srpc_bulk); + rpc->srpc_bulk = NULL; +} + +int +sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len, + int sink) +{ + LASSERT(rpc->srpc_bulk == NULL); + LASSERT(npages > 0 && npages <= LNET_MAX_IOV); + + rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink); + if (rpc->srpc_bulk == NULL) + return -ENOMEM; + + return 0; +} + +static int +sfw_add_test(srpc_server_rpc_t *rpc) +{ + sfw_session_t *sn = sfw_data.fw_session; + srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply; + srpc_test_reqst_t *request; + int rc; + sfw_batch_t *bat; + + request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst; + reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->tsr_loop == 0 || + request->tsr_concur == 0 || + request->tsr_sid.ses_nid == LNET_NID_ANY || + request->tsr_ndest > SFW_MAX_NDESTS || + (request->tsr_is_client && request->tsr_ndest == 0) || + request->tsr_concur > SFW_MAX_CONCUR || + request->tsr_service > SRPC_SERVICE_MAX_ID || + request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) { + reply->tsr_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) || + sfw_find_test_case(request->tsr_service) == NULL) { + reply->tsr_status = ENOENT; + return 0; + } + + bat = sfw_bid2batch(request->tsr_bid); + if (bat == NULL) { + CERROR("Dropping RPC (%s) from %s under memory pressure.\n", + rpc->srpc_scd->scd_svc->sv_name, + libcfs_id2str(rpc->srpc_peer)); + return -ENOMEM; + } + + if (sfw_batch_active(bat)) { + reply->tsr_status = EBUSY; + return 0; + } + + if (request->tsr_is_client && rpc->srpc_bulk == NULL) { + /* rpc will be resumed later in sfw_bulk_ready */ + int npg = sfw_id_pages(request->tsr_ndest); + int len; + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + len = npg * PAGE_CACHE_SIZE; + + } else { + len = sizeof(lnet_process_id_packed_t) * + request->tsr_ndest; + } + + return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1); + } + + rc = sfw_add_test_instance(bat, rpc); + CDEBUG(rc == 0 ? D_NET : D_WARNING, + "%s test: sv %d %s, loop %d, concur %d, ndest %d\n", + rc == 0 ? "Added" : "Failed to add", request->tsr_service, + request->tsr_is_client ? "client" : "server", + request->tsr_loop, request->tsr_concur, request->tsr_ndest); + + reply->tsr_status = (rc < 0) ? -rc : rc; + return 0; +} + +static int +sfw_control_batch(srpc_batch_reqst_t *request, srpc_batch_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + int rc = 0; + sfw_batch_t *bat; + + reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) { + reply->bar_status = ESRCH; + return 0; + } + + bat = sfw_find_batch(request->bar_bid); + if (bat == NULL) { + reply->bar_status = ENOENT; + return 0; + } + + switch (request->bar_opc) { + case SRPC_BATCH_OPC_RUN: + rc = sfw_run_batch(bat); + break; + + case SRPC_BATCH_OPC_STOP: + rc = sfw_stop_batch(bat, request->bar_arg); + break; + + case SRPC_BATCH_OPC_QUERY: + rc = sfw_query_batch(bat, request->bar_testidx, reply); + break; + + default: + return -EINVAL; /* drop it */ + } + + reply->bar_status = (rc < 0) ? -rc : rc; + return 0; +} + +static int +sfw_handle_server_rpc(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + srpc_msg_t *reply = &rpc->srpc_replymsg; + srpc_msg_t *request = &rpc->srpc_reqstbuf->buf_msg; + unsigned features = LST_FEATS_MASK; + int rc = 0; + + LASSERT(sfw_data.fw_active_srpc == NULL); + LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + spin_lock(&sfw_data.fw_lock); + + if (sfw_data.fw_shuttingdown) { + spin_unlock(&sfw_data.fw_lock); + return -ESHUTDOWN; + } + + /* Remove timer to avoid racing with it or expiring active session */ + if (sfw_del_session_timer() != 0) { + CERROR("Dropping RPC (%s) from %s: racing with expiry timer.", + sv->sv_name, libcfs_id2str(rpc->srpc_peer)); + spin_unlock(&sfw_data.fw_lock); + return -EAGAIN; + } + + sfw_data.fw_active_srpc = rpc; + spin_unlock(&sfw_data.fw_lock); + + sfw_unpack_message(request); + LASSERT(request->msg_type == srpc_service2request(sv->sv_id)); + + /* rpc module should have checked this */ + LASSERT(request->msg_version == SRPC_MSG_VERSION); + + if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION && + sv->sv_id != SRPC_SERVICE_DEBUG) { + sfw_session_t *sn = sfw_data.fw_session; + + if (sn != NULL && + sn->sn_features != request->msg_ses_feats) { + CNETERR("Features of framework RPC don't match features of current session: %x/%x\n", + request->msg_ses_feats, sn->sn_features); + reply->msg_body.reply.status = EPROTO; + reply->msg_body.reply.sid = sn->sn_id; + goto out; + } + + } else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + /* NB: at this point, old version will ignore features and + * create new session anyway, so console should be able + * to handle this */ + reply->msg_body.reply.status = EPROTO; + goto out; + } + + switch (sv->sv_id) { + default: + LBUG(); + case SRPC_SERVICE_TEST: + rc = sfw_add_test(rpc); + break; + + case SRPC_SERVICE_BATCH: + rc = sfw_control_batch(&request->msg_body.bat_reqst, + &reply->msg_body.bat_reply); + break; + + case SRPC_SERVICE_QUERY_STAT: + rc = sfw_get_stats(&request->msg_body.stat_reqst, + &reply->msg_body.stat_reply); + break; + + case SRPC_SERVICE_DEBUG: + rc = sfw_debug_session(&request->msg_body.dbg_reqst, + &reply->msg_body.dbg_reply); + break; + + case SRPC_SERVICE_MAKE_SESSION: + rc = sfw_make_session(&request->msg_body.mksn_reqst, + &reply->msg_body.mksn_reply); + break; + + case SRPC_SERVICE_REMOVE_SESSION: + rc = sfw_remove_session(&request->msg_body.rmsn_reqst, + &reply->msg_body.rmsn_reply); + break; + } + + if (sfw_data.fw_session != NULL) + features = sfw_data.fw_session->sn_features; + out: + reply->msg_ses_feats = features; + rpc->srpc_done = sfw_server_rpc_done; + spin_lock(&sfw_data.fw_lock); + + if (!sfw_data.fw_shuttingdown) + sfw_add_session_timer(); + + sfw_data.fw_active_srpc = NULL; + spin_unlock(&sfw_data.fw_lock); + return rc; +} + +static int +sfw_bulk_ready(struct srpc_server_rpc *rpc, int status) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + int rc; + + LASSERT(rpc->srpc_bulk != NULL); + LASSERT(sv->sv_id == SRPC_SERVICE_TEST); + LASSERT(sfw_data.fw_active_srpc == NULL); + LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client); + + spin_lock(&sfw_data.fw_lock); + + if (status != 0) { + CERROR("Bulk transfer failed for RPC: service %s, peer %s, status %d\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer), status); + spin_unlock(&sfw_data.fw_lock); + return -EIO; + } + + if (sfw_data.fw_shuttingdown) { + spin_unlock(&sfw_data.fw_lock); + return -ESHUTDOWN; + } + + if (sfw_del_session_timer() != 0) { + CERROR("Dropping RPC (%s) from %s: racing with expiry timer", + sv->sv_name, libcfs_id2str(rpc->srpc_peer)); + spin_unlock(&sfw_data.fw_lock); + return -EAGAIN; + } + + sfw_data.fw_active_srpc = rpc; + spin_unlock(&sfw_data.fw_lock); + + rc = sfw_add_test(rpc); + + spin_lock(&sfw_data.fw_lock); + + if (!sfw_data.fw_shuttingdown) + sfw_add_session_timer(); + + sfw_data.fw_active_srpc = NULL; + spin_unlock(&sfw_data.fw_lock); + return rc; +} + +srpc_client_rpc_t * +sfw_create_rpc(lnet_process_id_t peer, int service, + unsigned features, int nbulkiov, int bulklen, + void (*done)(srpc_client_rpc_t *), void *priv) +{ + srpc_client_rpc_t *rpc = NULL; + + spin_lock(&sfw_data.fw_lock); + + LASSERT(!sfw_data.fw_shuttingdown); + LASSERT(service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) { + rpc = list_entry(sfw_data.fw_zombie_rpcs.next, + srpc_client_rpc_t, crpc_list); + list_del(&rpc->crpc_list); + + srpc_init_client_rpc(rpc, peer, service, 0, 0, + done, sfw_client_rpc_fini, priv); + } + + spin_unlock(&sfw_data.fw_lock); + + if (rpc == NULL) { + rpc = srpc_create_client_rpc(peer, service, + nbulkiov, bulklen, done, + nbulkiov != 0 ? NULL : + sfw_client_rpc_fini, + priv); + } + + if (rpc != NULL) /* "session" is concept in framework */ + rpc->crpc_reqstmsg.msg_ses_feats = features; + + return rpc; +} + +void +sfw_unpack_message(srpc_msg_t *msg) +{ + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + /* srpc module should guarantee I wouldn't get crap */ + LASSERT(msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + if (msg->msg_type == SRPC_MSG_STAT_REQST) { + srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst; + + __swab32s(&req->str_type); + __swab64s(&req->str_rpyid); + sfw_unpack_sid(req->str_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_STAT_REPLY) { + srpc_stat_reply_t *rep = &msg->msg_body.stat_reply; + + __swab32s(&rep->str_status); + sfw_unpack_sid(rep->str_sid); + sfw_unpack_fw_counters(rep->str_fw); + sfw_unpack_rpc_counters(rep->str_rpc); + sfw_unpack_lnet_counters(rep->str_lnet); + return; + } + + if (msg->msg_type == SRPC_MSG_MKSN_REQST) { + srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst; + + __swab64s(&req->mksn_rpyid); + __swab32s(&req->mksn_force); + sfw_unpack_sid(req->mksn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_MKSN_REPLY) { + srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply; + + __swab32s(&rep->mksn_status); + __swab32s(&rep->mksn_timeout); + sfw_unpack_sid(rep->mksn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_RMSN_REQST) { + srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst; + + __swab64s(&req->rmsn_rpyid); + sfw_unpack_sid(req->rmsn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_RMSN_REPLY) { + srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply; + + __swab32s(&rep->rmsn_status); + sfw_unpack_sid(rep->rmsn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_DEBUG_REQST) { + srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst; + + __swab64s(&req->dbg_rpyid); + __swab32s(&req->dbg_flags); + sfw_unpack_sid(req->dbg_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) { + srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply; + + __swab32s(&rep->dbg_nbatch); + __swab32s(&rep->dbg_timeout); + sfw_unpack_sid(rep->dbg_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_BATCH_REQST) { + srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst; + + __swab32s(&req->bar_opc); + __swab64s(&req->bar_rpyid); + __swab32s(&req->bar_testidx); + __swab32s(&req->bar_arg); + sfw_unpack_sid(req->bar_sid); + __swab64s(&req->bar_bid.bat_id); + return; + } + + if (msg->msg_type == SRPC_MSG_BATCH_REPLY) { + srpc_batch_reply_t *rep = &msg->msg_body.bat_reply; + + __swab32s(&rep->bar_status); + sfw_unpack_sid(rep->bar_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_TEST_REQST) { + srpc_test_reqst_t *req = &msg->msg_body.tes_reqst; + + __swab64s(&req->tsr_rpyid); + __swab64s(&req->tsr_bulkid); + __swab32s(&req->tsr_loop); + __swab32s(&req->tsr_ndest); + __swab32s(&req->tsr_concur); + __swab32s(&req->tsr_service); + sfw_unpack_sid(req->tsr_sid); + __swab64s(&req->tsr_bid.bat_id); + return; + } + + if (msg->msg_type == SRPC_MSG_TEST_REPLY) { + srpc_test_reply_t *rep = &msg->msg_body.tes_reply; + + __swab32s(&rep->tsr_status); + sfw_unpack_sid(rep->tsr_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_JOIN_REQST) { + srpc_join_reqst_t *req = &msg->msg_body.join_reqst; + + __swab64s(&req->join_rpyid); + sfw_unpack_sid(req->join_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_JOIN_REPLY) { + srpc_join_reply_t *rep = &msg->msg_body.join_reply; + + __swab32s(&rep->join_status); + __swab32s(&rep->join_timeout); + sfw_unpack_sid(rep->join_sid); + return; + } + + LBUG(); + return; +} + +void +sfw_abort_rpc(srpc_client_rpc_t *rpc) +{ + LASSERT(atomic_read(&rpc->crpc_refcount) > 0); + LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + spin_lock(&rpc->crpc_lock); + srpc_abort_rpc(rpc, -EINTR); + spin_unlock(&rpc->crpc_lock); + return; +} + +void +sfw_post_rpc(srpc_client_rpc_t *rpc) +{ + spin_lock(&rpc->crpc_lock); + + LASSERT(!rpc->crpc_closed); + LASSERT(!rpc->crpc_aborted); + LASSERT(list_empty(&rpc->crpc_list)); + LASSERT(!sfw_data.fw_shuttingdown); + + rpc->crpc_timeout = rpc_timeout; + srpc_post_rpc(rpc); + + spin_unlock(&rpc->crpc_lock); + return; +} + +static srpc_service_t sfw_services[] = { + { + /* sv_id */ SRPC_SERVICE_DEBUG, + /* sv_name */ "debug", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_QUERY_STAT, + /* sv_name */ "query stats", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_MAKE_SESSION, + /* sv_name */ "make session", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_REMOVE_SESSION, + /* sv_name */ "remove session", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_BATCH, + /* sv_name */ "batch service", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_TEST, + /* sv_name */ "test service", + 0 + }, + { + /* sv_id */ 0, + /* sv_name */ NULL, + 0 + } +}; + +extern sfw_test_client_ops_t ping_test_client; +extern srpc_service_t ping_test_service; +extern void ping_init_test_client(void); +extern void ping_init_test_service(void); + +extern sfw_test_client_ops_t brw_test_client; +extern srpc_service_t brw_test_service; +extern void brw_init_test_client(void); +extern void brw_init_test_service(void); + + +int +sfw_startup(void) +{ + int i; + int rc; + int error; + srpc_service_t *sv; + sfw_test_case_t *tsc; + + + if (session_timeout < 0) { + CERROR("Session timeout must be non-negative: %d\n", + session_timeout); + return -EINVAL; + } + + if (rpc_timeout < 0) { + CERROR("RPC timeout must be non-negative: %d\n", + rpc_timeout); + return -EINVAL; + } + + if (session_timeout == 0) + CWARN("Zero session_timeout specified - test sessions never expire.\n"); + + if (rpc_timeout == 0) + CWARN("Zero rpc_timeout specified - test RPC never expire.\n"); + + memset(&sfw_data, 0, sizeof(struct smoketest_framework)); + + sfw_data.fw_session = NULL; + sfw_data.fw_active_srpc = NULL; + spin_lock_init(&sfw_data.fw_lock); + atomic_set(&sfw_data.fw_nzombies, 0); + INIT_LIST_HEAD(&sfw_data.fw_tests); + INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs); + INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions); + + brw_init_test_client(); + brw_init_test_service(); + rc = sfw_register_test(&brw_test_service, &brw_test_client); + LASSERT(rc == 0); + + ping_init_test_client(); + ping_init_test_service(); + rc = sfw_register_test(&ping_test_service, &ping_test_client); + LASSERT(rc == 0); + + error = 0; + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + sv = tsc->tsc_srv_service; + + rc = srpc_add_service(sv); + LASSERT(rc != -EBUSY); + if (rc != 0) { + CWARN("Failed to add %s service: %d\n", + sv->sv_name, rc); + error = rc; + } + } + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) break; + + sv->sv_bulk_ready = NULL; + sv->sv_handler = sfw_handle_server_rpc; + sv->sv_wi_total = SFW_FRWK_WI_MAX; + if (sv->sv_id == SRPC_SERVICE_TEST) + sv->sv_bulk_ready = sfw_bulk_ready; + + rc = srpc_add_service(sv); + LASSERT(rc != -EBUSY); + if (rc != 0) { + CWARN("Failed to add %s service: %d\n", + sv->sv_name, rc); + error = rc; + } + + /* about to sfw_shutdown, no need to add buffer */ + if (error) continue; + + rc = srpc_service_add_buffers(sv, sv->sv_wi_total); + if (rc != 0) { + CWARN("Failed to reserve enough buffers: service %s, %d needed: %d\n", + sv->sv_name, sv->sv_wi_total, rc); + error = -ENOMEM; + } + } + + if (error != 0) + sfw_shutdown(); + return error; +} + +void +sfw_shutdown(void) +{ + srpc_service_t *sv; + sfw_test_case_t *tsc; + int i; + + spin_lock(&sfw_data.fw_lock); + + sfw_data.fw_shuttingdown = 1; + lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock, + "waiting for active RPC to finish.\n"); + + if (sfw_del_session_timer() != 0) + lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock, + "waiting for session timer to explode.\n"); + + sfw_deactivate_session(); + lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0, + sfw_data.fw_lock, + "waiting for %d zombie sessions to die.\n", + atomic_read(&sfw_data.fw_nzombies)); + + spin_unlock(&sfw_data.fw_lock); + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) + break; + + srpc_shutdown_service(sv); + srpc_remove_service(sv); + } + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + sv = tsc->tsc_srv_service; + srpc_shutdown_service(sv); + srpc_remove_service(sv); + } + + while (!list_empty(&sfw_data.fw_zombie_rpcs)) { + srpc_client_rpc_t *rpc; + + rpc = list_entry(sfw_data.fw_zombie_rpcs.next, + srpc_client_rpc_t, crpc_list); + list_del(&rpc->crpc_list); + + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) + break; + + srpc_wait_service_shutdown(sv); + } + + while (!list_empty(&sfw_data.fw_tests)) { + tsc = list_entry(sfw_data.fw_tests.next, + sfw_test_case_t, tsc_list); + + srpc_wait_service_shutdown(tsc->tsc_srv_service); + + list_del(&tsc->tsc_list); + LIBCFS_FREE(tsc, sizeof(*tsc)); + } + + return; +} diff --git a/kernel/drivers/staging/lustre/lnet/selftest/module.c b/kernel/drivers/staging/lustre/lnet/selftest/module.c new file mode 100644 index 000000000..7ad62f167 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/module.c @@ -0,0 +1,159 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +enum { + LST_INIT_NONE = 0, + LST_INIT_WI_SERIAL, + LST_INIT_WI_TEST, + LST_INIT_RPC, + LST_INIT_FW, + LST_INIT_CONSOLE +}; + +extern int lstcon_console_init(void); +extern int lstcon_console_fini(void); + +static int lst_init_step = LST_INIT_NONE; + +struct cfs_wi_sched *lst_sched_serial; +struct cfs_wi_sched **lst_sched_test; + +static void +lnet_selftest_fini(void) +{ + int i; + + switch (lst_init_step) { + case LST_INIT_CONSOLE: + lstcon_console_fini(); + case LST_INIT_FW: + sfw_shutdown(); + case LST_INIT_RPC: + srpc_shutdown(); + case LST_INIT_WI_TEST: + for (i = 0; + i < cfs_cpt_number(lnet_cpt_table()); i++) { + if (lst_sched_test[i] == NULL) + continue; + cfs_wi_sched_destroy(lst_sched_test[i]); + } + LIBCFS_FREE(lst_sched_test, + sizeof(lst_sched_test[0]) * + cfs_cpt_number(lnet_cpt_table())); + lst_sched_test = NULL; + + case LST_INIT_WI_SERIAL: + cfs_wi_sched_destroy(lst_sched_serial); + lst_sched_serial = NULL; + case LST_INIT_NONE: + break; + default: + LBUG(); + } +} + +static int +lnet_selftest_init(void) +{ + int nscheds; + int rc; + int i; + + rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY, + 1, &lst_sched_serial); + if (rc != 0) { + CERROR("Failed to create serial WI scheduler for LST\n"); + return rc; + } + lst_init_step = LST_INIT_WI_SERIAL; + + nscheds = cfs_cpt_number(lnet_cpt_table()); + LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds); + if (lst_sched_test == NULL) + goto error; + + lst_init_step = LST_INIT_WI_TEST; + for (i = 0; i < nscheds; i++) { + int nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + + /* reserve at least one CPU for LND */ + nthrs = max(nthrs - 1, 1); + rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i, + nthrs, &lst_sched_test[i]); + if (rc != 0) { + CERROR("Failed to create CPT affinity WI scheduler %d for LST\n", + i); + goto error; + } + } + + rc = srpc_startup(); + if (rc != 0) { + CERROR("LST can't startup rpc\n"); + goto error; + } + lst_init_step = LST_INIT_RPC; + + rc = sfw_startup(); + if (rc != 0) { + CERROR("LST can't startup framework\n"); + goto error; + } + lst_init_step = LST_INIT_FW; + + rc = lstcon_console_init(); + if (rc != 0) { + CERROR("LST can't startup console\n"); + goto error; + } + lst_init_step = LST_INIT_CONSOLE; + return 0; +error: + lnet_selftest_fini(); + return rc; +} + + +MODULE_DESCRIPTION("LNet Selftest"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.9.0"); + +module_init(lnet_selftest_init); +module_exit(lnet_selftest_fini); diff --git a/kernel/drivers/staging/lustre/lnet/selftest/ping_test.c b/kernel/drivers/staging/lustre/lnet/selftest/ping_test.c new file mode 100644 index 000000000..644069a9f --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/ping_test.c @@ -0,0 +1,230 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * Test client & Server + * + * Author: Liang Zhen + */ + +#include "selftest.h" + +#define LST_PING_TEST_MAGIC 0xbabeface + +static int ping_srv_workitems = SFW_TEST_WI_MAX; +module_param(ping_srv_workitems, int, 0644); +MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems"); + +typedef struct { + spinlock_t pnd_lock; /* serialize */ + int pnd_counter; /* sequence counter */ +} lst_ping_data_t; + +static lst_ping_data_t lst_ping_data; + +static int +ping_client_init(sfw_test_instance_t *tsi) +{ + sfw_session_t *sn = tsi->tsi_batch->bat_session; + + LASSERT(tsi->tsi_is_client); + LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0); + + spin_lock_init(&lst_ping_data.pnd_lock); + lst_ping_data.pnd_counter = 0; + + return 0; +} + +static void +ping_client_fini(sfw_test_instance_t *tsi) +{ + sfw_session_t *sn = tsi->tsi_batch->bat_session; + int errors; + + LASSERT(sn != NULL); + LASSERT(tsi->tsi_is_client); + + errors = atomic_read(&sn->sn_ping_errors); + if (errors) + CWARN("%d pings have failed.\n", errors); + else + CDEBUG(D_NET, "Ping test finished OK.\n"); +} + +static int +ping_client_prep_rpc(sfw_test_unit_t *tsu, + lnet_process_id_t dest, srpc_client_rpc_t **rpc) +{ + srpc_ping_reqst_t *req; + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + struct timeval tv; + int rc; + + LASSERT(sn != NULL); + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc); + if (rc != 0) + return rc; + + req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst; + + req->pnr_magic = LST_PING_TEST_MAGIC; + + spin_lock(&lst_ping_data.pnd_lock); + req->pnr_seq = lst_ping_data.pnd_counter++; + spin_unlock(&lst_ping_data.pnd_lock); + + cfs_fs_timeval(&tv); + req->pnr_time_sec = tv.tv_sec; + req->pnr_time_usec = tv.tv_usec; + + return rc; +} + +static void +ping_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc) +{ + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + srpc_ping_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst; + srpc_ping_reply_t *reply = &rpc->crpc_replymsg.msg_body.ping_reply; + struct timeval tv; + + LASSERT(sn != NULL); + + if (rpc->crpc_status != 0) { + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_ping_errors); + CERROR("Unable to ping %s (%d): %d\n", + libcfs_id2str(rpc->crpc_dest), + reqst->pnr_seq, rpc->crpc_status); + return; + } + + if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) { + __swab32s(&reply->pnr_seq); + __swab32s(&reply->pnr_magic); + __swab32s(&reply->pnr_status); + } + + if (reply->pnr_magic != LST_PING_TEST_MAGIC) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); + CERROR("Bad magic %u from %s, %u expected.\n", + reply->pnr_magic, libcfs_id2str(rpc->crpc_dest), + LST_PING_TEST_MAGIC); + return; + } + + if (reply->pnr_seq != reqst->pnr_seq) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); + CERROR("Bad seq %u from %s, %u expected.\n", + reply->pnr_seq, libcfs_id2str(rpc->crpc_dest), + reqst->pnr_seq); + return; + } + + cfs_fs_timeval(&tv); + CDEBUG(D_NET, "%d reply in %u usec\n", reply->pnr_seq, + (unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000 + + (tv.tv_usec - reqst->pnr_time_usec))); + return; +} + +static int +ping_server_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + srpc_msg_t *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + srpc_msg_t *replymsg = &rpc->srpc_replymsg; + srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst; + srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply; + + LASSERT(sv->sv_id == SRPC_SERVICE_PING); + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { + LASSERT(reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + __swab32s(&req->pnr_seq); + __swab32s(&req->pnr_magic); + __swab64s(&req->pnr_time_sec); + __swab64s(&req->pnr_time_usec); + } + LASSERT(reqstmsg->msg_type == srpc_service2request(sv->sv_id)); + + if (req->pnr_magic != LST_PING_TEST_MAGIC) { + CERROR("Unexpected magic %08x from %s\n", + req->pnr_magic, libcfs_id2str(rpc->srpc_peer)); + return -EINVAL; + } + + rep->pnr_seq = req->pnr_seq; + rep->pnr_magic = LST_PING_TEST_MAGIC; + + if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + replymsg->msg_ses_feats = LST_FEATS_MASK; + rep->pnr_status = EPROTO; + return 0; + } + + replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; + + CDEBUG(D_NET, "Get ping %d from %s\n", + req->pnr_seq, libcfs_id2str(rpc->srpc_peer)); + return 0; +} + +sfw_test_client_ops_t ping_test_client; +void ping_init_test_client(void) +{ + ping_test_client.tso_init = ping_client_init; + ping_test_client.tso_fini = ping_client_fini; + ping_test_client.tso_prep_rpc = ping_client_prep_rpc; + ping_test_client.tso_done_rpc = ping_client_done_rpc; +} + +srpc_service_t ping_test_service; +void ping_init_test_service(void) +{ + ping_test_service.sv_id = SRPC_SERVICE_PING; + ping_test_service.sv_name = "ping_test"; + ping_test_service.sv_handler = ping_server_handle; + ping_test_service.sv_wi_total = ping_srv_workitems; +} diff --git a/kernel/drivers/staging/lustre/lnet/selftest/rpc.c b/kernel/drivers/staging/lustre/lnet/selftest/rpc.c new file mode 100644 index 000000000..080788ab7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/rpc.c @@ -0,0 +1,1673 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/rpc.c + * + * Author: Isaac Huang + * + * 2012-05-13: Liang Zhen + * - percpt data for service to improve smp performance + * - code cleanup + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +typedef enum { + SRPC_STATE_NONE, + SRPC_STATE_NI_INIT, + SRPC_STATE_EQ_INIT, + SRPC_STATE_RUNNING, + SRPC_STATE_STOPPING, +} srpc_state_t; + +static struct smoketest_rpc { + spinlock_t rpc_glock; /* global lock */ + srpc_service_t *rpc_services[SRPC_SERVICE_MAX_ID + 1]; + lnet_handle_eq_t rpc_lnet_eq; /* _the_ LNet event queue */ + srpc_state_t rpc_state; + srpc_counters_t rpc_counters; + __u64 rpc_matchbits; /* matchbits counter */ +} srpc_data; + +static inline int +srpc_serv_portal(int svc_id) +{ + return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ? + SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL; +} + +/* forward ref's */ +int srpc_handle_rpc(swi_workitem_t *wi); + +void srpc_get_counters(srpc_counters_t *cnt) +{ + spin_lock(&srpc_data.rpc_glock); + *cnt = srpc_data.rpc_counters; + spin_unlock(&srpc_data.rpc_glock); +} + +void srpc_set_counters(const srpc_counters_t *cnt) +{ + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters = *cnt; + spin_unlock(&srpc_data.rpc_glock); +} + +static int +srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob) +{ + nob = min(nob, (int)PAGE_CACHE_SIZE); + + LASSERT(nob > 0); + LASSERT(i >= 0 && i < bk->bk_niov); + + bk->bk_iovs[i].kiov_offset = 0; + bk->bk_iovs[i].kiov_page = pg; + bk->bk_iovs[i].kiov_len = nob; + return nob; +} + +void +srpc_free_bulk(srpc_bulk_t *bk) +{ + int i; + struct page *pg; + + LASSERT(bk != NULL); + + for (i = 0; i < bk->bk_niov; i++) { + pg = bk->bk_iovs[i].kiov_page; + if (pg == NULL) + break; + + __free_page(pg); + } + + LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov])); + return; +} + +srpc_bulk_t * +srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink) +{ + srpc_bulk_t *bk; + int i; + + LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV); + + LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt, + offsetof(srpc_bulk_t, bk_iovs[bulk_npg])); + if (bk == NULL) { + CERROR("Can't allocate descriptor for %d pages\n", bulk_npg); + return NULL; + } + + memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg])); + bk->bk_sink = sink; + bk->bk_len = bulk_len; + bk->bk_niov = bulk_npg; + + for (i = 0; i < bulk_npg; i++) { + struct page *pg; + int nob; + + pg = alloc_pages_node(cfs_cpt_spread_node(lnet_cpt_table(), cpt), + GFP_IOFS, 0); + if (pg == NULL) { + CERROR("Can't allocate page %d of %d\n", i, bulk_npg); + srpc_free_bulk(bk); + return NULL; + } + + nob = srpc_add_bulk_page(bk, pg, i, bulk_len); + bulk_len -= nob; + } + + return bk; +} + +static inline __u64 +srpc_next_id(void) +{ + __u64 id; + + spin_lock(&srpc_data.rpc_glock); + id = srpc_data.rpc_matchbits++; + spin_unlock(&srpc_data.rpc_glock); + return id; +} + +static void +srpc_init_server_rpc(struct srpc_server_rpc *rpc, + struct srpc_service_cd *scd, + struct srpc_buffer *buffer) +{ + memset(rpc, 0, sizeof(*rpc)); + swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc, + srpc_serv_is_framework(scd->scd_svc) ? + lst_sched_serial : lst_sched_test[scd->scd_cpt]); + + rpc->srpc_ev.ev_fired = 1; /* no event expected now */ + + rpc->srpc_scd = scd; + rpc->srpc_reqstbuf = buffer; + rpc->srpc_peer = buffer->buf_peer; + rpc->srpc_self = buffer->buf_self; + LNetInvalidateHandle(&rpc->srpc_replymdh); +} + +static void +srpc_service_fini(struct srpc_service *svc) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + struct srpc_buffer *buf; + struct list_head *q; + int i; + + if (svc->sv_cpt_data == NULL) + return; + + cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { + while (1) { + if (!list_empty(&scd->scd_buf_posted)) + q = &scd->scd_buf_posted; + else if (!list_empty(&scd->scd_buf_blocked)) + q = &scd->scd_buf_blocked; + else + break; + + while (!list_empty(q)) { + buf = list_entry(q->next, + struct srpc_buffer, + buf_list); + list_del(&buf->buf_list); + LIBCFS_FREE(buf, sizeof(*buf)); + } + } + + LASSERT(list_empty(&scd->scd_rpc_active)); + + while (!list_empty(&scd->scd_rpc_free)) { + rpc = list_entry(scd->scd_rpc_free.next, + struct srpc_server_rpc, + srpc_list); + list_del(&rpc->srpc_list); + LIBCFS_FREE(rpc, sizeof(*rpc)); + } + } + + cfs_percpt_free(svc->sv_cpt_data); + svc->sv_cpt_data = NULL; +} + +static int +srpc_service_nrpcs(struct srpc_service *svc) +{ + int nrpcs = svc->sv_wi_total / svc->sv_ncpts; + + return srpc_serv_is_framework(svc) ? + max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN); +} + +int srpc_add_buffer(struct swi_workitem *wi); + +static int +srpc_service_init(struct srpc_service *svc) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int nrpcs; + int i; + int j; + + svc->sv_shuttingdown = 0; + + svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct srpc_service_cd)); + if (svc->sv_cpt_data == NULL) + return -ENOMEM; + + svc->sv_ncpts = srpc_serv_is_framework(svc) ? + 1 : cfs_cpt_number(lnet_cpt_table()); + nrpcs = srpc_service_nrpcs(svc); + + cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { + scd->scd_cpt = i; + scd->scd_svc = svc; + spin_lock_init(&scd->scd_lock); + INIT_LIST_HEAD(&scd->scd_rpc_free); + INIT_LIST_HEAD(&scd->scd_rpc_active); + INIT_LIST_HEAD(&scd->scd_buf_posted); + INIT_LIST_HEAD(&scd->scd_buf_blocked); + + scd->scd_ev.ev_data = scd; + scd->scd_ev.ev_type = SRPC_REQUEST_RCVD; + + /* NB: don't use lst_sched_serial for adding buffer, + * see details in srpc_service_add_buffers() */ + swi_init_workitem(&scd->scd_buf_wi, scd, + srpc_add_buffer, lst_sched_test[i]); + + if (i != 0 && srpc_serv_is_framework(svc)) { + /* NB: framework service only needs srpc_service_cd for + * one partition, but we allocate for all to make + * it easier to implement, it will waste a little + * memory but nobody should care about this */ + continue; + } + + for (j = 0; j < nrpcs; j++) { + LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(), + i, sizeof(*rpc)); + if (rpc == NULL) { + srpc_service_fini(svc); + return -ENOMEM; + } + list_add(&rpc->srpc_list, &scd->scd_rpc_free); + } + } + + return 0; +} + +int +srpc_add_service(struct srpc_service *sv) +{ + int id = sv->sv_id; + + LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID); + + if (srpc_service_init(sv) != 0) + return -ENOMEM; + + spin_lock(&srpc_data.rpc_glock); + + LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING); + + if (srpc_data.rpc_services[id] != NULL) { + spin_unlock(&srpc_data.rpc_glock); + goto failed; + } + + srpc_data.rpc_services[id] = sv; + spin_unlock(&srpc_data.rpc_glock); + + CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name); + return 0; + + failed: + srpc_service_fini(sv); + return -EBUSY; +} + +int +srpc_remove_service(srpc_service_t *sv) +{ + int id = sv->sv_id; + + spin_lock(&srpc_data.rpc_glock); + + if (srpc_data.rpc_services[id] != sv) { + spin_unlock(&srpc_data.rpc_glock); + return -ENOENT; + } + + srpc_data.rpc_services[id] = NULL; + spin_unlock(&srpc_data.rpc_glock); + return 0; +} + +static int +srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf, + int len, int options, lnet_process_id_t peer, + lnet_handle_md_t *mdh, srpc_event_t *ev) +{ + int rc; + lnet_md_t md; + lnet_handle_me_t meh; + + rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK, + local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh); + if (rc != 0) { + CERROR("LNetMEAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + return -ENOMEM; + } + + md.threshold = 1; + md.user_ptr = ev; + md.start = buf; + md.length = len; + md.options = options; + md.eq_handle = srpc_data.rpc_lnet_eq; + + rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh); + if (rc != 0) { + CERROR("LNetMDAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + + rc = LNetMEUnlink(meh); + LASSERT(rc == 0); + return -ENOMEM; + } + + CDEBUG(D_NET, + "Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n", + libcfs_id2str(peer), portal, matchbits); + return 0; +} + +static int +srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len, + int options, lnet_process_id_t peer, lnet_nid_t self, + lnet_handle_md_t *mdh, srpc_event_t *ev) +{ + int rc; + lnet_md_t md; + + md.user_ptr = ev; + md.start = buf; + md.length = len; + md.eq_handle = srpc_data.rpc_lnet_eq; + md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1; + md.options = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET); + + rc = LNetMDBind(md, LNET_UNLINK, mdh); + if (rc != 0) { + CERROR("LNetMDBind failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + return -ENOMEM; + } + + /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options. + * they're only meaningful for MDs attached to an ME (i.e. passive + * buffers... */ + if ((options & LNET_MD_OP_PUT) != 0) { + rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer, + portal, matchbits, 0, 0); + } else { + LASSERT((options & LNET_MD_OP_GET) != 0); + + rc = LNetGet(self, *mdh, peer, portal, matchbits, 0); + } + + if (rc != 0) { + CERROR("LNet%s(%s, %d, %lld) failed: %d\n", + ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get", + libcfs_id2str(peer), portal, matchbits, rc); + + /* The forthcoming unlink event will complete this operation + * with failure, so fall through and return success here. + */ + rc = LNetMDUnlink(*mdh); + LASSERT(rc == 0); + } else { + CDEBUG(D_NET, + "Posted active RDMA: peer %s, portal %u, matchbits %#llx\n", + libcfs_id2str(peer), portal, matchbits); + } + return 0; +} + +static int +srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf, + int len, lnet_handle_md_t *mdh, srpc_event_t *ev) +{ + return srpc_post_active_rdma(srpc_serv_portal(service), service, + buf, len, LNET_MD_OP_PUT, peer, + LNET_NID_ANY, mdh, ev); +} + +static int +srpc_post_passive_rqtbuf(int service, int local, void *buf, int len, + lnet_handle_md_t *mdh, srpc_event_t *ev) +{ + lnet_process_id_t any = {0}; + + any.nid = LNET_NID_ANY; + any.pid = LNET_PID_ANY; + + return srpc_post_passive_rdma(srpc_serv_portal(service), + local, service, buf, len, + LNET_MD_OP_PUT, any, mdh, ev); +} + +static int +srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf) + __must_hold(&scd->scd_lock) +{ + struct srpc_service *sv = scd->scd_svc; + struct srpc_msg *msg = &buf->buf_msg; + int rc; + + LNetInvalidateHandle(&buf->buf_mdh); + list_add(&buf->buf_list, &scd->scd_buf_posted); + scd->scd_buf_nposted++; + spin_unlock(&scd->scd_lock); + + rc = srpc_post_passive_rqtbuf(sv->sv_id, + !srpc_serv_is_framework(sv), + msg, sizeof(*msg), &buf->buf_mdh, + &scd->scd_ev); + + /* At this point, a RPC (new or delayed) may have arrived in + * msg and its event handler has been called. So we must add + * buf to scd_buf_posted _before_ dropping scd_lock */ + + spin_lock(&scd->scd_lock); + + if (rc == 0) { + if (!sv->sv_shuttingdown) + return 0; + + spin_unlock(&scd->scd_lock); + /* srpc_shutdown_service might have tried to unlink me + * when my buf_mdh was still invalid */ + LNetMDUnlink(buf->buf_mdh); + spin_lock(&scd->scd_lock); + return 0; + } + + scd->scd_buf_nposted--; + if (sv->sv_shuttingdown) + return rc; /* don't allow to change scd_buf_posted */ + + list_del(&buf->buf_list); + spin_unlock(&scd->scd_lock); + + LIBCFS_FREE(buf, sizeof(*buf)); + + spin_lock(&scd->scd_lock); + return rc; +} + +int +srpc_add_buffer(struct swi_workitem *wi) +{ + struct srpc_service_cd *scd = wi->swi_workitem.wi_data; + struct srpc_buffer *buf; + int rc = 0; + + /* it's called by workitem scheduler threads, these threads + * should have been set CPT affinity, so buffers will be posted + * on CPT local list of Portal */ + spin_lock(&scd->scd_lock); + + while (scd->scd_buf_adjust > 0 && + !scd->scd_svc->sv_shuttingdown) { + scd->scd_buf_adjust--; /* consume it */ + scd->scd_buf_posting++; + + spin_unlock(&scd->scd_lock); + + LIBCFS_ALLOC(buf, sizeof(*buf)); + if (buf == NULL) { + CERROR("Failed to add new buf to service: %s\n", + scd->scd_svc->sv_name); + spin_lock(&scd->scd_lock); + rc = -ENOMEM; + break; + } + + spin_lock(&scd->scd_lock); + if (scd->scd_svc->sv_shuttingdown) { + spin_unlock(&scd->scd_lock); + LIBCFS_FREE(buf, sizeof(*buf)); + + spin_lock(&scd->scd_lock); + rc = -ESHUTDOWN; + break; + } + + rc = srpc_service_post_buffer(scd, buf); + if (rc != 0) + break; /* buf has been freed inside */ + + LASSERT(scd->scd_buf_posting > 0); + scd->scd_buf_posting--; + scd->scd_buf_total++; + scd->scd_buf_low = max(2, scd->scd_buf_total / 4); + } + + if (rc != 0) { + scd->scd_buf_err_stamp = get_seconds(); + scd->scd_buf_err = rc; + + LASSERT(scd->scd_buf_posting > 0); + scd->scd_buf_posting--; + } + + spin_unlock(&scd->scd_lock); + return 0; +} + +int +srpc_service_add_buffers(struct srpc_service *sv, int nbuffer) +{ + struct srpc_service_cd *scd; + int rc = 0; + int i; + + LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + scd->scd_buf_err = 0; + scd->scd_buf_err_stamp = 0; + scd->scd_buf_posting = 0; + scd->scd_buf_adjust = nbuffer; + /* start to post buffers */ + swi_schedule_workitem(&scd->scd_buf_wi); + spin_unlock(&scd->scd_lock); + + /* framework service only post buffer for one partition */ + if (srpc_serv_is_framework(sv)) + break; + } + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + /* + * NB: srpc_service_add_buffers() can be called inside + * thread context of lst_sched_serial, and we don't normally + * allow to sleep inside thread context of WI scheduler + * because it will block current scheduler thread from doing + * anything else, even worse, it could deadlock if it's + * waiting on result from another WI of the same scheduler. + * However, it's safe at here because scd_buf_wi is scheduled + * by thread in a different WI scheduler (lst_sched_test), + * so we don't have any risk of deadlock, though this could + * block all WIs pending on lst_sched_serial for a moment + * which is not good but not fatal. + */ + lst_wait_until(scd->scd_buf_err != 0 || + (scd->scd_buf_adjust == 0 && + scd->scd_buf_posting == 0), + scd->scd_lock, "waiting for adding buffer\n"); + + if (scd->scd_buf_err != 0 && rc == 0) + rc = scd->scd_buf_err; + + spin_unlock(&scd->scd_lock); + } + + return rc; +} + +void +srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer) +{ + struct srpc_service_cd *scd; + int num; + int i; + + LASSERT(!sv->sv_shuttingdown); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + num = scd->scd_buf_total + scd->scd_buf_posting; + scd->scd_buf_adjust -= min(nbuffer, num); + + spin_unlock(&scd->scd_lock); + } +} + +/* returns 1 if sv has finished, otherwise 0 */ +int +srpc_finish_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int i; + + LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */ + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + if (!swi_deschedule_workitem(&scd->scd_buf_wi)) { + spin_unlock(&scd->scd_lock); + return 0; + } + + if (scd->scd_buf_nposted > 0) { + CDEBUG(D_NET, "waiting for %d posted buffers to unlink", + scd->scd_buf_nposted); + spin_unlock(&scd->scd_lock); + return 0; + } + + if (list_empty(&scd->scd_rpc_active)) { + spin_unlock(&scd->scd_lock); + continue; + } + + rpc = list_entry(scd->scd_rpc_active.next, + struct srpc_server_rpc, srpc_list); + CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n", + rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), + rpc->srpc_wi.swi_workitem.wi_scheduled, + rpc->srpc_wi.swi_workitem.wi_running, + rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type, + rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet); + spin_unlock(&scd->scd_lock); + return 0; + } + + /* no lock needed from now on */ + srpc_service_fini(sv); + return 1; +} + +/* called with sv->sv_lock held */ +static void +srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf) + __must_hold(&scd->scd_lock) +{ + if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) { + if (srpc_service_post_buffer(scd, buf) != 0) { + CWARN("Failed to post %s buffer\n", + scd->scd_svc->sv_name); + } + return; + } + + /* service is shutting down, or we want to recycle some buffers */ + scd->scd_buf_total--; + + if (scd->scd_buf_adjust < 0) { + scd->scd_buf_adjust++; + if (scd->scd_buf_adjust < 0 && + scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) { + CDEBUG(D_INFO, + "Try to recycle %d buffers but nothing left\n", + scd->scd_buf_adjust); + scd->scd_buf_adjust = 0; + } + } + + spin_unlock(&scd->scd_lock); + LIBCFS_FREE(buf, sizeof(*buf)); + spin_lock(&scd->scd_lock); +} + +void +srpc_abort_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int i; + + CDEBUG(D_NET, "Aborting service: id %d, name %s\n", + sv->sv_id, sv->sv_name); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + /* schedule in-flight RPCs to notice the abort, NB: + * racing with incoming RPCs; complete fix should make test + * RPCs carry session ID in its headers */ + list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) { + rpc->srpc_aborted = 1; + swi_schedule_workitem(&rpc->srpc_wi); + } + + spin_unlock(&scd->scd_lock); + } +} + +void +srpc_shutdown_service(srpc_service_t *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + srpc_buffer_t *buf; + int i; + + CDEBUG(D_NET, "Shutting down service: id %d, name %s\n", + sv->sv_id, sv->sv_name); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) + spin_lock(&scd->scd_lock); + + sv->sv_shuttingdown = 1; /* i.e. no new active RPC */ + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) + spin_unlock(&scd->scd_lock); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + /* schedule in-flight RPCs to notice the shutdown */ + list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) + swi_schedule_workitem(&rpc->srpc_wi); + + spin_unlock(&scd->scd_lock); + + /* OK to traverse scd_buf_posted without lock, since no one + * touches scd_buf_posted now */ + list_for_each_entry(buf, &scd->scd_buf_posted, buf_list) + LNetMDUnlink(buf->buf_mdh); + } +} + +static int +srpc_send_request(srpc_client_rpc_t *rpc) +{ + srpc_event_t *ev = &rpc->crpc_reqstev; + int rc; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REQUEST_SENT; + + rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service, + &rpc->crpc_reqstmsg, sizeof(srpc_msg_t), + &rpc->crpc_reqstmdh, ev); + if (rc != 0) { + LASSERT(rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +static int +srpc_prepare_reply(srpc_client_rpc_t *rpc) +{ + srpc_event_t *ev = &rpc->crpc_replyev; + __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid; + int rc; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REPLY_RCVD; + + *id = srpc_next_id(); + + rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, + &rpc->crpc_replymsg, sizeof(srpc_msg_t), + LNET_MD_OP_PUT, rpc->crpc_dest, + &rpc->crpc_replymdh, ev); + if (rc != 0) { + LASSERT(rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +static int +srpc_prepare_bulk(srpc_client_rpc_t *rpc) +{ + srpc_bulk_t *bk = &rpc->crpc_bulk; + srpc_event_t *ev = &rpc->crpc_bulkev; + __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid; + int rc; + int opt; + + LASSERT(bk->bk_niov <= LNET_MAX_IOV); + + if (bk->bk_niov == 0) + return 0; /* nothing to do */ + + opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET; + opt |= LNET_MD_KIOV; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_BULK_REQ_RCVD; + + *id = srpc_next_id(); + + rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, + &bk->bk_iovs[0], bk->bk_niov, opt, + rpc->crpc_dest, &bk->bk_mdh, ev); + if (rc != 0) { + LASSERT(rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +static int +srpc_do_bulk(srpc_server_rpc_t *rpc) +{ + srpc_event_t *ev = &rpc->srpc_ev; + srpc_bulk_t *bk = rpc->srpc_bulk; + __u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid; + int rc; + int opt; + + LASSERT(bk != NULL); + + opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT; + opt |= LNET_MD_KIOV; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT; + + rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id, + &bk->bk_iovs[0], bk->bk_niov, opt, + rpc->srpc_peer, rpc->srpc_self, + &bk->bk_mdh, ev); + if (rc != 0) + ev->ev_fired = 1; /* no more event expected */ + return rc; +} + +/* only called from srpc_handle_rpc */ +static void +srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status) +{ + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + srpc_buffer_t *buffer; + + LASSERT(status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE); + + rpc->srpc_status = status; + + CDEBUG_LIMIT(status == 0 ? D_NET : D_NETERROR, + "Server RPC %p done: service %s, peer %s, status %s:%d\n", + rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), status); + + if (status != 0) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_dropped++; + spin_unlock(&srpc_data.rpc_glock); + } + + if (rpc->srpc_done != NULL) + (*rpc->srpc_done) (rpc); + LASSERT(rpc->srpc_bulk == NULL); + + spin_lock(&scd->scd_lock); + + if (rpc->srpc_reqstbuf != NULL) { + /* NB might drop sv_lock in srpc_service_recycle_buffer, but + * sv won't go away for scd_rpc_active must not be empty */ + srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf); + rpc->srpc_reqstbuf = NULL; + } + + list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */ + + /* + * No one can schedule me now since: + * - I'm not on scd_rpc_active. + * - all LNet events have been fired. + * Cancel pending schedules and prevent future schedule attempts: + */ + LASSERT(rpc->srpc_ev.ev_fired); + swi_exit_workitem(&rpc->srpc_wi); + + if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) { + buffer = list_entry(scd->scd_buf_blocked.next, + srpc_buffer_t, buf_list); + list_del(&buffer->buf_list); + + srpc_init_server_rpc(rpc, scd, buffer); + list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active); + swi_schedule_workitem(&rpc->srpc_wi); + } else { + list_add(&rpc->srpc_list, &scd->scd_rpc_free); + } + + spin_unlock(&scd->scd_lock); + return; +} + +/* handles an incoming RPC */ +int +srpc_handle_rpc(swi_workitem_t *wi) +{ + struct srpc_server_rpc *rpc = wi->swi_workitem.wi_data; + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + srpc_event_t *ev = &rpc->srpc_ev; + int rc = 0; + + LASSERT(wi == &rpc->srpc_wi); + + spin_lock(&scd->scd_lock); + + if (sv->sv_shuttingdown || rpc->srpc_aborted) { + spin_unlock(&scd->scd_lock); + + if (rpc->srpc_bulk != NULL) + LNetMDUnlink(rpc->srpc_bulk->bk_mdh); + LNetMDUnlink(rpc->srpc_replymdh); + + if (ev->ev_fired) { /* no more event, OK to finish */ + srpc_server_rpc_done(rpc, -ESHUTDOWN); + return 1; + } + return 0; + } + + spin_unlock(&scd->scd_lock); + + switch (wi->swi_state) { + default: + LBUG(); + case SWI_STATE_NEWBORN: { + srpc_msg_t *msg; + srpc_generic_reply_t *reply; + + msg = &rpc->srpc_reqstbuf->buf_msg; + reply = &rpc->srpc_replymsg.msg_body.reply; + + if (msg->msg_magic == 0) { + /* moaned already in srpc_lnet_ev_handler */ + srpc_server_rpc_done(rpc, EBADMSG); + return 1; + } + + srpc_unpack_msg_hdr(msg); + if (msg->msg_version != SRPC_MSG_VERSION) { + CWARN("Version mismatch: %u, %u expected, from %s\n", + msg->msg_version, SRPC_MSG_VERSION, + libcfs_id2str(rpc->srpc_peer)); + reply->status = EPROTO; + /* drop through and send reply */ + } else { + reply->status = 0; + rc = (*sv->sv_handler)(rpc); + LASSERT(reply->status == 0 || !rpc->srpc_bulk); + if (rc != 0) { + srpc_server_rpc_done(rpc, rc); + return 1; + } + } + + wi->swi_state = SWI_STATE_BULK_STARTED; + + if (rpc->srpc_bulk != NULL) { + rc = srpc_do_bulk(rpc); + if (rc == 0) + return 0; /* wait for bulk */ + + LASSERT(ev->ev_fired); + ev->ev_status = rc; + } + } + case SWI_STATE_BULK_STARTED: + LASSERT(rpc->srpc_bulk == NULL || ev->ev_fired); + + if (rpc->srpc_bulk != NULL) { + rc = ev->ev_status; + + if (sv->sv_bulk_ready != NULL) + rc = (*sv->sv_bulk_ready) (rpc, rc); + + if (rc != 0) { + srpc_server_rpc_done(rpc, rc); + return 1; + } + } + + wi->swi_state = SWI_STATE_REPLY_SUBMITTED; + rc = srpc_send_reply(rpc); + if (rc == 0) + return 0; /* wait for reply */ + srpc_server_rpc_done(rpc, rc); + return 1; + + case SWI_STATE_REPLY_SUBMITTED: + if (!ev->ev_fired) { + CERROR("RPC %p: bulk %p, service %d\n", + rpc, rpc->srpc_bulk, sv->sv_id); + CERROR("Event: status %d, type %d, lnet %d\n", + ev->ev_status, ev->ev_type, ev->ev_lnet); + LASSERT(ev->ev_fired); + } + + wi->swi_state = SWI_STATE_DONE; + srpc_server_rpc_done(rpc, ev->ev_status); + return 1; + } + + return 0; +} + +static void +srpc_client_rpc_expired(void *data) +{ + srpc_client_rpc_t *rpc = data; + + CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + rpc->crpc_timeout); + + spin_lock(&rpc->crpc_lock); + + rpc->crpc_timeout = 0; + srpc_abort_rpc(rpc, -ETIMEDOUT); + + spin_unlock(&rpc->crpc_lock); + + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_expired++; + spin_unlock(&srpc_data.rpc_glock); +} + +inline void +srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc) +{ + stt_timer_t *timer = &rpc->crpc_timer; + + if (rpc->crpc_timeout == 0) + return; + + INIT_LIST_HEAD(&timer->stt_list); + timer->stt_data = rpc; + timer->stt_func = srpc_client_rpc_expired; + timer->stt_expires = cfs_time_add(rpc->crpc_timeout, + get_seconds()); + stt_add_timer(timer); + return; +} + +/* + * Called with rpc->crpc_lock held. + * + * Upon exit the RPC expiry timer is not queued and the handler is not + * running on any CPU. */ +static void +srpc_del_client_rpc_timer(srpc_client_rpc_t *rpc) +{ + /* timer not planted or already exploded */ + if (rpc->crpc_timeout == 0) + return; + + /* timer successfully defused */ + if (stt_del_timer(&rpc->crpc_timer)) + return; + + /* timer detonated, wait for it to explode */ + while (rpc->crpc_timeout != 0) { + spin_unlock(&rpc->crpc_lock); + + schedule(); + + spin_lock(&rpc->crpc_lock); + } +} + +static void +srpc_client_rpc_done(srpc_client_rpc_t *rpc, int status) +{ + swi_workitem_t *wi = &rpc->crpc_wi; + + LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE); + + spin_lock(&rpc->crpc_lock); + + rpc->crpc_closed = 1; + if (rpc->crpc_status == 0) + rpc->crpc_status = status; + + srpc_del_client_rpc_timer(rpc); + + CDEBUG_LIMIT((status == 0) ? D_NET : D_NETERROR, + "Client RPC done: service %d, peer %s, status %s:%d:%d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(wi->swi_state), rpc->crpc_aborted, status); + + /* + * No one can schedule me now since: + * - RPC timer has been defused. + * - all LNet events have been fired. + * - crpc_closed has been set, preventing srpc_abort_rpc from + * scheduling me. + * Cancel pending schedules and prevent future schedule attempts: + */ + LASSERT(!srpc_event_pending(rpc)); + swi_exit_workitem(wi); + + spin_unlock(&rpc->crpc_lock); + + (*rpc->crpc_done)(rpc); + return; +} + +/* sends an outgoing RPC */ +int +srpc_send_rpc(swi_workitem_t *wi) +{ + int rc = 0; + srpc_client_rpc_t *rpc; + srpc_msg_t *reply; + int do_bulk; + + LASSERT(wi != NULL); + + rpc = wi->swi_workitem.wi_data; + + LASSERT(rpc != NULL); + LASSERT(wi == &rpc->crpc_wi); + + reply = &rpc->crpc_replymsg; + do_bulk = rpc->crpc_bulk.bk_niov > 0; + + spin_lock(&rpc->crpc_lock); + + if (rpc->crpc_aborted) { + spin_unlock(&rpc->crpc_lock); + goto abort; + } + + spin_unlock(&rpc->crpc_lock); + + switch (wi->swi_state) { + default: + LBUG(); + case SWI_STATE_NEWBORN: + LASSERT(!srpc_event_pending(rpc)); + + rc = srpc_prepare_reply(rpc); + if (rc != 0) { + srpc_client_rpc_done(rpc, rc); + return 1; + } + + rc = srpc_prepare_bulk(rpc); + if (rc != 0) + break; + + wi->swi_state = SWI_STATE_REQUEST_SUBMITTED; + rc = srpc_send_request(rpc); + break; + + case SWI_STATE_REQUEST_SUBMITTED: + /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any + * order; however, they're processed in a strict order: + * rqt, rpy, and bulk. */ + if (!rpc->crpc_reqstev.ev_fired) + break; + + rc = rpc->crpc_reqstev.ev_status; + if (rc != 0) + break; + + wi->swi_state = SWI_STATE_REQUEST_SENT; + /* perhaps more events, fall thru */ + case SWI_STATE_REQUEST_SENT: { + srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service); + + if (!rpc->crpc_replyev.ev_fired) + break; + + rc = rpc->crpc_replyev.ev_status; + if (rc != 0) + break; + + srpc_unpack_msg_hdr(reply); + if (reply->msg_type != type || + (reply->msg_magic != SRPC_MSG_MAGIC && + reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) { + CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n", + libcfs_id2str(rpc->crpc_dest), + reply->msg_type, type, + reply->msg_magic, SRPC_MSG_MAGIC); + rc = -EBADMSG; + break; + } + + if (do_bulk && reply->msg_body.reply.status != 0) { + CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n", + reply->msg_body.reply.status, + libcfs_id2str(rpc->crpc_dest)); + LNetMDUnlink(rpc->crpc_bulk.bk_mdh); + } + + wi->swi_state = SWI_STATE_REPLY_RECEIVED; + } + case SWI_STATE_REPLY_RECEIVED: + if (do_bulk && !rpc->crpc_bulkev.ev_fired) + break; + + rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0; + + /* Bulk buffer was unlinked due to remote error. Clear error + * since reply buffer still contains valid data. + * NB rpc->crpc_done shouldn't look into bulk data in case of + * remote error. */ + if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK && + rpc->crpc_status == 0 && reply->msg_body.reply.status != 0) + rc = 0; + + wi->swi_state = SWI_STATE_DONE; + srpc_client_rpc_done(rpc, rc); + return 1; + } + + if (rc != 0) { + spin_lock(&rpc->crpc_lock); + srpc_abort_rpc(rpc, rc); + spin_unlock(&rpc->crpc_lock); + } + +abort: + if (rpc->crpc_aborted) { + LNetMDUnlink(rpc->crpc_reqstmdh); + LNetMDUnlink(rpc->crpc_replymdh); + LNetMDUnlink(rpc->crpc_bulk.bk_mdh); + + if (!srpc_event_pending(rpc)) { + srpc_client_rpc_done(rpc, -EINTR); + return 1; + } + } + return 0; +} + +srpc_client_rpc_t * +srpc_create_client_rpc(lnet_process_id_t peer, int service, + int nbulkiov, int bulklen, + void (*rpc_done)(srpc_client_rpc_t *), + void (*rpc_fini)(srpc_client_rpc_t *), void *priv) +{ + srpc_client_rpc_t *rpc; + + LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t, + crpc_bulk.bk_iovs[nbulkiov])); + if (rpc == NULL) + return NULL; + + srpc_init_client_rpc(rpc, peer, service, nbulkiov, + bulklen, rpc_done, rpc_fini, priv); + return rpc; +} + +/* called with rpc->crpc_lock held */ +void +srpc_abort_rpc(srpc_client_rpc_t *rpc, int why) +{ + LASSERT(why != 0); + + if (rpc->crpc_aborted || /* already aborted */ + rpc->crpc_closed) /* callback imminent */ + return; + + CDEBUG(D_NET, + "Aborting RPC: service %d, peer %s, state %s, why %d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(rpc->crpc_wi.swi_state), why); + + rpc->crpc_aborted = 1; + rpc->crpc_status = why; + swi_schedule_workitem(&rpc->crpc_wi); + return; +} + +/* called with rpc->crpc_lock held */ +void +srpc_post_rpc(srpc_client_rpc_t *rpc) +{ + LASSERT(!rpc->crpc_aborted); + LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING); + + CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n", + libcfs_id2str(rpc->crpc_dest), rpc->crpc_service, + rpc->crpc_timeout); + + srpc_add_client_rpc_timer(rpc); + swi_schedule_workitem(&rpc->crpc_wi); + return; +} + + +int +srpc_send_reply(struct srpc_server_rpc *rpc) +{ + srpc_event_t *ev = &rpc->srpc_ev; + struct srpc_msg *msg = &rpc->srpc_replymsg; + struct srpc_buffer *buffer = rpc->srpc_reqstbuf; + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + __u64 rpyid; + int rc; + + LASSERT(buffer != NULL); + rpyid = buffer->buf_msg.msg_body.reqst.rpyid; + + spin_lock(&scd->scd_lock); + + if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) { + /* Repost buffer before replying since test client + * might send me another RPC once it gets the reply */ + if (srpc_service_post_buffer(scd, buffer) != 0) + CWARN("Failed to repost %s buffer\n", sv->sv_name); + rpc->srpc_reqstbuf = NULL; + } + + spin_unlock(&scd->scd_lock); + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REPLY_SENT; + + msg->msg_magic = SRPC_MSG_MAGIC; + msg->msg_version = SRPC_MSG_VERSION; + msg->msg_type = srpc_service2reply(sv->sv_id); + + rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg, + sizeof(*msg), LNET_MD_OP_PUT, + rpc->srpc_peer, rpc->srpc_self, + &rpc->srpc_replymdh, ev); + if (rc != 0) + ev->ev_fired = 1; /* no more event expected */ + return rc; +} + +/* when in kernel always called with LNET_LOCK() held, and in thread context */ +static void +srpc_lnet_ev_handler(lnet_event_t *ev) +{ + struct srpc_service_cd *scd; + srpc_event_t *rpcev = ev->md.user_ptr; + srpc_client_rpc_t *crpc; + srpc_server_rpc_t *srpc; + srpc_buffer_t *buffer; + srpc_service_t *sv; + srpc_msg_t *msg; + srpc_msg_type_t type; + + LASSERT(!in_interrupt()); + + if (ev->status != 0) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.errors++; + spin_unlock(&srpc_data.rpc_glock); + } + + rpcev->ev_lnet = ev->type; + + switch (rpcev->ev_type) { + default: + CERROR("Unknown event: status %d, type %d, lnet %d\n", + rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet); + LBUG(); + case SRPC_REQUEST_SENT: + if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_sent++; + spin_unlock(&srpc_data.rpc_glock); + } + case SRPC_REPLY_RCVD: + case SRPC_BULK_REQ_RCVD: + crpc = rpcev->ev_data; + + if (rpcev != &crpc->crpc_reqstev && + rpcev != &crpc->crpc_replyev && + rpcev != &crpc->crpc_bulkev) { + CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n", + rpcev, crpc, &crpc->crpc_reqstev, + &crpc->crpc_replyev, &crpc->crpc_bulkev); + CERROR("Bad event: status %d, type %d, lnet %d\n", + rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet); + LBUG(); + } + + spin_lock(&crpc->crpc_lock); + + LASSERT(rpcev->ev_fired == 0); + rpcev->ev_fired = 1; + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + -EINTR : ev->status; + swi_schedule_workitem(&crpc->crpc_wi); + + spin_unlock(&crpc->crpc_lock); + break; + + case SRPC_REQUEST_RCVD: + scd = rpcev->ev_data; + sv = scd->scd_svc; + + LASSERT(rpcev == &scd->scd_ev); + + spin_lock(&scd->scd_lock); + + LASSERT(ev->unlinked); + LASSERT(ev->type == LNET_EVENT_PUT || + ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->type != LNET_EVENT_UNLINK || + sv->sv_shuttingdown); + + buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg); + buffer->buf_peer = ev->initiator; + buffer->buf_self = ev->target.nid; + + LASSERT(scd->scd_buf_nposted > 0); + scd->scd_buf_nposted--; + + if (sv->sv_shuttingdown) { + /* Leave buffer on scd->scd_buf_nposted since + * srpc_finish_service needs to traverse it. */ + spin_unlock(&scd->scd_lock); + break; + } + + if (scd->scd_buf_err_stamp != 0 && + scd->scd_buf_err_stamp < get_seconds()) { + /* re-enable adding buffer */ + scd->scd_buf_err_stamp = 0; + scd->scd_buf_err = 0; + } + + if (scd->scd_buf_err == 0 && /* adding buffer is enabled */ + scd->scd_buf_adjust == 0 && + scd->scd_buf_nposted < scd->scd_buf_low) { + scd->scd_buf_adjust = max(scd->scd_buf_total / 2, + SFW_TEST_WI_MIN); + swi_schedule_workitem(&scd->scd_buf_wi); + } + + list_del(&buffer->buf_list); /* from scd->scd_buf_posted */ + msg = &buffer->buf_msg; + type = srpc_service2request(sv->sv_id); + + if (ev->status != 0 || ev->mlength != sizeof(*msg) || + (msg->msg_type != type && + msg->msg_type != __swab32(type)) || + (msg->msg_magic != SRPC_MSG_MAGIC && + msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) { + CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n", + sv->sv_name, libcfs_id2str(ev->initiator), + ev->status, ev->mlength, + msg->msg_type, msg->msg_magic); + + /* NB can't call srpc_service_recycle_buffer here since + * it may call LNetM[DE]Attach. The invalid magic tells + * srpc_handle_rpc to drop this RPC */ + msg->msg_magic = 0; + } + + if (!list_empty(&scd->scd_rpc_free)) { + srpc = list_entry(scd->scd_rpc_free.next, + struct srpc_server_rpc, + srpc_list); + list_del(&srpc->srpc_list); + + srpc_init_server_rpc(srpc, scd, buffer); + list_add_tail(&srpc->srpc_list, + &scd->scd_rpc_active); + swi_schedule_workitem(&srpc->srpc_wi); + } else { + list_add_tail(&buffer->buf_list, + &scd->scd_buf_blocked); + } + + spin_unlock(&scd->scd_lock); + + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_rcvd++; + spin_unlock(&srpc_data.rpc_glock); + break; + + case SRPC_BULK_GET_RPLD: + LASSERT(ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_REPLY || + ev->type == LNET_EVENT_UNLINK); + + if (!ev->unlinked) + break; /* wait for final event */ + + case SRPC_BULK_PUT_SENT: + if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) { + spin_lock(&srpc_data.rpc_glock); + + if (rpcev->ev_type == SRPC_BULK_GET_RPLD) + srpc_data.rpc_counters.bulk_get += ev->mlength; + else + srpc_data.rpc_counters.bulk_put += ev->mlength; + + spin_unlock(&srpc_data.rpc_glock); + } + case SRPC_REPLY_SENT: + srpc = rpcev->ev_data; + scd = srpc->srpc_scd; + + LASSERT(rpcev == &srpc->srpc_ev); + + spin_lock(&scd->scd_lock); + + rpcev->ev_fired = 1; + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + -EINTR : ev->status; + swi_schedule_workitem(&srpc->srpc_wi); + + spin_unlock(&scd->scd_lock); + break; + } +} + + +int +srpc_startup(void) +{ + int rc; + + memset(&srpc_data, 0, sizeof(struct smoketest_rpc)); + spin_lock_init(&srpc_data.rpc_glock); + + /* 1 second pause to avoid timestamp reuse */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + srpc_data.rpc_matchbits = ((__u64) get_seconds()) << 48; + + srpc_data.rpc_state = SRPC_STATE_NONE; + + rc = LNetNIInit(LUSTRE_SRV_LNET_PID); + if (rc < 0) { + CERROR("LNetNIInit() has failed: %d\n", rc); + return rc; + } + + srpc_data.rpc_state = SRPC_STATE_NI_INIT; + + LNetInvalidateHandle(&srpc_data.rpc_lnet_eq); + rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq); + if (rc != 0) { + CERROR("LNetEQAlloc() has failed: %d\n", rc); + goto bail; + } + + rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); + LASSERT(rc == 0); + rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL); + LASSERT(rc == 0); + + srpc_data.rpc_state = SRPC_STATE_EQ_INIT; + + rc = stt_startup(); + +bail: + if (rc != 0) + srpc_shutdown(); + else + srpc_data.rpc_state = SRPC_STATE_RUNNING; + + return rc; +} + +void +srpc_shutdown(void) +{ + int i; + int rc; + int state; + + state = srpc_data.rpc_state; + srpc_data.rpc_state = SRPC_STATE_STOPPING; + + switch (state) { + default: + LBUG(); + case SRPC_STATE_RUNNING: + spin_lock(&srpc_data.rpc_glock); + + for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) { + srpc_service_t *sv = srpc_data.rpc_services[i]; + + LASSERTF(sv == NULL, + "service not empty: id %d, name %s\n", + i, sv->sv_name); + } + + spin_unlock(&srpc_data.rpc_glock); + + stt_shutdown(); + + case SRPC_STATE_EQ_INIT: + rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); + rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL); + LASSERT(rc == 0); + rc = LNetEQFree(srpc_data.rpc_lnet_eq); + LASSERT(rc == 0); /* the EQ should have no user by now */ + + case SRPC_STATE_NI_INIT: + LNetNIFini(); + } + + return; +} diff --git a/kernel/drivers/staging/lustre/lnet/selftest/rpc.h b/kernel/drivers/staging/lustre/lnet/selftest/rpc.h new file mode 100644 index 000000000..fbeb75fe5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/rpc.h @@ -0,0 +1,302 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __SELFTEST_RPC_H__ +#define __SELFTEST_RPC_H__ + +#include "../../include/linux/lnet/lnetst.h" + +/* + * LST wired structures + * + * XXX: *REPLY == *REQST + 1 + */ +typedef enum { + SRPC_MSG_MKSN_REQST = 0, + SRPC_MSG_MKSN_REPLY = 1, + SRPC_MSG_RMSN_REQST = 2, + SRPC_MSG_RMSN_REPLY = 3, + SRPC_MSG_BATCH_REQST = 4, + SRPC_MSG_BATCH_REPLY = 5, + SRPC_MSG_STAT_REQST = 6, + SRPC_MSG_STAT_REPLY = 7, + SRPC_MSG_TEST_REQST = 8, + SRPC_MSG_TEST_REPLY = 9, + SRPC_MSG_DEBUG_REQST = 10, + SRPC_MSG_DEBUG_REPLY = 11, + SRPC_MSG_BRW_REQST = 12, + SRPC_MSG_BRW_REPLY = 13, + SRPC_MSG_PING_REQST = 14, + SRPC_MSG_PING_REPLY = 15, + SRPC_MSG_JOIN_REQST = 16, + SRPC_MSG_JOIN_REPLY = 17, +} srpc_msg_type_t; + + +/* CAVEAT EMPTOR: + * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer, + * and 2nd field matchbits of bulk buffer if any. + * + * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field + * session id if needed. + */ +typedef struct { + __u64 rpyid; /* reply buffer matchbits */ + __u64 bulkid; /* bulk buffer matchbits */ +} WIRE_ATTR srpc_generic_reqst_t; + +typedef struct { + __u32 status; + lst_sid_t sid; +} WIRE_ATTR srpc_generic_reply_t; + +/* FRAMEWORK RPCs */ +typedef struct { + __u64 mksn_rpyid; /* reply buffer matchbits */ + lst_sid_t mksn_sid; /* session id */ + __u32 mksn_force; /* use brute force */ + char mksn_name[LST_NAME_SIZE]; +} WIRE_ATTR srpc_mksn_reqst_t; /* make session request */ + +typedef struct { + __u32 mksn_status; /* session status */ + lst_sid_t mksn_sid; /* session id */ + __u32 mksn_timeout; /* session timeout */ + char mksn_name[LST_NAME_SIZE]; +} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */ + +typedef struct { + __u64 rmsn_rpyid; /* reply buffer matchbits */ + lst_sid_t rmsn_sid; /* session id */ +} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */ + +typedef struct { + __u32 rmsn_status; + lst_sid_t rmsn_sid; /* session id */ +} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */ + +typedef struct { + __u64 join_rpyid; /* reply buffer matchbits */ + lst_sid_t join_sid; /* session id to join */ + char join_group[LST_NAME_SIZE]; /* group name */ +} WIRE_ATTR srpc_join_reqst_t; + +typedef struct { + __u32 join_status; /* returned status */ + lst_sid_t join_sid; /* session id */ + __u32 join_timeout; /* # seconds' inactivity to expire */ + char join_session[LST_NAME_SIZE]; /* session name */ +} WIRE_ATTR srpc_join_reply_t; + +typedef struct { + __u64 dbg_rpyid; /* reply buffer matchbits */ + lst_sid_t dbg_sid; /* session id */ + __u32 dbg_flags; /* bitmap of debug */ +} WIRE_ATTR srpc_debug_reqst_t; + +typedef struct { + __u32 dbg_status; /* returned code */ + lst_sid_t dbg_sid; /* session id */ + __u32 dbg_timeout; /* session timeout */ + __u32 dbg_nbatch; /* # of batches in the node */ + char dbg_name[LST_NAME_SIZE]; /* session name */ +} WIRE_ATTR srpc_debug_reply_t; + +#define SRPC_BATCH_OPC_RUN 1 +#define SRPC_BATCH_OPC_STOP 2 +#define SRPC_BATCH_OPC_QUERY 3 + +typedef struct { + __u64 bar_rpyid; /* reply buffer matchbits */ + lst_sid_t bar_sid; /* session id */ + lst_bid_t bar_bid; /* batch id */ + __u32 bar_opc; /* create/start/stop batch */ + __u32 bar_testidx; /* index of test */ + __u32 bar_arg; /* parameters */ +} WIRE_ATTR srpc_batch_reqst_t; + +typedef struct { + __u32 bar_status; /* status of request */ + lst_sid_t bar_sid; /* session id */ + __u32 bar_active; /* # of active tests in batch/test */ + __u32 bar_time; /* remained time */ +} WIRE_ATTR srpc_batch_reply_t; + +typedef struct { + __u64 str_rpyid; /* reply buffer matchbits */ + lst_sid_t str_sid; /* session id */ + __u32 str_type; /* type of stat */ +} WIRE_ATTR srpc_stat_reqst_t; + +typedef struct { + __u32 str_status; + lst_sid_t str_sid; + sfw_counters_t str_fw; + srpc_counters_t str_rpc; + lnet_counters_t str_lnet; +} WIRE_ATTR srpc_stat_reply_t; + +typedef struct { + __u32 blk_opc; /* bulk operation code */ + __u32 blk_npg; /* # of pages */ + __u32 blk_flags; /* reserved flags */ +} WIRE_ATTR test_bulk_req_t; + +typedef struct { + /** bulk operation code */ + __u16 blk_opc; + /** data check flags */ + __u16 blk_flags; + /** data length */ + __u32 blk_len; + /** reserved: offset */ + __u32 blk_offset; +} WIRE_ATTR test_bulk_req_v1_t; + +typedef struct { + __u32 png_size; /* size of ping message */ + __u32 png_flags; /* reserved flags */ +} WIRE_ATTR test_ping_req_t; + +typedef struct { + __u64 tsr_rpyid; /* reply buffer matchbits */ + __u64 tsr_bulkid; /* bulk buffer matchbits */ + lst_sid_t tsr_sid; /* session id */ + lst_bid_t tsr_bid; /* batch id */ + __u32 tsr_service; /* test type: bulk|ping|... */ + /* test client loop count or # server buffers needed */ + __u32 tsr_loop; + __u32 tsr_concur; /* concurrency of test */ + __u8 tsr_is_client; /* is test client or not */ + __u8 tsr_stop_onerr; /* stop on error */ + __u32 tsr_ndest; /* # of dest nodes */ + + union { + test_ping_req_t ping; + test_bulk_req_t bulk_v0; + test_bulk_req_v1_t bulk_v1; + } tsr_u; +} WIRE_ATTR srpc_test_reqst_t; + +typedef struct { + __u32 tsr_status; /* returned code */ + lst_sid_t tsr_sid; +} WIRE_ATTR srpc_test_reply_t; + +/* TEST RPCs */ +typedef struct { + __u64 pnr_rpyid; + __u32 pnr_magic; + __u32 pnr_seq; + __u64 pnr_time_sec; + __u64 pnr_time_usec; +} WIRE_ATTR srpc_ping_reqst_t; + +typedef struct { + __u32 pnr_status; + __u32 pnr_magic; + __u32 pnr_seq; +} WIRE_ATTR srpc_ping_reply_t; + +typedef struct { + __u64 brw_rpyid; /* reply buffer matchbits */ + __u64 brw_bulkid; /* bulk buffer matchbits */ + __u32 brw_rw; /* read or write */ + __u32 brw_len; /* bulk data len */ + __u32 brw_flags; /* bulk data patterns */ +} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */ + +typedef struct { + __u32 brw_status; +} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */ + +#define SRPC_MSG_MAGIC 0xeeb0f00d +#define SRPC_MSG_VERSION 1 + +typedef struct srpc_msg { + /** magic number */ + __u32 msg_magic; + /** message version number */ + __u32 msg_version; + /** type of message body: srpc_msg_type_t */ + __u32 msg_type; + __u32 msg_reserved0; + __u32 msg_reserved1; + /** test session features */ + __u32 msg_ses_feats; + union { + srpc_generic_reqst_t reqst; + srpc_generic_reply_t reply; + + srpc_mksn_reqst_t mksn_reqst; + srpc_mksn_reply_t mksn_reply; + srpc_rmsn_reqst_t rmsn_reqst; + srpc_rmsn_reply_t rmsn_reply; + srpc_debug_reqst_t dbg_reqst; + srpc_debug_reply_t dbg_reply; + srpc_batch_reqst_t bat_reqst; + srpc_batch_reply_t bat_reply; + srpc_stat_reqst_t stat_reqst; + srpc_stat_reply_t stat_reply; + srpc_test_reqst_t tes_reqst; + srpc_test_reply_t tes_reply; + srpc_join_reqst_t join_reqst; + srpc_join_reply_t join_reply; + + srpc_ping_reqst_t ping_reqst; + srpc_ping_reply_t ping_reply; + srpc_brw_reqst_t brw_reqst; + srpc_brw_reply_t brw_reply; + } msg_body; +} WIRE_ATTR srpc_msg_t; + +static inline void +srpc_unpack_msg_hdr(srpc_msg_t *msg) +{ + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + /* We do not swap the magic number here as it is needed to + determine whether the body needs to be swapped. */ + /* __swab32s(&msg->msg_magic); */ + __swab32s(&msg->msg_type); + __swab32s(&msg->msg_version); + __swab32s(&msg->msg_ses_feats); + __swab32s(&msg->msg_reserved0); + __swab32s(&msg->msg_reserved1); +} + +#endif /* __SELFTEST_RPC_H__ */ diff --git a/kernel/drivers/staging/lustre/lnet/selftest/selftest.h b/kernel/drivers/staging/lustre/lnet/selftest/selftest.h new file mode 100644 index 000000000..d48701834 --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/selftest.h @@ -0,0 +1,624 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * copy of GPLv2]. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/selftest.h + * + * Author: Isaac Huang + */ +#ifndef __SELFTEST_SELFTEST_H__ +#define __SELFTEST_SELFTEST_H__ + +#define LNET_ONLY + +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/lnet.h" +#include "../../include/linux/lnet/lib-lnet.h" +#include "../../include/linux/lnet/lib-types.h" +#include "../../include/linux/lnet/lnetst.h" + +#include "rpc.h" +#include "timer.h" + +#ifndef MADE_WITHOUT_COMPROMISE +#define MADE_WITHOUT_COMPROMISE +#endif + + +#define SWI_STATE_NEWBORN 0 +#define SWI_STATE_REPLY_SUBMITTED 1 +#define SWI_STATE_REPLY_SENT 2 +#define SWI_STATE_REQUEST_SUBMITTED 3 +#define SWI_STATE_REQUEST_SENT 4 +#define SWI_STATE_REPLY_RECEIVED 5 +#define SWI_STATE_BULK_STARTED 6 +#define SWI_STATE_DONE 10 + +/* forward refs */ +struct srpc_service; +struct srpc_service_cd; +struct sfw_test_unit; +struct sfw_test_instance; + +/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework + * services, e.g. create/modify session. + */ +#define SRPC_SERVICE_DEBUG 0 +#define SRPC_SERVICE_MAKE_SESSION 1 +#define SRPC_SERVICE_REMOVE_SESSION 2 +#define SRPC_SERVICE_BATCH 3 +#define SRPC_SERVICE_TEST 4 +#define SRPC_SERVICE_QUERY_STAT 5 +#define SRPC_SERVICE_JOIN 6 +#define SRPC_FRAMEWORK_SERVICE_MAX_ID 10 +/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */ +#define SRPC_SERVICE_BRW 11 +#define SRPC_SERVICE_PING 12 +#define SRPC_SERVICE_MAX_ID 12 + +#define SRPC_REQUEST_PORTAL 50 +/* a lazy portal for framework RPC requests */ +#define SRPC_FRAMEWORK_REQUEST_PORTAL 51 +/* all reply/bulk RDMAs go to this portal */ +#define SRPC_RDMA_PORTAL 52 + +static inline srpc_msg_type_t +srpc_service2request (int service) +{ + switch (service) { + default: + LBUG (); + case SRPC_SERVICE_DEBUG: + return SRPC_MSG_DEBUG_REQST; + + case SRPC_SERVICE_MAKE_SESSION: + return SRPC_MSG_MKSN_REQST; + + case SRPC_SERVICE_REMOVE_SESSION: + return SRPC_MSG_RMSN_REQST; + + case SRPC_SERVICE_BATCH: + return SRPC_MSG_BATCH_REQST; + + case SRPC_SERVICE_TEST: + return SRPC_MSG_TEST_REQST; + + case SRPC_SERVICE_QUERY_STAT: + return SRPC_MSG_STAT_REQST; + + case SRPC_SERVICE_BRW: + return SRPC_MSG_BRW_REQST; + + case SRPC_SERVICE_PING: + return SRPC_MSG_PING_REQST; + + case SRPC_SERVICE_JOIN: + return SRPC_MSG_JOIN_REQST; + } +} + +static inline srpc_msg_type_t +srpc_service2reply (int service) +{ + return srpc_service2request(service) + 1; +} + +typedef enum { + SRPC_BULK_REQ_RCVD = 1, /* passive bulk request(PUT sink/GET source) received */ + SRPC_BULK_PUT_SENT = 2, /* active bulk PUT sent (source) */ + SRPC_BULK_GET_RPLD = 3, /* active bulk GET replied (sink) */ + SRPC_REPLY_RCVD = 4, /* incoming reply received */ + SRPC_REPLY_SENT = 5, /* outgoing reply sent */ + SRPC_REQUEST_RCVD = 6, /* incoming request received */ + SRPC_REQUEST_SENT = 7, /* outgoing request sent */ +} srpc_event_type_t; + +/* RPC event */ +typedef struct { + srpc_event_type_t ev_type; /* what's up */ + lnet_event_kind_t ev_lnet; /* LNet event type */ + int ev_fired; /* LNet event fired? */ + int ev_status; /* LNet event status */ + void *ev_data; /* owning server/client RPC */ +} srpc_event_t; + +typedef struct { + int bk_len; /* len of bulk data */ + lnet_handle_md_t bk_mdh; + int bk_sink; /* sink/source */ + int bk_niov; /* # iov in bk_iovs */ + lnet_kiov_t bk_iovs[0]; +} srpc_bulk_t; /* bulk descriptor */ + +/* message buffer descriptor */ +typedef struct srpc_buffer { + struct list_head buf_list; /* chain on srpc_service::*_msgq */ + srpc_msg_t buf_msg; + lnet_handle_md_t buf_mdh; + lnet_nid_t buf_self; + lnet_process_id_t buf_peer; +} srpc_buffer_t; + +struct swi_workitem; +typedef int (*swi_action_t) (struct swi_workitem *); + +typedef struct swi_workitem { + struct cfs_wi_sched *swi_sched; + cfs_workitem_t swi_workitem; + swi_action_t swi_action; + int swi_state; +} swi_workitem_t; + +/* server-side state of a RPC */ +typedef struct srpc_server_rpc { + /* chain on srpc_service::*_rpcq */ + struct list_head srpc_list; + struct srpc_service_cd *srpc_scd; + swi_workitem_t srpc_wi; + srpc_event_t srpc_ev; /* bulk/reply event */ + lnet_nid_t srpc_self; + lnet_process_id_t srpc_peer; + srpc_msg_t srpc_replymsg; + lnet_handle_md_t srpc_replymdh; + srpc_buffer_t *srpc_reqstbuf; + srpc_bulk_t *srpc_bulk; + + unsigned int srpc_aborted; /* being given up */ + int srpc_status; + void (*srpc_done)(struct srpc_server_rpc *); +} srpc_server_rpc_t; + +/* client-side state of a RPC */ +typedef struct srpc_client_rpc { + struct list_head crpc_list; /* chain on user's lists */ + spinlock_t crpc_lock; /* serialize */ + int crpc_service; + atomic_t crpc_refcount; + int crpc_timeout; /* # seconds to wait for reply */ + stt_timer_t crpc_timer; + swi_workitem_t crpc_wi; + lnet_process_id_t crpc_dest; + + void (*crpc_done)(struct srpc_client_rpc *); + void (*crpc_fini)(struct srpc_client_rpc *); + int crpc_status; /* completion status */ + void *crpc_priv; /* caller data */ + + /* state flags */ + unsigned int crpc_aborted:1; /* being given up */ + unsigned int crpc_closed:1; /* completed */ + + /* RPC events */ + srpc_event_t crpc_bulkev; /* bulk event */ + srpc_event_t crpc_reqstev; /* request event */ + srpc_event_t crpc_replyev; /* reply event */ + + /* bulk, request(reqst), and reply exchanged on wire */ + srpc_msg_t crpc_reqstmsg; + srpc_msg_t crpc_replymsg; + lnet_handle_md_t crpc_reqstmdh; + lnet_handle_md_t crpc_replymdh; + srpc_bulk_t crpc_bulk; +} srpc_client_rpc_t; + +#define srpc_client_rpc_size(rpc) \ +offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov]) + +#define srpc_client_rpc_addref(rpc) \ +do { \ + CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n", \ + (rpc), libcfs_id2str((rpc)->crpc_dest), \ + atomic_read(&(rpc)->crpc_refcount)); \ + LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ + atomic_inc(&(rpc)->crpc_refcount); \ +} while (0) + +#define srpc_client_rpc_decref(rpc) \ +do { \ + CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n", \ + (rpc), libcfs_id2str((rpc)->crpc_dest), \ + atomic_read(&(rpc)->crpc_refcount)); \ + LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ + if (atomic_dec_and_test(&(rpc)->crpc_refcount)) \ + srpc_destroy_client_rpc(rpc); \ +} while (0) + +#define srpc_event_pending(rpc) ((rpc)->crpc_bulkev.ev_fired == 0 || \ + (rpc)->crpc_reqstev.ev_fired == 0 || \ + (rpc)->crpc_replyev.ev_fired == 0) + +/* CPU partition data of srpc service */ +struct srpc_service_cd { + /** serialize */ + spinlock_t scd_lock; + /** backref to service */ + struct srpc_service *scd_svc; + /** event buffer */ + srpc_event_t scd_ev; + /** free RPC descriptors */ + struct list_head scd_rpc_free; + /** in-flight RPCs */ + struct list_head scd_rpc_active; + /** workitem for posting buffer */ + swi_workitem_t scd_buf_wi; + /** CPT id */ + int scd_cpt; + /** error code for scd_buf_wi */ + int scd_buf_err; + /** timestamp for scd_buf_err */ + unsigned long scd_buf_err_stamp; + /** total # request buffers */ + int scd_buf_total; + /** # posted request buffers */ + int scd_buf_nposted; + /** in progress of buffer posting */ + int scd_buf_posting; + /** allocate more buffers if scd_buf_nposted < scd_buf_low */ + int scd_buf_low; + /** increase/decrease some buffers */ + int scd_buf_adjust; + /** posted message buffers */ + struct list_head scd_buf_posted; + /** blocked for RPC descriptor */ + struct list_head scd_buf_blocked; +}; + +/* number of server workitems (mini-thread) for testing service */ +#define SFW_TEST_WI_MIN 256 +#define SFW_TEST_WI_MAX 2048 +/* extra buffers for tolerating buggy peers, or unbalanced number + * of peers between partitions */ +#define SFW_TEST_WI_EXTRA 64 + +/* number of server workitems (mini-thread) for framework service */ +#define SFW_FRWK_WI_MIN 16 +#define SFW_FRWK_WI_MAX 256 + +typedef struct srpc_service { + int sv_id; /* service id */ + const char *sv_name; /* human readable name */ + int sv_wi_total; /* total server workitems */ + int sv_shuttingdown; + int sv_ncpts; + /* percpt data for srpc_service */ + struct srpc_service_cd **sv_cpt_data; + /* Service callbacks: + * - sv_handler: process incoming RPC request + * - sv_bulk_ready: notify bulk data + */ + int (*sv_handler) (srpc_server_rpc_t *); + int (*sv_bulk_ready) (srpc_server_rpc_t *, int); +} srpc_service_t; + +typedef struct { + struct list_head sn_list; /* chain on fw_zombie_sessions */ + lst_sid_t sn_id; /* unique identifier */ + unsigned int sn_timeout; /* # seconds' inactivity to expire */ + int sn_timer_active; + unsigned int sn_features; + stt_timer_t sn_timer; + struct list_head sn_batches; /* list of batches */ + char sn_name[LST_NAME_SIZE]; + atomic_t sn_refcount; + atomic_t sn_brw_errors; + atomic_t sn_ping_errors; + unsigned long sn_started; +} sfw_session_t; + +#define sfw_sid_equal(sid0, sid1) ((sid0).ses_nid == (sid1).ses_nid && \ + (sid0).ses_stamp == (sid1).ses_stamp) + +typedef struct { + struct list_head bat_list; /* chain on sn_batches */ + lst_bid_t bat_id; /* batch id */ + int bat_error; /* error code of batch */ + sfw_session_t *bat_session; /* batch's session */ + atomic_t bat_nactive; /* # of active tests */ + struct list_head bat_tests; /* test instances */ +} sfw_batch_t; + +typedef struct { + int (*tso_init)(struct sfw_test_instance *tsi); /* initialize test client */ + void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */ + int (*tso_prep_rpc)(struct sfw_test_unit *tsu, + lnet_process_id_t dest, + srpc_client_rpc_t **rpc); /* prep a tests rpc */ + void (*tso_done_rpc)(struct sfw_test_unit *tsu, + srpc_client_rpc_t *rpc); /* done a test rpc */ +} sfw_test_client_ops_t; + +typedef struct sfw_test_instance { + struct list_head tsi_list; /* chain on batch */ + int tsi_service; /* test type */ + sfw_batch_t *tsi_batch; /* batch */ + sfw_test_client_ops_t *tsi_ops; /* test client operations */ + + /* public parameter for all test units */ + unsigned int tsi_is_client:1; /* is test client */ + unsigned int tsi_stoptsu_onerr:1; /* stop tsu on error */ + int tsi_concur; /* concurrency */ + int tsi_loop; /* loop count */ + + /* status of test instance */ + spinlock_t tsi_lock; /* serialize */ + unsigned int tsi_stopping:1; /* test is stopping */ + atomic_t tsi_nactive; /* # of active test unit */ + struct list_head tsi_units; /* test units */ + struct list_head tsi_free_rpcs; /* free rpcs */ + struct list_head tsi_active_rpcs; /* active rpcs */ + + union { + test_ping_req_t ping; /* ping parameter */ + test_bulk_req_t bulk_v0; /* bulk parameter */ + test_bulk_req_v1_t bulk_v1; /* bulk v1 parameter */ + } tsi_u; +} sfw_test_instance_t; + +/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at + * the end of pages are not used */ +#define SFW_MAX_CONCUR LST_MAX_CONCUR +#define SFW_ID_PER_PAGE (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t)) +#define SFW_MAX_NDESTS (LNET_MAX_IOV * SFW_ID_PER_PAGE) +#define sfw_id_pages(n) (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE) + +typedef struct sfw_test_unit { + struct list_head tsu_list; /* chain on lst_test_instance */ + lnet_process_id_t tsu_dest; /* id of dest node */ + int tsu_loop; /* loop count of the test */ + sfw_test_instance_t *tsu_instance; /* pointer to test instance */ + void *tsu_private; /* private data */ + swi_workitem_t tsu_worker; /* workitem of the test unit */ +} sfw_test_unit_t; + +typedef struct sfw_test_case { + struct list_head tsc_list; /* chain on fw_tests */ + srpc_service_t *tsc_srv_service; /* test service */ + sfw_test_client_ops_t *tsc_cli_ops; /* ops of test client */ +} sfw_test_case_t; + +srpc_client_rpc_t * +sfw_create_rpc(lnet_process_id_t peer, int service, + unsigned features, int nbulkiov, int bulklen, + void (*done) (srpc_client_rpc_t *), void *priv); +int sfw_create_test_rpc(sfw_test_unit_t *tsu, + lnet_process_id_t peer, unsigned features, + int nblk, int blklen, srpc_client_rpc_t **rpc); +void sfw_abort_rpc(srpc_client_rpc_t *rpc); +void sfw_post_rpc(srpc_client_rpc_t *rpc); +void sfw_client_rpc_done(srpc_client_rpc_t *rpc); +void sfw_unpack_message(srpc_msg_t *msg); +void sfw_free_pages(srpc_server_rpc_t *rpc); +void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i); +int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len, + int sink); +int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply); + +srpc_client_rpc_t * +srpc_create_client_rpc(lnet_process_id_t peer, int service, + int nbulkiov, int bulklen, + void (*rpc_done)(srpc_client_rpc_t *), + void (*rpc_fini)(srpc_client_rpc_t *), void *priv); +void srpc_post_rpc(srpc_client_rpc_t *rpc); +void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why); +void srpc_free_bulk(srpc_bulk_t *bk); +srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, + int sink); +int srpc_send_rpc(swi_workitem_t *wi); +int srpc_send_reply(srpc_server_rpc_t *rpc); +int srpc_add_service(srpc_service_t *sv); +int srpc_remove_service(srpc_service_t *sv); +void srpc_shutdown_service(srpc_service_t *sv); +void srpc_abort_service(srpc_service_t *sv); +int srpc_finish_service(srpc_service_t *sv); +int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer); +void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer); +void srpc_get_counters(srpc_counters_t *cnt); +void srpc_set_counters(const srpc_counters_t *cnt); + +extern struct cfs_wi_sched *lst_sched_serial; +extern struct cfs_wi_sched **lst_sched_test; + +static inline int +srpc_serv_is_framework(struct srpc_service *svc) +{ + return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID; +} + +static inline int +swi_wi_action(cfs_workitem_t *wi) +{ + swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem); + + return swi->swi_action(swi); +} + +static inline void +swi_init_workitem(swi_workitem_t *swi, void *data, + swi_action_t action, struct cfs_wi_sched *sched) +{ + swi->swi_sched = sched; + swi->swi_action = action; + swi->swi_state = SWI_STATE_NEWBORN; + cfs_wi_init(&swi->swi_workitem, data, swi_wi_action); +} + +static inline void +swi_schedule_workitem(swi_workitem_t *wi) +{ + cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem); +} + +static inline void +swi_exit_workitem(swi_workitem_t *swi) +{ + cfs_wi_exit(swi->swi_sched, &swi->swi_workitem); +} + +static inline int +swi_deschedule_workitem(swi_workitem_t *swi) +{ + return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem); +} + + +int sfw_startup(void); +int srpc_startup(void); +void sfw_shutdown(void); +void srpc_shutdown(void); + +static inline void +srpc_destroy_client_rpc (srpc_client_rpc_t *rpc) +{ + LASSERT (rpc != NULL); + LASSERT (!srpc_event_pending(rpc)); + LASSERT (atomic_read(&rpc->crpc_refcount) == 0); + + if (rpc->crpc_fini == NULL) { + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } else { + (*rpc->crpc_fini) (rpc); + } + + return; +} + +static inline void +srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer, + int service, int nbulkiov, int bulklen, + void (*rpc_done)(srpc_client_rpc_t *), + void (*rpc_fini)(srpc_client_rpc_t *), void *priv) +{ + LASSERT (nbulkiov <= LNET_MAX_IOV); + + memset(rpc, 0, offsetof(srpc_client_rpc_t, + crpc_bulk.bk_iovs[nbulkiov])); + + INIT_LIST_HEAD(&rpc->crpc_list); + swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc, + lst_sched_test[lnet_cpt_of_nid(peer.nid)]); + spin_lock_init(&rpc->crpc_lock); + atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */ + + rpc->crpc_dest = peer; + rpc->crpc_priv = priv; + rpc->crpc_service = service; + rpc->crpc_bulk.bk_len = bulklen; + rpc->crpc_bulk.bk_niov = nbulkiov; + rpc->crpc_done = rpc_done; + rpc->crpc_fini = rpc_fini; + LNetInvalidateHandle(&rpc->crpc_reqstmdh); + LNetInvalidateHandle(&rpc->crpc_replymdh); + LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh); + + /* no event is expected at this point */ + rpc->crpc_bulkev.ev_fired = + rpc->crpc_reqstev.ev_fired = + rpc->crpc_replyev.ev_fired = 1; + + rpc->crpc_reqstmsg.msg_magic = SRPC_MSG_MAGIC; + rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION; + rpc->crpc_reqstmsg.msg_type = srpc_service2request(service); + return; +} + +static inline const char * +swi_state2str (int state) +{ +#define STATE2STR(x) case x: return #x + switch(state) { + default: + LBUG(); + STATE2STR(SWI_STATE_NEWBORN); + STATE2STR(SWI_STATE_REPLY_SUBMITTED); + STATE2STR(SWI_STATE_REPLY_SENT); + STATE2STR(SWI_STATE_REQUEST_SUBMITTED); + STATE2STR(SWI_STATE_REQUEST_SENT); + STATE2STR(SWI_STATE_REPLY_RECEIVED); + STATE2STR(SWI_STATE_BULK_STARTED); + STATE2STR(SWI_STATE_DONE); + } +#undef STATE2STR +} + +#define selftest_wait_events() \ + do { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + schedule_timeout(cfs_time_seconds(1) / 10); \ + } while (0) + + +#define lst_wait_until(cond, lock, fmt, ...) \ +do { \ + int __I = 2; \ + while (!(cond)) { \ + CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET, \ + fmt, ## __VA_ARGS__); \ + spin_unlock(&(lock)); \ + \ + selftest_wait_events(); \ + \ + spin_lock(&(lock)); \ + } \ +} while (0) + +static inline void +srpc_wait_service_shutdown(srpc_service_t *sv) +{ + int i = 2; + + LASSERT(sv->sv_shuttingdown); + + while (srpc_finish_service(sv) == 0) { + i++; + CDEBUG (((i & -i) == i) ? D_WARNING : D_NET, + "Waiting for %s service to shutdown...\n", + sv->sv_name); + selftest_wait_events(); + } +} + +extern sfw_test_client_ops_t brw_test_client; +void brw_init_test_client(void); + +extern srpc_service_t brw_test_service; +void brw_init_test_service(void); + +extern sfw_test_client_ops_t ping_test_client; +void ping_init_test_client(void); + +extern srpc_service_t ping_test_service; +void ping_init_test_service(void); + +#endif /* __SELFTEST_SELFTEST_H__ */ diff --git a/kernel/drivers/staging/lustre/lnet/selftest/timer.c b/kernel/drivers/staging/lustre/lnet/selftest/timer.c new file mode 100644 index 000000000..441f9472a --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/timer.c @@ -0,0 +1,248 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/timer.c + * + * Author: Isaac Huang + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + + +/* + * Timers are implemented as a sorted queue of expiry times. The queue + * is slotted, with each slot holding timers which expire in a + * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are + * sorted by increasing expiry time. The number of slots is 2**7 (128), + * to cover a time period of 1024 seconds into the future before wrapping. + */ +#define STTIMER_MINPOLL 3 /* log2 min poll interval (8 s) */ +#define STTIMER_SLOTTIME (1 << STTIMER_MINPOLL) +#define STTIMER_SLOTTIMEMASK (~(STTIMER_SLOTTIME - 1)) +#define STTIMER_NSLOTS (1 << 7) +#define STTIMER_SLOT(t) (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \ + (STTIMER_NSLOTS - 1))]) + +static struct st_timer_data { + spinlock_t stt_lock; + /* start time of the slot processed previously */ + unsigned long stt_prev_slot; + struct list_head stt_hash[STTIMER_NSLOTS]; + int stt_shuttingdown; + wait_queue_head_t stt_waitq; + int stt_nthreads; +} stt_data; + +void +stt_add_timer(stt_timer_t *timer) +{ + struct list_head *pos; + + spin_lock(&stt_data.stt_lock); + + LASSERT(stt_data.stt_nthreads > 0); + LASSERT(!stt_data.stt_shuttingdown); + LASSERT(timer->stt_func != NULL); + LASSERT(list_empty(&timer->stt_list)); + LASSERT(cfs_time_after(timer->stt_expires, get_seconds())); + + /* a simple insertion sort */ + list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) { + stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list); + + if (cfs_time_aftereq(timer->stt_expires, old->stt_expires)) + break; + } + list_add(&timer->stt_list, pos); + + spin_unlock(&stt_data.stt_lock); +} + +/* + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + * + * CAVEAT EMPTOR: + * When 0 is returned, it is possible that timer->stt_func _is_ running on + * another CPU. + */ +int +stt_del_timer(stt_timer_t *timer) +{ + int ret = 0; + + spin_lock(&stt_data.stt_lock); + + LASSERT(stt_data.stt_nthreads > 0); + LASSERT(!stt_data.stt_shuttingdown); + + if (!list_empty(&timer->stt_list)) { + ret = 1; + list_del_init(&timer->stt_list); + } + + spin_unlock(&stt_data.stt_lock); + return ret; +} + +/* called with stt_data.stt_lock held */ +static int +stt_expire_list(struct list_head *slot, unsigned long now) +{ + int expired = 0; + stt_timer_t *timer; + + while (!list_empty(slot)) { + timer = list_entry(slot->next, stt_timer_t, stt_list); + + if (cfs_time_after(timer->stt_expires, now)) + break; + + list_del_init(&timer->stt_list); + spin_unlock(&stt_data.stt_lock); + + expired++; + (*timer->stt_func) (timer->stt_data); + + spin_lock(&stt_data.stt_lock); + } + + return expired; +} + +static int +stt_check_timers(unsigned long *last) +{ + int expired = 0; + unsigned long now; + unsigned long this_slot; + + now = get_seconds(); + this_slot = now & STTIMER_SLOTTIMEMASK; + + spin_lock(&stt_data.stt_lock); + + while (cfs_time_aftereq(this_slot, *last)) { + expired += stt_expire_list(STTIMER_SLOT(this_slot), now); + this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME); + } + + *last = now & STTIMER_SLOTTIMEMASK; + spin_unlock(&stt_data.stt_lock); + return expired; +} + + +static int +stt_timer_main(void *arg) +{ + cfs_block_allsigs(); + + while (!stt_data.stt_shuttingdown) { + stt_check_timers(&stt_data.stt_prev_slot); + + wait_event_timeout(stt_data.stt_waitq, + stt_data.stt_shuttingdown, + cfs_time_seconds(STTIMER_SLOTTIME)); + } + + spin_lock(&stt_data.stt_lock); + stt_data.stt_nthreads--; + spin_unlock(&stt_data.stt_lock); + return 0; +} + +static int +stt_start_timer_thread(void) +{ + struct task_struct *task; + + LASSERT(!stt_data.stt_shuttingdown); + + task = kthread_run(stt_timer_main, NULL, "st_timer"); + if (IS_ERR(task)) + return PTR_ERR(task); + + spin_lock(&stt_data.stt_lock); + stt_data.stt_nthreads++; + spin_unlock(&stt_data.stt_lock); + return 0; +} + + +int +stt_startup(void) +{ + int rc = 0; + int i; + + stt_data.stt_shuttingdown = 0; + stt_data.stt_prev_slot = get_seconds() & STTIMER_SLOTTIMEMASK; + + spin_lock_init(&stt_data.stt_lock); + for (i = 0; i < STTIMER_NSLOTS; i++) + INIT_LIST_HEAD(&stt_data.stt_hash[i]); + + stt_data.stt_nthreads = 0; + init_waitqueue_head(&stt_data.stt_waitq); + rc = stt_start_timer_thread(); + if (rc != 0) + CERROR("Can't spawn timer thread: %d\n", rc); + + return rc; +} + +void +stt_shutdown(void) +{ + int i; + + spin_lock(&stt_data.stt_lock); + + for (i = 0; i < STTIMER_NSLOTS; i++) + LASSERT(list_empty(&stt_data.stt_hash[i])); + + stt_data.stt_shuttingdown = 1; + + wake_up(&stt_data.stt_waitq); + lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock, + "waiting for %d threads to terminate\n", + stt_data.stt_nthreads); + + spin_unlock(&stt_data.stt_lock); +} diff --git a/kernel/drivers/staging/lustre/lnet/selftest/timer.h b/kernel/drivers/staging/lustre/lnet/selftest/timer.h new file mode 100644 index 000000000..d727c1e2b --- /dev/null +++ b/kernel/drivers/staging/lustre/lnet/selftest/timer.h @@ -0,0 +1,53 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/timer.h + * + * Author: Isaac Huang + */ +#ifndef __SELFTEST_TIMER_H__ +#define __SELFTEST_TIMER_H__ + +typedef struct { + struct list_head stt_list; + unsigned long stt_expires; + void (*stt_func) (void *); + void *stt_data; +} stt_timer_t; + +void stt_add_timer (stt_timer_t *timer); +int stt_del_timer (stt_timer_t *timer); +int stt_startup (void); +void stt_shutdown (void); + +#endif /* __SELFTEST_TIMER_H__ */ diff --git a/kernel/drivers/staging/lustre/lustre/Kconfig b/kernel/drivers/staging/lustre/lustre/Kconfig new file mode 100644 index 000000000..62c7bba75 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/Kconfig @@ -0,0 +1,62 @@ +config LUSTRE_FS + tristate "Lustre file system client support" + depends on INET && m && !MIPS && !XTENSA && !SUPERH + select LNET + select CRYPTO + select CRYPTO_CRC32 + select CRYPTO_CRC32_PCLMUL if X86 + select CRYPTO_CRC32C + select CRYPTO_MD5 + select CRYPTO_SHA1 + select CRYPTO_SHA256 + select CRYPTO_SHA512 + depends on MULTIUSER + help + This option enables Lustre file system client support. Choose Y + here if you want to access a Lustre file system cluster. To compile + this file system support as a module, choose M here: the module will + be called lustre. + + To mount Lustre file systems, you also need to install the user space + mount.lustre and other user space commands which can be found in the + lustre-client package, available from + http://downloads.whamcloud.com/public/lustre/ + + Lustre file system is the most popular cluster file system in high + performance computing. Source code of both kernel space and user space + Lustre components can also be found at + http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary + + If unsure, say N. + + See also http://wiki.lustre.org/ + +config LUSTRE_OBD_MAX_IOCTL_BUFFER + int "Lustre obd max ioctl buffer bytes (default 8KB)" + depends on LUSTRE_FS + default 8192 + help + This option defines the maximum size of buffer in bytes that user space + applications can pass to Lustre kernel module through ioctl interface. + + If unsure, use default. + +config LUSTRE_DEBUG_EXPENSIVE_CHECK + bool "Enable Lustre DEBUG checks" + depends on LUSTRE_FS + help + This option is mainly for debug purpose. It enables Lustre code to do + expensive checks that may have a performance impact. + + Use with caution. If unsure, say N. + +config LUSTRE_TRANSLATE_ERRNOS + bool + depends on LUSTRE_FS && !X86 + default y + +config LUSTRE_LLITE_LLOOP + tristate "Lustre virtual block device" + depends on LUSTRE_FS && BLOCK + depends on !PPC_64K_PAGES && !ARM64_64K_PAGES && !MICROBLAZE_64K_PAGES && !PAGE_SIZE_64KB && !IA64_PAGE_SIZE_64KB && !PARISC_PAGE_SIZE_64KB + default m diff --git a/kernel/drivers/staging/lustre/lustre/Makefile b/kernel/drivers/staging/lustre/lustre/Makefile new file mode 100644 index 000000000..35d8b0b2d --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_LUSTRE_FS) += libcfs/ obdclass/ ptlrpc/ fld/ osc/ mgc/ \ + fid/ lov/ mdc/ lmv/ llite/ obdecho/ diff --git a/kernel/drivers/staging/lustre/lustre/fid/Makefile b/kernel/drivers/staging/lustre/lustre/fid/Makefile new file mode 100644 index 000000000..5513ce416 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fid/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LUSTRE_FS) += fid.o +fid-y := fid_request.o fid_lib.o +fid-$(CONFIG_PROC_FS) += lproc_fid.o diff --git a/kernel/drivers/staging/lustre/lustre/fid/fid_internal.h b/kernel/drivers/staging/lustre/lustre/fid/fid_internal.h new file mode 100644 index 000000000..b5e8da895 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fid/fid_internal.h @@ -0,0 +1,56 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_internal.h + * + * Author: Yury Umanets + */ +#ifndef __FID_INTERNAL_H +#define __FID_INTERNAL_H + +#include "../include/lustre/lustre_idl.h" +#include "../../include/linux/libcfs/libcfs.h" + +/* Functions used internally in module. */ +int seq_client_alloc_super(struct lu_client_seq *seq, + const struct lu_env *env); + +#if defined(CONFIG_PROC_FS) +extern struct lprocfs_vars seq_client_proc_list[]; +#endif + +extern struct proc_dir_entry *seq_type_proc_dir; + +#endif /* __FID_INTERNAL_H */ diff --git a/kernel/drivers/staging/lustre/lustre/fid/fid_lib.c b/kernel/drivers/staging/lustre/lustre/fid/fid_lib.c new file mode 100644 index 000000000..dd65159eb --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fid/fid_lib.c @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_lib.c + * + * Miscellaneous fid functions. + * + * Author: Nikita Danilov + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include "../../include/linux/libcfs/libcfs.h" +#include +#include "../include/lustre/lustre_idl.h" +#include "../include/lustre_fid.h" + +/** + * A cluster-wide range from which fid-sequences are granted to servers and + * then clients. + * + * Fid namespace: + *
+ * Normal FID:        seq:64 [2^33,2^64-1]      oid:32          ver:32
+ * IGIF      :        0:32, ino:32              gen:32          0:32
+ * IDIF      :        0:31, 1:1, ost-index:16,  objd:48         0:32
+ * 
+ * + * The first 0x400 sequences of normal FID are reserved for special purpose. + * FID_SEQ_START + 1 is for local file id generation. + * FID_SEQ_START + 2 is for .lustre directory and its objects + */ +const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = { + FID_SEQ_NORMAL, + (__u64)~0ULL +}; +EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE); + +/* Zero range, used for init and other purposes. */ +const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = { + 0, + 0 +}; +EXPORT_SYMBOL(LUSTRE_SEQ_ZERO_RANGE); + +/* Lustre Big Fs Lock fid. */ +const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL, + .f_oid = FID_OID_SPECIAL_BFL, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LUSTRE_BFL_FID); + +/** Special fid for ".lustre" directory */ +const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_DOT_LUSTRE_FID); + +/** Special fid for "fid" special object in .lustre */ +const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE_OBF, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_OBF_FID); diff --git a/kernel/drivers/staging/lustre/lustre/fid/fid_request.c b/kernel/drivers/staging/lustre/lustre/fid/fid_request.c new file mode 100644 index 000000000..063441abf --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fid/fid_request.c @@ -0,0 +1,572 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_request.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include "../../include/linux/libcfs/libcfs.h" +#include + +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lustre_fid.h" +/* mdc RPC locks */ +#include "../include/lustre_mdc.h" +#include "fid_internal.h" + +static int seq_client_rpc(struct lu_client_seq *seq, + struct lu_seq_range *output, __u32 opc, + const char *opcname) +{ + struct obd_export *exp = seq->lcs_exp; + struct ptlrpc_request *req; + struct lu_seq_range *out, *in; + __u32 *op; + unsigned int debug_mask; + int rc; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY, + LUSTRE_MDS_VERSION, SEQ_QUERY); + if (req == NULL) + return -ENOMEM; + + /* Init operation code */ + op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC); + *op = opc; + + /* Zero out input range, this is not recovery yet. */ + in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE); + range_init(in); + + ptlrpc_request_set_replen(req); + + in->lsr_index = seq->lcs_space.lsr_index; + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + fld_range_set_mdt(in); + else + fld_range_set_ost(in); + + if (opc == SEQ_ALLOC_SUPER) { + req->rq_request_portal = SEQ_CONTROLLER_PORTAL; + req->rq_reply_portal = MDC_REPLY_PORTAL; + /* During allocating super sequence for data object, + * the current thread might hold the export of MDT0(MDT0 + * precreating objects on this OST), and it will send the + * request to MDT0 here, so we can not keep resending the + * request here, otherwise if MDT0 is failed(umounted), + * it can not release the export of MDT0 */ + if (seq->lcs_type == LUSTRE_SEQ_DATA) + req->rq_no_delay = req->rq_no_resend = 1; + debug_mask = D_CONSOLE; + } else { + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + req->rq_request_portal = SEQ_METADATA_PORTAL; + else + req->rq_request_portal = SEQ_DATA_PORTAL; + debug_mask = D_INFO; + } + + ptlrpc_at_set_req_timeout(req); + + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + rc = ptlrpc_queue_wait(req); + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + if (rc) + goto out_req; + + out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE); + *output = *out; + + if (!range_is_sane(output)) { + CERROR("%s: Invalid range received from server: " + DRANGE"\n", seq->lcs_name, PRANGE(output)); + rc = -EINVAL; + goto out_req; + } + + if (range_is_exhausted(output)) { + CERROR("%s: Range received from server is exhausted: " + DRANGE"]\n", seq->lcs_name, PRANGE(output)); + rc = -EINVAL; + goto out_req; + } + + CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n", + seq->lcs_name, opcname, PRANGE(output)); + +out_req: + ptlrpc_req_finished(req); + return rc; +} + +/* Request sequence-controller node to allocate new super-sequence. */ +int seq_client_alloc_super(struct lu_client_seq *seq, + const struct lu_env *env) +{ + int rc; + + mutex_lock(&seq->lcs_mutex); + + if (seq->lcs_srv) { + rc = 0; + } else { + /* Check whether the connection to seq controller has been + * setup (lcs_exp != NULL) */ + if (seq->lcs_exp == NULL) { + mutex_unlock(&seq->lcs_mutex); + return -EINPROGRESS; + } + + rc = seq_client_rpc(seq, &seq->lcs_space, + SEQ_ALLOC_SUPER, "super"); + } + mutex_unlock(&seq->lcs_mutex); + return rc; +} + +/* Request sequence-controller node to allocate new meta-sequence. */ +static int seq_client_alloc_meta(const struct lu_env *env, + struct lu_client_seq *seq) +{ + int rc; + + if (seq->lcs_srv) { + rc = 0; + } else { + do { + /* If meta server return -EINPROGRESS or EAGAIN, + * it means meta server might not be ready to + * allocate super sequence from sequence controller + * (MDT0)yet */ + rc = seq_client_rpc(seq, &seq->lcs_space, + SEQ_ALLOC_META, "meta"); + } while (rc == -EINPROGRESS || rc == -EAGAIN); + } + + return rc; +} + +/* Allocate new sequence for client. */ +static int seq_client_alloc_seq(const struct lu_env *env, + struct lu_client_seq *seq, u64 *seqnr) +{ + int rc; + + LASSERT(range_is_sane(&seq->lcs_space)); + + if (range_is_exhausted(&seq->lcs_space)) { + rc = seq_client_alloc_meta(env, seq); + if (rc) { + CERROR("%s: Can't allocate new meta-sequence, rc %d\n", + seq->lcs_name, rc); + return rc; + } else { + CDEBUG(D_INFO, "%s: New range - "DRANGE"\n", + seq->lcs_name, PRANGE(&seq->lcs_space)); + } + } else { + rc = 0; + } + + LASSERT(!range_is_exhausted(&seq->lcs_space)); + *seqnr = seq->lcs_space.lsr_start; + seq->lcs_space.lsr_start += 1; + + CDEBUG(D_INFO, "%s: Allocated sequence [%#llx]\n", seq->lcs_name, + *seqnr); + + return rc; +} + +static int seq_fid_alloc_prep(struct lu_client_seq *seq, + wait_queue_t *link) +{ + if (seq->lcs_update) { + add_wait_queue(&seq->lcs_waitq, link); + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&seq->lcs_mutex); + + schedule(); + + mutex_lock(&seq->lcs_mutex); + remove_wait_queue(&seq->lcs_waitq, link); + set_current_state(TASK_RUNNING); + return -EAGAIN; + } + ++seq->lcs_update; + mutex_unlock(&seq->lcs_mutex); + return 0; +} + +static void seq_fid_alloc_fini(struct lu_client_seq *seq) +{ + LASSERT(seq->lcs_update == 1); + mutex_lock(&seq->lcs_mutex); + --seq->lcs_update; + wake_up(&seq->lcs_waitq); +} + +/** + * Allocate the whole seq to the caller. + **/ +int seq_client_get_seq(const struct lu_env *env, + struct lu_client_seq *seq, u64 *seqnr) +{ + wait_queue_t link; + int rc; + + LASSERT(seqnr != NULL); + mutex_lock(&seq->lcs_mutex); + init_waitqueue_entry(&link, current); + + while (1) { + rc = seq_fid_alloc_prep(seq, &link); + if (rc == 0) + break; + } + + rc = seq_client_alloc_seq(env, seq, seqnr); + if (rc) { + CERROR("%s: Can't allocate new sequence, rc %d\n", + seq->lcs_name, rc); + seq_fid_alloc_fini(seq); + mutex_unlock(&seq->lcs_mutex); + return rc; + } + + CDEBUG(D_INFO, "%s: allocate sequence [0x%16.16Lx]\n", + seq->lcs_name, *seqnr); + + /* Since the caller require the whole seq, + * so marked this seq to be used */ + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + seq->lcs_fid.f_oid = LUSTRE_METADATA_SEQ_MAX_WIDTH; + else + seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH; + + seq->lcs_fid.f_seq = *seqnr; + seq->lcs_fid.f_ver = 0; + /* + * Inform caller that sequence switch is performed to allow it + * to setup FLD for it. + */ + seq_fid_alloc_fini(seq); + mutex_unlock(&seq->lcs_mutex); + + return rc; +} +EXPORT_SYMBOL(seq_client_get_seq); + +/* Allocate new fid on passed client @seq and save it to @fid. */ +int seq_client_alloc_fid(const struct lu_env *env, + struct lu_client_seq *seq, struct lu_fid *fid) +{ + wait_queue_t link; + int rc; + + LASSERT(seq != NULL); + LASSERT(fid != NULL); + + init_waitqueue_entry(&link, current); + mutex_lock(&seq->lcs_mutex); + + if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST)) + seq->lcs_fid.f_oid = seq->lcs_width; + + while (1) { + u64 seqnr; + + if (!fid_is_zero(&seq->lcs_fid) && + fid_oid(&seq->lcs_fid) < seq->lcs_width) { + /* Just bump last allocated fid and return to caller. */ + seq->lcs_fid.f_oid += 1; + rc = 0; + break; + } + + rc = seq_fid_alloc_prep(seq, &link); + if (rc) + continue; + + rc = seq_client_alloc_seq(env, seq, &seqnr); + if (rc) { + CERROR("%s: Can't allocate new sequence, rc %d\n", + seq->lcs_name, rc); + seq_fid_alloc_fini(seq); + mutex_unlock(&seq->lcs_mutex); + return rc; + } + + CDEBUG(D_INFO, "%s: Switch to sequence [0x%16.16Lx]\n", + seq->lcs_name, seqnr); + + seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID; + seq->lcs_fid.f_seq = seqnr; + seq->lcs_fid.f_ver = 0; + + /* + * Inform caller that sequence switch is performed to allow it + * to setup FLD for it. + */ + rc = 1; + + seq_fid_alloc_fini(seq); + break; + } + + *fid = seq->lcs_fid; + mutex_unlock(&seq->lcs_mutex); + + CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name, PFID(fid)); + return rc; +} +EXPORT_SYMBOL(seq_client_alloc_fid); + +/* + * Finish the current sequence due to disconnect. + * See mdc_import_event() + */ +void seq_client_flush(struct lu_client_seq *seq) +{ + wait_queue_t link; + + LASSERT(seq != NULL); + init_waitqueue_entry(&link, current); + mutex_lock(&seq->lcs_mutex); + + while (seq->lcs_update) { + add_wait_queue(&seq->lcs_waitq, &link); + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&seq->lcs_mutex); + + schedule(); + + mutex_lock(&seq->lcs_mutex); + remove_wait_queue(&seq->lcs_waitq, &link); + set_current_state(TASK_RUNNING); + } + + fid_zero(&seq->lcs_fid); + /** + * this id shld not be used for seq range allocation. + * set to -1 for dgb check. + */ + + seq->lcs_space.lsr_index = -1; + + range_init(&seq->lcs_space); + mutex_unlock(&seq->lcs_mutex); +} +EXPORT_SYMBOL(seq_client_flush); + +static void seq_client_proc_fini(struct lu_client_seq *seq) +{ +#if defined(CONFIG_PROC_FS) + if (seq->lcs_proc_dir) { + if (!IS_ERR(seq->lcs_proc_dir)) + lprocfs_remove(&seq->lcs_proc_dir); + seq->lcs_proc_dir = NULL; + } +#endif /* CONFIG_PROC_FS */ +} + +static int seq_client_proc_init(struct lu_client_seq *seq) +{ +#if defined(CONFIG_PROC_FS) + int rc; + + seq->lcs_proc_dir = lprocfs_register(seq->lcs_name, + seq_type_proc_dir, + NULL, NULL); + + if (IS_ERR(seq->lcs_proc_dir)) { + CERROR("%s: LProcFS failed in seq-init\n", + seq->lcs_name); + rc = PTR_ERR(seq->lcs_proc_dir); + return rc; + } + + rc = lprocfs_add_vars(seq->lcs_proc_dir, + seq_client_proc_list, seq); + if (rc) { + CERROR("%s: Can't init sequence manager proc, rc %d\n", + seq->lcs_name, rc); + goto out_cleanup; + } + + return 0; + +out_cleanup: + seq_client_proc_fini(seq); + return rc; + +#else /* CONFIG_PROC_FS */ + return 0; +#endif +} + +int seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + const char *prefix, + struct lu_server_seq *srv) +{ + int rc; + + LASSERT(seq != NULL); + LASSERT(prefix != NULL); + + seq->lcs_srv = srv; + seq->lcs_type = type; + + mutex_init(&seq->lcs_mutex); + if (type == LUSTRE_SEQ_METADATA) + seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH; + else + seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH; + + init_waitqueue_head(&seq->lcs_waitq); + /* Make sure that things are clear before work is started. */ + seq_client_flush(seq); + + if (exp != NULL) + seq->lcs_exp = class_export_get(exp); + else if (type == LUSTRE_SEQ_METADATA) + LASSERT(seq->lcs_srv != NULL); + + snprintf(seq->lcs_name, sizeof(seq->lcs_name), + "cli-%s", prefix); + + rc = seq_client_proc_init(seq); + if (rc) + seq_client_fini(seq); + return rc; +} +EXPORT_SYMBOL(seq_client_init); + +void seq_client_fini(struct lu_client_seq *seq) +{ + seq_client_proc_fini(seq); + + if (seq->lcs_exp != NULL) { + class_export_put(seq->lcs_exp); + seq->lcs_exp = NULL; + } + + seq->lcs_srv = NULL; +} +EXPORT_SYMBOL(seq_client_fini); + +int client_fid_init(struct obd_device *obd, + struct obd_export *exp, enum lu_cli_type type) +{ + struct client_obd *cli = &obd->u.cli; + char *prefix; + int rc; + + OBD_ALLOC_PTR(cli->cl_seq); + if (cli->cl_seq == NULL) + return -ENOMEM; + + OBD_ALLOC(prefix, MAX_OBD_NAME + 5); + if (prefix == NULL) { + rc = -ENOMEM; + goto out_free_seq; + } + + snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name); + + /* Init client side sequence-manager */ + rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL); + OBD_FREE(prefix, MAX_OBD_NAME + 5); + if (rc) + goto out_free_seq; + + return rc; +out_free_seq: + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + return rc; +} +EXPORT_SYMBOL(client_fid_init); + +int client_fid_fini(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + + if (cli->cl_seq != NULL) { + seq_client_fini(cli->cl_seq); + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + } + + return 0; +} +EXPORT_SYMBOL(client_fid_fini); + +struct proc_dir_entry *seq_type_proc_dir; + +static int __init fid_mod_init(void) +{ + seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME, + proc_lustre_root, + NULL, NULL); + return PTR_ERR_OR_ZERO(seq_type_proc_dir); +} + +static void __exit fid_mod_exit(void) +{ + if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) { + lprocfs_remove(&seq_type_proc_dir); + seq_type_proc_dir = NULL; + } +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre FID Module"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.1.0"); + +module_init(fid_mod_init); +module_exit(fid_mod_exit); diff --git a/kernel/drivers/staging/lustre/lustre/fid/lproc_fid.c b/kernel/drivers/staging/lustre/lustre/fid/lproc_fid.c new file mode 100644 index 000000000..783939dbd --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fid/lproc_fid.c @@ -0,0 +1,225 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/lproc_fid.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include "../../include/linux/libcfs/libcfs.h" +#include + +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/dt_object.h" +#include "../include/obd_support.h" +#include "../include/lustre_req_layout.h" +#include "../include/lustre_fid.h" +#include "fid_internal.h" + +/* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */ +#define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64)) +/* + * Note: this function is only used for testing, it is no safe for production + * use. + */ +static int lprocfs_fid_write_common(const char __user *buffer, size_t count, + struct lu_seq_range *range) +{ + struct lu_seq_range tmp; + int rc; + char kernbuf[MAX_FID_RANGE_STRLEN]; + + LASSERT(range != NULL); + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = 0; + + if (count == 5 && strcmp(kernbuf, "clear") == 0) { + memset(range, 0, sizeof(*range)); + return count; + } + + /* of the form "[0x0000000240000400 - 0x000000028000400]" */ + rc = sscanf(kernbuf, "[%llx - %llx]\n", + (unsigned long long *)&tmp.lsr_start, + (unsigned long long *)&tmp.lsr_end); + if (!range_is_sane(&tmp) || range_is_zero(&tmp) || + tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end) + return -EINVAL; + *range = tmp; + return count; +} + +/* Client side procfs stuff */ +static ssize_t lprocfs_fid_space_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct lu_client_seq *seq; + int rc; + + seq = ((struct seq_file *)file->private_data)->private; + LASSERT(seq != NULL); + + mutex_lock(&seq->lcs_mutex); + rc = lprocfs_fid_write_common(buffer, count, &seq->lcs_space); + + if (rc == 0) { + CDEBUG(D_INFO, "%s: Space: "DRANGE"\n", + seq->lcs_name, PRANGE(&seq->lcs_space)); + } + + mutex_unlock(&seq->lcs_mutex); + + return count; +} + +static int +lprocfs_fid_space_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lcs_mutex); + seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space)); + mutex_unlock(&seq->lcs_mutex); + + return 0; +} + +static ssize_t lprocfs_fid_width_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct lu_client_seq *seq; + __u64 max; + int rc, val; + + seq = ((struct seq_file *)file->private_data)->private; + LASSERT(seq != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + mutex_lock(&seq->lcs_mutex); + if (seq->lcs_type == LUSTRE_SEQ_DATA) + max = LUSTRE_DATA_SEQ_MAX_WIDTH; + else + max = LUSTRE_METADATA_SEQ_MAX_WIDTH; + + if (val <= max && val > 0) { + seq->lcs_width = val; + + if (rc == 0) { + CDEBUG(D_INFO, "%s: Sequence size: %llu\n", + seq->lcs_name, seq->lcs_width); + } + } + + mutex_unlock(&seq->lcs_mutex); + + return count; +} + +static int +lprocfs_fid_width_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lcs_mutex); + seq_printf(m, "%llu\n", seq->lcs_width); + mutex_unlock(&seq->lcs_mutex); + + return 0; +} + +static int +lprocfs_fid_fid_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lcs_mutex); + seq_printf(m, DFID "\n", PFID(&seq->lcs_fid)); + mutex_unlock(&seq->lcs_mutex); + + return 0; +} + +static int +lprocfs_fid_server_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + struct client_obd *cli; + + LASSERT(seq != NULL); + + if (seq->lcs_exp != NULL) { + cli = &seq->lcs_exp->exp_obd->u.cli; + seq_printf(m, "%s\n", cli->cl_target_uuid.uuid); + } else { + seq_printf(m, "%s\n", seq->lcs_srv->lss_name); + } + + return 0; +} + +LPROC_SEQ_FOPS(lprocfs_fid_space); +LPROC_SEQ_FOPS(lprocfs_fid_width); +LPROC_SEQ_FOPS_RO(lprocfs_fid_server); +LPROC_SEQ_FOPS_RO(lprocfs_fid_fid); + +struct lprocfs_vars seq_client_proc_list[] = { + { "space", &lprocfs_fid_space_fops }, + { "width", &lprocfs_fid_width_fops }, + { "server", &lprocfs_fid_server_fops }, + { "fid", &lprocfs_fid_fid_fops }, + { NULL } +}; diff --git a/kernel/drivers/staging/lustre/lustre/fld/Makefile b/kernel/drivers/staging/lustre/lustre/fld/Makefile new file mode 100644 index 000000000..2bbf08433 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fld/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LUSTRE_FS) += fld.o +fld-y := fld_request.o fld_cache.o +fld-$(CONFIG_PROC_FS) += lproc_fld.o diff --git a/kernel/drivers/staging/lustre/lustre/fld/fld_cache.c b/kernel/drivers/staging/lustre/lustre/fld/fld_cache.c new file mode 100644 index 000000000..0d0a73745 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fld/fld_cache.c @@ -0,0 +1,546 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_cache.c + * + * FLD (Fids Location Database) + * + * Author: Pravin Shelar + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include "../../include/linux/libcfs/libcfs.h" +#include +#include + +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/lustre_ver.h" +#include "../include/obd_support.h" +#include "../include/lprocfs_status.h" + +#include "../include/dt_object.h" +#include "../include/lustre_req_layout.h" +#include "../include/lustre_fld.h" +#include "fld_internal.h" + +/** + * create fld cache. + */ +struct fld_cache *fld_cache_init(const char *name, + int cache_size, int cache_threshold) +{ + struct fld_cache *cache; + + LASSERT(name != NULL); + LASSERT(cache_threshold < cache_size); + + OBD_ALLOC_PTR(cache); + if (cache == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&cache->fci_entries_head); + INIT_LIST_HEAD(&cache->fci_lru); + + cache->fci_cache_count = 0; + rwlock_init(&cache->fci_lock); + + strlcpy(cache->fci_name, name, + sizeof(cache->fci_name)); + + cache->fci_cache_size = cache_size; + cache->fci_threshold = cache_threshold; + + /* Init fld cache info. */ + memset(&cache->fci_stat, 0, sizeof(cache->fci_stat)); + + CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n", + cache->fci_name, cache_size, cache_threshold); + + return cache; +} + +/** + * destroy fld cache. + */ +void fld_cache_fini(struct fld_cache *cache) +{ + __u64 pct; + + LASSERT(cache != NULL); + fld_cache_flush(cache); + + if (cache->fci_stat.fst_count > 0) { + pct = cache->fci_stat.fst_cache * 100; + do_div(pct, cache->fci_stat.fst_count); + } else { + pct = 0; + } + + CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name); + CDEBUG(D_INFO, " Total reqs: %llu\n", cache->fci_stat.fst_count); + CDEBUG(D_INFO, " Cache reqs: %llu\n", cache->fci_stat.fst_cache); + CDEBUG(D_INFO, " Cache hits: %llu%%\n", pct); + + OBD_FREE_PTR(cache); +} + +/** + * delete given node from list. + */ +void fld_cache_entry_delete(struct fld_cache *cache, + struct fld_cache_entry *node) +{ + list_del(&node->fce_list); + list_del(&node->fce_lru); + cache->fci_cache_count--; + OBD_FREE_PTR(node); +} + +/** + * fix list by checking new entry with NEXT entry in order. + */ +static void fld_fix_new_list(struct fld_cache *cache) +{ + struct fld_cache_entry *f_curr; + struct fld_cache_entry *f_next; + struct lu_seq_range *c_range; + struct lu_seq_range *n_range; + struct list_head *head = &cache->fci_entries_head; + +restart_fixup: + + list_for_each_entry_safe(f_curr, f_next, head, fce_list) { + c_range = &f_curr->fce_range; + n_range = &f_next->fce_range; + + LASSERT(range_is_sane(c_range)); + if (&f_next->fce_list == head) + break; + + if (c_range->lsr_flags != n_range->lsr_flags) + continue; + + LASSERTF(c_range->lsr_start <= n_range->lsr_start, + "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n", + PRANGE(c_range), PRANGE(n_range)); + + /* check merge possibility with next range */ + if (c_range->lsr_end == n_range->lsr_start) { + if (c_range->lsr_index != n_range->lsr_index) + continue; + n_range->lsr_start = c_range->lsr_start; + fld_cache_entry_delete(cache, f_curr); + continue; + } + + /* check if current range overlaps with next range. */ + if (n_range->lsr_start < c_range->lsr_end) { + if (c_range->lsr_index == n_range->lsr_index) { + n_range->lsr_start = c_range->lsr_start; + n_range->lsr_end = max(c_range->lsr_end, + n_range->lsr_end); + fld_cache_entry_delete(cache, f_curr); + } else { + if (n_range->lsr_end <= c_range->lsr_end) { + *n_range = *c_range; + fld_cache_entry_delete(cache, f_curr); + } else + n_range->lsr_start = c_range->lsr_end; + } + + /* we could have overlap over next + * range too. better restart. */ + goto restart_fixup; + } + + /* kill duplicates */ + if (c_range->lsr_start == n_range->lsr_start && + c_range->lsr_end == n_range->lsr_end) + fld_cache_entry_delete(cache, f_curr); + } +} + +/** + * add node to fld cache + */ +static inline void fld_cache_entry_add(struct fld_cache *cache, + struct fld_cache_entry *f_new, + struct list_head *pos) +{ + list_add(&f_new->fce_list, pos); + list_add(&f_new->fce_lru, &cache->fci_lru); + + cache->fci_cache_count++; + fld_fix_new_list(cache); +} + +/** + * Check if cache needs to be shrunk. If so - do it. + * Remove one entry in list and so on until cache is shrunk enough. + */ +static int fld_cache_shrink(struct fld_cache *cache) +{ + struct fld_cache_entry *flde; + struct list_head *curr; + int num = 0; + + LASSERT(cache != NULL); + + if (cache->fci_cache_count < cache->fci_cache_size) + return 0; + + curr = cache->fci_lru.prev; + + while (cache->fci_cache_count + cache->fci_threshold > + cache->fci_cache_size && curr != &cache->fci_lru) { + + flde = list_entry(curr, struct fld_cache_entry, fce_lru); + curr = curr->prev; + fld_cache_entry_delete(cache, flde); + num++; + } + + CDEBUG(D_INFO, "%s: FLD cache - Shrunk by %d entries\n", + cache->fci_name, num); + + return 0; +} + +/** + * kill all fld cache entries. + */ +void fld_cache_flush(struct fld_cache *cache) +{ + write_lock(&cache->fci_lock); + cache->fci_cache_size = 0; + fld_cache_shrink(cache); + write_unlock(&cache->fci_lock); +} + +/** + * punch hole in existing range. divide this range and add new + * entry accordingly. + */ + +static void fld_cache_punch_hole(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) +{ + const struct lu_seq_range *range = &f_new->fce_range; + const u64 new_start = range->lsr_start; + const u64 new_end = range->lsr_end; + struct fld_cache_entry *fldt; + + OBD_ALLOC_GFP(fldt, sizeof(*fldt), GFP_ATOMIC); + if (!fldt) { + OBD_FREE_PTR(f_new); + /* overlap is not allowed, so dont mess up list. */ + return; + } + /* break f_curr RANGE into three RANGES: + * f_curr, f_new , fldt + */ + + /* f_new = *range */ + + /* fldt */ + fldt->fce_range.lsr_start = new_end; + fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end; + fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index; + + /* f_curr */ + f_curr->fce_range.lsr_end = new_start; + + /* add these two entries to list */ + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + fld_cache_entry_add(cache, fldt, &f_new->fce_list); + + /* no need to fixup */ +} + +/** + * handle range overlap in fld cache. + */ +static void fld_cache_overlap_handle(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) +{ + const struct lu_seq_range *range = &f_new->fce_range; + const u64 new_start = range->lsr_start; + const u64 new_end = range->lsr_end; + const u32 mdt = range->lsr_index; + + /* this is overlap case, these case are checking overlapping with + * prev range only. fixup will handle overlapping with next range. */ + + if (f_curr->fce_range.lsr_index == mdt) { + f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start, + new_start); + + f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end, + new_end); + + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); + + } else if (new_start <= f_curr->fce_range.lsr_start && + f_curr->fce_range.lsr_end <= new_end) { + /* case 1: new range completely overshadowed existing range. + * e.g. whole range migrated. update fld cache entry */ + + f_curr->fce_range = *range; + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); + + } else if (f_curr->fce_range.lsr_start < new_start && + new_end < f_curr->fce_range.lsr_end) { + /* case 2: new range fit within existing range. */ + + fld_cache_punch_hole(cache, f_curr, f_new); + + } else if (new_end <= f_curr->fce_range.lsr_end) { + /* case 3: overlap: + * [new_start [c_start new_end) c_end) + */ + + LASSERT(new_start <= f_curr->fce_range.lsr_start); + + f_curr->fce_range.lsr_start = new_end; + fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev); + + } else if (f_curr->fce_range.lsr_start <= new_start) { + /* case 4: overlap: + * [c_start [new_start c_end) new_end) + */ + + LASSERT(f_curr->fce_range.lsr_end <= new_end); + + f_curr->fce_range.lsr_end = new_start; + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + } else + CERROR("NEW range ="DRANGE" curr = "DRANGE"\n", + PRANGE(range), PRANGE(&f_curr->fce_range)); +} + +struct fld_cache_entry +*fld_cache_entry_create(const struct lu_seq_range *range) +{ + struct fld_cache_entry *f_new; + + LASSERT(range_is_sane(range)); + + OBD_ALLOC_PTR(f_new); + if (!f_new) + return ERR_PTR(-ENOMEM); + + f_new->fce_range = *range; + return f_new; +} + +/** + * Insert FLD entry in FLD cache. + * + * This function handles all cases of merging and breaking up of + * ranges. + */ +int fld_cache_insert_nolock(struct fld_cache *cache, + struct fld_cache_entry *f_new) +{ + struct fld_cache_entry *f_curr; + struct fld_cache_entry *n; + struct list_head *head; + struct list_head *prev = NULL; + const u64 new_start = f_new->fce_range.lsr_start; + const u64 new_end = f_new->fce_range.lsr_end; + __u32 new_flags = f_new->fce_range.lsr_flags; + + /* + * Duplicate entries are eliminated in insert op. + * So we don't need to search new entry before starting + * insertion loop. + */ + + if (!cache->fci_no_shrink) + fld_cache_shrink(cache); + + head = &cache->fci_entries_head; + + list_for_each_entry_safe(f_curr, n, head, fce_list) { + /* add list if next is end of list */ + if (new_end < f_curr->fce_range.lsr_start || + (new_end == f_curr->fce_range.lsr_start && + new_flags != f_curr->fce_range.lsr_flags)) + break; + + prev = &f_curr->fce_list; + /* check if this range is to left of new range. */ + if (new_start < f_curr->fce_range.lsr_end && + new_flags == f_curr->fce_range.lsr_flags) { + fld_cache_overlap_handle(cache, f_curr, f_new); + goto out; + } + } + + if (prev == NULL) + prev = head; + + CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range)); + /* Add new entry to cache and lru list. */ + fld_cache_entry_add(cache, f_new, prev); +out: + return 0; +} + +int fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + int rc; + + flde = fld_cache_entry_create(range); + if (IS_ERR(flde)) + return PTR_ERR(flde); + + write_lock(&cache->fci_lock); + rc = fld_cache_insert_nolock(cache, flde); + write_unlock(&cache->fci_lock); + if (rc) + OBD_FREE_PTR(flde); + + return rc; +} + +void fld_cache_delete_nolock(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *tmp; + struct list_head *head; + + head = &cache->fci_entries_head; + list_for_each_entry_safe(flde, tmp, head, fce_list) { + /* add list if next is end of list */ + if (range->lsr_start == flde->fce_range.lsr_start || + (range->lsr_end == flde->fce_range.lsr_end && + range->lsr_flags == flde->fce_range.lsr_flags)) { + fld_cache_entry_delete(cache, flde); + break; + } + } +} + +/** + * Delete FLD entry in FLD cache. + * + */ +void fld_cache_delete(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + write_lock(&cache->fci_lock); + fld_cache_delete_nolock(cache, range); + write_unlock(&cache->fci_lock); +} + +struct fld_cache_entry +*fld_cache_entry_lookup_nolock(struct fld_cache *cache, + struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *got = NULL; + struct list_head *head; + + head = &cache->fci_entries_head; + list_for_each_entry(flde, head, fce_list) { + if (range->lsr_start == flde->fce_range.lsr_start || + (range->lsr_end == flde->fce_range.lsr_end && + range->lsr_flags == flde->fce_range.lsr_flags)) { + got = flde; + break; + } + } + + return got; +} + +/** + * lookup \a seq sequence for range in fld cache. + */ +struct fld_cache_entry +*fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range) +{ + struct fld_cache_entry *got = NULL; + + read_lock(&cache->fci_lock); + got = fld_cache_entry_lookup_nolock(cache, range); + read_unlock(&cache->fci_lock); + return got; +} + +/** + * lookup \a seq sequence for range in fld cache. + */ +int fld_cache_lookup(struct fld_cache *cache, + const u64 seq, struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *prev = NULL; + struct list_head *head; + + read_lock(&cache->fci_lock); + head = &cache->fci_entries_head; + + cache->fci_stat.fst_count++; + list_for_each_entry(flde, head, fce_list) { + if (flde->fce_range.lsr_start > seq) { + if (prev != NULL) + *range = prev->fce_range; + break; + } + + prev = flde; + if (range_within(&flde->fce_range, seq)) { + *range = flde->fce_range; + + cache->fci_stat.fst_cache++; + read_unlock(&cache->fci_lock); + return 0; + } + } + read_unlock(&cache->fci_lock); + return -ENOENT; +} diff --git a/kernel/drivers/staging/lustre/lustre/fld/fld_internal.h b/kernel/drivers/staging/lustre/lustre/fld/fld_internal.h new file mode 100644 index 000000000..68bec7658 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fld/fld_internal.h @@ -0,0 +1,193 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_internal.h + * + * Author: Yury Umanets + * Author: Tom WangDi + */ +#ifndef __FLD_INTERNAL_H +#define __FLD_INTERNAL_H + +#include "../include/lustre/lustre_idl.h" +#include "../include/dt_object.h" + +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/lustre_req_layout.h" +#include "../include/lustre_fld.h" + +enum { + LUSTRE_FLD_INIT = 1 << 0, + LUSTRE_FLD_RUN = 1 << 1 +}; + +struct fld_stats { + __u64 fst_count; + __u64 fst_cache; + __u64 fst_inflight; +}; + +typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64); + +typedef struct lu_fld_target * +(*fld_scan_func_t) (struct lu_client_fld *, __u64); + +struct lu_fld_hash { + const char *fh_name; + fld_hash_func_t fh_hash_func; + fld_scan_func_t fh_scan_func; +}; + +struct fld_cache_entry { + struct list_head fce_lru; + struct list_head fce_list; + /** + * fld cache entries are sorted on range->lsr_start field. */ + struct lu_seq_range fce_range; +}; + +struct fld_cache { + /** + * Cache guard, protects fci_hash mostly because others immutable after + * init is finished. + */ + rwlock_t fci_lock; + + /** + * Cache shrink threshold */ + int fci_threshold; + + /** + * Preferred number of cached entries */ + int fci_cache_size; + + /** + * Current number of cached entries. Protected by \a fci_lock */ + int fci_cache_count; + + /** + * LRU list fld entries. */ + struct list_head fci_lru; + + /** + * sorted fld entries. */ + struct list_head fci_entries_head; + + /** + * Cache statistics. */ + struct fld_stats fci_stat; + + /** + * Cache name used for debug and messages. */ + char fci_name[LUSTRE_MDT_MAXNAMELEN]; + unsigned int fci_no_shrink:1; +}; + +enum fld_op { + FLD_CREATE = 0, + FLD_DELETE = 1, + FLD_LOOKUP = 2 +}; + +enum { + /* 4M of FLD cache will not hurt client a lot. */ + FLD_SERVER_CACHE_SIZE = (4 * 0x100000), + + /* 1M of FLD cache will not hurt client a lot. */ + FLD_CLIENT_CACHE_SIZE = (1 * 0x100000) +}; + +enum { + /* Cache threshold is 10 percent of size. */ + FLD_SERVER_CACHE_THRESHOLD = 10, + + /* Cache threshold is 10 percent of size. */ + FLD_CLIENT_CACHE_THRESHOLD = 10 +}; + +extern struct lu_fld_hash fld_hash[]; + +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, __u32 fld_op); + +#if defined(CONFIG_PROC_FS) +extern struct lprocfs_vars fld_client_proc_list[]; +#endif + + +struct fld_cache *fld_cache_init(const char *name, + int cache_size, int cache_threshold); + +void fld_cache_fini(struct fld_cache *cache); + +void fld_cache_flush(struct fld_cache *cache); + +int fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range); + +struct fld_cache_entry +*fld_cache_entry_create(const struct lu_seq_range *range); + +int fld_cache_insert_nolock(struct fld_cache *cache, + struct fld_cache_entry *f_new); +void fld_cache_delete(struct fld_cache *cache, + const struct lu_seq_range *range); +void fld_cache_delete_nolock(struct fld_cache *cache, + const struct lu_seq_range *range); +int fld_cache_lookup(struct fld_cache *cache, + const u64 seq, struct lu_seq_range *range); + +struct fld_cache_entry* +fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range); +void fld_cache_entry_delete(struct fld_cache *cache, + struct fld_cache_entry *node); +void fld_dump_cache_entries(struct fld_cache *cache); + +struct fld_cache_entry +*fld_cache_entry_lookup_nolock(struct fld_cache *cache, + struct lu_seq_range *range); +int fld_write_range(const struct lu_env *env, struct dt_object *dt, + const struct lu_seq_range *range, struct thandle *th); + +static inline const char * +fld_target_name(struct lu_fld_target *tar) +{ + if (tar->ft_srv != NULL) + return tar->ft_srv->lsf_name; + + return (const char *)tar->ft_exp->exp_obd->obd_name; +} + +#endif /* __FLD_INTERNAL_H */ diff --git a/kernel/drivers/staging/lustre/lustre/fld/fld_request.c b/kernel/drivers/staging/lustre/lustre/fld/fld_request.c new file mode 100644 index 000000000..6ac225e90 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fld/fld_request.c @@ -0,0 +1,526 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_request.c + * + * FLD (Fids Location Database) + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include "../../include/linux/libcfs/libcfs.h" +#include +#include + +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/lustre_ver.h" +#include "../include/obd_support.h" +#include "../include/lprocfs_status.h" + +#include "../include/dt_object.h" +#include "../include/lustre_req_layout.h" +#include "../include/lustre_fld.h" +#include "../include/lustre_mdc.h" +#include "fld_internal.h" + +/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c + * It should be common thing. The same about mdc RPC lock */ +static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw) +{ + int rc; + + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = list_empty(&mcw->mcw_entry); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +}; + +static void fld_enter_request(struct client_obd *cli) +{ + struct mdc_cache_waiter mcw; + struct l_wait_info lwi = { 0 }; + + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { + list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters); + init_waitqueue_head(&mcw.mcw_waitq); + client_obd_list_unlock(&cli->cl_loi_list_lock); + l_wait_event(mcw.mcw_waitq, fld_req_avail(cli, &mcw), &lwi); + } else { + cli->cl_r_in_flight++; + client_obd_list_unlock(&cli->cl_loi_list_lock); + } +} + +static void fld_exit_request(struct client_obd *cli) +{ + struct list_head *l, *tmp; + struct mdc_cache_waiter *mcw; + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_r_in_flight--; + list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { + + if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { + /* No free request slots anymore */ + break; + } + + mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry); + list_del_init(&mcw->mcw_entry); + cli->cl_r_in_flight++; + wake_up(&mcw->mcw_waitq); + } + client_obd_list_unlock(&cli->cl_loi_list_lock); +} + +static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq) +{ + LASSERT(fld->lcf_count > 0); + return do_div(seq, fld->lcf_count); +} + +static struct lu_fld_target * +fld_rrb_scan(struct lu_client_fld *fld, u64 seq) +{ + struct lu_fld_target *target; + int hash; + + /* Because almost all of special sequence located in MDT0, + * it should go to index 0 directly, instead of calculating + * hash again, and also if other MDTs is not being connected, + * the fld lookup requests(for seq on MDT0) should not be + * blocked because of other MDTs */ + if (fid_seq_is_norm(seq)) + hash = fld_rrb_hash(fld, seq); + else + hash = 0; + +again: + list_for_each_entry(target, &fld->lcf_targets, ft_chain) { + if (target->ft_idx == hash) + return target; + } + + if (hash != 0) { + /* It is possible the remote target(MDT) are not connected to + * with client yet, so we will refer this to MDT0, which should + * be connected during mount */ + hash = 0; + goto again; + } + + CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n", + fld->lcf_name, hash, seq, fld->lcf_count); + + list_for_each_entry(target, &fld->lcf_targets, ft_chain) { + const char *srv_name = target->ft_srv != NULL ? + target->ft_srv->lsf_name : ""; + const char *exp_name = target->ft_exp != NULL ? + (char *)target->ft_exp->exp_obd->obd_uuid.uuid : + ""; + + CERROR(" exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n", + target->ft_exp, exp_name, target->ft_srv, + srv_name, target->ft_idx); + } + + /* + * If target is not found, there is logical error anyway, so here is + * LBUG() to catch this situation. + */ + LBUG(); + return NULL; +} + +struct lu_fld_hash fld_hash[] = { + { + .fh_name = "RRB", + .fh_hash_func = fld_rrb_hash, + .fh_scan_func = fld_rrb_scan + }, + { + NULL, + } +}; + +static struct lu_fld_target * +fld_client_get_target(struct lu_client_fld *fld, u64 seq) +{ + struct lu_fld_target *target; + + LASSERT(fld->lcf_hash != NULL); + + spin_lock(&fld->lcf_lock); + target = fld->lcf_hash->fh_scan_func(fld, seq); + spin_unlock(&fld->lcf_lock); + + if (target != NULL) { + CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n", + fld->lcf_name, target->ft_idx, seq); + } + + return target; +} + +/* + * Add export to FLD. This is usually done by CMM and LMV as they are main users + * of FLD module. + */ +int fld_client_add_target(struct lu_client_fld *fld, + struct lu_fld_target *tar) +{ + const char *name; + struct lu_fld_target *target, *tmp; + + LASSERT(tar != NULL); + name = fld_target_name(tar); + LASSERT(name != NULL); + LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL); + + if (fld->lcf_flags != LUSTRE_FLD_INIT) { + CERROR("%s: Attempt to add target %s (idx %llu) on fly - skip it\n", + fld->lcf_name, name, tar->ft_idx); + return 0; + } + CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n", + fld->lcf_name, name, tar->ft_idx); + + OBD_ALLOC_PTR(target); + if (target == NULL) + return -ENOMEM; + + spin_lock(&fld->lcf_lock); + list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) { + if (tmp->ft_idx == tar->ft_idx) { + spin_unlock(&fld->lcf_lock); + OBD_FREE_PTR(target); + CERROR("Target %s exists in FLD and known as %s:#%llu\n", + name, fld_target_name(tmp), tmp->ft_idx); + return -EEXIST; + } + } + + target->ft_exp = tar->ft_exp; + if (target->ft_exp != NULL) + class_export_get(target->ft_exp); + target->ft_srv = tar->ft_srv; + target->ft_idx = tar->ft_idx; + + list_add_tail(&target->ft_chain, + &fld->lcf_targets); + + fld->lcf_count++; + spin_unlock(&fld->lcf_lock); + + return 0; +} +EXPORT_SYMBOL(fld_client_add_target); + +/* Remove export from FLD */ +int fld_client_del_target(struct lu_client_fld *fld, __u64 idx) +{ + struct lu_fld_target *target, *tmp; + + spin_lock(&fld->lcf_lock); + list_for_each_entry_safe(target, tmp, + &fld->lcf_targets, ft_chain) { + if (target->ft_idx == idx) { + fld->lcf_count--; + list_del(&target->ft_chain); + spin_unlock(&fld->lcf_lock); + + if (target->ft_exp != NULL) + class_export_put(target->ft_exp); + + OBD_FREE_PTR(target); + return 0; + } + } + spin_unlock(&fld->lcf_lock); + return -ENOENT; +} +EXPORT_SYMBOL(fld_client_del_target); + +static struct proc_dir_entry *fld_type_proc_dir; + +#if defined(CONFIG_PROC_FS) +static int fld_client_proc_init(struct lu_client_fld *fld) +{ + int rc; + + fld->lcf_proc_dir = lprocfs_register(fld->lcf_name, + fld_type_proc_dir, + NULL, NULL); + + if (IS_ERR(fld->lcf_proc_dir)) { + CERROR("%s: LProcFS failed in fld-init\n", + fld->lcf_name); + rc = PTR_ERR(fld->lcf_proc_dir); + return rc; + } + + rc = lprocfs_add_vars(fld->lcf_proc_dir, + fld_client_proc_list, fld); + if (rc) { + CERROR("%s: Can't init FLD proc, rc %d\n", + fld->lcf_name, rc); + goto out_cleanup; + } + + return 0; + +out_cleanup: + fld_client_proc_fini(fld); + return rc; +} + +void fld_client_proc_fini(struct lu_client_fld *fld) +{ + if (fld->lcf_proc_dir) { + if (!IS_ERR(fld->lcf_proc_dir)) + lprocfs_remove(&fld->lcf_proc_dir); + fld->lcf_proc_dir = NULL; + } +} +#else +static int fld_client_proc_init(struct lu_client_fld *fld) +{ + return 0; +} + +void fld_client_proc_fini(struct lu_client_fld *fld) +{ +} +#endif +EXPORT_SYMBOL(fld_client_proc_fini); + +static inline int hash_is_sane(int hash) +{ + return (hash >= 0 && hash < ARRAY_SIZE(fld_hash)); +} + +int fld_client_init(struct lu_client_fld *fld, + const char *prefix, int hash) +{ + int cache_size, cache_threshold; + int rc; + + LASSERT(fld != NULL); + + snprintf(fld->lcf_name, sizeof(fld->lcf_name), + "cli-%s", prefix); + + if (!hash_is_sane(hash)) { + CERROR("%s: Wrong hash function %#x\n", + fld->lcf_name, hash); + return -EINVAL; + } + + fld->lcf_count = 0; + spin_lock_init(&fld->lcf_lock); + fld->lcf_hash = &fld_hash[hash]; + fld->lcf_flags = LUSTRE_FLD_INIT; + INIT_LIST_HEAD(&fld->lcf_targets); + + cache_size = FLD_CLIENT_CACHE_SIZE / + sizeof(struct fld_cache_entry); + + cache_threshold = cache_size * + FLD_CLIENT_CACHE_THRESHOLD / 100; + + fld->lcf_cache = fld_cache_init(fld->lcf_name, + cache_size, cache_threshold); + if (IS_ERR(fld->lcf_cache)) { + rc = PTR_ERR(fld->lcf_cache); + fld->lcf_cache = NULL; + goto out; + } + + rc = fld_client_proc_init(fld); + if (rc) + goto out; +out: + if (rc) + fld_client_fini(fld); + else + CDEBUG(D_INFO, "%s: Using \"%s\" hash\n", + fld->lcf_name, fld->lcf_hash->fh_name); + return rc; +} +EXPORT_SYMBOL(fld_client_init); + +void fld_client_fini(struct lu_client_fld *fld) +{ + struct lu_fld_target *target, *tmp; + + spin_lock(&fld->lcf_lock); + list_for_each_entry_safe(target, tmp, + &fld->lcf_targets, ft_chain) { + fld->lcf_count--; + list_del(&target->ft_chain); + if (target->ft_exp != NULL) + class_export_put(target->ft_exp); + OBD_FREE_PTR(target); + } + spin_unlock(&fld->lcf_lock); + + if (fld->lcf_cache != NULL) { + if (!IS_ERR(fld->lcf_cache)) + fld_cache_fini(fld->lcf_cache); + fld->lcf_cache = NULL; + } +} +EXPORT_SYMBOL(fld_client_fini); + +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, __u32 fld_op) +{ + struct ptlrpc_request *req; + struct lu_seq_range *prange; + __u32 *op; + int rc; + struct obd_import *imp; + + LASSERT(exp != NULL); + + imp = class_exp2cliimp(exp); + req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, LUSTRE_MDS_VERSION, + FLD_QUERY); + if (req == NULL) + return -ENOMEM; + + op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC); + *op = fld_op; + + prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD); + *prange = *range; + + ptlrpc_request_set_replen(req); + req->rq_request_portal = FLD_REQUEST_PORTAL; + req->rq_reply_portal = MDC_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (fld_op == FLD_LOOKUP && + imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) + req->rq_allow_replay = 1; + + if (fld_op != FLD_LOOKUP) + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + fld_enter_request(&exp->exp_obd->u.cli); + rc = ptlrpc_queue_wait(req); + fld_exit_request(&exp->exp_obd->u.cli); + if (fld_op != FLD_LOOKUP) + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + if (rc) + goto out_req; + + prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD); + if (prange == NULL) { + rc = -EFAULT; + goto out_req; + } + *range = *prange; +out_req: + ptlrpc_req_finished(req); + return rc; +} + +int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds, + __u32 flags, const struct lu_env *env) +{ + struct lu_seq_range res = { 0 }; + struct lu_fld_target *target; + int rc; + + fld->lcf_flags |= LUSTRE_FLD_RUN; + + rc = fld_cache_lookup(fld->lcf_cache, seq, &res); + if (rc == 0) { + *mds = res.lsr_index; + return 0; + } + + /* Can not find it in the cache */ + target = fld_client_get_target(fld, seq); + LASSERT(target != NULL); + + CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n", + fld->lcf_name, seq, fld_target_name(target), target->ft_idx); + + res.lsr_start = seq; + fld_range_set_type(&res, flags); + rc = fld_client_rpc(target->ft_exp, &res, FLD_LOOKUP); + + if (rc == 0) { + *mds = res.lsr_index; + + fld_cache_insert(fld->lcf_cache, &res); + } + return rc; +} +EXPORT_SYMBOL(fld_client_lookup); + +void fld_client_flush(struct lu_client_fld *fld) +{ + fld_cache_flush(fld->lcf_cache); +} +EXPORT_SYMBOL(fld_client_flush); + +static int __init fld_mod_init(void) +{ + fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME, + proc_lustre_root, + NULL, NULL); + return PTR_ERR_OR_ZERO(fld_type_proc_dir); +} + +static void __exit fld_mod_exit(void) +{ + if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) { + lprocfs_remove(&fld_type_proc_dir); + fld_type_proc_dir = NULL; + } +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre FLD"); +MODULE_LICENSE("GPL"); + +module_init(fld_mod_init) +module_exit(fld_mod_exit) diff --git a/kernel/drivers/staging/lustre/lustre/fld/lproc_fld.c b/kernel/drivers/staging/lustre/lustre/fld/lproc_fld.c new file mode 100644 index 000000000..f53fdcfae --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/fld/lproc_fld.c @@ -0,0 +1,172 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/lproc_fld.c + * + * FLD (FIDs Location Database) + * + * Author: Yury Umanets + * Di Wang + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include "../../include/linux/libcfs/libcfs.h" +#include + +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/dt_object.h" +#include "../include/obd_support.h" +#include "../include/lustre_req_layout.h" +#include "../include/lustre_fld.h" +#include "../include/lustre_fid.h" +#include "fld_internal.h" + +static int +fld_proc_targets_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)m->private; + struct lu_fld_target *target; + + LASSERT(fld != NULL); + + spin_lock(&fld->lcf_lock); + list_for_each_entry(target, + &fld->lcf_targets, ft_chain) + seq_printf(m, "%s\n", fld_target_name(target)); + spin_unlock(&fld->lcf_lock); + + return 0; +} + +static int +fld_proc_hash_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)m->private; + + LASSERT(fld != NULL); + + spin_lock(&fld->lcf_lock); + seq_printf(m, "%s\n", fld->lcf_hash->fh_name); + spin_unlock(&fld->lcf_lock); + + return 0; +} + +static ssize_t +fld_proc_hash_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct lu_client_fld *fld; + struct lu_fld_hash *hash = NULL; + char fh_name[8]; + int i; + + if (count > sizeof(fh_name)) + return -ENAMETOOLONG; + + if (copy_from_user(fh_name, buffer, count) != 0) + return -EFAULT; + + fld = ((struct seq_file *)file->private_data)->private; + LASSERT(fld != NULL); + + for (i = 0; fld_hash[i].fh_name != NULL; i++) { + if (count != strlen(fld_hash[i].fh_name)) + continue; + + if (!strncmp(fld_hash[i].fh_name, fh_name, count)) { + hash = &fld_hash[i]; + break; + } + } + + if (hash != NULL) { + spin_lock(&fld->lcf_lock); + fld->lcf_hash = hash; + spin_unlock(&fld->lcf_lock); + + CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n", + fld->lcf_name, hash->fh_name); + } + + return count; +} + +static ssize_t +fld_proc_cache_flush_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) +{ + struct lu_client_fld *fld = file->private_data; + + LASSERT(fld != NULL); + + fld_cache_flush(fld->lcf_cache); + + CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name); + + return count; +} + +static int fld_proc_cache_flush_open(struct inode *inode, struct file *file) +{ + file->private_data = PDE_DATA(inode); + return 0; +} + +static int fld_proc_cache_flush_release(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return 0; +} + +static struct file_operations fld_proc_cache_flush_fops = { + .owner = THIS_MODULE, + .open = fld_proc_cache_flush_open, + .write = fld_proc_cache_flush_write, + .release = fld_proc_cache_flush_release, +}; + +LPROC_SEQ_FOPS_RO(fld_proc_targets); +LPROC_SEQ_FOPS(fld_proc_hash); + +struct lprocfs_vars fld_client_proc_list[] = { + { "targets", &fld_proc_targets_fops }, + { "hash", &fld_proc_hash_fops }, + { "cache_flush", &fld_proc_cache_flush_fops }, + { NULL } +}; diff --git a/kernel/drivers/staging/lustre/lustre/include/cl_object.h b/kernel/drivers/staging/lustre/lustre/include/cl_object.h new file mode 100644 index 000000000..d56c8bea8 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/cl_object.h @@ -0,0 +1,3287 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef _LUSTRE_CL_OBJECT_H +#define _LUSTRE_CL_OBJECT_H + +/** \defgroup clio clio + * + * Client objects implement io operations and cache pages. + * + * Examples: lov and osc are implementations of cl interface. + * + * Big Theory Statement. + * + * Layered objects. + * + * Client implementation is based on the following data-types: + * + * - cl_object + * + * - cl_page + * + * - cl_lock represents an extent lock on an object. + * + * - cl_io represents high-level i/o activity such as whole read/write + * system call, or write-out of pages from under the lock being + * canceled. cl_io has sub-ios that can be stopped and resumed + * independently, thus achieving high degree of transfer + * parallelism. Single cl_io can be advanced forward by + * the multiple threads (although in the most usual case of + * read/write system call it is associated with the single user + * thread, that issued the system call). + * + * - cl_req represents a collection of pages for a transfer. cl_req is + * constructed by req-forming engine that tries to saturate + * transport with large and continuous transfers. + * + * Terminology + * + * - to avoid confusion high-level I/O operation like read or write system + * call is referred to as "an io", whereas low-level I/O operation, like + * RPC, is referred to as "a transfer" + * + * - "generic code" means generic (not file system specific) code in the + * hosting environment. "cl-code" means code (mostly in cl_*.c files) that + * is not layer specific. + * + * Locking. + * + * - i_mutex + * - PG_locked + * - cl_object_header::coh_page_guard + * - cl_object_header::coh_lock_guard + * - lu_site::ls_guard + * + * See the top comment in cl_object.c for the description of overall locking and + * reference-counting design. + * + * See comments below for the description of i/o, page, and dlm-locking + * design. + * + * @{ + */ + +/* + * super-class definitions. + */ +#include "lu_object.h" +#include "linux/lustre_compat25.h" +#include +#include + +struct inode; + +struct cl_device; +struct cl_device_operations; + +struct cl_object; +struct cl_object_page_operations; +struct cl_object_lock_operations; + +struct cl_page; +struct cl_page_slice; +struct cl_lock; +struct cl_lock_slice; + +struct cl_lock_operations; +struct cl_page_operations; + +struct cl_io; +struct cl_io_slice; + +struct cl_req; +struct cl_req_slice; + +/** + * Operations for each data device in the client stack. + * + * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops + */ +struct cl_device_operations { + /** + * Initialize cl_req. This method is called top-to-bottom on all + * devices in the stack to get them a chance to allocate layer-private + * data, and to attach them to the cl_req by calling + * cl_req_slice_add(). + * + * \see osc_req_init(), lov_req_init(), lovsub_req_init() + * \see ccc_req_init() + */ + int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +}; + +/** + * Device in the client stack. + * + * \see ccc_device, lov_device, lovsub_device, osc_device + */ +struct cl_device { + /** Super-class. */ + struct lu_device cd_lu_dev; + /** Per-layer operation vector. */ + const struct cl_device_operations *cd_ops; +}; + +/** \addtogroup cl_object cl_object + * @{ */ +/** + * "Data attributes" of cl_object. Data attributes can be updated + * independently for a sub-object, and top-object's attributes are calculated + * from sub-objects' ones. + */ +struct cl_attr { + /** Object size, in bytes */ + loff_t cat_size; + /** + * Known minimal size, in bytes. + * + * This is only valid when at least one DLM lock is held. + */ + loff_t cat_kms; + /** Modification time. Measured in seconds since epoch. */ + time_t cat_mtime; + /** Access time. Measured in seconds since epoch. */ + time_t cat_atime; + /** Change time. Measured in seconds since epoch. */ + time_t cat_ctime; + /** + * Blocks allocated to this cl_object on the server file system. + * + * \todo XXX An interface for block size is needed. + */ + __u64 cat_blocks; + /** + * User identifier for quota purposes. + */ + uid_t cat_uid; + /** + * Group identifier for quota purposes. + */ + gid_t cat_gid; +}; + +/** + * Fields in cl_attr that are being set. + */ +enum cl_attr_valid { + CAT_SIZE = 1 << 0, + CAT_KMS = 1 << 1, + CAT_MTIME = 1 << 3, + CAT_ATIME = 1 << 4, + CAT_CTIME = 1 << 5, + CAT_BLOCKS = 1 << 6, + CAT_UID = 1 << 7, + CAT_GID = 1 << 8 +}; + +/** + * Sub-class of lu_object with methods common for objects on the client + * stacks. + * + * cl_object: represents a regular file system object, both a file and a + * stripe. cl_object is based on lu_object: it is identified by a fid, + * layered, cached, hashed, and lrued. Important distinction with the server + * side, where md_object and dt_object are used, is that cl_object "fans out" + * at the lov/sns level: depending on the file layout, single file is + * represented as a set of "sub-objects" (stripes). At the implementation + * level, struct lov_object contains an array of cl_objects. Each sub-object + * is a full-fledged cl_object, having its fid, living in the lru and hash + * table. + * + * This leads to the next important difference with the server side: on the + * client, it's quite usual to have objects with the different sequence of + * layers. For example, typical top-object is composed of the following + * layers: + * + * - vvp + * - lov + * + * whereas its sub-objects are composed of + * + * - lovsub + * - osc + * + * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep + * track of the object-subobject relationship. + * + * Sub-objects are not cached independently: when top-object is about to + * be discarded from the memory, all its sub-objects are torn-down and + * destroyed too. + * + * \see ccc_object, lov_object, lovsub_object, osc_object + */ +struct cl_object { + /** super class */ + struct lu_object co_lu; + /** per-object-layer operations */ + const struct cl_object_operations *co_ops; + /** offset of page slice in cl_page buffer */ + int co_slice_off; +}; + +/** + * Description of the client object configuration. This is used for the + * creation of a new client object that is identified by a more state than + * fid. + */ +struct cl_object_conf { + /** Super-class. */ + struct lu_object_conf coc_lu; + union { + /** + * Object layout. This is consumed by lov. + */ + struct lustre_md *coc_md; + /** + * Description of particular stripe location in the + * cluster. This is consumed by osc. + */ + struct lov_oinfo *coc_oinfo; + } u; + /** + * VFS inode. This is consumed by vvp. + */ + struct inode *coc_inode; + /** + * Layout lock handle. + */ + struct ldlm_lock *coc_lock; + /** + * Operation to handle layout, OBJECT_CONF_XYZ. + */ + int coc_opc; +}; + +enum { + /** configure layout, set up a new stripe, must be called while + * holding layout lock. */ + OBJECT_CONF_SET = 0, + /** invalidate the current stripe configuration due to losing + * layout lock. */ + OBJECT_CONF_INVALIDATE = 1, + /** wait for old layout to go away so that new layout can be + * set up. */ + OBJECT_CONF_WAIT = 2 +}; + +/** + * Operations implemented for each cl object layer. + * + * \see vvp_ops, lov_ops, lovsub_ops, osc_ops + */ +struct cl_object_operations { + /** + * Initialize page slice for this layer. Called top-to-bottom through + * every object layer when a new cl_page is instantiated. Layer + * keeping private per-page data, or requiring its own page operations + * vector should allocate these data here, and attach then to the page + * by calling cl_page_slice_add(). \a vmpage is locked (in the VM + * sense). Optional. + * + * \retval NULL success. + * + * \retval ERR_PTR(errno) failure code. + * + * \retval valid-pointer pointer to already existing referenced page + * to be used instead of newly created. + */ + int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage); + /** + * Initialize lock slice for this layer. Called top-to-bottom through + * every object layer when a new cl_lock is instantiated. Layer + * keeping private per-lock data, or requiring its own lock operations + * vector should allocate these data here, and attach then to the lock + * by calling cl_lock_slice_add(). Mandatory. + */ + int (*coo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + /** + * Initialize io state for a given layer. + * + * called top-to-bottom once per io existence to initialize io + * state. If layer wants to keep some state for this type of io, it + * has to embed struct cl_io_slice in lu_env::le_ses, and register + * slice with cl_io_slice_add(). It is guaranteed that all threads + * participating in this io share the same session. + */ + int (*coo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + /** + * Fill portion of \a attr that this layer controls. This method is + * called top-to-bottom through all object layers. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return 0: to continue + * \return +ve: to stop iterating through layers (but 0 is returned + * from enclosing cl_object_attr_get()) + * \return -ve: to signal error + */ + int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); + /** + * Update attributes. + * + * \a valid is a bitmask composed from enum #cl_attr_valid, and + * indicating what attributes are to be set. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return the same convention as for + * cl_object_operations::coo_attr_get() is used. + */ + int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); + /** + * Update object configuration. Called top-to-bottom to modify object + * configuration. + * + * XXX error conditions and handling. + */ + int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); + /** + * Glimpse ast. Executed when glimpse ast arrives for a lock on this + * object. Layers are supposed to fill parts of \a lvb that will be + * shipped to the glimpse originator as a glimpse result. + * + * \see ccc_object_glimpse(), lovsub_object_glimpse(), + * \see osc_object_glimpse() + */ + int (*coo_glimpse)(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); +}; + +/** + * Extended header for client object. + */ +struct cl_object_header { + /** Standard lu_object_header. cl_object::co_lu::lo_header points + * here. */ + struct lu_object_header coh_lu; + /** \name locks + * \todo XXX move locks below to the separate cache-lines, they are + * mostly useless otherwise. + */ + /** @{ */ + /** Lock protecting page tree. */ + spinlock_t coh_page_guard; + /** Lock protecting lock list. */ + spinlock_t coh_lock_guard; + /** @} locks */ + /** Radix tree of cl_page's, cached for this object. */ + struct radix_tree_root coh_tree; + /** # of pages in radix tree. */ + unsigned long coh_pages; + /** List of cl_lock's granted for this object. */ + struct list_head coh_locks; + + /** + * Parent object. It is assumed that an object has a well-defined + * parent, but not a well-defined child (there may be multiple + * sub-objects, for the same top-object). cl_object_header::coh_parent + * field allows certain code to be written generically, without + * limiting possible cl_object layouts unduly. + */ + struct cl_object_header *coh_parent; + /** + * Protects consistency between cl_attr of parent object and + * attributes of sub-objects, that the former is calculated ("merged") + * from. + * + * \todo XXX this can be read/write lock if needed. + */ + spinlock_t coh_attr_guard; + /** + * Size of cl_page + page slices + */ + unsigned short coh_page_bufsize; + /** + * Number of objects above this one: 0 for a top-object, 1 for its + * sub-object, etc. + */ + unsigned char coh_nesting; +}; + +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer top-to-bottom to \a slice. + */ +#define cl_object_for_each(slice, obj) \ + list_for_each_entry((slice), \ + &(obj)->co_lu.lo_header->loh_layers, \ + co_lu.lo_linkage) +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer bottom-to-top to \a slice. + */ +#define cl_object_for_each_reverse(slice, obj) \ + list_for_each_entry_reverse((slice), \ + &(obj)->co_lu.lo_header->loh_layers, \ + co_lu.lo_linkage) +/** @} cl_object */ + +#ifndef pgoff_t +#define pgoff_t unsigned long +#endif + +#define CL_PAGE_EOF ((pgoff_t)~0ull) + +/** \addtogroup cl_page cl_page + * @{ */ + +/** \struct cl_page + * Layered client page. + * + * cl_page: represents a portion of a file, cached in the memory. All pages + * of the given file are of the same size, and are kept in the radix tree + * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects + * of the top-level file object are first class cl_objects, they have their + * own radix trees of pages and hence page is implemented as a sequence of + * struct cl_pages's, linked into double-linked list through + * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the + * corresponding radix tree at the corresponding logical offset. + * + * cl_page is associated with VM page of the hosting environment (struct + * page in Linux kernel, for example), struct page. It is assumed, that this + * association is implemented by one of cl_page layers (top layer in the + * current design) that + * + * - intercepts per-VM-page call-backs made by the environment (e.g., + * memory pressure), + * + * - translates state (page flag bits) and locking between lustre and + * environment. + * + * The association between cl_page and struct page is immutable and + * established when cl_page is created. + * + * cl_page can be "owned" by a particular cl_io (see below), guaranteeing + * this io an exclusive access to this page w.r.t. other io attempts and + * various events changing page state (such as transfer completion, or + * eviction of the page from the memory). Note, that in general cl_io + * cannot be identified with a particular thread, and page ownership is not + * exactly equal to the current thread holding a lock on the page. Layer + * implementing association between cl_page and struct page has to implement + * ownership on top of available synchronization mechanisms. + * + * While lustre client maintains the notion of an page ownership by io, + * hosting MM/VM usually has its own page concurrency control + * mechanisms. For example, in Linux, page access is synchronized by the + * per-page PG_locked bit-lock, and generic kernel code (generic_file_*()) + * takes care to acquire and release such locks as necessary around the + * calls to the file system methods (->readpage(), ->prepare_write(), + * ->commit_write(), etc.). This leads to the situation when there are two + * different ways to own a page in the client: + * + * - client code explicitly and voluntary owns the page (cl_page_own()); + * + * - VM locks a page and then calls the client, that has "to assume" + * the ownership from the VM (cl_page_assume()). + * + * Dual methods to release ownership are cl_page_disown() and + * cl_page_unassume(). + * + * cl_page is reference counted (cl_page::cp_ref). When reference counter + * drops to 0, the page is returned to the cache, unless it is in + * cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * The general logic guaranteeing the absence of "existential races" for + * pages is the following: + * + * - there are fixed known ways for a thread to obtain a new reference + * to a page: + * + * - by doing a lookup in the cl_object radix tree, protected by the + * spin-lock; + * + * - by starting from VM-locked struct page and following some + * hosting environment method (e.g., following ->private pointer in + * the case of Linux kernel), see cl_vmpage_page(); + * + * - when the page enters cl_page_state::CPS_FREEING state, all these + * ways are severed with the proper synchronization + * (cl_page_delete()); + * + * - entry into cl_page_state::CPS_FREEING is serialized by the VM page + * lock; + * + * - no new references to the page in cl_page_state::CPS_FREEING state + * are allowed (checked in cl_page_get()). + * + * Together this guarantees that when last reference to a + * cl_page_state::CPS_FREEING page is released, it is safe to destroy the + * page, as neither references to it can be acquired at that point, nor + * ones exist. + * + * cl_page is a state machine. States are enumerated in enum + * cl_page_state. Possible state transitions are enumerated in + * cl_page_state_set(). State transition process (i.e., actual changing of + * cl_page::cp_state field) is protected by the lock on the underlying VM + * page. + * + * Linux Kernel implementation. + * + * Binding between cl_page and struct page (which is a typedef for + * struct page) is implemented in the vvp layer. cl_page is attached to the + * ->private pointer of the struct page, together with the setting of + * PG_private bit in page->flags, and acquiring additional reference on the + * struct page (much like struct buffer_head, or any similar file system + * private data structures). + * + * PG_locked lock is used to implement both ownership and transfer + * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}} + * states. No additional references are acquired for the duration of the + * transfer. + * + * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where + * write-out is "protected" by the special PG_writeback bit. + */ + +/** + * States of cl_page. cl_page.c assumes particular order here. + * + * The page state machine is rather crude, as it doesn't recognize finer page + * states like "dirty" or "up to date". This is because such states are not + * always well defined for the whole stack (see, for example, the + * implementation of the read-ahead, that hides page up-to-dateness to track + * cache hits accurately). Such sub-states are maintained by the layers that + * are interested in them. + */ +enum cl_page_state { + /** + * Page is in the cache, un-owned. Page leaves cached state in the + * following cases: + * + * - [cl_page_state::CPS_OWNED] io comes across the page and + * owns it; + * + * - [cl_page_state::CPS_PAGEOUT] page is dirty, the + * req-formation engine decides that it wants to include this page + * into an cl_req being constructed, and yanks it from the cache; + * + * - [cl_page_state::CPS_FREEING] VM callback is executed to + * evict the page form the memory; + * + * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_CACHED, + /** + * Page is exclusively owned by some cl_io. Page may end up in this + * state as a result of + * + * - io creating new page and immediately owning it; + * + * - [cl_page_state::CPS_CACHED] io finding existing cached page + * and owning it; + * + * - [cl_page_state::CPS_OWNED] io finding existing owned page + * and waiting for owner to release the page; + * + * Page leaves owned state in the following cases: + * + * - [cl_page_state::CPS_CACHED] io decides to leave the page in + * the cache, doing nothing; + * + * - [cl_page_state::CPS_PAGEIN] io starts read transfer for + * this page; + * + * - [cl_page_state::CPS_PAGEOUT] io starts immediate write + * transfer for this page; + * + * - [cl_page_state::CPS_FREEING] io decides to destroy this + * page (e.g., as part of truncate or extent lock cancellation). + * + * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL + */ + CPS_OWNED, + /** + * Page is being written out, as a part of a transfer. This state is + * entered when req-formation logic decided that it wants this page to + * be sent through the wire _now_. Specifically, it means that once + * this state is achieved, transfer completion handler (with either + * success or failure indication) is guaranteed to be executed against + * this page independently of any locks and any scheduling decisions + * made by the hosting environment (that effectively means that the + * page is never put into cl_page_state::CPS_PAGEOUT state "in + * advance". This property is mentioned, because it is important when + * reasoning about possible dead-locks in the system). The page can + * enter this state as a result of + * + * - [cl_page_state::CPS_OWNED] an io requesting an immediate + * write-out of this page, or + * + * - [cl_page_state::CPS_CACHED] req-forming engine deciding + * that it has enough dirty pages cached to issue a "good" + * transfer. + * + * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer + * is completed---it is moved into cl_page_state::CPS_CACHED state. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEOUT, + /** + * Page is being read in, as a part of a transfer. This is quite + * similar to the cl_page_state::CPS_PAGEOUT state, except that + * read-in is always "immediate"---there is no such thing a sudden + * construction of read cl_req from cached, presumably not up to date, + * pages. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEIN, + /** + * Page is being destroyed. This state is entered when client decides + * that page has to be deleted from its host object, as, e.g., a part + * of truncate. + * + * Once this state is reached, there is no way to escape it. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_FREEING, + CPS_NR +}; + +enum cl_page_type { + /** Host page, the page is from the host inode which the cl_page + * belongs to. */ + CPT_CACHEABLE = 1, + + /** Transient page, the transient cl_page is used to bind a cl_page + * to vmpage which is not belonging to the same object of cl_page. + * it is used in DirectIO, lockless IO and liblustre. */ + CPT_TRANSIENT, +}; + +/** + * Flags maintained for every cl_page. + */ +enum cl_page_flags { + /** + * Set when pagein completes. Used for debugging (read completes at + * most once for a page). + */ + CPF_READ_COMPLETED = 1 << 0 +}; + +/** + * Fields are protected by the lock on struct page, except for atomics and + * immutables. + * + * \invariant Data type invariants are in cl_page_invariant(). Basically: + * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked + * list, consistent with the parent/child pointers in the cl_page::cp_obj and + * cl_page::cp_owner (when set). + */ +struct cl_page { + /** Reference counter. */ + atomic_t cp_ref; + /** An object this page is a part of. Immutable after creation. */ + struct cl_object *cp_obj; + /** Logical page index within the object. Immutable after creation. */ + pgoff_t cp_index; + /** List of slices. Immutable after creation. */ + struct list_head cp_layers; + /** Parent page, NULL for top-level page. Immutable after creation. */ + struct cl_page *cp_parent; + /** Lower-layer page. NULL for bottommost page. Immutable after + * creation. */ + struct cl_page *cp_child; + /** + * Page state. This field is const to avoid accidental update, it is + * modified only internally within cl_page.c. Protected by a VM lock. + */ + const enum cl_page_state cp_state; + /** Linkage of pages within group. Protected by cl_page::cp_mutex. */ + struct list_head cp_batch; + /** Mutex serializing membership of a page in a batch. */ + struct mutex cp_mutex; + /** Linkage of pages within cl_req. */ + struct list_head cp_flight; + /** Transfer error. */ + int cp_error; + + /** + * Page type. Only CPT_TRANSIENT is used so far. Immutable after + * creation. + */ + enum cl_page_type cp_type; + + /** + * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned + * by sub-io. Protected by a VM lock. + */ + struct cl_io *cp_owner; + /** + * Debug information, the task is owning the page. + */ + struct task_struct *cp_task; + /** + * Owning IO request in cl_page_state::CPS_PAGEOUT and + * cl_page_state::CPS_PAGEIN states. This field is maintained only in + * the top-level pages. Protected by a VM lock. + */ + struct cl_req *cp_req; + /** List of references to this page, for debugging. */ + struct lu_ref cp_reference; + /** Link to an object, for debugging. */ + struct lu_ref_link cp_obj_ref; + /** Link to a queue, for debugging. */ + struct lu_ref_link cp_queue_ref; + /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */ + unsigned cp_flags; + /** Assigned if doing a sync_io */ + struct cl_sync_io *cp_sync_io; +}; + +/** + * Per-layer part of cl_page. + * + * \see ccc_page, lov_page, osc_page + */ +struct cl_page_slice { + struct cl_page *cpl_page; + /** + * Object slice corresponding to this page slice. Immutable after + * creation. + */ + struct cl_object *cpl_obj; + const struct cl_page_operations *cpl_ops; + /** Linkage into cl_page::cp_layers. Immutable after creation. */ + struct list_head cpl_linkage; +}; + +/** + * Lock mode. For the client extent locks. + * + * \warning: cl_lock_mode_match() assumes particular ordering here. + * \ingroup cl_lock + */ +enum cl_lock_mode { + /** + * Mode of a lock that protects no data, and exists only as a + * placeholder. This is used for `glimpse' requests. A phantom lock + * might get promoted to real lock at some point. + */ + CLM_PHANTOM, + CLM_READ, + CLM_WRITE, + CLM_GROUP +}; + +/** + * Requested transfer type. + * \ingroup cl_req + */ +enum cl_req_type { + CRT_READ, + CRT_WRITE, + CRT_NR +}; + +/** + * Per-layer page operations. + * + * Methods taking an \a io argument are for the activity happening in the + * context of given \a io. Page is assumed to be owned by that io, except for + * the obvious cases (like cl_page_operations::cpo_own()). + * + * \see vvp_page_ops, lov_page_ops, osc_page_ops + */ +struct cl_page_operations { + /** + * cl_page<->struct page methods. Only one layer in the stack has to + * implement these. Current code assumes that this functionality is + * provided by the topmost layer, see cl_page_disown0() as an example. + */ + + /** + * \return the underlying VM page. Optional. + */ + struct page *(*cpo_vmpage)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Called when \a io acquires this page into the exclusive + * ownership. When this method returns, it is guaranteed that the is + * not owned by other io, and no transfer is going on against + * it. Optional. + * + * \see cl_page_own() + * \see vvp_page_own(), lov_page_own() + */ + int (*cpo_own)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock); + /** Called when ownership it yielded. Optional. + * + * \see cl_page_disown() + * \see vvp_page_disown() + */ + void (*cpo_disown)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Called for a page that is already "owned" by \a io from VM point of + * view. Optional. + * + * \see cl_page_assume() + * \see vvp_page_assume(), lov_page_assume() + */ + void (*cpo_assume)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** Dual to cl_page_operations::cpo_assume(). Optional. Called + * bottom-to-top when IO releases a page without actually unlocking + * it. + * + * \see cl_page_unassume() + * \see vvp_page_unassume() + */ + void (*cpo_unassume)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Announces whether the page contains valid data or not by \a uptodate. + * + * \see cl_page_export() + * \see vvp_page_export() + */ + void (*cpo_export)(const struct lu_env *env, + const struct cl_page_slice *slice, int uptodate); + /** + * Unmaps page from the user space (if it is mapped). + * + * \see cl_page_unmap() + * \see vvp_page_unmap() + */ + int (*cpo_unmap)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Checks whether underlying VM page is locked (in the suitable + * sense). Used for assertions. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. (Should never happen.) + */ + int (*cpo_is_vmlocked)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Page destruction. + */ + + /** + * Called when page is truncated from the object. Optional. + * + * \see cl_page_discard() + * \see vvp_page_discard(), osc_page_discard() + */ + void (*cpo_discard)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Called when page is removed from the cache, and is about to being + * destroyed. Optional. + * + * \see cl_page_delete() + * \see vvp_page_delete(), osc_page_delete() + */ + void (*cpo_delete)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** Destructor. Frees resources and slice itself. */ + void (*cpo_fini)(const struct lu_env *env, + struct cl_page_slice *slice); + + /** + * Checks whether the page is protected by a cl_lock. This is a + * per-layer method, because certain layers have ways to check for the + * lock much more efficiently than through the generic locks scan, or + * implement locking mechanisms separate from cl_lock, e.g., + * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks + * being canceled, or scheduled for cancellation as soon as the last + * user goes away, too. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. + * + * \see cl_page_is_under_lock() + */ + int (*cpo_is_under_lock)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + + /** + * Optional debugging helper. Prints given page slice. + * + * \see cl_page_print() + */ + int (*cpo_print)(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p); + /** + * \name transfer + * + * Transfer methods. See comment on cl_req for a description of + * transfer formation and life-cycle. + * + * @{ + */ + /** + * Request type dependent vector of operations. + * + * Transfer operations depend on transfer mode (cl_req_type). To avoid + * passing transfer mode to each and every of these methods, and to + * avoid branching on request type inside of the methods, separate + * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are + * provided. That is, method invocation usually looks like + * + * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...); + */ + struct { + /** + * Called when a page is submitted for a transfer as a part of + * cl_page_list. + * + * \return 0 : page is eligible for submission; + * \return -EALREADY : skip this page; + * \return -ve : error. + * + * \see cl_page_prep() + */ + int (*cpo_prep)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Completion handler. This is guaranteed to be eventually + * fired after cl_page_operations::cpo_prep() or + * cl_page_operations::cpo_make_ready() call. + * + * This method can be called in a non-blocking context. It is + * guaranteed however, that the page involved and its object + * are pinned in memory (and, hence, calling cl_page_put() is + * safe). + * + * \see cl_page_completion() + */ + void (*cpo_completion)(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret); + /** + * Called when cached page is about to be added to the + * cl_req as a part of req formation. + * + * \return 0 : proceed with this page; + * \return -EAGAIN : skip this page; + * \return -ve : error. + * + * \see cl_page_make_ready() + */ + int (*cpo_make_ready)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Announce that this page is to be written out + * opportunistically, that is, page is dirty, it is not + * necessary to start write-out transfer right now, but + * eventually page has to be written out. + * + * Main caller of this is the write path (see + * vvp_io_commit_write()), using this method to build a + * "transfer cache" from which large transfers are then + * constructed by the req-formation engine. + * + * \todo XXX it would make sense to add page-age tracking + * semantics here, and to oblige the req-formation engine to + * send the page out not later than it is too old. + * + * \see cl_page_cache_add() + */ + int (*cpo_cache_add)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + } io[CRT_NR]; + /** + * Tell transfer engine that only [to, from] part of a page should be + * transmitted. + * + * This is used for immediate transfers. + * + * \todo XXX this is not very good interface. It would be much better + * if all transfer parameters were supplied as arguments to + * cl_io_operations::cio_submit() call, but it is not clear how to do + * this for page queues. + * + * \see cl_page_clip() + */ + void (*cpo_clip)(const struct lu_env *env, + const struct cl_page_slice *slice, + int from, int to); + /** + * \pre the page was queued for transferring. + * \post page is removed from client's pending list, or -EBUSY + * is returned if it has already been in transferring. + * + * This is one of seldom page operation which is: + * 0. called from top level; + * 1. don't have vmpage locked; + * 2. every layer should synchronize execution of its ->cpo_cancel() + * with completion handlers. Osc uses client obd lock for this + * purpose. Based on there is no vvp_page_cancel and + * lov_page_cancel(), cpo_cancel is defacto protected by client lock. + * + * \see osc_page_cancel(). + */ + int (*cpo_cancel)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Write out a page by kernel. This is only called by ll_writepage + * right now. + * + * \see cl_page_flush() + */ + int (*cpo_flush)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** @} transfer */ +}; + +/** + * Helper macro, dumping detailed information about \a page into a log. + */ +#define CL_PAGE_DEBUG(mask, env, page, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_page_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Helper macro, dumping shorter information about \a page into a log. + */ +#define CL_PAGE_HEADER(mask, env, page, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +static inline int __page_in_use(const struct cl_page *page, int refc) +{ + if (page->cp_type == CPT_CACHEABLE) + ++refc; + LASSERT(atomic_read(&page->cp_ref) > 0); + return (atomic_read(&page->cp_ref) > refc); +} +#define cl_page_in_use(pg) __page_in_use(pg, 1) +#define cl_page_in_use_noref(pg) __page_in_use(pg, 0) + +/** @} cl_page */ + +/** \addtogroup cl_lock cl_lock + * @{ */ +/** \struct cl_lock + * + * Extent locking on the client. + * + * LAYERING + * + * The locking model of the new client code is built around + * + * struct cl_lock + * + * data-type representing an extent lock on a regular file. cl_lock is a + * layered object (much like cl_object and cl_page), it consists of a header + * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to + * cl_lock::cll_layers list through cl_lock_slice::cls_linkage. + * + * All locks for a given object are linked into cl_object_header::coh_locks + * list (protected by cl_object_header::coh_lock_guard spin-lock) through + * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can + * sort it in starting lock offset, or use altogether different data structure + * like a tree. + * + * Typical cl_lock consists of the two layers: + * + * - vvp_lock (vvp specific data), and + * - lov_lock (lov specific data). + * + * lov_lock contains an array of sub-locks. Each of these sub-locks is a + * normal cl_lock: it has a header (struct cl_lock) and a list of layers: + * + * - lovsub_lock, and + * - osc_lock + * + * Each sub-lock is associated with a cl_object (representing stripe + * sub-object or the file to which top-level cl_lock is associated to), and is + * linked into that cl_object::coh_locks. In this respect cl_lock is similar to + * cl_object (that at lov layer also fans out into multiple sub-objects), and + * is different from cl_page, that doesn't fan out (there is usually exactly + * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock + * a "top-lock" and its lovsub-osc portion a "sub-lock". + * + * LIFE CYCLE + * + * cl_lock is reference counted. When reference counter drops to 0, lock is + * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING + * lock is destroyed when last reference is released. Referencing between + * top-lock and its sub-locks is described in the lov documentation module. + * + * STATE MACHINE + * + * Also, cl_lock is a state machine. This requires some clarification. One of + * the goals of client IO re-write was to make IO path non-blocking, or at + * least to make it easier to make it non-blocking in the future. Here + * `non-blocking' means that when a system call (read, write, truncate) + * reaches a situation where it has to wait for a communication with the + * server, it should --instead of waiting-- remember its current state and + * switch to some other work. E.g,. instead of waiting for a lock enqueue, + * client should proceed doing IO on the next stripe, etc. Obviously this is + * rather radical redesign, and it is not planned to be fully implemented at + * this time, instead we are putting some infrastructure in place, that would + * make it easier to do asynchronous non-blocking IO easier in the + * future. Specifically, where old locking code goes to sleep (waiting for + * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When + * enqueue reply comes, its completion handler signals that lock state-machine + * is ready to transit to the next state. There is some generic code in + * cl_lock.c that sleeps, waiting for these signals. As a result, for users of + * this cl_lock.c code, it looks like locking is done in normal blocking + * fashion, and it the same time it is possible to switch to the non-blocking + * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c + * functions). + * + * For a description of state machine states and transitions see enum + * cl_lock_state. + * + * There are two ways to restrict a set of states which lock might move to: + * + * - placing a "hold" on a lock guarantees that lock will not be moved + * into cl_lock_state::CLS_FREEING state until hold is released. Hold + * can be only acquired on a lock that is not in + * cl_lock_state::CLS_FREEING. All holds on a lock are counted in + * cl_lock::cll_holds. Hold protects lock from cancellation and + * destruction. Requests to cancel and destroy a lock on hold will be + * recorded, but only honored when last hold on a lock is released; + * + * - placing a "user" on a lock guarantees that lock will not leave + * cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING, + * cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of + * states, once it enters this set. That is, if a user is added onto a + * lock in a state not from this set, it doesn't immediately enforce + * lock to move to this set, but once lock enters this set it will + * remain there until all users are removed. Lock users are counted in + * cl_lock::cll_users. + * + * User is used to assure that lock is not canceled or destroyed while + * it is being enqueued, or actively used by some IO. + * + * Currently, a user always comes with a hold (cl_lock_invariant() + * checks that a number of holds is not less than a number of users). + * + * CONCURRENCY + * + * This is how lock state-machine operates. struct cl_lock contains a mutex + * cl_lock::cll_guard that protects struct fields. + * + * - mutex is taken, and cl_lock::cll_state is examined. + * + * - for every state there are possible target states where lock can move + * into. They are tried in order. Attempts to move into next state are + * done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try(). + * + * - if the transition can be performed immediately, state is changed, + * and mutex is released. + * + * - if the transition requires blocking, _try() function returns + * cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to + * sleep, waiting for possibility of lock state change. It is woken + * up when some event occurs, that makes lock state change possible + * (e.g., the reception of the reply from the server), and repeats + * the loop. + * + * Top-lock and sub-lock has separate mutexes and the latter has to be taken + * first to avoid dead-lock. + * + * To see an example of interaction of all these issues, take a look at the + * lov_cl.c:lov_lock_enqueue() function. It is called as a part of + * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by + * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note + * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It + * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be + * done in parallel, rather than one after another (this is used for glimpse + * locks, that cannot dead-lock). + * + * INTERFACE AND USAGE + * + * struct cl_lock_operations provide a number of call-backs that are invoked + * when events of interest occurs. Layers can intercept and handle glimpse, + * blocking, cancel ASTs and a reception of the reply from the server. + * + * One important difference with the old client locking model is that new + * client has a representation for the top-lock, whereas in the old code only + * sub-locks existed as real data structures and file-level locks are + * represented by "request sets" that are created and destroyed on each and + * every lock creation. + * + * Top-locks are cached, and can be found in the cache by the system calls. It + * is possible that top-lock is in cache, but some of its sub-locks were + * canceled and destroyed. In that case top-lock has to be enqueued again + * before it can be used. + * + * Overall process of the locking during IO operation is as following: + * + * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock() + * is called on each layer. Responsibility of this method is to add locks, + * needed by a given layer into cl_io.ci_lockset. + * + * - once locks for all layers were collected, they are sorted to avoid + * dead-locks (cl_io_locks_sort()), and enqueued. + * + * - when all locks are acquired, IO is performed; + * + * - locks are released into cache. + * + * Striping introduces major additional complexity into locking. The + * fundamental problem is that it is generally unsafe to actively use (hold) + * two locks on the different OST servers at the same time, as this introduces + * inter-server dependency and can lead to cascading evictions. + * + * Basic solution is to sub-divide large read/write IOs into smaller pieces so + * that no multi-stripe locks are taken (note that this design abandons POSIX + * read/write semantics). Such pieces ideally can be executed concurrently. At + * the same time, certain types of IO cannot be sub-divived, without + * sacrificing correctness. This includes: + * + * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee + * atomicity; + * + * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken. + * + * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where + * buf is a part of memory mapped Lustre file, a lock or locks protecting buf + * has to be held together with the usual lock on [offset, offset + count]. + * + * As multi-stripe locks have to be allowed, it makes sense to cache them, so + * that, for example, a sequence of O_APPEND writes can proceed quickly + * without going down to the individual stripes to do lock matching. On the + * other hand, multi-stripe locks shouldn't be used by normal read/write + * calls. To achieve this, every layer can implement ->clo_fits_into() method, + * that is called by lock matching code (cl_lock_lookup()), and that can be + * used to selectively disable matching of certain locks for certain IOs. For + * example, lov layer implements lov_lock_fits_into() that allow multi-stripe + * locks to be matched only for truncates and O_APPEND writes. + * + * Interaction with DLM + * + * In the expected setup, cl_lock is ultimately backed up by a collection of + * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is + * implemented in osc layer, that also matches DLM events (ASTs, cancellation, + * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed + * description of interaction with DLM. + */ + +/** + * Lock description. + */ +struct cl_lock_descr { + /** Object this lock is granted for. */ + struct cl_object *cld_obj; + /** Index of the first page protected by this lock. */ + pgoff_t cld_start; + /** Index of the last page (inclusive) protected by this lock. */ + pgoff_t cld_end; + /** Group ID, for group lock */ + __u64 cld_gid; + /** Lock mode. */ + enum cl_lock_mode cld_mode; + /** + * flags to enqueue lock. A combination of bit-flags from + * enum cl_enq_flags. + */ + __u32 cld_enq_flags; +}; + +#define DDESCR "%s(%d):[%lu, %lu]" +#define PDESCR(descr) \ + cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \ + (descr)->cld_start, (descr)->cld_end + +const char *cl_lock_mode_name(const enum cl_lock_mode mode); + +/** + * Lock state-machine states. + * + * \htmlonly + *
+ *
+ * Possible state transitions:
+ *
+ *	      +------------------>NEW
+ *	      |		    |
+ *	      |		    | cl_enqueue_try()
+ *	      |		    |
+ *	      |    cl_unuse_try()  V
+ *	      |  +--------------QUEUING (*)
+ *	      |  |		 |
+ *	      |  |		 | cl_enqueue_try()
+ *	      |  |		 |
+ *	      |  | cl_unuse_try()  V
+ *    sub-lock  |  +-------------ENQUEUED (*)
+ *    canceled  |  |		 |
+ *	      |  |		 | cl_wait_try()
+ *	      |  |		 |
+ *	      |  |		(R)
+ *	      |  |		 |
+ *	      |  |		 V
+ *	      |  |		HELD<---------+
+ *	      |  |		 |	    |
+ *	      |  |		 |	    | cl_use_try()
+ *	      |  |  cl_unuse_try() |	    |
+ *	      |  |		 |	    |
+ *	      |  |		 V	 ---+
+ *	      |  +------------>INTRANSIT (D) <--+
+ *	      |		    |	    |
+ *	      |     cl_unuse_try() |	    | cached lock found
+ *	      |		    |	    | cl_use_try()
+ *	      |		    |	    |
+ *	      |		    V	    |
+ *	      +------------------CACHED---------+
+ *				   |
+ *				  (C)
+ *				   |
+ *				   V
+ *				FREEING
+ *
+ * Legend:
+ *
+ *	 In states marked with (*) transition to the same state (i.e., a loop
+ *	 in the diagram) is possible.
+ *
+ *	 (R) is the point where Receive call-back is invoked: it allows layers
+ *	 to handle arrival of lock reply.
+ *
+ *	 (C) is the point where Cancellation call-back is invoked.
+ *
+ *	 (D) is the transit state which means the lock is changing.
+ *
+ *	 Transition to FREEING state is possible from any other state in the
+ *	 diagram in case of unrecoverable error.
+ * 
+ * \endhtmlonly + * + * These states are for individual cl_lock object. Top-lock and its sub-locks + * can be in the different states. Another way to say this is that we have + * nested state-machines. + * + * Separate QUEUING and ENQUEUED states are needed to support non-blocking + * operation for locks with multiple sub-locks. Imagine lock on a file F, that + * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send + * enqueue to S0, wait for its completion, then send enqueue for S1, wait for + * its completion and at last enqueue lock for S2, and wait for its + * completion. In that case, top-lock is in QUEUING state while S0, S1 are + * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note + * that in this case, sub-locks move from state to state, and top-lock remains + * in the same state). + */ +enum cl_lock_state { + /** + * Lock that wasn't yet enqueued + */ + CLS_NEW, + /** + * Enqueue is in progress, blocking for some intermediate interaction + * with the other side. + */ + CLS_QUEUING, + /** + * Lock is fully enqueued, waiting for server to reply when it is + * granted. + */ + CLS_ENQUEUED, + /** + * Lock granted, actively used by some IO. + */ + CLS_HELD, + /** + * This state is used to mark the lock is being used, or unused. + * We need this state because the lock may have several sublocks, + * so it's impossible to have an atomic way to bring all sublocks + * into CLS_HELD state at use case, or all sublocks to CLS_CACHED + * at unuse case. + * If a thread is referring to a lock, and it sees the lock is in this + * state, it must wait for the lock. + * See state diagram for details. + */ + CLS_INTRANSIT, + /** + * Lock granted, not used. + */ + CLS_CACHED, + /** + * Lock is being destroyed. + */ + CLS_FREEING, + CLS_NR +}; + +enum cl_lock_flags { + /** + * lock has been cancelled. This flag is never cleared once set (by + * cl_lock_cancel0()). + */ + CLF_CANCELLED = 1 << 0, + /** cancellation is pending for this lock. */ + CLF_CANCELPEND = 1 << 1, + /** destruction is pending for this lock. */ + CLF_DOOMED = 1 << 2, + /** from enqueue RPC reply upcall. */ + CLF_FROM_UPCALL= 1 << 3, +}; + +/** + * Lock closure. + * + * Lock closure is a collection of locks (both top-locks and sub-locks) that + * might be updated in a result of an operation on a certain lock (which lock + * this is a closure of). + * + * Closures are needed to guarantee dead-lock freedom in the presence of + * + * - nested state-machines (top-lock state-machine composed of sub-lock + * state-machines), and + * + * - shared sub-locks. + * + * Specifically, many operations, such as lock enqueue, wait, unlock, + * etc. start from a top-lock, and then operate on a sub-locks of this + * top-lock, holding a top-lock mutex. When sub-lock state changes as a result + * of such operation, this change has to be propagated to all top-locks that + * share this sub-lock. Obviously, no natural lock ordering (e.g., + * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has + * to be used. Lock closure systematizes this try-and-repeat logic. + */ +struct cl_lock_closure { + /** + * Lock that is mutexed when closure construction is started. When + * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on + * origin is released before waiting. + */ + struct cl_lock *clc_origin; + /** + * List of enclosed locks, so far. Locks are linked here through + * cl_lock::cll_inclosure. + */ + struct list_head clc_list; + /** + * True iff closure is in a `wait' mode. This determines what + * cl_lock_enclosure() does when a lock L to be added to the closure + * is currently mutexed by some other thread. + * + * If cl_lock_closure::clc_wait is not set, then closure construction + * fails with CLO_REPEAT immediately. + * + * In wait mode, cl_lock_enclosure() waits until next attempt to build + * a closure might succeed. To this end it releases an origin mutex + * (cl_lock_closure::clc_origin), that has to be the only lock mutex + * owned by the current thread, and then waits on L mutex (by grabbing + * it and immediately releasing), before returning CLO_REPEAT to the + * caller. + */ + int clc_wait; + /** Number of locks in the closure. */ + int clc_nr; +}; + +/** + * Layered client lock. + */ +struct cl_lock { + /** Reference counter. */ + atomic_t cll_ref; + /** List of slices. Immutable after creation. */ + struct list_head cll_layers; + /** + * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected + * by cl_lock::cll_descr::cld_obj::coh_lock_guard. + */ + struct list_head cll_linkage; + /** + * Parameters of this lock. Protected by + * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within + * cl_lock::cll_guard. Modified only on lock creation and in + * cl_lock_modify(). + */ + struct cl_lock_descr cll_descr; + /** Protected by cl_lock::cll_guard. */ + enum cl_lock_state cll_state; + /** signals state changes. */ + wait_queue_head_t cll_wq; + /** + * Recursive lock, most fields in cl_lock{} are protected by this. + * + * Locking rules: this mutex is never held across network + * communication, except when lock is being canceled. + * + * Lock ordering: a mutex of a sub-lock is taken first, then a mutex + * on a top-lock. Other direction is implemented through a + * try-lock-repeat loop. Mutices of unrelated locks can be taken only + * by try-locking. + * + * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait(). + */ + struct mutex cll_guard; + struct task_struct *cll_guarder; + int cll_depth; + + /** + * the owner for INTRANSIT state + */ + struct task_struct *cll_intransit_owner; + int cll_error; + /** + * Number of holds on a lock. A hold prevents a lock from being + * canceled and destroyed. Protected by cl_lock::cll_guard. + * + * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release() + */ + int cll_holds; + /** + * Number of lock users. Valid in cl_lock_state::CLS_HELD state + * only. Lock user pins lock in CLS_HELD state. Protected by + * cl_lock::cll_guard. + * + * \see cl_wait(), cl_unuse(). + */ + int cll_users; + /** + * Flag bit-mask. Values from enum cl_lock_flags. Updates are + * protected by cl_lock::cll_guard. + */ + unsigned long cll_flags; + /** + * A linkage into a list of locks in a closure. + * + * \see cl_lock_closure + */ + struct list_head cll_inclosure; + /** + * Confict lock at queuing time. + */ + struct cl_lock *cll_conflict; + /** + * A list of references to this lock, for debugging. + */ + struct lu_ref cll_reference; + /** + * A list of holds on this lock, for debugging. + */ + struct lu_ref cll_holders; + /** + * A reference for cl_lock::cll_descr::cld_obj. For debugging. + */ + struct lu_ref_link cll_obj_ref; +#ifdef CONFIG_LOCKDEP + /* "dep_map" name is assumed by lockdep.h macros. */ + struct lockdep_map dep_map; +#endif +}; + +/** + * Per-layer part of cl_lock + * + * \see ccc_lock, lov_lock, lovsub_lock, osc_lock + */ +struct cl_lock_slice { + struct cl_lock *cls_lock; + /** Object slice corresponding to this lock slice. Immutable after + * creation. */ + struct cl_object *cls_obj; + const struct cl_lock_operations *cls_ops; + /** Linkage into cl_lock::cll_layers. Immutable after creation. */ + struct list_head cls_linkage; +}; + +/** + * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}(). + * + * NOTE: lov_subresult() depends on ordering here. + */ +enum cl_lock_transition { + /** operation cannot be completed immediately. Wait for state change. */ + CLO_WAIT = 1, + /** operation had to release lock mutex, restart. */ + CLO_REPEAT = 2, + /** lower layer re-enqueued. */ + CLO_REENQUEUED = 3, +}; + +/** + * + * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops + */ +struct cl_lock_operations { + /** + * \name statemachine + * + * State machine transitions. These 3 methods are called to transfer + * lock from one state to another, as described in the commentary + * above enum #cl_lock_state. + * + * \retval 0 this layer has nothing more to do to before + * transition to the target state happens; + * + * \retval CLO_REPEAT method had to release and re-acquire cl_lock + * mutex, repeat invocation of transition method + * across all layers; + * + * \retval CLO_WAIT this layer cannot move to the target state + * immediately, as it has to wait for certain event + * (e.g., the communication with the server). It + * is guaranteed, that when the state transfer + * becomes possible, cl_lock::cll_wq wait-queue + * is signaled. Caller can wait for this event by + * calling cl_lock_state_wait(); + * + * \retval -ve failure, abort state transition, move the lock + * into cl_lock_state::CLS_FREEING state, and set + * cl_lock::cll_error. + * + * Once all layers voted to agree to transition (by returning 0), lock + * is moved into corresponding target state. All state transition + * methods are optional. + */ + /** @{ */ + /** + * Attempts to enqueue the lock. Called top-to-bottom. + * + * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(), + * \see osc_lock_enqueue() + */ + int (*clo_enqueue)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags); + /** + * Attempts to wait for enqueue result. Called top-to-bottom. + * + * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait() + */ + int (*clo_wait)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Attempts to unlock the lock. Called bottom-to-top. In addition to + * usual return values of lock state-machine methods, this can return + * -ESTALE to indicate that lock cannot be returned to the cache, and + * has to be re-initialized. + * unuse is a one-shot operation, so it must NOT return CLO_WAIT. + * + * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse() + */ + int (*clo_unuse)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Notifies layer that cached lock is started being used. + * + * \pre lock->cll_state == CLS_CACHED + * + * \see lov_lock_use(), osc_lock_use() + */ + int (*clo_use)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} statemachine */ + /** + * A method invoked when lock state is changed (as a result of state + * transition). This is used, for example, to track when the state of + * a sub-lock changes, to propagate this change to the corresponding + * top-lock. Optional + * + * \see lovsub_lock_state() + */ + void (*clo_state)(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state st); + /** + * Returns true, iff given lock is suitable for the given io, idea + * being, that there are certain "unsafe" locks, e.g., ones acquired + * for O_APPEND writes, that we don't want to re-use for a normal + * write, to avoid the danger of cascading evictions. Optional. Runs + * under cl_object_header::coh_lock_guard. + * + * XXX this should take more information about lock needed by + * io. Probably lock description or something similar. + * + * \see lov_fits_into() + */ + int (*clo_fits_into)(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io); + /** + * \name ast + * Asynchronous System Traps. All of then are optional, all are + * executed bottom-to-top. + */ + /** @{ */ + + /** + * Cancellation callback. Cancel a lock voluntarily, or under + * the request of server. + */ + void (*clo_cancel)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Lock weighting ast. Executed to estimate how precious this lock + * is. The sum of results across all layers is used to determine + * whether lock worth keeping in cache given present memory usage. + * + * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh(). + */ + unsigned long (*clo_weigh)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} ast */ + + /** + * \see lovsub_lock_closure() + */ + int (*clo_closure)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_lock_closure *closure); + /** + * Executed bottom-to-top when lock description changes (e.g., as a + * result of server granting more generous lock than was requested). + * + * \see lovsub_lock_modify() + */ + int (*clo_modify)(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *updated); + /** + * Notifies layers (bottom-to-top) that lock is going to be + * destroyed. Responsibility of layers is to prevent new references on + * this lock from being acquired once this method returns. + * + * This can be called multiple times due to the races. + * + * \see cl_lock_delete() + * \see osc_lock_delete(), lovsub_lock_delete() + */ + void (*clo_delete)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Destructor. Frees resources and the slice. + * + * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(), + * \see osc_lock_fini() + */ + void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice); + /** + * Optional debugging helper. Prints given lock slice. + */ + int (*clo_print)(const struct lu_env *env, + void *cookie, lu_printer_t p, + const struct cl_lock_slice *slice); +}; + +#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +#define CL_LOCK_ASSERT(expr, env, lock) do { \ + if (likely(expr)) \ + break; \ + \ + CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \ + LBUG(); \ +} while (0) + +/** @} cl_lock */ + +/** \addtogroup cl_page_list cl_page_list + * Page list used to perform collective operations on a group of pages. + * + * Pages are added to the list one by one. cl_page_list acquires a reference + * for every page in it. Page list is used to perform collective operations on + * pages: + * + * - submit pages for an immediate transfer, + * + * - own pages on behalf of certain io (waiting for each page in turn), + * + * - discard pages. + * + * When list is finalized, it releases references on all pages it still has. + * + * \todo XXX concurrency control. + * + * @{ + */ +struct cl_page_list { + unsigned pl_nr; + struct list_head pl_pages; + struct task_struct *pl_owner; +}; + +/** + * A 2-queue of pages. A convenience data-type for common use case, 2-queue + * contains an incoming page list and an outgoing page list. + */ +struct cl_2queue { + struct cl_page_list c2_qin; + struct cl_page_list c2_qout; +}; + +/** @} cl_page_list */ + +/** \addtogroup cl_io cl_io + * @{ */ +/** \struct cl_io + * I/O + * + * cl_io represents a high level I/O activity like + * read(2)/write(2)/truncate(2) system call, or cancellation of an extent + * lock. + * + * cl_io is a layered object, much like cl_{object,page,lock} but with one + * important distinction. We want to minimize number of calls to the allocator + * in the fast path, e.g., in the case of read(2) when everything is cached: + * client already owns the lock over region being read, and data are cached + * due to read-ahead. To avoid allocation of cl_io layers in such situations, + * per-layer io state is stored in the session, associated with the io, see + * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized + * by using free-lists, see cl_env_get(). + * + * There is a small predefined number of possible io types, enumerated in enum + * cl_io_type. + * + * cl_io is a state machine, that can be advanced concurrently by the multiple + * threads. It is up to these threads to control the concurrency and, + * specifically, to detect when io is done, and its state can be safely + * released. + * + * For read/write io overall execution plan is as following: + * + * (0) initialize io state through all layers; + * + * (1) loop: prepare chunk of work to do + * + * (2) call all layers to collect locks they need to process current chunk + * + * (3) sort all locks to avoid dead-locks, and acquire them + * + * (4) process the chunk: call per-page methods + * (cl_io_operations::cio_read_page() for read, + * cl_io_operations::cio_prepare_write(), + * cl_io_operations::cio_commit_write() for write) + * + * (5) release locks + * + * (6) repeat loop. + * + * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to + * address allocation efficiency issues mentioned above), and returns with the + * special error condition from per-page method when current sub-io has to + * block. This causes io loop to be repeated, and lov switches to the next + * sub-io in its cl_io_operations::cio_iter_init() implementation. + */ + +/** IO types */ +enum cl_io_type { + /** read system call */ + CIT_READ, + /** write system call */ + CIT_WRITE, + /** truncate, utime system calls */ + CIT_SETATTR, + /** + * page fault handling + */ + CIT_FAULT, + /** + * fsync system call handling + * To write out a range of file + */ + CIT_FSYNC, + /** + * Miscellaneous io. This is used for occasional io activity that + * doesn't fit into other types. Currently this is used for: + * + * - cancellation of an extent lock. This io exists as a context + * to write dirty pages from under the lock being canceled back + * to the server; + * + * - VM induced page write-out. An io context for writing page out + * for memory cleansing; + * + * - glimpse. An io context to acquire glimpse lock. + * + * - grouplock. An io context to acquire group lock. + * + * CIT_MISC io is used simply as a context in which locks and pages + * are manipulated. Such io has no internal "process", that is, + * cl_io_loop() is never called for it. + */ + CIT_MISC, + CIT_OP_NR +}; + +/** + * States of cl_io state machine + */ +enum cl_io_state { + /** Not initialized. */ + CIS_ZERO, + /** Initialized. */ + CIS_INIT, + /** IO iteration started. */ + CIS_IT_STARTED, + /** Locks taken. */ + CIS_LOCKED, + /** Actual IO is in progress. */ + CIS_IO_GOING, + /** IO for the current iteration finished. */ + CIS_IO_FINISHED, + /** Locks released. */ + CIS_UNLOCKED, + /** Iteration completed. */ + CIS_IT_ENDED, + /** cl_io finalized. */ + CIS_FINI +}; + +/** + * IO state private for a layer. + * + * This is usually embedded into layer session data, rather than allocated + * dynamically. + * + * \see vvp_io, lov_io, osc_io, ccc_io + */ +struct cl_io_slice { + struct cl_io *cis_io; + /** corresponding object slice. Immutable after creation. */ + struct cl_object *cis_obj; + /** io operations. Immutable after creation. */ + const struct cl_io_operations *cis_iop; + /** + * linkage into a list of all slices for a given cl_io, hanging off + * cl_io::ci_layers. Immutable after creation. + */ + struct list_head cis_linkage; +}; + + +/** + * Per-layer io operations. + * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops + */ +struct cl_io_operations { + /** + * Vector of io state transition methods for every io type. + * + * \see cl_page_operations::io + */ + struct { + /** + * Prepare io iteration at a given layer. + * + * Called top-to-bottom at the beginning of each iteration of + * "io loop" (if it makes sense for this type of io). Here + * layer selects what work it will do during this iteration. + * + * \see cl_io_operations::cio_iter_fini() + */ + int (*cio_iter_init) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize io iteration. + * + * Called bottom-to-top at the end of each iteration of "io + * loop". Here layers can decide whether IO has to be + * continued. + * + * \see cl_io_operations::cio_iter_init() + */ + void (*cio_iter_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Collect locks for the current iteration of io. + * + * Called top-to-bottom to collect all locks necessary for + * this iteration. This methods shouldn't actually enqueue + * anything, instead it should post a lock through + * cl_io_lock_add(). Once all locks are collected, they are + * sorted and enqueued in the proper order. + */ + int (*cio_lock) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize unlocking. + * + * Called bottom-to-top to finish layer specific unlocking + * functionality, after generic code released all locks + * acquired by cl_io_operations::cio_lock(). + */ + void (*cio_unlock)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Start io iteration. + * + * Once all locks are acquired, called top-to-bottom to + * commence actual IO. In the current implementation, + * top-level vvp_io_{read,write}_start() does all the work + * synchronously by calling generic_file_*(), so other layers + * are called when everything is done. + */ + int (*cio_start)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called top-to-bottom at the end of io loop. Here layer + * might wait for an unfinished asynchronous io. + */ + void (*cio_end) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called bottom-to-top to notify layers that read/write IO + * iteration finished, with \a nob bytes transferred. + */ + void (*cio_advance)(const struct lu_env *env, + const struct cl_io_slice *slice, + size_t nob); + /** + * Called once per io, bottom-to-top to release io resources. + */ + void (*cio_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + } op[CIT_OP_NR]; + struct { + /** + * Submit pages from \a queue->c2_qin for IO, and move + * successfully submitted pages into \a queue->c2_qout. Return + * non-zero if failed to submit even the single page. If + * submission failed after some pages were moved into \a + * queue->c2_qout, completion callback with non-zero ioret is + * executed on them. + */ + int (*cio_submit)(const struct lu_env *env, + const struct cl_io_slice *slice, + enum cl_req_type crt, + struct cl_2queue *queue); + } req_op[CRT_NR]; + /** + * Read missing page. + * + * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start() + * method, when it hits not-up-to-date page in the range. Optional. + * + * \pre io->ci_type == CIT_READ + */ + int (*cio_read_page)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page); + /** + * Prepare write of a \a page. Called bottom-to-top by a top-level + * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for + * get data from user-level buffer. + * + * \pre io->ci_type == CIT_WRITE + * + * \see vvp_io_prepare_write(), lov_io_prepare_write(), + * osc_io_prepare_write(). + */ + int (*cio_prepare_write)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page, + unsigned from, unsigned to); + /** + * + * \pre io->ci_type == CIT_WRITE + * + * \see vvp_io_commit_write(), lov_io_commit_write(), + * osc_io_commit_write(). + */ + int (*cio_commit_write)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page, + unsigned from, unsigned to); + /** + * Optional debugging helper. Print given io slice. + */ + int (*cio_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_io_slice *slice); +}; + +/** + * Flags to lock enqueue procedure. + * \ingroup cl_lock + */ +enum cl_enq_flags { + /** + * instruct server to not block, if conflicting lock is found. Instead + * -EWOULDBLOCK is returned immediately. + */ + CEF_NONBLOCK = 0x00000001, + /** + * take lock asynchronously (out of order), as it cannot + * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing. + */ + CEF_ASYNC = 0x00000002, + /** + * tell the server to instruct (though a flag in the blocking ast) an + * owner of the conflicting lock, that it can drop dirty pages + * protected by this lock, without sending them to the server. + */ + CEF_DISCARD_DATA = 0x00000004, + /** + * tell the sub layers that it must be a `real' lock. This is used for + * mmapped-buffer locks and glimpse locks that must be never converted + * into lockless mode. + * + * \see vvp_mmap_locks(), cl_glimpse_lock(). + */ + CEF_MUST = 0x00000008, + /** + * tell the sub layers that never request a `real' lock. This flag is + * not used currently. + * + * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless + * conversion policy: ci_lockreq describes generic information of lock + * requirement for this IO, especially for locks which belong to the + * object doing IO; however, lock itself may have precise requirements + * that are described by the enqueue flags. + */ + CEF_NEVER = 0x00000010, + /** + * for async glimpse lock. + */ + CEF_AGL = 0x00000020, + /** + * mask of enq_flags. + */ + CEF_MASK = 0x0000003f, +}; + +/** + * Link between lock and io. Intermediate structure is needed, because the + * same lock can be part of multiple io's simultaneously. + */ +struct cl_io_lock_link { + /** linkage into one of cl_lockset lists. */ + struct list_head cill_linkage; + struct cl_lock_descr cill_descr; + struct cl_lock *cill_lock; + /** optional destructor */ + void (*cill_fini)(const struct lu_env *env, + struct cl_io_lock_link *link); +}; + +/** + * Lock-set represents a collection of locks, that io needs at a + * time. Generally speaking, client tries to avoid holding multiple locks when + * possible, because + * + * - holding extent locks over multiple ost's introduces the danger of + * "cascading timeouts"; + * + * - holding multiple locks over the same ost is still dead-lock prone, + * see comment in osc_lock_enqueue(), + * + * but there are certain situations where this is unavoidable: + * + * - O_APPEND writes have to take [0, EOF] lock for correctness; + * + * - truncate has to take [new-size, EOF] lock for correctness; + * + * - SNS has to take locks across full stripe for correctness; + * + * - in the case when user level buffer, supplied to {read,write}(file0), + * is a part of a memory mapped lustre file, client has to take a dlm + * locks on file0, and all files that back up the buffer (or a part of + * the buffer, that is being processed in the current chunk, in any + * case, there are situations where at least 2 locks are necessary). + * + * In such cases we at least try to take locks in the same consistent + * order. To this end, all locks are first collected, then sorted, and then + * enqueued. + */ +struct cl_lockset { + /** locks to be acquired. */ + struct list_head cls_todo; + /** locks currently being processed. */ + struct list_head cls_curr; + /** locks acquired. */ + struct list_head cls_done; +}; + +/** + * Lock requirements(demand) for IO. It should be cl_io_lock_req, + * but 'req' is always to be thought as 'request' :-) + */ +enum cl_io_lock_dmd { + /** Always lock data (e.g., O_APPEND). */ + CILR_MANDATORY = 0, + /** Layers are free to decide between local and global locking. */ + CILR_MAYBE, + /** Never lock: there is no cache (e.g., liblustre). */ + CILR_NEVER +}; + +enum cl_fsync_mode { + /** start writeback, do not wait for them to finish */ + CL_FSYNC_NONE = 0, + /** start writeback and wait for them to finish */ + CL_FSYNC_LOCAL = 1, + /** discard all of dirty pages in a specific file range */ + CL_FSYNC_DISCARD = 2, + /** start writeback and make sure they have reached storage before + * return. OST_SYNC RPC must be issued and finished */ + CL_FSYNC_ALL = 3 +}; + +struct cl_io_rw_common { + loff_t crw_pos; + size_t crw_count; + int crw_nonblock; +}; + + +/** + * State for io. + * + * cl_io is shared by all threads participating in this IO (in current + * implementation only one thread advances IO, but parallel IO design and + * concurrent copy_*_user() require multiple threads acting on the same IO. It + * is up to these threads to serialize their activities, including updates to + * mutable cl_io fields. + */ +struct cl_io { + /** type of this IO. Immutable after creation. */ + enum cl_io_type ci_type; + /** current state of cl_io state machine. */ + enum cl_io_state ci_state; + /** main object this io is against. Immutable after creation. */ + struct cl_object *ci_obj; + /** + * Upper layer io, of which this io is a part of. Immutable after + * creation. + */ + struct cl_io *ci_parent; + /** List of slices. Immutable after creation. */ + struct list_head ci_layers; + /** list of locks (to be) acquired by this io. */ + struct cl_lockset ci_lockset; + /** lock requirements, this is just a help info for sublayers. */ + enum cl_io_lock_dmd ci_lockreq; + union { + struct cl_rd_io { + struct cl_io_rw_common rd; + } ci_rd; + struct cl_wr_io { + struct cl_io_rw_common wr; + int wr_append; + int wr_sync; + } ci_wr; + struct cl_io_rw_common ci_rw; + struct cl_setattr_io { + struct ost_lvb sa_attr; + unsigned int sa_valid; + struct obd_capa *sa_capa; + } ci_setattr; + struct cl_fault_io { + /** page index within file. */ + pgoff_t ft_index; + /** bytes valid byte on a faulted page. */ + int ft_nob; + /** writable page? for nopage() only */ + int ft_writable; + /** page of an executable? */ + int ft_executable; + /** page_mkwrite() */ + int ft_mkwrite; + /** resulting page */ + struct cl_page *ft_page; + } ci_fault; + struct cl_fsync_io { + loff_t fi_start; + loff_t fi_end; + struct obd_capa *fi_capa; + /** file system level fid */ + struct lu_fid *fi_fid; + enum cl_fsync_mode fi_mode; + /* how many pages were written/discarded */ + unsigned int fi_nr_written; + } ci_fsync; + } u; + struct cl_2queue ci_queue; + size_t ci_nob; + int ci_result; + unsigned int ci_continue:1, + /** + * This io has held grouplock, to inform sublayers that + * don't do lockless i/o. + */ + ci_no_srvlock:1, + /** + * The whole IO need to be restarted because layout has been changed + */ + ci_need_restart:1, + /** + * to not refresh layout - the IO issuer knows that the layout won't + * change(page operations, layout change causes all page to be + * discarded), or it doesn't matter if it changes(sync). + */ + ci_ignore_layout:1, + /** + * Check if layout changed after the IO finishes. Mainly for HSM + * requirement. If IO occurs to openning files, it doesn't need to + * verify layout because HSM won't release openning files. + * Right now, only two operations need to verify layout: glimpse + * and setattr. + */ + ci_verify_layout:1, + /** + * file is released, restore has to to be triggered by vvp layer + */ + ci_restore_needed:1, + /** + * O_NOATIME + */ + ci_noatime:1; + /** + * Number of pages owned by this IO. For invariant checking. + */ + unsigned ci_owned_nr; +}; + +/** @} cl_io */ + +/** \addtogroup cl_req cl_req + * @{ */ +/** \struct cl_req + * Transfer. + * + * There are two possible modes of transfer initiation on the client: + * + * - immediate transfer: this is started when a high level io wants a page + * or a collection of pages to be transferred right away. Examples: + * read-ahead, synchronous read in the case of non-page aligned write, + * page write-out as a part of extent lock cancellation, page write-out + * as a part of memory cleansing. Immediate transfer can be both + * cl_req_type::CRT_READ and cl_req_type::CRT_WRITE; + * + * - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens + * when io wants to transfer a page to the server some time later, when + * it can be done efficiently. Example: pages dirtied by the write(2) + * path. + * + * In any case, transfer takes place in the form of a cl_req, which is a + * representation for a network RPC. + * + * Pages queued for an opportunistic transfer are cached until it is decided + * that efficient RPC can be composed of them. This decision is made by "a + * req-formation engine", currently implemented as a part of osc + * layer. Req-formation depends on many factors: the size of the resulting + * RPC, whether or not multi-object RPCs are supported by the server, + * max-rpc-in-flight limitations, size of the dirty cache, etc. + * + * For the immediate transfer io submits a cl_page_list, that req-formation + * engine slices into cl_req's, possibly adding cached pages to some of + * the resulting req's. + * + * Whenever a page from cl_page_list is added to a newly constructed req, its + * cl_page_operations::cpo_prep() layer methods are called. At that moment, + * page state is atomically changed from cl_page_state::CPS_OWNED to + * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner + * is zeroed, and cl_page::cp_req is set to the + * req. cl_page_operations::cpo_prep() method at the particular layer might + * return -EALREADY to indicate that it does not need to submit this page + * at all. This is possible, for example, if page, submitted for read, + * became up-to-date in the meantime; and for write, the page don't have + * dirty bit marked. \see cl_io_submit_rw() + * + * Whenever a cached page is added to a newly constructed req, its + * cl_page_operations::cpo_make_ready() layer methods are called. At that + * moment, page state is atomically changed from cl_page_state::CPS_CACHED to + * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to + * req. cl_page_operations::cpo_make_ready() method at the particular layer + * might return -EAGAIN to indicate that this page is not eligible for the + * transfer right now. + * + * FUTURE + * + * Plan is to divide transfers into "priority bands" (indicated when + * submitting cl_page_list, and queuing a page for the opportunistic transfer) + * and allow glueing of cached pages to immediate transfers only within single + * band. This would make high priority transfers (like lock cancellation or + * memory pressure induced write-out) really high priority. + * + */ + +/** + * Per-transfer attributes. + */ +struct cl_req_attr { + /** Generic attributes for the server consumption. */ + struct obdo *cra_oa; + /** Capability. */ + struct obd_capa *cra_capa; + /** Jobid */ + char cra_jobid[JOBSTATS_JOBID_SIZE]; +}; + +/** + * Transfer request operations definable at every layer. + * + * Concurrency: transfer formation engine synchronizes calls to all transfer + * methods. + */ +struct cl_req_operations { + /** + * Invoked top-to-bottom by cl_req_prep() when transfer formation is + * complete (all pages are added). + * + * \see osc_req_prep() + */ + int (*cro_prep)(const struct lu_env *env, + const struct cl_req_slice *slice); + /** + * Called top-to-bottom to fill in \a oa fields. This is called twice + * with different flags, see bug 10150 and osc_build_req(). + * + * \param obj an object from cl_req which attributes are to be set in + * \a oa. + * + * \param oa struct obdo where attributes are placed + * + * \param flags \a oa fields to be filled. + */ + void (*cro_attr_set)(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, u64 flags); + /** + * Called top-to-bottom from cl_req_completion() to notify layers that + * transfer completed. Has to free all state allocated by + * cl_device_operations::cdo_req_init(). + */ + void (*cro_completion)(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret); +}; + +/** + * A per-object state that (potentially multi-object) transfer request keeps. + */ +struct cl_req_obj { + /** object itself */ + struct cl_object *ro_obj; + /** reference to cl_req_obj::ro_obj. For debugging. */ + struct lu_ref_link ro_obj_ref; + /* something else? Number of pages for a given object? */ +}; + +/** + * Transfer request. + * + * Transfer requests are not reference counted, because IO sub-system owns + * them exclusively and knows when to free them. + * + * Life cycle. + * + * cl_req is created by cl_req_alloc() that calls + * cl_device_operations::cdo_req_init() device methods to allocate per-req + * state in every layer. + * + * Then pages are added (cl_req_page_add()), req keeps track of all objects it + * contains pages for. + * + * Once all pages were collected, cl_page_operations::cpo_prep() method is + * called top-to-bottom. At that point layers can modify req, let it pass, or + * deny it completely. This is to support things like SNS that have transfer + * ordering requirements invisible to the individual req-formation engine. + * + * On transfer completion (or transfer timeout, or failure to initiate the + * transfer of an allocated req), cl_req_operations::cro_completion() method + * is called, after execution of cl_page_operations::cpo_completion() of all + * req's pages. + */ +struct cl_req { + enum cl_req_type crq_type; + /** A list of pages being transferred */ + struct list_head crq_pages; + /** Number of pages in cl_req::crq_pages */ + unsigned crq_nrpages; + /** An array of objects which pages are in ->crq_pages */ + struct cl_req_obj *crq_o; + /** Number of elements in cl_req::crq_objs[] */ + unsigned crq_nrobjs; + struct list_head crq_layers; +}; + +/** + * Per-layer state for request. + */ +struct cl_req_slice { + struct cl_req *crs_req; + struct cl_device *crs_dev; + struct list_head crs_linkage; + const struct cl_req_operations *crs_ops; +}; + +/* @} cl_req */ + +enum cache_stats_item { + /** how many cache lookups were performed */ + CS_lookup = 0, + /** how many times cache lookup resulted in a hit */ + CS_hit, + /** how many entities are in the cache right now */ + CS_total, + /** how many entities in the cache are actively used (and cannot be + * evicted) right now */ + CS_busy, + /** how many entities were created at all */ + CS_create, + CS_NR +}; + +#define CS_NAMES { "lookup", "hit", "total", "busy", "create" } + +/** + * Stats for a generic cache (similar to inode, lu_object, etc. caches). + */ +struct cache_stats { + const char *cs_name; + atomic_t cs_stats[CS_NR]; +}; + +/** These are not exported so far */ +void cache_stats_init (struct cache_stats *cs, const char *name); + +/** + * Client-side site. This represents particular client stack. "Global" + * variables should (directly or indirectly) be added here to allow multiple + * clients to co-exist in the single address space. + */ +struct cl_site { + struct lu_site cs_lu; + /** + * Statistical counters. Atomics do not scale, something better like + * per-cpu counters is needed. + * + * These are exported as /proc/fs/lustre/llite/.../site + * + * When interpreting keep in mind that both sub-locks (and sub-pages) + * and top-locks (and top-pages) are accounted here. + */ + struct cache_stats cs_pages; + struct cache_stats cs_locks; + atomic_t cs_pages_state[CPS_NR]; + atomic_t cs_locks_state[CLS_NR]; +}; + +int cl_site_init (struct cl_site *s, struct cl_device *top); +void cl_site_fini (struct cl_site *s); +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl); + +/** + * Output client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, struct seq_file *m); + +/** + * \name helpers + * + * Type conversion and accessory functions. + */ +/** @{ */ + +static inline struct cl_site *lu2cl_site(const struct lu_site *site) +{ + return container_of(site, struct cl_site, cs_lu); +} + +static inline int lu_device_is_cl(const struct lu_device *d) +{ + return d->ld_type->ldt_tags & LU_DEVICE_CL; +} + +static inline struct cl_device *lu2cl_dev(const struct lu_device *d) +{ + LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d)); + return container_of0(d, struct cl_device, cd_lu_dev); +} + +static inline struct lu_device *cl2lu_dev(struct cl_device *d) +{ + return &d->cd_lu_dev; +} + +static inline struct cl_object *lu2cl(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); + return container_of0(o, struct cl_object, co_lu); +} + +static inline const struct cl_object_conf * +lu2cl_conf(const struct lu_object_conf *conf) +{ + return container_of0(conf, struct cl_object_conf, coc_lu); +} + +static inline struct cl_object *cl_object_next(const struct cl_object *obj) +{ + return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL; +} + +static inline struct cl_device *cl_object_device(const struct cl_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev)); + return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev); +} + +static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) +{ + return container_of0(h, struct cl_object_header, coh_lu); +} + +static inline struct cl_site *cl_object_site(const struct cl_object *obj) +{ + return lu2cl_site(obj->co_lu.lo_dev->ld_site); +} + +static inline +struct cl_object_header *cl_object_header(const struct cl_object *obj) +{ + return luh2coh(obj->co_lu.lo_header); +} + +static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t) +{ + return lu_device_init(&d->cd_lu_dev, t); +} + +static inline void cl_device_fini(struct cl_device *d) +{ + lu_device_fini(&d->cd_lu_dev); +} + +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops); +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops); +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, const struct cl_io_operations *ops); +void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice, + struct cl_device *dev, + const struct cl_req_operations *ops); +/** @} helpers */ + +/** \defgroup cl_object cl_object + * @{ */ +struct cl_object *cl_object_top (struct cl_object *o); +struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd, + const struct lu_fid *fid, + const struct cl_object_conf *c); + +int cl_object_header_init(struct cl_object_header *h); +void cl_object_header_fini(struct cl_object_header *h); +void cl_object_put (const struct lu_env *env, struct cl_object *o); +void cl_object_get (struct cl_object *o); +void cl_object_attr_lock (struct cl_object *o); +void cl_object_attr_unlock(struct cl_object *o); +int cl_object_attr_get (const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +int cl_object_attr_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int cl_object_glimpse (const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb); +int cl_conf_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +void cl_object_prune (const struct lu_env *env, struct cl_object *obj); +void cl_object_kill (const struct lu_env *env, struct cl_object *obj); +int cl_object_has_locks (struct cl_object *obj); + +/** + * Returns true, iff \a o0 and \a o1 are slices of the same object. + */ +static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1) +{ + return cl_object_header(o0) == cl_object_header(o1); +} + +static inline void cl_object_page_init(struct cl_object *clob, int size) +{ + clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize; + cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8); +} + +static inline void *cl_object_page_slice(struct cl_object *clob, + struct cl_page *page) +{ + return (void *)((char *)page + clob->co_slice_off); +} + +/** @} cl_object */ + +/** \defgroup cl_page cl_page + * @{ */ +enum { + CLP_GANG_OKAY = 0, + CLP_GANG_RESCHED, + CLP_GANG_AGAIN, + CLP_GANG_ABORT +}; + +/* callback of cl_page_gang_lookup() */ +typedef int (*cl_page_gang_cb_t) (const struct lu_env *, struct cl_io *, + struct cl_page *, void *); +int cl_page_gang_lookup (const struct lu_env *env, + struct cl_object *obj, + struct cl_io *io, + pgoff_t start, pgoff_t end, + cl_page_gang_cb_t cb, void *cbdata); +struct cl_page *cl_page_lookup (struct cl_object_header *hdr, + pgoff_t index); +struct cl_page *cl_page_find (const struct lu_env *env, + struct cl_object *obj, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type); +struct cl_page *cl_page_find_sub (const struct lu_env *env, + struct cl_object *obj, + pgoff_t idx, struct page *vmpage, + struct cl_page *parent); +void cl_page_get (struct cl_page *page); +void cl_page_put (const struct lu_env *env, + struct cl_page *page); +void cl_page_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +struct page *cl_page_vmpage (const struct lu_env *env, + struct cl_page *page); +struct cl_page *cl_vmpage_page (struct page *vmpage, struct cl_object *obj); +struct cl_page *cl_page_top (struct cl_page *page); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype); + +/** + * \name ownership + * + * Functions dealing with the ownership of page by io. + */ +/** @{ */ + +int cl_page_own (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_own_try (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_unassume (const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); +void cl_page_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_is_owned (const struct cl_page *pg, const struct cl_io *io); + +/** @} ownership */ + +/** + * \name transfer + * + * Functions dealing with the preparation of a page for a transfer, and + * tracking transfer state. + */ +/** @{ */ +int cl_page_prep (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_completion (const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret); +int cl_page_make_ready (const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt); +int cl_page_cache_add (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_clip (const struct lu_env *env, struct cl_page *pg, + int from, int to); +int cl_page_cancel (const struct lu_env *env, struct cl_page *page); +int cl_page_flush (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); + +/** @} transfer */ + + +/** + * \name helper routines + * Functions to discard, delete and export a cl_page. + */ +/** @{ */ +void cl_page_discard (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +void cl_page_delete (const struct lu_env *env, struct cl_page *pg); +int cl_page_unmap (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +int cl_page_is_vmlocked (const struct lu_env *env, + const struct cl_page *pg); +void cl_page_export (const struct lu_env *env, + struct cl_page *pg, int uptodate); +int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, + struct cl_page *page); +loff_t cl_offset (const struct cl_object *obj, pgoff_t idx); +pgoff_t cl_index (const struct cl_object *obj, loff_t offset); +int cl_page_size (const struct cl_object *obj); +int cl_pages_prune (const struct lu_env *env, struct cl_object *obj); + +void cl_lock_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock); +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr); +/* @} helper */ + +/** @} cl_page */ + +/** \defgroup cl_lock cl_lock + * @{ */ + +struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env, + struct cl_object *obj, pgoff_t index, + struct cl_lock *except, int pending, + int canceld); +static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, + struct cl_lock *except, + int pending, int canceld) +{ + LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj)); + return cl_lock_at_pgoff(env, obj, page->cp_index, except, + pending, canceld); +} + +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype); + +void cl_lock_get (struct cl_lock *lock); +void cl_lock_get_trust (struct cl_lock *lock); +void cl_lock_put (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_hold_add (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_unhold (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_release (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_user_add (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_user_del (const struct lu_env *env, struct cl_lock *lock); + +enum cl_lock_state cl_lock_intransit(const struct lu_env *env, + struct cl_lock *lock); +void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state); +int cl_lock_is_intransit(struct cl_lock *lock); + +int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock, + int keep_mutex); + +/** \name statemachine statemachine + * Interface to lock state machine consists of 3 parts: + * + * - "try" functions that attempt to effect a state transition. If state + * transition is not possible right now (e.g., if it has to wait for some + * asynchronous event to occur), these functions return + * cl_lock_transition::CLO_WAIT. + * + * - "non-try" functions that implement synchronous blocking interface on + * top of non-blocking "try" functions. These functions repeatedly call + * corresponding "try" versions, and if state transition is not possible + * immediately, wait for lock state change. + * + * - methods from cl_lock_operations, called by "try" functions. Lock can + * be advanced to the target state only when all layers voted that they + * are ready for this transition. "Try" functions call methods under lock + * mutex. If a layer had to release a mutex, it re-acquires it and returns + * cl_lock_transition::CLO_REPEAT, causing "try" function to call all + * layers again. + * + * TRY NON-TRY METHOD FINAL STATE + * + * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED + * + * cl_wait_try() cl_wait() cl_lock_operations::clo_wait() CLS_HELD + * + * cl_unuse_try() cl_unuse() cl_lock_operations::clo_unuse() CLS_CACHED + * + * cl_use_try() NONE cl_lock_operations::clo_use() CLS_HELD + * + * @{ */ + +int cl_enqueue (const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags); +int cl_wait (const struct lu_env *env, struct cl_lock *lock); +void cl_unuse (const struct lu_env *env, struct cl_lock *lock); +int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags); +int cl_unuse_try (const struct lu_env *env, struct cl_lock *lock); +int cl_wait_try (const struct lu_env *env, struct cl_lock *lock); +int cl_use_try (const struct lu_env *env, struct cl_lock *lock, int atomic); + +/** @} statemachine */ + +void cl_lock_signal (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_state_wait (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_state_set (const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state); +int cl_queue_match (const struct list_head *queue, + const struct cl_lock_descr *need); + +void cl_lock_mutex_get (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_mutex_try (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_mutex_put (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_is_mutexed (struct cl_lock *lock); +int cl_lock_nr_mutexed (const struct lu_env *env); +int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock); +int cl_lock_ext_match (const struct cl_lock_descr *has, + const struct cl_lock_descr *need); +int cl_lock_descr_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need); +int cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need); +int cl_lock_modify (const struct lu_env *env, struct cl_lock *lock, + const struct cl_lock_descr *desc); + +void cl_lock_closure_init (const struct lu_env *env, + struct cl_lock_closure *closure, + struct cl_lock *origin, int wait); +void cl_lock_closure_fini (struct cl_lock_closure *closure); +int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure); +void cl_lock_disclosure (const struct lu_env *env, + struct cl_lock_closure *closure); +int cl_lock_enclosure (const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure); + +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock); +void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock); +void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error); +void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait); + +unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock); + +/** @} cl_lock */ + +/** \defgroup cl_io cl_io + * @{ */ + +int cl_io_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_sub_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_rw_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count); +int cl_io_loop (const struct lu_env *env, struct cl_io *io); + +void cl_io_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_iter_init (const struct lu_env *env, struct cl_io *io); +void cl_io_iter_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_lock (const struct lu_env *env, struct cl_io *io); +void cl_io_unlock (const struct lu_env *env, struct cl_io *io); +int cl_io_start (const struct lu_env *env, struct cl_io *io); +void cl_io_end (const struct lu_env *env, struct cl_io *io); +int cl_io_lock_add (const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link); +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr); +int cl_io_read_page (const struct lu_env *env, struct cl_io *io, + struct cl_page *page); +int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to); +int cl_io_commit_write (const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to); +int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue); +int cl_io_submit_sync (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout); +void cl_io_rw_advance (const struct lu_env *env, struct cl_io *io, + size_t nob); +int cl_io_cancel (const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue); +int cl_io_is_going (const struct lu_env *env); + +/** + * True, iff \a io is an O_APPEND write(2). + */ +static inline int cl_io_is_append(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; +} + +static inline int cl_io_is_sync_write(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync; +} + +static inline int cl_io_is_mkwrite(const struct cl_io *io) +{ + return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite; +} + +/** + * True, iff \a io is a truncate(2). + */ +static inline int cl_io_is_trunc(const struct cl_io *io) +{ + return io->ci_type == CIT_SETATTR && + (io->u.ci_setattr.sa_valid & ATTR_SIZE); +} + +struct cl_io *cl_io_top(struct cl_io *io); + +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io); + +#define CL_IO_SLICE_CLEAN(foo_io, base) \ +do { \ + typeof(foo_io) __foo_io = (foo_io); \ + \ + CLASSERT(offsetof(typeof(*__foo_io), base) == 0); \ + memset(&__foo_io->base + 1, 0, \ + sizeof(*__foo_io) - sizeof(__foo_io->base)); \ +} while (0) + +/** @} cl_io */ + +/** \defgroup cl_page_list cl_page_list + * @{ */ + +/** + * Last page in the page list. + */ +static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist) +{ + LASSERT(plist->pl_nr > 0); + return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch); +} + +/** + * Iterate over pages in a page list. + */ +#define cl_page_list_for_each(page, list) \ + list_for_each_entry((page), &(list)->pl_pages, cp_batch) + +/** + * Iterate over pages in a page list, taking possible removals into account. + */ +#define cl_page_list_for_each_safe(page, temp, list) \ + list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch) + +void cl_page_list_init (struct cl_page_list *plist); +void cl_page_list_add (struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_move (struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page); +void cl_page_list_splice (struct cl_page_list *list, + struct cl_page_list *head); +void cl_page_list_del (const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +int cl_page_list_own (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +int cl_page_list_unmap (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_fini (const struct lu_env *env, struct cl_page_list *plist); + +void cl_2queue_init (struct cl_2queue *queue); +void cl_2queue_add (struct cl_2queue *queue, struct cl_page *page); +void cl_2queue_disown (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_assume (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_discard (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_fini (const struct lu_env *env, struct cl_2queue *queue); +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page); + +/** @} cl_page_list */ + +/** \defgroup cl_req cl_req + * @{ */ +struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page, + enum cl_req_type crt, int nr_objects); + +void cl_req_page_add (const struct lu_env *env, struct cl_req *req, + struct cl_page *page); +void cl_req_page_done (const struct lu_env *env, struct cl_page *page); +int cl_req_prep (const struct lu_env *env, struct cl_req *req); +void cl_req_attr_set (const struct lu_env *env, struct cl_req *req, + struct cl_req_attr *attr, u64 flags); +void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret); + +/** \defgroup cl_sync_io cl_sync_io + * @{ */ + +/** + * Anchor for synchronous transfer. This is allocated on a stack by thread + * doing synchronous transfer, and a pointer to this structure is set up in + * every page submitted for transfer. Transfer completion routine updates + * anchor and wakes up waiting thread when transfer is complete. + */ +struct cl_sync_io { + /** number of pages yet to be transferred. */ + atomic_t csi_sync_nr; + /** error code. */ + int csi_sync_rc; + /** barrier of destroy this structure */ + atomic_t csi_barrier; + /** completion to be signaled when transfer is complete. */ + wait_queue_head_t csi_waitq; +}; + +void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages); +int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_sync_io *anchor, + long timeout); +void cl_sync_io_note(struct cl_sync_io *anchor, int ioret); + +/** @} cl_sync_io */ + +/** @} cl_req */ + +/** \defgroup cl_env cl_env + * + * lu_env handling for a client. + * + * lu_env is an environment within which lustre code executes. Its major part + * is lu_context---a fast memory allocation mechanism that is used to conserve + * precious kernel stack space. Originally lu_env was designed for a server, + * where + * + * - there is a (mostly) fixed number of threads, and + * + * - call chains have no non-lustre portions inserted between lustre code. + * + * On a client both these assumption fails, because every user thread can + * potentially execute lustre code as part of a system call, and lustre calls + * into VFS or MM that call back into lustre. + * + * To deal with that, cl_env wrapper functions implement the following + * optimizations: + * + * - allocation and destruction of environment is amortized by caching no + * longer used environments instead of destroying them; + * + * - there is a notion of "current" environment, attached to the kernel + * data structure representing current thread Top-level lustre code + * allocates an environment and makes it current, then calls into + * non-lustre code, that in turn calls lustre back. Low-level lustre + * code thus called can fetch environment created by the top-level code + * and reuse it, avoiding additional environment allocation. + * Right now, three interfaces can attach the cl_env to running thread: + * - cl_env_get + * - cl_env_implant + * - cl_env_reexit(cl_env_reenter had to be called priorly) + * + * \see lu_env, lu_context, lu_context_key + * @{ */ + +struct cl_env_nest { + int cen_refcheck; + void *cen_cookie; +}; + +struct lu_env *cl_env_peek (int *refcheck); +struct lu_env *cl_env_get (int *refcheck); +struct lu_env *cl_env_alloc (int *refcheck, __u32 tags); +struct lu_env *cl_env_nested_get (struct cl_env_nest *nest); +void cl_env_put (struct lu_env *env, int *refcheck); +void cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env); +void *cl_env_reenter (void); +void cl_env_reexit (void *cookie); +void cl_env_implant (struct lu_env *env, int *refcheck); +void cl_env_unplant (struct lu_env *env, int *refcheck); + +/** @} cl_env */ + +/* + * Misc + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr); +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb); + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next); +/** @} clio */ + +int cl_global_init(void); +void cl_global_fini(void); + +#endif /* _LINUX_CL_OBJECT_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/dt_object.h b/kernel/drivers/staging/lustre/lustre/include/dt_object.h new file mode 100644 index 000000000..be4c7d95e --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/dt_object.h @@ -0,0 +1,1499 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_DT_OBJECT_H +#define __LUSTRE_DT_OBJECT_H + +/** \defgroup dt dt + * Sub-class of lu_object with methods common for "data" objects in OST stack. + * + * Data objects behave like regular files: you can read/write them, get and + * set their attributes. Implementation of dt interface is supposed to + * implement some form of garbage collection, normally reference counting + * (nlink) based one. + * + * Examples: osd (lustre/osd) is an implementation of dt interface. + * @{ + */ + + +/* + * super-class definitions. + */ +#include "lu_object.h" + +#include "../../include/linux/libcfs/libcfs.h" + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; + +struct thandle; +struct dt_device; +struct dt_object; +struct dt_index_features; +struct niobuf_local; +struct niobuf_remote; +struct ldlm_enqueue_info; + +typedef enum { + MNTOPT_USERXATTR = 0x00000001, + MNTOPT_ACL = 0x00000002, +} mntopt_t; + +struct dt_device_param { + unsigned ddp_max_name_len; + unsigned ddp_max_nlink; + unsigned ddp_block_shift; + mntopt_t ddp_mntopts; + unsigned ddp_max_ea_size; + void *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */ + int ddp_mount_type; + unsigned long long ddp_maxbytes; + /* percentage of available space to reserve for grant error margin */ + int ddp_grant_reserved; + /* per-inode space consumption */ + short ddp_inodespace; + /* per-fragment grant overhead to be used by client for grant + * calculation */ + int ddp_grant_frag; +}; + +/** + * Per-transaction commit callback function + */ +struct dt_txn_commit_cb; +typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err); +/** + * Special per-transaction callback for cases when just commit callback + * is needed and per-device callback are not convenient to use + */ +#define TRANS_COMMIT_CB_MAGIC 0xa0a00a0a +#define MAX_COMMIT_CB_STR_LEN 32 + +struct dt_txn_commit_cb { + struct list_head dcb_linkage; + dt_cb_t dcb_func; + __u32 dcb_magic; + char dcb_name[MAX_COMMIT_CB_STR_LEN]; +}; + +/** + * Operations on dt device. + */ +struct dt_device_operations { + /** + * Return device-wide statistics. + */ + int (*dt_statfs)(const struct lu_env *env, + struct dt_device *dev, struct obd_statfs *osfs); + /** + * Create transaction, described by \a param. + */ + struct thandle *(*dt_trans_create)(const struct lu_env *env, + struct dt_device *dev); + /** + * Start transaction, described by \a param. + */ + int (*dt_trans_start)(const struct lu_env *env, + struct dt_device *dev, struct thandle *th); + /** + * Finish previously started transaction. + */ + int (*dt_trans_stop)(const struct lu_env *env, + struct thandle *th); + /** + * Add commit callback to the transaction. + */ + int (*dt_trans_cb_add)(struct thandle *th, + struct dt_txn_commit_cb *dcb); + /** + * Return fid of root index object. + */ + int (*dt_root_get)(const struct lu_env *env, + struct dt_device *dev, struct lu_fid *f); + /** + * Return device configuration data. + */ + void (*dt_conf_get)(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param); + /** + * handling device state, mostly for tests + */ + int (*dt_sync)(const struct lu_env *env, struct dt_device *dev); + int (*dt_ro)(const struct lu_env *env, struct dt_device *dev); + /** + * Start a transaction commit asynchronously + * + * \param env environment + * \param dev dt_device to start commit on + * + * \return 0 success, negative value if error + */ + int (*dt_commit_async)(const struct lu_env *env, + struct dt_device *dev); + /** + * Initialize capability context. + */ + int (*dt_init_capa_ctxt)(const struct lu_env *env, + struct dt_device *dev, + int mode, unsigned long timeout, + __u32 alg, struct lustre_capa_key *keys); +}; + +struct dt_index_features { + /** required feature flags from enum dt_index_flags */ + __u32 dif_flags; + /** minimal required key size */ + size_t dif_keysize_min; + /** maximal required key size, 0 if no limit */ + size_t dif_keysize_max; + /** minimal required record size */ + size_t dif_recsize_min; + /** maximal required record size, 0 if no limit */ + size_t dif_recsize_max; + /** pointer size for record */ + size_t dif_ptrsize; +}; + +enum dt_index_flags { + /** index supports variable sized keys */ + DT_IND_VARKEY = 1 << 0, + /** index supports variable sized records */ + DT_IND_VARREC = 1 << 1, + /** index can be modified */ + DT_IND_UPDATE = 1 << 2, + /** index supports records with non-unique (duplicate) keys */ + DT_IND_NONUNQ = 1 << 3, + /** + * index support fixed-size keys sorted with natural numerical way + * and is able to return left-side value if no exact value found + */ + DT_IND_RANGE = 1 << 4, +}; + +/** + * Features, required from index to support file system directories (mapping + * names to fids). + */ +extern const struct dt_index_features dt_directory_features; +extern const struct dt_index_features dt_otable_features; +extern const struct dt_index_features dt_lfsck_features; + +/* index features supported by the accounting objects */ +extern const struct dt_index_features dt_acct_features; + +/* index features supported by the quota global indexes */ +extern const struct dt_index_features dt_quota_glb_features; + +/* index features supported by the quota slave indexes */ +extern const struct dt_index_features dt_quota_slv_features; + +/** + * This is a general purpose dt allocation hint. + * It now contains the parent object. + * It can contain any allocation hint in the future. + */ +struct dt_allocation_hint { + struct dt_object *dah_parent; + __u32 dah_mode; +}; + +/** + * object type specifier. + */ + +enum dt_format_type { + DFT_REGULAR, + DFT_DIR, + /** for mknod */ + DFT_NODE, + /** for special index */ + DFT_INDEX, + /** for symbolic link */ + DFT_SYM, +}; + +/** + * object format specifier. + */ +struct dt_object_format { + /** type for dt object */ + enum dt_format_type dof_type; + union { + struct dof_regular { + int striped; + } dof_reg; + struct dof_dir { + } dof_dir; + struct dof_node { + } dof_node; + /** + * special index need feature as parameter to create + * special idx + */ + struct dof_index { + const struct dt_index_features *di_feat; + } dof_idx; + } u; +}; + +enum dt_format_type dt_mode_to_dft(__u32 mode); + +typedef __u64 dt_obj_version_t; + +/** + * Per-dt-object operations. + */ +struct dt_object_operations { + void (*do_read_lock)(const struct lu_env *env, + struct dt_object *dt, unsigned role); + void (*do_write_lock)(const struct lu_env *env, + struct dt_object *dt, unsigned role); + void (*do_read_unlock)(const struct lu_env *env, + struct dt_object *dt); + void (*do_write_unlock)(const struct lu_env *env, + struct dt_object *dt); + int (*do_write_locked)(const struct lu_env *env, + struct dt_object *dt); + /** + * Note: following ->do_{x,}attr_{set,get}() operations are very + * similar to ->moo_{x,}attr_{set,get}() operations in struct + * md_object_operations (see md_object.h). These operations are not in + * lu_object_operations, because ->do_{x,}attr_set() versions take + * transaction handle as an argument (this transaction is started by + * caller). We might factor ->do_{x,}attr_get() into + * lu_object_operations, but that would break existing symmetry. + */ + + /** + * Return standard attributes. + * + * precondition: lu_object_exists(&dt->do_lu); + */ + int (*do_attr_get)(const struct lu_env *env, + struct dt_object *dt, struct lu_attr *attr, + struct lustre_capa *capa); + /** + * Set standard attributes. + * + * precondition: dt_object_exists(dt); + */ + int (*do_declare_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *handle); + int (*do_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *handle, + struct lustre_capa *capa); + /** + * Return a value of an extended attribute. + * + * precondition: dt_object_exists(dt); + */ + int (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, const char *name, + struct lustre_capa *capa); + /** + * Set value of an extended attribute. + * + * \a fl - flags from enum lu_xattr_flags + * + * precondition: dt_object_exists(dt); + */ + int (*do_declare_xattr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, int fl, + struct thandle *handle); + int (*do_xattr_set)(const struct lu_env *env, + struct dt_object *dt, const struct lu_buf *buf, + const char *name, int fl, struct thandle *handle, + struct lustre_capa *capa); + /** + * Delete existing extended attribute. + * + * precondition: dt_object_exists(dt); + */ + int (*do_declare_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, struct thandle *handle); + int (*do_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, struct thandle *handle, + struct lustre_capa *capa); + /** + * Place list of existing extended attributes into \a buf (which has + * length len). + * + * precondition: dt_object_exists(dt); + */ + int (*do_xattr_list)(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + struct lustre_capa *capa); + /** + * Init allocation hint using parent object and child mode. + * (1) The \a parent might be NULL if this is a partial creation for + * remote object. + * (2) The type of child is in \a child_mode. + * (3) The result hint is stored in \a ah; + */ + void (*do_ah_init)(const struct lu_env *env, + struct dt_allocation_hint *ah, + struct dt_object *parent, + struct dt_object *child, + umode_t child_mode); + /** + * Create new object on this device. + * + * precondition: !dt_object_exists(dt); + * postcondition: ergo(result == 0, dt_object_exists(dt)); + */ + int (*do_declare_create)(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + int (*do_create)(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + + /** + Destroy object on this device + * precondition: !dt_object_exists(dt); + * postcondition: ergo(result == 0, dt_object_exists(dt)); + */ + int (*do_declare_destroy)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + int (*do_destroy)(const struct lu_env *env, struct dt_object *dt, + struct thandle *th); + + /** + * Announce that this object is going to be used as an index. This + * operation check that object supports indexing operations and + * installs appropriate dt_index_operations vector on success. + * + * Also probes for features. Operation is successful if all required + * features are supported. + */ + int (*do_index_try)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_index_features *feat); + /** + * Add nlink of the object + * precondition: dt_object_exists(dt); + */ + int (*do_declare_ref_add)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + int (*do_ref_add)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + /** + * Del nlink of the object + * precondition: dt_object_exists(dt); + */ + int (*do_declare_ref_del)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + int (*do_ref_del)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + + struct obd_capa *(*do_capa_get)(const struct lu_env *env, + struct dt_object *dt, + struct lustre_capa *old, + __u64 opc); + int (*do_object_sync)(const struct lu_env *env, struct dt_object *obj, + __u64 start, __u64 end); + /** + * Get object info of next level. Currently, only get inode from osd. + * This is only used by quota b=16542 + * precondition: dt_object_exists(dt); + */ + int (*do_data_get)(const struct lu_env *env, struct dt_object *dt, + void **data); + + /** + * Lock object. + */ + int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy); +}; + +/** + * Per-dt-object operations on "file body". + */ +struct dt_body_operations { + /** + * precondition: dt_object_exists(dt); + */ + ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos, + struct lustre_capa *capa); + /** + * precondition: dt_object_exists(dt); + */ + ssize_t (*dbo_declare_write)(const struct lu_env *env, + struct dt_object *dt, + const loff_t size, loff_t pos, + struct thandle *handle); + ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, + struct thandle *handle, struct lustre_capa *capa, + int ignore_quota); + /* + * methods for zero-copy IO + */ + + /* + * precondition: dt_object_exists(dt); + * returns: + * < 0 - error code + * = 0 - illegal + * > 0 - number of local buffers prepared + */ + int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt, + loff_t pos, ssize_t len, struct niobuf_local *lb, + int rw, struct lustre_capa *capa); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lb, int nr); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lb, int nr); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_declare_write_commit)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *, + int, struct thandle *); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *, int, struct thandle *); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lnb, int nr); + int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt, + struct ll_user_fiemap *fm); + /** + * Punch object's content + * precondition: regular object, not index + */ + int (*dbo_declare_punch)(const struct lu_env *, struct dt_object *, + __u64, __u64, struct thandle *th); + int (*dbo_punch)(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, struct thandle *th, + struct lustre_capa *capa); +}; + +/** + * Incomplete type of index record. + */ +struct dt_rec; + +/** + * Incomplete type of index key. + */ +struct dt_key; + +/** + * Incomplete type of dt iterator. + */ +struct dt_it; + +/** + * Per-dt-object operations on object as index. + */ +struct dt_index_operations { + /** + * precondition: dt_object_exists(dt); + */ + int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key, + struct lustre_capa *capa); + /** + * precondition: dt_object_exists(dt); + */ + int (*dio_declare_insert)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *handle); + int (*dio_insert)(const struct lu_env *env, struct dt_object *dt, + const struct dt_rec *rec, const struct dt_key *key, + struct thandle *handle, struct lustre_capa *capa, + int ignore_quota); + /** + * precondition: dt_object_exists(dt); + */ + int (*dio_declare_delete)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *handle); + int (*dio_delete)(const struct lu_env *env, struct dt_object *dt, + const struct dt_key *key, struct thandle *handle, + struct lustre_capa *capa); + /** + * Iterator interface + */ + struct dt_it_ops { + /** + * Allocate and initialize new iterator. + * + * precondition: dt_object_exists(dt); + */ + struct dt_it *(*init)(const struct lu_env *env, + struct dt_object *dt, + __u32 attr, + struct lustre_capa *capa); + void (*fini)(const struct lu_env *env, + struct dt_it *di); + int (*get)(const struct lu_env *env, + struct dt_it *di, + const struct dt_key *key); + void (*put)(const struct lu_env *env, + struct dt_it *di); + int (*next)(const struct lu_env *env, + struct dt_it *di); + struct dt_key *(*key)(const struct lu_env *env, + const struct dt_it *di); + int (*key_size)(const struct lu_env *env, + const struct dt_it *di); + int (*rec)(const struct lu_env *env, + const struct dt_it *di, + struct dt_rec *rec, + __u32 attr); + __u64 (*store)(const struct lu_env *env, + const struct dt_it *di); + int (*load)(const struct lu_env *env, + const struct dt_it *di, __u64 hash); + int (*key_rec)(const struct lu_env *env, + const struct dt_it *di, void *key_rec); + } dio_it; +}; + +enum dt_otable_it_valid { + DOIV_ERROR_HANDLE = 0x0001, +}; + +enum dt_otable_it_flags { + /* Exit when fail. */ + DOIF_FAILOUT = 0x0001, + + /* Reset iteration position to the device beginning. */ + DOIF_RESET = 0x0002, + + /* There is up layer component uses the iteration. */ + DOIF_OUTUSED = 0x0004, +}; + +/* otable based iteration needs to use the common DT interation APIs. + * To initialize the iteration, it needs call dio_it::init() firstly. + * Here is how the otable based iteration should prepare arguments to + * call dt_it_ops::init(). + * + * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init() + * is composed of two parts: + * low 16-bits is for valid bits, high 16-bits is for flags bits. */ +#define DT_OTABLE_IT_FLAGS_SHIFT 16 +#define DT_OTABLE_IT_FLAGS_MASK 0xffff0000 + +struct dt_device { + struct lu_device dd_lu_dev; + const struct dt_device_operations *dd_ops; + + /** + * List of dt_txn_callback (see below). This is not protected in any + * way, because callbacks are supposed to be added/deleted only during + * single-threaded start-up shut-down procedures. + */ + struct list_head dd_txn_callbacks; +}; + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t); +void dt_device_fini(struct dt_device *dev); + +static inline int lu_device_is_dt(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT); +} + +static inline struct dt_device *lu2dt_dev(struct lu_device *l) +{ + LASSERT(lu_device_is_dt(l)); + return container_of0(l, struct dt_device, dd_lu_dev); +} + +struct dt_object { + struct lu_object do_lu; + const struct dt_object_operations *do_ops; + const struct dt_body_operations *do_body_ops; + const struct dt_index_operations *do_index_ops; +}; + +/* + * In-core representation of per-device local object OID storage + */ +struct local_oid_storage { + /* all initialized llog systems on this node linked by this */ + struct list_head los_list; + + /* how many handle's reference this los has */ + atomic_t los_refcount; + struct dt_device *los_dev; + struct dt_object *los_obj; + + /* data used to generate new fids */ + struct mutex los_id_lock; + __u64 los_seq; + __u32 los_last_oid; +}; + +static inline struct dt_object *lu2dt(struct lu_object *l) +{ + LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev)); + return container_of0(l, struct dt_object, do_lu); +} + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d); + +void dt_object_fini(struct dt_object *obj); + +static inline int dt_object_exists(const struct dt_object *dt) +{ + return lu_object_exists(&dt->do_lu); +} + +static inline int dt_object_remote(const struct dt_object *dt) +{ + return lu_object_remote(&dt->do_lu); +} + +static inline struct dt_object *lu2dt_obj(struct lu_object *o) +{ + LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev))); + return container_of0(o, struct dt_object, do_lu); +} + +/** + * This is the general purpose transaction handle. + * 1. Transaction Life Cycle + * This transaction handle is allocated upon starting a new transaction, + * and deallocated after this transaction is committed. + * 2. Transaction Nesting + * We do _NOT_ support nested transaction. So, every thread should only + * have one active transaction, and a transaction only belongs to one + * thread. Due to this, transaction handle need no reference count. + * 3. Transaction & dt_object locking + * dt_object locks should be taken inside transaction. + * 4. Transaction & RPC + * No RPC request should be issued inside transaction. + */ +struct thandle { + /** the dt device on which the transactions are executed */ + struct dt_device *th_dev; + + /** context for this transaction, tag is LCT_TX_HANDLE */ + struct lu_context th_ctx; + + /** additional tags (layers can add in declare) */ + __u32 th_tags; + + /** the last operation result in this transaction. + * this value is used in recovery */ + __s32 th_result; + + /** whether we need sync commit */ + unsigned int th_sync:1; + + /* local transation, no need to inform other layers */ + unsigned int th_local:1; + + /* In DNE, one transaction can be disassemblied into + * updates on several different MDTs, and these updates + * will be attached to th_remote_update_list per target. + * Only single thread will access the list, no need lock + */ + struct list_head th_remote_update_list; + struct update_request *th_current_request; +}; + +/** + * Transaction call-backs. + * + * These are invoked by osd (or underlying transaction engine) when + * transaction changes state. + * + * Call-backs are used by upper layers to modify transaction parameters and to + * perform some actions on for each transaction state transition. Typical + * example is mdt registering call-back to write into last-received file + * before each transaction commit. + */ +struct dt_txn_callback { + int (*dtc_txn_start)(const struct lu_env *env, + struct thandle *txn, void *cookie); + int (*dtc_txn_stop)(const struct lu_env *env, + struct thandle *txn, void *cookie); + void (*dtc_txn_commit)(struct thandle *txn, void *cookie); + void *dtc_cookie; + __u32 dtc_tag; + struct list_head dtc_linkage; +}; + +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb); +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *txn); +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn); +void dt_txn_hook_commit(struct thandle *txn); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj); + +/** + * Callback function used for parsing path. + * \see llo_store_resolve + */ +typedef int (*dt_entry_func_t)(const struct lu_env *env, + const char *name, + void *pvt); + +#define DT_MAX_PATH 1024 + +int dt_path_parser(const struct lu_env *env, + char *local, dt_entry_func_t entry_func, + void *data); + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid); + +struct dt_object *dt_store_open(const struct lu_env *env, + struct dt_device *dt, + const char *dirname, + const char *filename, + struct lu_fid *fid); + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *attr); + +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, + const struct lu_fid *fid, + struct lu_device *top_dev); +static inline struct dt_object * +dt_locate(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *fid) +{ + return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev); +} + + +int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *first_fid, + struct local_oid_storage **los); +void local_oid_storage_fini(const struct lu_env *env, + struct local_oid_storage *los); +int local_object_fid_generate(const struct lu_env *env, + struct local_oid_storage *los, + struct lu_fid *fid); +int local_object_declare_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, + struct dt_object_format *dof, + struct thandle *th); +int local_object_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, struct dt_object_format *dof, + struct thandle *th); +struct dt_object *local_file_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode); +struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, + __u32 mode); +struct dt_object * +local_index_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +struct dt_object * +local_index_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +int local_object_unlink(const struct lu_env *env, struct dt_device *dt, + struct dt_object *parent, const char *name); + +static inline int dt_object_lock(const struct lu_env *env, + struct dt_object *o, struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_object_lock); + return o->do_ops->do_object_lock(env, o, lh, einfo, policy); +} + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid); + +static inline int dt_object_sync(const struct lu_env *env, struct dt_object *o, + __u64 start, __u64 end) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_object_sync); + return o->do_ops->do_object_sync(env, o, start, end); +} + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th); +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th); +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o); + + +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th); +typedef int (*dt_index_page_build_t)(const struct lu_env *env, + union lu_page *lp, int nob, + const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg); +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg); +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg); + +static inline struct thandle *dt_trans_create(const struct lu_env *env, + struct dt_device *d) +{ + LASSERT(d->dd_ops->dt_trans_create); + return d->dd_ops->dt_trans_create(env, d); +} + +static inline int dt_trans_start(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + return d->dd_ops->dt_trans_start(env, d, th); +} + +/* for this transaction hooks shouldn't be called */ +static inline int dt_trans_start_local(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + th->th_local = 1; + return d->dd_ops->dt_trans_start(env, d, th); +} + +static inline int dt_trans_stop(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_stop); + return d->dd_ops->dt_trans_stop(env, th); +} + +static inline int dt_trans_cb_add(struct thandle *th, + struct dt_txn_commit_cb *dcb) +{ + LASSERT(th->th_dev->dd_ops->dt_trans_cb_add); + dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC; + return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb); +} +/** @} dt */ + + +static inline int dt_declare_record_write(const struct lu_env *env, + struct dt_object *dt, + int size, loff_t pos, + struct thandle *th) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_write); + rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th); + return rc; +} + +static inline int dt_declare_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_create); + return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_create); + return dt->do_ops->do_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_declare_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_destroy); + return dt->do_ops->do_declare_destroy(env, dt, th); +} + +static inline int dt_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_destroy); + return dt->do_ops->do_destroy(env, dt, th); +} + +static inline void dt_read_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_lock); + dt->do_ops->do_read_lock(env, dt, role); +} + +static inline void dt_write_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_lock); + dt->do_ops->do_write_lock(env, dt, role); +} + +static inline void dt_read_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_unlock); + dt->do_ops->do_read_unlock(env, dt); +} + +static inline void dt_write_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_unlock); + dt->do_ops->do_write_unlock(env, dt); +} + +static inline int dt_write_locked(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_locked); + return dt->do_ops->do_write_locked(env, dt); +} + +static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *la, void *arg) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_get); + return dt->do_ops->do_attr_get(env, dt, la, arg); +} + +static inline int dt_declare_attr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *la, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_attr_set); + return dt->do_ops->do_declare_attr_set(env, dt, la, th); +} + +static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_attr *la, struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_set); + return dt->do_ops->do_attr_set(env, dt, la, th, capa); +} + +static inline int dt_declare_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_add); + return dt->do_ops->do_declare_ref_add(env, dt, th); +} + +static inline int dt_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_add); + return dt->do_ops->do_ref_add(env, dt, th); +} + +static inline int dt_declare_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_del); + return dt->do_ops->do_declare_ref_del(env, dt, th); +} + +static inline int dt_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_del); + return dt->do_ops->do_ref_del(env, dt, th); +} + +static inline struct obd_capa *dt_capa_get(const struct lu_env *env, + struct dt_object *dt, + struct lustre_capa *old, __u64 opc) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_del); + return dt->do_ops->do_capa_get(env, dt, old, opc); +} + +static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d, + struct niobuf_remote *rnb, + struct niobuf_local *lnb, int rw, + struct lustre_capa *capa) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_get); + return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset, + rnb->len, lnb, rw, capa); +} + +static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_put); + return d->do_body_ops->dbo_bufs_put(env, d, lnb, n); +} + +static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_prep); + return d->do_body_ops->dbo_write_prep(env, d, lnb, n); +} + +static inline int dt_declare_write_commit(const struct lu_env *env, + struct dt_object *d, + struct niobuf_local *lnb, + int n, struct thandle *th) +{ + LASSERTF(d != NULL, "dt is NULL when we want to declare write\n"); + LASSERT(th != NULL); + return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th); +} + + +static inline int dt_write_commit(const struct lu_env *env, + struct dt_object *d, struct niobuf_local *lnb, + int n, struct thandle *th) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_commit); + return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th); +} + +static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_read_prep); + return d->do_body_ops->dbo_read_prep(env, d, lnb, n); +} + +static inline int dt_declare_punch(const struct lu_env *env, + struct dt_object *dt, __u64 start, + __u64 end, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_punch); + return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th); +} + +static inline int dt_punch(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_punch); + return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa); +} + +static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d, + struct ll_user_fiemap *fm) +{ + LASSERT(d); + if (d->do_body_ops == NULL) + return -EPROTO; + if (d->do_body_ops->dbo_fiemap_get == NULL) + return -EOPNOTSUPP; + return d->do_body_ops->dbo_fiemap_get(env, d, fm); +} + +static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev, + struct obd_statfs *osfs) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_statfs); + return dev->dd_ops->dt_statfs(env, dev, osfs); +} + +static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev, + struct lu_fid *f) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_root_get); + return dev->dd_ops->dt_root_get(env, dev, f); +} + +static inline void dt_conf_get(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_conf_get); + return dev->dd_ops->dt_conf_get(env, dev, param); +} + +static inline int dt_sync(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_sync); + return dev->dd_ops->dt_sync(env, dev); +} + +static inline int dt_ro(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_ro); + return dev->dd_ops->dt_ro(env, dev); +} + +static inline int dt_declare_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_insert); + return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th); +} + +static inline int dt_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th, + struct lustre_capa *capa, + int noquota) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_insert); + return dt->do_index_ops->dio_insert(env, dt, rec, key, th, + capa, noquota); +} + +static inline int dt_declare_xattr_del(const struct lu_env *env, + struct dt_object *dt, + const char *name, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_del); + return dt->do_ops->do_declare_xattr_del(env, dt, name, th); +} + +static inline int dt_xattr_del(const struct lu_env *env, + struct dt_object *dt, const char *name, + struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_del); + return dt->do_ops->do_xattr_del(env, dt, name, th, capa); +} + +static inline int dt_declare_xattr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, int fl, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_set); + return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th); +} + +static inline int dt_xattr_set(const struct lu_env *env, + struct dt_object *dt, const struct lu_buf *buf, + const char *name, int fl, struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_set); + return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa); +} + +static inline int dt_xattr_get(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + const char *name, struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_get); + return dt->do_ops->do_xattr_get(env, dt, buf, name, capa); +} + +static inline int dt_xattr_list(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_list); + return dt->do_ops->do_xattr_list(env, dt, buf, capa); +} + +static inline int dt_declare_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_delete); + return dt->do_index_ops->dio_declare_delete(env, dt, key, th); +} + +static inline int dt_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_delete); + return dt->do_index_ops->dio_delete(env, dt, key, th, capa); +} + +static inline int dt_commit_async(const struct lu_env *env, + struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_commit_async); + return dev->dd_ops->dt_commit_async(env, dev); +} + +static inline int dt_init_capa_ctxt(const struct lu_env *env, + struct dt_device *dev, + int mode, unsigned long timeout, + __u32 alg, struct lustre_capa_key *keys) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_init_capa_ctxt); + return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode, + timeout, alg, keys); +} + +static inline int dt_lookup(const struct lu_env *env, + struct dt_object *dt, + struct dt_rec *rec, + const struct dt_key *key, + struct lustre_capa *capa) +{ + int ret; + + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_lookup); + + ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa); + if (ret > 0) + ret = 0; + else if (ret == 0) + ret = -ENOENT; + return ret; +} + +#define LU221_BAD_TIME (0x80000000U + 24 * 3600) + +struct dt_find_hint { + struct lu_fid *dfh_fid; + struct dt_device *dfh_dt; + struct dt_object *dfh_o; +}; + +struct dt_thread_info { + char dti_buf[DT_MAX_PATH]; + struct dt_find_hint dti_dfh; + struct lu_attr dti_attr; + struct lu_fid dti_fid; + struct dt_object_format dti_dof; + struct lustre_mdt_attrs dti_lma; + struct lu_buf dti_lb; + loff_t dti_off; +}; + +extern struct lu_context_key dt_key; + +static inline struct dt_thread_info *dt_info(const struct lu_env *env) +{ + struct dt_thread_info *dti; + + dti = lu_context_key_get(&env->le_ctx, &dt_key); + LASSERT(dti); + return dti; +} + +int dt_global_init(void); +void dt_global_fini(void); + +#if defined (CONFIG_PROC_FS) +int lprocfs_dt_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, void *data); +#endif /* CONFIG_PROC_FS */ + +#endif /* __LUSTRE_DT_OBJECT_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/interval_tree.h b/kernel/drivers/staging/lustre/lustre/include/interval_tree.h new file mode 100644 index 000000000..bf9027d5f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/interval_tree.h @@ -0,0 +1,124 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/interval_tree.h + * + * Author: Huang Wei + * Author: Jay Xiong + */ + +#ifndef _INTERVAL_H__ +#define _INTERVAL_H__ + +#include "../../include/linux/libcfs/libcfs.h" /* LASSERT. */ + +struct interval_node { + struct interval_node *in_left; + struct interval_node *in_right; + struct interval_node *in_parent; + unsigned in_color:1, + in_intree:1, /** set if the node is in tree */ + in_res1:30; + __u8 in_res2[4]; /** tags, 8-bytes aligned */ + __u64 in_max_high; + struct interval_node_extent { + __u64 start; + __u64 end; + } in_extent; +}; + +enum interval_iter { + INTERVAL_ITER_CONT = 1, + INTERVAL_ITER_STOP = 2 +}; + +static inline int interval_is_intree(struct interval_node *node) +{ + return node->in_intree == 1; +} + +static inline __u64 interval_low(struct interval_node *node) +{ + return node->in_extent.start; +} + +static inline __u64 interval_high(struct interval_node *node) +{ + return node->in_extent.end; +} + +static inline void interval_set(struct interval_node *node, + __u64 start, __u64 end) +{ + LASSERT(start <= end); + node->in_extent.start = start; + node->in_extent.end = end; + node->in_max_high = end; +} + +/* Rules to write an interval callback. + * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration + * should be stopped. It will then cause the iteration function to return + * immediately with return value INTERVAL_ITER_STOP. + * - callbacks for interval_iterate and interval_iterate_reverse: Every + * nodes in the tree will be set to @node before the callback being called + * - callback for interval_search: Only overlapped node will be set to @node + * before the callback being called. + */ +typedef enum interval_iter (*interval_callback_t)(struct interval_node *node, + void *args); + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root); +void interval_erase(struct interval_node *node, struct interval_node **root); + +/* Search the extents in the tree and call @func for each overlapped + * extents. */ +enum interval_iter interval_search(struct interval_node *root, + struct interval_node_extent *ex, + interval_callback_t func, void *data); + +/* Iterate every node in the tree - by reverse order or regular order. */ +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, void *data); +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func, void *data); + +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter); +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ex); +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex); +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lclient.h b/kernel/drivers/staging/lustre/lustre/include/lclient.h new file mode 100644 index 000000000..c5c3a8d9e --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lclient.h @@ -0,0 +1,433 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Definitions shared between vvp and liblustre, and other clients in the + * future. + * + * Author: Oleg Drokin + * Author: Nikita Danilov + */ + +#ifndef LCLIENT_H +#define LCLIENT_H + +blkcnt_t dirty_cnt(struct inode *inode); + +int cl_glimpse_size0(struct inode *inode, int agl); +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob, int agl); + +static inline int cl_glimpse_size(struct inode *inode) +{ + return cl_glimpse_size0(inode, 0); +} + +static inline int cl_agl(struct inode *inode) +{ + return cl_glimpse_size0(inode, 1); +} + +/** + * Locking policy for setattr. + */ +enum ccc_setattr_lock_type { + /** Locking is done by server */ + SETATTR_NOLOCK, + /** Extent lock is enqueued */ + SETATTR_EXTENT_LOCK, + /** Existing local extent lock is used */ + SETATTR_MATCH_LOCK +}; + + +/** + * IO state private to vvp or slp layers. + */ +struct ccc_io { + /** super class */ + struct cl_io_slice cui_cl; + struct cl_io_lock_link cui_link; + /** + * I/O vector information to or from which read/write is going. + */ + struct iov_iter *cui_iter; + /** + * Total size for the left IO. + */ + size_t cui_tot_count; + + union { + struct { + enum ccc_setattr_lock_type cui_local_lock; + } setattr; + } u; + /** + * True iff io is processing glimpse right now. + */ + int cui_glimpse; + /** + * Layout version when this IO is initialized + */ + __u32 cui_layout_gen; + /** + * File descriptor against which IO is done. + */ + struct ll_file_data *cui_fd; + struct kiocb *cui_iocb; +}; + +/** + * True, if \a io is a normal io, False for splice_{read,write}. + * must be implemented in arch specific code. + */ +int cl_is_normalio(const struct lu_env *env, const struct cl_io *io); + +extern struct lu_context_key ccc_key; +extern struct lu_context_key ccc_session_key; + +struct ccc_thread_info { + struct cl_lock_descr cti_descr; + struct cl_io cti_io; + struct cl_attr cti_attr; +}; + +static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env) +{ + struct ccc_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &ccc_key); + LASSERT(info != NULL); + return info; +} + +static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env) +{ + struct cl_attr *attr = &ccc_env_info(env)->cti_attr; + + memset(attr, 0, sizeof(*attr)); + return attr; +} + +static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env) +{ + struct cl_io *io = &ccc_env_info(env)->cti_io; + + memset(io, 0, sizeof(*io)); + return io; +} + +struct ccc_session { + struct ccc_io cs_ios; +}; + +static inline struct ccc_session *ccc_env_session(const struct lu_env *env) +{ + struct ccc_session *ses; + + ses = lu_context_key_get(env->le_ses, &ccc_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct ccc_io *ccc_env_io(const struct lu_env *env) +{ + return &ccc_env_session(env)->cs_ios; +} + +/** + * ccc-private object state. + */ +struct ccc_object { + struct cl_object_header cob_header; + struct cl_object cob_cl; + struct inode *cob_inode; + + /** + * A list of dirty pages pending IO in the cache. Used by + * SOM. Protected by ll_inode_info::lli_lock. + * + * \see ccc_page::cpg_pending_linkage + */ + struct list_head cob_pending_list; + + /** + * Access this counter is protected by inode->i_sem. Now that + * the lifetime of transient pages must be covered by inode sem, + * we don't need to hold any lock.. + */ + int cob_transient_pages; + /** + * Number of outstanding mmaps on this file. + * + * \see ll_vm_open(), ll_vm_close(). + */ + atomic_t cob_mmap_cnt; + + /** + * various flags + * cob_discard_page_warned + * if pages belonging to this object are discarded when a client + * is evicted, some debug info will be printed, this flag will be set + * during processing the first discarded page, then avoid flooding + * debug message for lots of discarded pages. + * + * \see ll_dirty_page_discard_warn. + */ + unsigned int cob_discard_page_warned:1; +}; + +/** + * ccc-private page state. + */ +struct ccc_page { + struct cl_page_slice cpg_cl; + int cpg_defer_uptodate; + int cpg_ra_used; + int cpg_write_queued; + /** + * Non-empty iff this page is already counted in + * ccc_object::cob_pending_list. Protected by + * ccc_object::cob_pending_guard. This list is only used as a flag, + * that is, never iterated through, only checked for list_empty(), but + * having a list is useful for debugging. + */ + struct list_head cpg_pending_linkage; + /** VM page */ + struct page *cpg_page; +}; + +static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice) +{ + return container_of(slice, struct ccc_page, cpg_cl); +} + +struct cl_page *ccc_vmpage_page_transient(struct page *vmpage); + +struct ccc_device { + struct cl_device cdv_cl; + struct super_block *cdv_sb; + struct cl_device *cdv_next; +}; + +struct ccc_lock { + struct cl_lock_slice clk_cl; +}; + +struct ccc_req { + struct cl_req_slice crq_cl; +}; + +void *ccc_key_init (const struct lu_context *ctx, + struct lu_context_key *key); +void ccc_key_fini (const struct lu_context *ctx, + struct lu_context_key *key, void *data); +void *ccc_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key); +void ccc_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + +int ccc_device_init (const struct lu_env *env, + struct lu_device *d, + const char *name, struct lu_device *next); +struct lu_device *ccc_device_fini (const struct lu_env *env, + struct lu_device *d); +struct lu_device *ccc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg, + const struct lu_device_operations *luops, + const struct cl_device_operations *clops); +struct lu_device *ccc_device_free (const struct lu_env *env, + struct lu_device *d); +struct lu_object *ccc_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev, + const struct cl_object_operations *clops, + const struct lu_object_operations *luops); + +int ccc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +void ccc_umount(const struct lu_env *env, struct cl_device *dev); +int ccc_global_init(struct lu_device_type *device_type); +void ccc_global_fini(struct lu_device_type *device_type); +int ccc_object_init0(const struct lu_env *env, struct ccc_object *vob, + const struct cl_object_conf *conf); +int ccc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +void ccc_object_free(const struct lu_env *env, struct lu_object *obj); +int ccc_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io, + const struct cl_lock_operations *lkops); +int ccc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int ccc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); +int ccc_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +struct page *ccc_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice); +int ccc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); +int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice); +void ccc_transient_page_verify(const struct cl_page *page); +int ccc_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock); +void ccc_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +int ccc_transient_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice); +void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice); +int ccc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags); +int ccc_lock_use(const struct lu_env *env, const struct cl_lock_slice *slice); +int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice); +int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice); +int ccc_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io); +void ccc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state); + +void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios); +int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + pgoff_t start, pgoff_t end); +int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + loff_t start, loff_t end); +void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios); +void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios, + size_t nob); +void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio, + struct cl_io *io); +int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, loff_t start, size_t count, int *exceed); +void ccc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret); +void ccc_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *oa, u64 flags); + +struct lu_device *ccc2lu_dev (struct ccc_device *vdv); +struct lu_object *ccc2lu (struct ccc_object *vob); +struct ccc_device *lu2ccc_dev (const struct lu_device *d); +struct ccc_device *cl2ccc_dev (const struct cl_device *d); +struct ccc_object *lu2ccc (const struct lu_object *obj); +struct ccc_object *cl2ccc (const struct cl_object *obj); +struct ccc_lock *cl2ccc_lock (const struct cl_lock_slice *slice); +struct ccc_io *cl2ccc_io (const struct lu_env *env, + const struct cl_io_slice *slice); +struct ccc_req *cl2ccc_req (const struct cl_req_slice *slice); +struct page *cl2vm_page (const struct cl_page_slice *slice); +struct inode *ccc_object_inode(const struct cl_object *obj); +struct ccc_object *cl_inode2ccc (struct inode *inode); + +int cl_setattr_ost(struct inode *inode, const struct iattr *attr, + struct obd_capa *capa); + +struct cl_page *ccc_vmpage_page_transient(struct page *vmpage); +int ccc_object_invariant(const struct cl_object *obj); +int cl_file_inode_init(struct inode *inode, struct lustre_md *md); +void cl_inode_fini(struct inode *inode); +int cl_local_size(struct inode *inode); + +__u16 ll_dirent_type_get(struct lu_dirent *ent); +__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32); +__u32 cl_fid_build_gen(const struct lu_fid *fid); + +# define CLOBINVRNT(env, clob, expr) \ + ((void)sizeof(env), (void)sizeof(clob), (void)sizeof(!!(expr))) + +int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp); +int cl_ocd_update(struct obd_device *host, + struct obd_device *watched, + enum obd_notify_event ev, void *owner, void *data); + +struct ccc_grouplock { + struct lu_env *cg_env; + struct cl_io *cg_io; + struct cl_lock *cg_lock; + unsigned long cg_gid; +}; + +int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, + struct ccc_grouplock *cg); +void cl_put_grouplock(struct ccc_grouplock *cg); + +/** + * New interfaces to get and put lov_stripe_md from lov layer. This violates + * layering because lov_stripe_md is supposed to be a private data in lov. + * + * NB: If you find you have to use these interfaces for your new code, please + * think about it again. These interfaces may be removed in the future for + * better layering. */ +struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj); +void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm); +int lov_read_and_clear_async_rc(struct cl_object *clob); + +struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode); +void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm); + +/** + * Data structure managing a client's cached clean pages. An LRU of + * pages is maintained, along with other statistics. + */ +struct cl_client_cache { + atomic_t ccc_users; /* # of users (OSCs) of this data */ + struct list_head ccc_lru; /* LRU list of cached clean pages */ + spinlock_t ccc_lru_lock; /* lock for list */ + atomic_t ccc_lru_left; /* # of LRU entries available */ + unsigned long ccc_lru_max; /* Max # of LRU entries possible */ + unsigned int ccc_lru_shrinkers; /* # of threads reclaiming */ +}; + +#endif /*LCLIENT_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h new file mode 100644 index 000000000..3925db160 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h @@ -0,0 +1,216 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_COMPAT25_H +#define _LINUX_COMPAT25_H + +#include +#include + +#include "lustre_patchless_compat.h" + +/* + * set ATTR_BLOCKS to a high value to avoid any risk of collision with other + * ATTR_* attributes (see bug 13828) + */ +#define ATTR_BLOCKS (1 << 27) + +#define current_ngroups current_cred()->group_info->ngroups +#define current_groups current_cred()->group_info->small_block + +/* + * OBD need working random driver, thus all our + * initialization routines must be called after device + * driver initialization + */ +#ifndef MODULE +#undef module_init +#define module_init(a) late_initcall(a) +#endif + + +#define LTIME_S(time) (time.tv_sec) + +/* inode_dio_wait(i) use as-is for write lock */ +# define inode_dio_write_done(i) do {} while (0) /* for write unlock */ +# define inode_dio_read(i) atomic_inc(&(i)->i_dio_count) +/* inode_dio_done(i) use as-is for read unlock */ + + +#ifndef FS_HAS_FIEMAP +#define FS_HAS_FIEMAP (0) +#endif + +#define ll_vfs_rmdir(dir, entry, mnt) vfs_rmdir(dir, entry) +#define ll_vfs_mkdir(inode, dir, mnt, mode) vfs_mkdir(inode, dir, mode) +#define ll_vfs_link(old, mnt, dir, new, mnt1) vfs_link(old, dir, new) +#define ll_vfs_unlink(inode, entry, mnt) vfs_unlink(inode, entry) +#define ll_vfs_mknod(dir, entry, mnt, mode, dev) \ + vfs_mknod(dir, entry, mode, dev) +#define ll_security_inode_unlink(dir, entry, mnt) \ + security_inode_unlink(dir, entry) +#define ll_vfs_rename(old, old_dir, mnt, new, new_dir, mnt1) \ + vfs_rename(old, old_dir, new, new_dir, NULL, 0) + +#define cfs_bio_io_error(a, b) bio_io_error((a)) +#define cfs_bio_endio(a, b, c) bio_endio((a), (c)) + +#define cfs_path_put(nd) path_put(&(nd)->path) + + +#ifndef SLAB_DESTROY_BY_RCU +#define SLAB_DESTROY_BY_RCU 0 +#endif + + + +static inline int +ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount) +{ + int rc; + + if (sb->s_qcop->quota_on) { + struct path path; + + rc = kern_path(name, LOOKUP_FOLLOW, &path); + if (!rc) + return rc; + rc = sb->s_qcop->quota_on(sb, off, ver + , &path + ); + path_put(&path); + return rc; + } else + return -ENOSYS; +} + +static inline int ll_quota_off(struct super_block *sb, int off, int remount) +{ + if (sb->s_qcop->quota_off) { + return sb->s_qcop->quota_off(sb, off + ); + } else + return -ENOSYS; +} + + +# define ll_vfs_dq_init dquot_initialize +# define ll_vfs_dq_drop dquot_drop +# define ll_vfs_dq_transfer dquot_transfer +# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1) + + + + + +#define queue_max_phys_segments(rq) queue_max_segments(rq) +#define queue_max_hw_segments(rq) queue_max_segments(rq) + + +#define ll_d_hlist_node hlist_node +#define ll_d_hlist_empty(list) hlist_empty(list) +#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name) +#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry) +#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \ + p = NULL; hlist_for_each_entry(dentry, i_dentry, alias) + + +#define bio_hw_segments(q, bio) 0 + + +#define ll_pagevec_init(pv, cold) do {} while (0) +#define ll_pagevec_add(pv, pg) (0) +#define ll_pagevec_lru_add_file(pv) do {} while (0) + + +#ifndef QUOTA_OK +# define QUOTA_OK 0 +#endif +#ifndef NO_QUOTA +# define NO_QUOTA (-EDQUOT) +#endif + +#ifndef SEEK_DATA +#define SEEK_DATA 3 /* seek to the next data */ +#endif +#ifndef SEEK_HOLE +#define SEEK_HOLE 4 /* seek to the next hole */ +#endif + +#ifndef FMODE_UNSIGNED_OFFSET +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) +#endif + +#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit) +# define ext2_set_bit __test_and_set_bit_le +# define ext2_clear_bit __test_and_clear_bit_le +# define ext2_test_bit test_bit_le +# define ext2_find_first_zero_bit find_first_zero_bit_le +# define ext2_find_next_zero_bit find_next_zero_bit_le +#endif + +#ifdef ATTR_TIMES_SET +# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) +#else +# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET) +#endif + + + +/* + * After 3.1, kernel's nameidata.intent.open.flags is different + * with lustre's lookup_intent.it_flags, as lustre's it_flags' + * lower bits equal to FMODE_xxx while kernel doesn't transliterate + * lower bits of nameidata.intent.open.flags to FMODE_xxx. + * */ +#include +static inline int ll_namei_to_lookup_intent_flag(int flag) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0) + flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag); +#endif + return flag; +} + +#include + +# define ll_umode_t umode_t + +#include + +# define ll_dirty_inode(inode, flag) (inode)->i_sb->s_op->dirty_inode((inode), flag) + +#endif /* _COMPAT25_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_lite.h new file mode 100644 index 000000000..a7658a99a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_lite.h @@ -0,0 +1,98 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LL_H +#define _LINUX_LL_H + +#ifndef _LL_H +#error Do not #include this file directly. #include instead +#endif + + +#include + +#include +#include +#include + +#include "../obd_class.h" +#include "../lustre_net.h" +#include "../lustre_ha.h" + +#include +#include "../../include/linux/lustre_compat25.h" +#include + +/* lprocfs.c */ +enum { + LPROC_LL_DIRTY_HITS = 0, + LPROC_LL_DIRTY_MISSES, + LPROC_LL_READ_BYTES, + LPROC_LL_WRITE_BYTES, + LPROC_LL_BRW_READ, + LPROC_LL_BRW_WRITE, + LPROC_LL_OSC_READ, + LPROC_LL_OSC_WRITE, + LPROC_LL_IOCTL, + LPROC_LL_OPEN, + LPROC_LL_RELEASE, + LPROC_LL_MAP, + LPROC_LL_LLSEEK, + LPROC_LL_FSYNC, + LPROC_LL_READDIR, + LPROC_LL_SETATTR, + LPROC_LL_TRUNC, + LPROC_LL_FLOCK, + LPROC_LL_GETATTR, + LPROC_LL_CREATE, + LPROC_LL_LINK, + LPROC_LL_UNLINK, + LPROC_LL_SYMLINK, + LPROC_LL_MKDIR, + LPROC_LL_RMDIR, + LPROC_LL_MKNOD, + LPROC_LL_RENAME, + LPROC_LL_STAFS, + LPROC_LL_ALLOC_INODE, + LPROC_LL_SETXATTR, + LPROC_LL_GETXATTR, + LPROC_LL_GETXATTR_HITS, + LPROC_LL_LISTXATTR, + LPROC_LL_REMOVEXATTR, + LPROC_LL_INODE_PERM, + LPROC_LL_FILE_OPCODES +}; + + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h new file mode 100644 index 000000000..d72605864 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h @@ -0,0 +1,85 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LUSTRE_PATCHLESS_COMPAT_H +#define LUSTRE_PATCHLESS_COMPAT_H + +#include + +#include +#include +#include + + +#define ll_delete_from_page_cache(page) delete_from_page_cache(page) + +static inline void +truncate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return; + + if (PagePrivate(page)) + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); + + ClearPageMappedToDisk(page); + ll_delete_from_page_cache(page); +} + +#ifdef ATTR_OPEN +# define ATTR_FROM_OPEN ATTR_OPEN +#else +# ifndef ATTR_FROM_OPEN +# define ATTR_FROM_OPEN 0 +# endif +#endif /* ATTR_OPEN */ + +#ifndef ATTR_RAW +#define ATTR_RAW 0 +#endif + +#ifndef ATTR_CTIME_SET +/* + * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other + * ATTR_* attributes (see bug 13828) + */ +#define ATTR_CTIME_SET (1 << 28) +#endif + +#endif /* LUSTRE_PATCHLESS_COMPAT_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_user.h new file mode 100644 index 000000000..9cc2849f3 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/linux/lustre_user.h @@ -0,0 +1,70 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +#ifndef _LINUX_LUSTRE_USER_H +#define _LINUX_LUSTRE_USER_H + +# include + +/* + * asm-x86_64/processor.h on some SLES 9 distros seems to use + * kernel-only typedefs. fortunately skipping it altogether is ok + * (for now). + */ +#define __ASM_X86_64_PROCESSOR_H + +#include + +/* + * We need to always use 64bit version because the structure + * is shared across entire cluster where 32bit and 64bit machines + * are co-existing. + */ +#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64) +typedef struct stat64 lstat_t; +#define lstat_f lstat64 +#else +typedef struct stat lstat_t; +#define lstat_f lstat +#endif + +#define HAVE_LOV_USER_MDS_DATA + +#endif /* _LUSTRE_USER_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/linux/obd.h b/kernel/drivers/staging/lustre/lustre/include/linux/obd.h new file mode 100644 index 000000000..9cd868357 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/linux/obd.h @@ -0,0 +1,125 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_OBD_H +#define __LINUX_OBD_H + +#ifndef __OBD_H +#error Do not #include this file directly. #include instead +#endif + +#include "../obd_support.h" + +# include +# include +# include /* for struct task_struct, for current.h */ +# include +# include +#include "../lustre_intent.h" + +struct ll_iattr { + struct iattr iattr; + unsigned int ia_attr_flags; +}; + +#define CLIENT_OBD_LIST_LOCK_DEBUG 1 + +typedef struct { + spinlock_t lock; + + unsigned long time; + struct task_struct *task; + const char *func; + int line; +} client_obd_lock_t; + +static inline void __client_obd_list_lock(client_obd_lock_t *lock, + const char *func, int line) +{ + unsigned long cur = jiffies; + while (1) { + if (spin_trylock(&lock->lock)) { + LASSERT(lock->task == NULL); + lock->task = current; + lock->func = func; + lock->line = line; + lock->time = jiffies; + break; + } + + if (time_before(cur + 5 * HZ, jiffies) && + time_before(lock->time + 5 * HZ, jiffies)) { + struct task_struct *task = lock->task; + + if (task == NULL) + continue; + + LCONSOLE_WARN("%s:%d: lock %p was acquired by <%s:%d:%s:%d> for %lu seconds.\n", + current->comm, current->pid, + lock, task->comm, task->pid, + lock->func, lock->line, + (jiffies - lock->time) / HZ); + LCONSOLE_WARN("====== for current process =====\n"); + dump_stack(); + LCONSOLE_WARN("====== end =======\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(1000 * HZ); + } + cpu_relax(); + } +} + +#define client_obd_list_lock(lock) \ + __client_obd_list_lock(lock, __func__, __LINE__) + +static inline void client_obd_list_unlock(client_obd_lock_t *lock) +{ + LASSERT(lock->task != NULL); + lock->task = NULL; + lock->time = jiffies; + spin_unlock(&lock->lock); +} + + +static inline void client_obd_list_lock_init(client_obd_lock_t *lock) +{ + spin_lock_init(&lock->lock); +} + +static inline void client_obd_list_lock_done(client_obd_lock_t *lock) +{} + +#endif /* __LINUX_OBD_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lprocfs_status.h b/kernel/drivers/staging/lustre/lustre/include/lprocfs_status.h new file mode 100644 index 000000000..d030847e5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lprocfs_status.h @@ -0,0 +1,1015 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lprocfs_status.h + * + * Top level header file for LProc SNMP + * + * Author: Hariharan Thantry thantry@users.sourceforge.net + */ +#ifndef _LPROCFS_SNMP_H +#define _LPROCFS_SNMP_H + +#include +#include +#include +#include + +#include "lustre/lustre_idl.h" + +struct lprocfs_vars { + const char *name; + struct file_operations *fops; + void *data; + /** + * /proc file mode. + */ + umode_t proc_mode; +}; + +struct lprocfs_static_vars { + struct lprocfs_vars *module_vars; + struct lprocfs_vars *obd_vars; +}; + +/* if we find more consumers this could be generalized */ +#define OBD_HIST_MAX 32 +struct obd_histogram { + spinlock_t oh_lock; + unsigned long oh_buckets[OBD_HIST_MAX]; +}; + +enum { + BRW_R_PAGES = 0, + BRW_W_PAGES, + BRW_R_RPC_HIST, + BRW_W_RPC_HIST, + BRW_R_IO_TIME, + BRW_W_IO_TIME, + BRW_R_DISCONT_PAGES, + BRW_W_DISCONT_PAGES, + BRW_R_DISCONT_BLOCKS, + BRW_W_DISCONT_BLOCKS, + BRW_R_DISK_IOSIZE, + BRW_W_DISK_IOSIZE, + BRW_R_DIO_FRAGS, + BRW_W_DIO_FRAGS, + BRW_LAST, +}; + +struct brw_stats { + struct obd_histogram hist[BRW_LAST]; +}; + +enum { + RENAME_SAMEDIR_SIZE = 0, + RENAME_CROSSDIR_SRC_SIZE, + RENAME_CROSSDIR_TGT_SIZE, + RENAME_LAST, +}; + +struct rename_stats { + struct obd_histogram hist[RENAME_LAST]; +}; + +/* An lprocfs counter can be configured using the enum bit masks below. + * + * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already + * protects this counter from concurrent updates. If not specified, + * lprocfs an internal per-counter lock variable. External locks are + * not used to protect counter increments, but are used to protect + * counter readout and resets. + * + * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples, + * (i.e. counter can be incremented by more than "1"). When specified, + * the counter maintains min, max and sum in addition to a simple + * invocation count. This allows averages to be be computed. + * If not specified, the counter is an increment-by-1 counter. + * min, max, sum, etc. are not maintained. + * + * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of + * squares (for multi-valued counter samples only). This allows + * external computation of standard deviation, but involves a 64-bit + * multiply per counter increment. + */ + +enum { + LPROCFS_CNTR_EXTERNALLOCK = 0x0001, + LPROCFS_CNTR_AVGMINMAX = 0x0002, + LPROCFS_CNTR_STDDEV = 0x0004, + + /* counter data type */ + LPROCFS_TYPE_REGS = 0x0100, + LPROCFS_TYPE_BYTES = 0x0200, + LPROCFS_TYPE_PAGES = 0x0400, + LPROCFS_TYPE_CYCLE = 0x0800, +}; + +#define LC_MIN_INIT ((~(__u64)0) >> 1) + +struct lprocfs_counter_header { + unsigned int lc_config; + const char *lc_name; /* must be static */ + const char *lc_units; /* must be static */ +}; + +struct lprocfs_counter { + __s64 lc_count; + __s64 lc_min; + __s64 lc_max; + __s64 lc_sumsquare; + /* + * Every counter has lc_array_sum[0], while lc_array_sum[1] is only + * for irq context counter, i.e. stats with + * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need + * lc_array_sum[1] + */ + __s64 lc_array_sum[1]; +}; +#define lc_sum lc_array_sum[0] +#define lc_sum_irq lc_array_sum[1] + +struct lprocfs_percpu { +#ifndef __GNUC__ + __s64 pad; +#endif + struct lprocfs_counter lp_cntr[0]; +}; + +#define LPROCFS_GET_NUM_CPU 0x0001 +#define LPROCFS_GET_SMP_ID 0x0002 + +enum lprocfs_stats_flags { + LPROCFS_STATS_FLAG_NONE = 0x0000, /* per cpu counter */ + LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu + * area and need locking */ + LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */ +}; + +enum lprocfs_fields_flags { + LPROCFS_FIELDS_FLAGS_CONFIG = 0x0001, + LPROCFS_FIELDS_FLAGS_SUM = 0x0002, + LPROCFS_FIELDS_FLAGS_MIN = 0x0003, + LPROCFS_FIELDS_FLAGS_MAX = 0x0004, + LPROCFS_FIELDS_FLAGS_AVG = 0x0005, + LPROCFS_FIELDS_FLAGS_SUMSQUARE = 0x0006, + LPROCFS_FIELDS_FLAGS_COUNT = 0x0007, +}; + +struct lprocfs_stats { + /* # of counters */ + unsigned short ls_num; + /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */ + unsigned short ls_biggest_alloc_num; + enum lprocfs_stats_flags ls_flags; + /* Lock used when there are no percpu stats areas; For percpu stats, + * it is used to protect ls_biggest_alloc_num change */ + spinlock_t ls_lock; + + /* has ls_num of counter headers */ + struct lprocfs_counter_header *ls_cnt_header; + struct lprocfs_percpu *ls_percpu[0]; +}; + +#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC) + +/* Pack all opcodes down into a single monotonically increasing index */ +static inline int opcode_offset(__u32 opc) { + if (opc < OST_LAST_OPC) { + /* OST opcode */ + return (opc - OST_FIRST_OPC); + } else if (opc < MDS_LAST_OPC) { + /* MDS opcode */ + return (opc - MDS_FIRST_OPC + + OPC_RANGE(OST)); + } else if (opc < LDLM_LAST_OPC) { + /* LDLM Opcode */ + return (opc - LDLM_FIRST_OPC + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < MGS_LAST_OPC) { + /* MGS Opcode */ + return (opc - MGS_FIRST_OPC + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < OBD_LAST_OPC) { + /* OBD Ping */ + return (opc - OBD_FIRST_OPC + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < LLOG_LAST_OPC) { + /* LLOG Opcode */ + return (opc - LLOG_FIRST_OPC + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < QUOTA_LAST_OPC) { + /* LQUOTA Opcode */ + return (opc - QUOTA_FIRST_OPC + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEQ_LAST_OPC) { + /* SEQ opcode */ + return (opc - SEQ_FIRST_OPC + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEC_LAST_OPC) { + /* SEC opcode */ + return (opc - SEC_FIRST_OPC + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < FLD_LAST_OPC) { + /* FLD opcode */ + return (opc - FLD_FIRST_OPC + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < UPDATE_LAST_OPC) { + /* update opcode */ + return (opc - UPDATE_FIRST_OPC + + OPC_RANGE(FLD) + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else { + /* Unknown Opcode */ + return -1; + } +} + + +#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST) + \ + OPC_RANGE(MDS) + \ + OPC_RANGE(LDLM) + \ + OPC_RANGE(MGS) + \ + OPC_RANGE(OBD) + \ + OPC_RANGE(LLOG) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(SEQ) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(FLD) + \ + OPC_RANGE(UPDATE)) + +#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \ + OPC_RANGE(EXTRA)) + +enum { + PTLRPC_REQWAIT_CNTR = 0, + PTLRPC_REQQDEPTH_CNTR, + PTLRPC_REQACTIVE_CNTR, + PTLRPC_TIMEOUT, + PTLRPC_REQBUF_AVAIL_CNTR, + PTLRPC_LAST_CNTR +}; + +#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR + +enum { + LDLM_GLIMPSE_ENQUEUE = 0, + LDLM_PLAIN_ENQUEUE, + LDLM_EXTENT_ENQUEUE, + LDLM_FLOCK_ENQUEUE, + LDLM_IBITS_ENQUEUE, + MDS_REINT_SETATTR, + MDS_REINT_CREATE, + MDS_REINT_LINK, + MDS_REINT_UNLINK, + MDS_REINT_RENAME, + MDS_REINT_OPEN, + MDS_REINT_SETXATTR, + BRW_READ_BYTES, + BRW_WRITE_BYTES, + EXTRA_LAST_OPC +}; + +#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE +/* class_obd.c */ +extern struct proc_dir_entry *proc_lustre_root; + +struct obd_device; +struct obd_histogram; + +/* Days / hours / mins / seconds format */ +struct dhms { + int d, h, m, s; +}; +static inline void s2dhms(struct dhms *ts, time_t secs) +{ + ts->d = secs / 86400; + secs = secs % 86400; + ts->h = secs / 3600; + secs = secs % 3600; + ts->m = secs / 60; + ts->s = secs % 60; +} +#define DHMS_FMT "%dd%dh%02dm%02ds" +#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s + +#define JOBSTATS_JOBID_VAR_MAX_LEN 20 +#define JOBSTATS_DISABLE "disable" +#define JOBSTATS_PROCNAME_UID "procname_uid" +#define JOBSTATS_NODELOCAL "nodelocal" + +extern int lprocfs_write_frac_helper(const char __user *buffer, + unsigned long count, int *val, int mult); +extern int lprocfs_read_frac_helper(char *buffer, unsigned long count, + long val, int mult); +#if defined (CONFIG_PROC_FS) + +extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, + unsigned int cpuid); +/* + * \return value + * < 0 : on error (only possible for opc as LPROCFS_GET_SMP_ID) + */ +static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc, + unsigned long *flags) +{ + int rc = 0; + + switch (opc) { + default: + LBUG(); + + case LPROCFS_GET_SMP_ID: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, *flags); + else + spin_lock(&stats->ls_lock); + return 0; + } else { + unsigned int cpuid = get_cpu(); + + if (unlikely(stats->ls_percpu[cpuid] == NULL)) { + rc = lprocfs_stats_alloc_one(stats, cpuid); + if (rc < 0) { + put_cpu(); + return rc; + } + } + return cpuid; + } + + case LPROCFS_GET_NUM_CPU: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, *flags); + else + spin_lock(&stats->ls_lock); + return 1; + } else { + return stats->ls_biggest_alloc_num; + } + } +} + +static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc, + unsigned long *flags) +{ + switch (opc) { + default: + LBUG(); + + case LPROCFS_GET_SMP_ID: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) { + spin_unlock_irqrestore(&stats->ls_lock, + *flags); + } else { + spin_unlock(&stats->ls_lock); + } + } else { + put_cpu(); + } + return; + + case LPROCFS_GET_NUM_CPU: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) { + spin_unlock_irqrestore(&stats->ls_lock, + *flags); + } else { + spin_unlock(&stats->ls_lock); + } + } + return; + } +} + +static inline unsigned int +lprocfs_stats_counter_size(struct lprocfs_stats *stats) +{ + unsigned int percpusize; + + percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]); + + /* irq safe stats need lc_array_sum[1] */ + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpusize += stats->ls_num * sizeof(__s64); + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0) + percpusize = L1_CACHE_ALIGN(percpusize); + + return percpusize; +} + +static inline struct lprocfs_counter * +lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid, + int index) +{ + struct lprocfs_counter *cntr; + + cntr = &stats->ls_percpu[cpuid]->lp_cntr[index]; + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + cntr = (void *)cntr + index * sizeof(__s64); + + return cntr; +} + +/* Two optimized LPROCFS counter increment functions are provided: + * lprocfs_counter_incr(cntr, value) - optimized for by-one counters + * lprocfs_counter_add(cntr) - use for multi-valued counters + * Counter data layout allows config flag, counter lock and the + * count itself to reside within a single cache line. + */ + +extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, + long amount); +extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, + long amount); + +#define lprocfs_counter_incr(stats, idx) \ + lprocfs_counter_add(stats, idx, 1) +#define lprocfs_counter_decr(stats, idx) \ + lprocfs_counter_sub(stats, idx, 1) + +extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc, + struct lprocfs_counter_header *header, + enum lprocfs_stats_flags flags, + enum lprocfs_fields_flags field); +static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats, + int idx, + enum lprocfs_fields_flags field) +{ + int i; + unsigned int num_cpu; + unsigned long flags = 0; + __u64 ret = 0; + + LASSERT(stats != NULL); + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; i++) { + if (stats->ls_percpu[i] == NULL) + continue; + ret += lprocfs_read_helper( + lprocfs_stats_counter_get(stats, i, idx), + &stats->ls_cnt_header[idx], stats->ls_flags, + field); + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); + return ret; +} + +extern struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags); +extern void lprocfs_clear_stats(struct lprocfs_stats *stats); +extern void lprocfs_free_stats(struct lprocfs_stats **stats); +extern void lprocfs_init_ops_stats(int num_private_stats, + struct lprocfs_stats *stats); +extern void lprocfs_init_mps_stats(int num_private_stats, + struct lprocfs_stats *stats); +extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats); +extern int lprocfs_alloc_obd_stats(struct obd_device *obddev, + unsigned int num_private_stats); +extern int lprocfs_alloc_md_stats(struct obd_device *obddev, + unsigned int num_private_stats); +extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, + const char *units); +extern void lprocfs_free_obd_stats(struct obd_device *obddev); +extern void lprocfs_free_md_stats(struct obd_device *obddev); +struct obd_export; +struct nid_stat; +extern int lprocfs_add_clear_entry(struct obd_device *obd, + struct proc_dir_entry *entry); +extern int lprocfs_exp_setup(struct obd_export *exp, + lnet_nid_t *peer_nid, int *newnid); +extern int lprocfs_exp_cleanup(struct obd_export *exp); +extern struct proc_dir_entry *lprocfs_add_simple(struct proc_dir_entry *root, + char *name, + void *data, + struct file_operations *fops); +extern struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...); +extern void lprocfs_free_per_client_stats(struct obd_device *obd); +extern int +lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data); + +extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name, + struct lprocfs_stats *stats); + +/* lprocfs_status.c */ +extern int lprocfs_add_vars(struct proc_dir_entry *root, + struct lprocfs_vars *var, + void *data); + +extern struct proc_dir_entry *lprocfs_register(const char *name, + struct proc_dir_entry *parent, + struct lprocfs_vars *list, + void *data); + +extern void lprocfs_remove(struct proc_dir_entry **root); +extern void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent); + +extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list); +extern int lprocfs_obd_cleanup(struct obd_device *obd); + +extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name, + umode_t mode, + const struct file_operations *seq_fops, + void *data); +extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name, + umode_t mode, + const struct file_operations *seq_fops, + void *data); + +/* Generic callbacks */ + +extern int lprocfs_rd_u64(struct seq_file *m, void *data); +extern int lprocfs_rd_atomic(struct seq_file *m, void *data); +extern int lprocfs_wr_atomic(struct file *file, const char __user *buffer, + unsigned long count, void *data); +extern int lprocfs_rd_uint(struct seq_file *m, void *data); +extern int lprocfs_wr_uint(struct file *file, const char __user *buffer, + unsigned long count, void *data); +extern int lprocfs_rd_uuid(struct seq_file *m, void *data); +extern int lprocfs_rd_name(struct seq_file *m, void *data); +extern int lprocfs_rd_server_uuid(struct seq_file *m, void *data); +extern int lprocfs_rd_conn_uuid(struct seq_file *m, void *data); +extern int lprocfs_rd_import(struct seq_file *m, void *data); +extern int lprocfs_rd_state(struct seq_file *m, void *data); +extern int lprocfs_rd_connect_flags(struct seq_file *m, void *data); +extern int lprocfs_rd_num_exports(struct seq_file *m, void *data); +extern int lprocfs_rd_numrefs(struct seq_file *m, void *data); + +struct adaptive_timeout; +extern int lprocfs_at_hist_helper(struct seq_file *m, + struct adaptive_timeout *at); +extern int lprocfs_rd_timeouts(struct seq_file *m, void *data); +extern int lprocfs_wr_timeouts(struct file *file, const char __user *buffer, + unsigned long count, void *data); +extern int lprocfs_wr_evict_client(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +extern int lprocfs_wr_ping(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +extern int lprocfs_wr_import(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +extern int lprocfs_rd_pinger_recov(struct seq_file *m, void *n); +extern int lprocfs_wr_pinger_recov(struct file *file, const char __user *buffer, + size_t count, loff_t *off); + +/* Statfs helpers */ +extern int lprocfs_rd_blksize(struct seq_file *m, void *data); +extern int lprocfs_rd_kbytestotal(struct seq_file *m, void *data); +extern int lprocfs_rd_kbytesfree(struct seq_file *m, void *data); +extern int lprocfs_rd_kbytesavail(struct seq_file *m, void *data); +extern int lprocfs_rd_filestotal(struct seq_file *m, void *data); +extern int lprocfs_rd_filesfree(struct seq_file *m, void *data); + +extern int lprocfs_write_helper(const char __user *buffer, unsigned long count, + int *val); +extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult); +extern int lprocfs_write_u64_helper(const char __user *buffer, + unsigned long count, __u64 *val); +extern int lprocfs_write_frac_u64_helper(const char *buffer, + unsigned long count, + __u64 *val, int mult); +extern char *lprocfs_find_named_value(const char *buffer, const char *name, + size_t *count); +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_clear(struct obd_histogram *oh); +unsigned long lprocfs_oh_sum(struct obd_histogram *oh); + +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt); + +extern int lprocfs_single_release(struct inode *, struct file *); +extern int lprocfs_seq_release(struct inode *, struct file *); + +/* You must use these macros when you want to refer to + * the import in a client obd_device for a lprocfs entry */ +#define LPROCFS_CLIMP_CHECK(obd) do { \ + typecheck(struct obd_device *, obd); \ + down_read(&(obd)->u.cli.cl_sem); \ + if ((obd)->u.cli.cl_import == NULL) { \ + up_read(&(obd)->u.cli.cl_sem); \ + return -ENODEV; \ + } \ +} while (0) +#define LPROCFS_CLIMP_EXIT(obd) \ + up_read(&(obd)->u.cli.cl_sem) + + +/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only + proc entries; otherwise, you will define name##_seq_write function also for + a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally, + call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */ +#define __LPROC_SEQ_FOPS(name, custom_seq_write) \ +static int name##_single_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, name##_seq_show, PDE_DATA(inode)); \ +} \ +static struct file_operations name##_fops = { \ + .owner = THIS_MODULE, \ + .open = name##_single_open, \ + .read = seq_read, \ + .write = custom_seq_write, \ + .llseek = seq_lseek, \ + .release = lprocfs_single_release, \ +} + +#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL) +#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write) + +#define LPROC_SEQ_FOPS_RO_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_rd_##type(m, m->private); \ + } \ + LPROC_SEQ_FOPS_RO(name##_##type) + +#define LPROC_SEQ_FOPS_RW_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_rd_##type(m, m->private); \ + } \ + static ssize_t name##_##type##_seq_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + struct seq_file *seq = file->private_data; \ + return lprocfs_wr_##type(file, buffer, \ + count, seq->private); \ + } \ + LPROC_SEQ_FOPS(name##_##type) + +#define LPROC_SEQ_FOPS_WR_ONLY(name, type) \ + static ssize_t name##_##type##_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + return lprocfs_wr_##type(file, buffer, count, off); \ + } \ + static int name##_##type##_open(struct inode *inode, struct file *file) \ + { \ + return single_open(file, NULL, PDE_DATA(inode)); \ + } \ + static struct file_operations name##_##type##_fops = { \ + .open = name##_##type##_open, \ + .write = name##_##type##_write, \ + .release = lprocfs_single_release, \ + } + +/* lproc_ptlrpc.c */ +struct ptlrpc_request; +extern void target_print_req(void *seq_file, struct ptlrpc_request *req); + +/* lproc_status.c */ +int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data); +int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer, + size_t count, loff_t *off); + +/* all quota proc functions */ +extern int lprocfs_quota_rd_bunit(char *page, char **start, + loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_btune(char *page, char **start, + loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_iunit(char *page, char **start, + loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_itune(char *page, char **start, + loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_type(char *page, char **start, loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_type(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_switch_seconds(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_sync_blk(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_switch_qs(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_switch_qs(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_boundary_factor(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_least_bunit(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_least_bunit(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_least_iunit(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_least_iunit(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_qs_factor(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_qs_factor(struct file *file, + const char *buffer, + unsigned long count, void *data); +#else +/* CONFIG_PROC_FS is not defined */ + +#define proc_lustre_root NULL + +static inline void lprocfs_counter_add(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_incr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_sub(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_decr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_init(struct lprocfs_stats *stats, + int index, unsigned conf, + const char *name, const char *units) +{ return; } + +static inline __u64 lc_read_helper(struct lprocfs_counter *lc, + enum lprocfs_fields_flags field) +{ return 0; } + +/* NB: we return !NULL to satisfy error checker */ +static inline struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags) +{ return (struct lprocfs_stats *)1; } +static inline void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_free_stats(struct lprocfs_stats **stats) +{ return; } +static inline int lprocfs_register_stats(struct proc_dir_entry *root, + const char *name, + struct lprocfs_stats *stats) +{ return 0; } +static inline void lprocfs_init_ops_stats(int num_private_stats, + struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_init_mps_stats(int num_private_stats, + struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ return; } +static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev, + unsigned int num_private_stats) +{ return 0; } +static inline int lprocfs_alloc_md_stats(struct obd_device *obddev, + unsigned int num_private_stats) +{ return 0; } +static inline void lprocfs_free_obd_stats(struct obd_device *obddev) +{ return; } +static inline void lprocfs_free_md_stats(struct obd_device *obddev) +{ return; } + +struct obd_export; +static inline int lprocfs_add_clear_entry(struct obd_export *exp) +{ return 0; } +static inline int lprocfs_exp_setup(struct obd_export *exp, + lnet_nid_t *peer_nid, + int *newnid) +{ return 0; } +static inline int lprocfs_exp_cleanup(struct obd_export *exp) +{ return 0; } +static inline struct proc_dir_entry * +lprocfs_add_simple(struct proc_dir_entry *root, char *name, + void *data, struct file_operations *fops) +{return 0; } +static inline struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...) +{return NULL; } +static inline void lprocfs_free_per_client_stats(struct obd_device *obd) +{ return; } +static inline +int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{return count;} +static inline +int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data) +{ return 0; } + +static inline struct proc_dir_entry * +lprocfs_register(const char *name, struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) +{ return NULL; } +static inline int lprocfs_add_vars(struct proc_dir_entry *root, + struct lprocfs_vars *var, + void *data) +{ return 0; } +static inline void lprocfs_remove(struct proc_dir_entry **root) +{ return; } +static inline void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent) +{ return; } +static inline int lprocfs_obd_setup(struct obd_device *dev, + struct lprocfs_vars *list) +{ return 0; } +static inline int lprocfs_obd_cleanup(struct obd_device *dev) +{ return 0; } +static inline int lprocfs_rd_u64(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_uuid(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_name(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_server_uuid(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_conn_uuid(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_import(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_pinger_recov(struct seq_file *m, void *n) +{ return 0; } +static inline int lprocfs_rd_state(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_connect_flags(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_num_exports(struct seq_file *m, void *data) +{ return 0; } +extern inline int lprocfs_rd_numrefs(struct seq_file *m, void *data) +{ return 0; } +struct adaptive_timeout; +static inline int lprocfs_at_hist_helper(struct seq_file *m, + struct adaptive_timeout *at) +{ return 0; } +static inline int lprocfs_rd_timeouts(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_wr_timeouts(struct file *file, + const char __user *buffer, + unsigned long count, void *data) +{ return 0; } +static inline int lprocfs_wr_evict_client(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline int lprocfs_wr_ping(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline int lprocfs_wr_import(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline int lprocfs_wr_pinger_recov(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } + +/* Statfs helpers */ +static inline +int lprocfs_rd_blksize(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_kbytestotal(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_kbytesfree(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_kbytesavail(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_filestotal(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_filesfree(struct seq_file *m, void *data) +{ return 0; } +static inline +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_clear(struct obd_histogram *oh) +{ return; } +static inline +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ return 0; } +static inline +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ return; } +static inline +__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, + enum lprocfs_fields_flags field) +{ return (__u64)0; } + +#define LPROC_SEQ_FOPS_RO(name) +#define LPROC_SEQ_FOPS(name) +#define LPROC_SEQ_FOPS_RO_TYPE(name, type) +#define LPROC_SEQ_FOPS_RW_TYPE(name, type) +#define LPROC_SEQ_FOPS_WR_ONLY(name, type) + +/* lproc_ptlrpc.c */ +#define target_print_req NULL + +#endif /* CONFIG_PROC_FS */ + +#endif /* LPROCFS_SNMP_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lu_object.h b/kernel/drivers/staging/lustre/lustre/include/lu_object.h new file mode 100644 index 000000000..c8cc48f00 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lu_object.h @@ -0,0 +1,1340 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_LU_OBJECT_H +#define __LUSTRE_LU_OBJECT_H + +#include +#include "../../include/linux/libcfs/libcfs.h" +#include "lustre/lustre_idl.h" +#include "lu_ref.h" + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; +struct lprocfs_stats; + +/** \defgroup lu lu + * lu_* data-types represent server-side entities shared by data and meta-data + * stacks. + * + * Design goals: + * + * -# support for layering. + * + * Server side object is split into layers, one per device in the + * corresponding device stack. Individual layer is represented by struct + * lu_object. Compound layered object --- by struct lu_object_header. Most + * interface functions take lu_object as an argument and operate on the + * whole compound object. This decision was made due to the following + * reasons: + * + * - it's envisaged that lu_object will be used much more often than + * lu_object_header; + * + * - we want lower (non-top) layers to be able to initiate operations + * on the whole object. + * + * Generic code supports layering more complex than simple stacking, e.g., + * it is possible that at some layer object "spawns" multiple sub-objects + * on the lower layer. + * + * -# fid-based identification. + * + * Compound object is uniquely identified by its fid. Objects are indexed + * by their fids (hash table is used for index). + * + * -# caching and life-cycle management. + * + * Object's life-time is controlled by reference counting. When reference + * count drops to 0, object is returned to cache. Cached objects still + * retain their identity (i.e., fid), and can be recovered from cache. + * + * Objects are kept in the global LRU list, and lu_site_purge() function + * can be used to reclaim given number of unused objects from the tail of + * the LRU. + * + * -# avoiding recursion. + * + * Generic code tries to replace recursion through layers by iterations + * where possible. Additionally to the end of reducing stack consumption, + * data, when practically possible, are allocated through lu_context_key + * interface rather than on stack. + * @{ + */ + +struct lu_site; +struct lu_object; +struct lu_device; +struct lu_object_header; +struct lu_context; +struct lu_env; + +/** + * Operations common for data and meta-data devices. + */ +struct lu_device_operations { + /** + * Allocate object for the given device (without lower-layer + * parts). This is called by lu_object_operations::loo_object_init() + * from the parent layer, and should setup at least lu_object::lo_dev + * and lu_object::lo_ops fields of resulting lu_object. + * + * Object creation protocol. + * + * Due to design goal of avoiding recursion, object creation (see + * lu_object_alloc()) is somewhat involved: + * + * - first, lu_device_operations::ldo_object_alloc() method of the + * top-level device in the stack is called. It should allocate top + * level object (including lu_object_header), but without any + * lower-layer sub-object(s). + * + * - then lu_object_alloc() sets fid in the header of newly created + * object. + * + * - then lu_object_operations::loo_object_init() is called. It has + * to allocate lower-layer object(s). To do this, + * lu_object_operations::loo_object_init() calls ldo_object_alloc() + * of the lower-layer device(s). + * + * - for all new objects allocated by + * lu_object_operations::loo_object_init() (and inserted into object + * stack), lu_object_operations::loo_object_init() is called again + * repeatedly, until no new objects are created. + * + * \post ergo(!IS_ERR(result), result->lo_dev == d && + * result->lo_ops != NULL); + */ + struct lu_object *(*ldo_object_alloc)(const struct lu_env *env, + const struct lu_object_header *h, + struct lu_device *d); + /** + * process config specific for device. + */ + int (*ldo_process_config)(const struct lu_env *env, + struct lu_device *, struct lustre_cfg *); + int (*ldo_recovery_complete)(const struct lu_env *, + struct lu_device *); + + /** + * initialize local objects for device. this method called after layer has + * been initialized (after LCFG_SETUP stage) and before it starts serving + * user requests. + */ + + int (*ldo_prepare)(const struct lu_env *, + struct lu_device *parent, + struct lu_device *dev); + +}; + +/** + * For lu_object_conf flags + */ +typedef enum { + /* This is a new object to be allocated, or the file + * corresponding to the object does not exists. */ + LOC_F_NEW = 0x00000001, +} loc_flags_t; + +/** + * Object configuration, describing particulars of object being created. On + * server this is not used, as server objects are full identified by fid. On + * client configuration contains struct lustre_md. + */ +struct lu_object_conf { + /** + * Some hints for obj find and alloc. + */ + loc_flags_t loc_flags; +}; + +/** + * Type of "printer" function used by lu_object_operations::loo_object_print() + * method. + * + * Printer function is needed to provide some flexibility in (semi-)debugging + * output: possible implementations: printk, CDEBUG, sysfs/seq_file + */ +typedef int (*lu_printer_t)(const struct lu_env *env, + void *cookie, const char *format, ...) + __printf(3, 4); + +/** + * Operations specific for particular lu_object. + */ +struct lu_object_operations { + + /** + * Allocate lower-layer parts of the object by calling + * lu_device_operations::ldo_object_alloc() of the corresponding + * underlying device. + * + * This method is called once for each object inserted into object + * stack. It's responsibility of this method to insert lower-layer + * object(s) it create into appropriate places of object stack. + */ + int (*loo_object_init)(const struct lu_env *env, + struct lu_object *o, + const struct lu_object_conf *conf); + /** + * Called (in top-to-bottom order) during object allocation after all + * layers were allocated and initialized. Can be used to perform + * initialization depending on lower layers. + */ + int (*loo_object_start)(const struct lu_env *env, + struct lu_object *o); + /** + * Called before lu_object_operations::loo_object_free() to signal + * that object is being destroyed. Dual to + * lu_object_operations::loo_object_init(). + */ + void (*loo_object_delete)(const struct lu_env *env, + struct lu_object *o); + /** + * Dual to lu_device_operations::ldo_object_alloc(). Called when + * object is removed from memory. + */ + void (*loo_object_free)(const struct lu_env *env, + struct lu_object *o); + /** + * Called when last active reference to the object is released (and + * object returns to the cache). This method is optional. + */ + void (*loo_object_release)(const struct lu_env *env, + struct lu_object *o); + /** + * Optional debugging helper. Print given object. + */ + int (*loo_object_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + /** + * Optional debugging method. Returns true iff method is internally + * consistent. + */ + int (*loo_object_invariant)(const struct lu_object *o); +}; + +/** + * Type of lu_device. + */ +struct lu_device_type; + +/** + * Device: a layer in the server side abstraction stacking. + */ +struct lu_device { + /** + * reference count. This is incremented, in particular, on each object + * created at this layer. + * + * \todo XXX which means that atomic_t is probably too small. + */ + atomic_t ld_ref; + /** + * Pointer to device type. Never modified once set. + */ + struct lu_device_type *ld_type; + /** + * Operation vector for this device. + */ + const struct lu_device_operations *ld_ops; + /** + * Stack this device belongs to. + */ + struct lu_site *ld_site; + struct proc_dir_entry *ld_proc_entry; + + /** \todo XXX: temporary back pointer into obd. */ + struct obd_device *ld_obd; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref ld_reference; + /** + * Link the device to the site. + **/ + struct list_head ld_linkage; +}; + +struct lu_device_type_operations; + +/** + * Tag bits for device type. They are used to distinguish certain groups of + * device types. + */ +enum lu_device_tag { + /** this is meta-data device */ + LU_DEVICE_MD = (1 << 0), + /** this is data device */ + LU_DEVICE_DT = (1 << 1), + /** data device in the client stack */ + LU_DEVICE_CL = (1 << 2) +}; + +/** + * Type of device. + */ +struct lu_device_type { + /** + * Tag bits. Taken from enum lu_device_tag. Never modified once set. + */ + __u32 ldt_tags; + /** + * Name of this class. Unique system-wide. Never modified once set. + */ + char *ldt_name; + /** + * Operations for this type. + */ + const struct lu_device_type_operations *ldt_ops; + /** + * \todo XXX: temporary pointer to associated obd_type. + */ + struct obd_type *ldt_obd_type; + /** + * \todo XXX: temporary: context tags used by obd_*() calls. + */ + __u32 ldt_ctx_tags; + /** + * Number of existing device type instances. + */ + unsigned ldt_device_nr; + /** + * Linkage into a global list of all device types. + * + * \see lu_device_types. + */ + struct list_head ldt_linkage; +}; + +/** + * Operations on a device type. + */ +struct lu_device_type_operations { + /** + * Allocate new device. + */ + struct lu_device *(*ldto_device_alloc)(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *lcfg); + /** + * Free device. Dual to + * lu_device_type_operations::ldto_device_alloc(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_free)(const struct lu_env *, + struct lu_device *); + + /** + * Initialize the devices after allocation + */ + int (*ldto_device_init)(const struct lu_env *env, + struct lu_device *, const char *, + struct lu_device *); + /** + * Finalize device. Dual to + * lu_device_type_operations::ldto_device_init(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_fini)(const struct lu_env *env, + struct lu_device *); + /** + * Initialize device type. This is called on module load. + */ + int (*ldto_init)(struct lu_device_type *t); + /** + * Finalize device type. Dual to + * lu_device_type_operations::ldto_init(). Called on module unload. + */ + void (*ldto_fini)(struct lu_device_type *t); + /** + * Called when the first device is created. + */ + void (*ldto_start)(struct lu_device_type *t); + /** + * Called when number of devices drops to 0. + */ + void (*ldto_stop)(struct lu_device_type *t); +}; + +static inline int lu_device_is_md(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD); +} + +/** + * Common object attributes. + */ +struct lu_attr { + /** size in bytes */ + __u64 la_size; + /** modification time in seconds since Epoch */ + s64 la_mtime; + /** access time in seconds since Epoch */ + s64 la_atime; + /** change time in seconds since Epoch */ + s64 la_ctime; + /** 512-byte blocks allocated to object */ + __u64 la_blocks; + /** permission bits and file type */ + __u32 la_mode; + /** owner id */ + __u32 la_uid; + /** group id */ + __u32 la_gid; + /** object flags */ + __u32 la_flags; + /** number of persistent references to this object */ + __u32 la_nlink; + /** blk bits of the object*/ + __u32 la_blkbits; + /** blk size of the object*/ + __u32 la_blksize; + /** real device */ + __u32 la_rdev; + /** + * valid bits + * + * \see enum la_valid + */ + __u64 la_valid; +}; + +/** Bit-mask of valid attributes */ +enum la_valid { + LA_ATIME = 1 << 0, + LA_MTIME = 1 << 1, + LA_CTIME = 1 << 2, + LA_SIZE = 1 << 3, + LA_MODE = 1 << 4, + LA_UID = 1 << 5, + LA_GID = 1 << 6, + LA_BLOCKS = 1 << 7, + LA_TYPE = 1 << 8, + LA_FLAGS = 1 << 9, + LA_NLINK = 1 << 10, + LA_RDEV = 1 << 11, + LA_BLKSIZE = 1 << 12, + LA_KILL_SUID = 1 << 13, + LA_KILL_SGID = 1 << 14, +}; + +/** + * Layer in the layered object. + */ +struct lu_object { + /** + * Header for this object. + */ + struct lu_object_header *lo_header; + /** + * Device for this layer. + */ + struct lu_device *lo_dev; + /** + * Operations for this object. + */ + const struct lu_object_operations *lo_ops; + /** + * Linkage into list of all layers. + */ + struct list_head lo_linkage; + /** + * Link to the device, for debugging. + */ + struct lu_ref_link lo_dev_ref; +}; + +enum lu_object_header_flags { + /** + * Don't keep this object in cache. Object will be destroyed as soon + * as last reference to it is released. This flag cannot be cleared + * once set. + */ + LU_OBJECT_HEARD_BANSHEE = 0, + /** + * Mark this object has already been taken out of cache. + */ + LU_OBJECT_UNHASHED = 1 +}; + +enum lu_object_header_attr { + LOHA_EXISTS = 1 << 0, + LOHA_REMOTE = 1 << 1, + /** + * UNIX file type is stored in S_IFMT bits. + */ + LOHA_FT_START = 001 << 12, /**< S_IFIFO */ + LOHA_FT_END = 017 << 12, /**< S_IFMT */ +}; + +/** + * "Compound" object, consisting of multiple layers. + * + * Compound object with given fid is unique with given lu_site. + * + * Note, that object does *not* necessary correspond to the real object in the + * persistent storage: object is an anchor for locking and method calling, so + * it is created for things like not-yet-existing child created by mkdir or + * create calls. lu_object_operations::loo_exists() can be used to check + * whether object is backed by persistent storage entity. + */ +struct lu_object_header { + /** + * Fid, uniquely identifying this object. + */ + struct lu_fid loh_fid; + /** + * Object flags from enum lu_object_header_flags. Set and checked + * atomically. + */ + unsigned long loh_flags; + /** + * Object reference count. Protected by lu_site::ls_guard. + */ + atomic_t loh_ref; + /** + * Common object attributes, cached for efficiency. From enum + * lu_object_header_attr. + */ + __u32 loh_attr; + /** + * Linkage into per-site hash table. Protected by lu_site::ls_guard. + */ + struct hlist_node loh_hash; + /** + * Linkage into per-site LRU list. Protected by lu_site::ls_guard. + */ + struct list_head loh_lru; + /** + * Linkage into list of layers. Never modified once set (except lately + * during object destruction). No locking is necessary. + */ + struct list_head loh_layers; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref loh_reference; +}; + +struct fld; + +struct lu_site_bkt_data { + /** + * number of busy object on this bucket + */ + long lsb_busy; + /** + * LRU list, updated on each access to object. Protected by + * bucket lock of lu_site::ls_obj_hash. + * + * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are + * moved to the lu_site::ls_lru.prev (this is due to the non-existence + * of list_for_each_entry_safe_reverse()). + */ + struct list_head lsb_lru; + /** + * Wait-queue signaled when an object in this site is ultimately + * destroyed (lu_object_free()). It is used by lu_object_find() to + * wait before re-trying when object in the process of destruction is + * found in the hash table. + * + * \see htable_lookup(). + */ + wait_queue_head_t lsb_marche_funebre; +}; + +enum { + LU_SS_CREATED = 0, + LU_SS_CACHE_HIT, + LU_SS_CACHE_MISS, + LU_SS_CACHE_RACE, + LU_SS_CACHE_DEATH_RACE, + LU_SS_LRU_PURGED, + LU_SS_LAST_STAT +}; + +/** + * lu_site is a "compartment" within which objects are unique, and LRU + * discipline is maintained. + * + * lu_site exists so that multiple layered stacks can co-exist in the same + * address space. + * + * lu_site has the same relation to lu_device as lu_object_header to + * lu_object. + */ +struct lu_site { + /** + * objects hash table + */ + struct cfs_hash *ls_obj_hash; + /** + * index of bucket on hash table while purging + */ + int ls_purge_start; + /** + * Top-level device for this stack. + */ + struct lu_device *ls_top_dev; + /** + * Bottom-level device for this stack + */ + struct lu_device *ls_bottom_dev; + /** + * Linkage into global list of sites. + */ + struct list_head ls_linkage; + /** + * List for lu device for this site, protected + * by ls_ld_lock. + **/ + struct list_head ls_ld_linkage; + spinlock_t ls_ld_lock; + + /** + * lu_site stats + */ + struct lprocfs_stats *ls_stats; + /** + * XXX: a hack! fld has to find md_site via site, remove when possible + */ + struct seq_server_site *ld_seq_site; +}; + +static inline struct lu_site_bkt_data * +lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid) +{ + struct cfs_hash_bd bd; + + cfs_hash_bd_get(site->ls_obj_hash, fid, &bd); + return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); +} + +static inline struct seq_server_site *lu_site2seq(const struct lu_site *s) +{ + return s->ld_seq_site; +} + +/** \name ctors + * Constructors/destructors. + * @{ + */ + +int lu_site_init (struct lu_site *s, struct lu_device *d); +void lu_site_fini (struct lu_site *s); +int lu_site_init_finish (struct lu_site *s); +void lu_stack_fini (const struct lu_env *env, struct lu_device *top); +void lu_device_get (struct lu_device *d); +void lu_device_put (struct lu_device *d); +int lu_device_init (struct lu_device *d, struct lu_device_type *t); +void lu_device_fini (struct lu_device *d); +int lu_object_header_init(struct lu_object_header *h); +void lu_object_header_fini(struct lu_object_header *h); +int lu_object_init (struct lu_object *o, + struct lu_object_header *h, struct lu_device *d); +void lu_object_fini (struct lu_object *o); +void lu_object_add_top (struct lu_object_header *h, struct lu_object *o); +void lu_object_add (struct lu_object *before, struct lu_object *o); + +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d); +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d); + +/** + * Helpers to initialize and finalize device types. + */ + +int lu_device_type_init(struct lu_device_type *ldt); +void lu_device_type_fini(struct lu_device_type *ldt); +void lu_types_stop(void); + +/** @} ctors */ + +/** \name caching + * Caching and reference counting. + * @{ + */ + +/** + * Acquire additional reference to the given object. This function is used to + * attain additional reference. To acquire initial reference use + * lu_object_find(). + */ +static inline void lu_object_get(struct lu_object *o) +{ + LASSERT(atomic_read(&o->lo_header->loh_ref) > 0); + atomic_inc(&o->lo_header->loh_ref); +} + +/** + * Return true of object will not be cached after last reference to it is + * released. + */ +static inline int lu_object_is_dying(const struct lu_object_header *h) +{ + return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags); +} + +void lu_object_put(const struct lu_env *env, struct lu_object *o); +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o); +void lu_object_unhash(const struct lu_env *env, struct lu_object *o); + +int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr); + +void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, + lu_printer_t printer); +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +/** @} caching */ + +/** \name helpers + * Helpers. + * @{ + */ + +/** + * First (topmost) sub-object of given compound object + */ +static inline struct lu_object *lu_object_top(struct lu_object_header *h) +{ + LASSERT(!list_empty(&h->loh_layers)); + return container_of0(h->loh_layers.next, struct lu_object, lo_linkage); +} + +/** + * Next sub-object in the layering + */ +static inline struct lu_object *lu_object_next(const struct lu_object *o) +{ + return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage); +} + +/** + * Pointer to the fid of this object. + */ +static inline const struct lu_fid *lu_object_fid(const struct lu_object *o) +{ + return &o->lo_header->loh_fid; +} + +/** + * return device operations vector for this object + */ +static const inline struct lu_device_operations * +lu_object_ops(const struct lu_object *o) +{ + return o->lo_dev->ld_ops; +} + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype); + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...); + +/** + * Print object description followed by a user-supplied message. + */ +#define LU_OBJECT_DEBUG(mask, env, object, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + lu_object_print(env, &msgdata, lu_cdebug_printer, object);\ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Print short object description followed by a user-supplied message. + */ +#define LU_OBJECT_HEADER(mask, env, object, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + lu_object_header_print(env, &msgdata, lu_cdebug_printer,\ + (object)->lo_header); \ + lu_cdebug_printer(env, &msgdata, "\n"); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +void lu_object_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o); +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o); + + +/** + * Check whether object exists, no matter on local or remote storage. + * Note: LOHA_EXISTS will be set once some one created the object, + * and it does not needs to be committed to storage. + */ +#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS) + +/** + * Check whether object on the remote storage. + */ +#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE) + +static inline int lu_object_assert_exists(const struct lu_object *o) +{ + return lu_object_exists(o); +} + +static inline int lu_object_assert_not_exists(const struct lu_object *o) +{ + return !lu_object_exists(o); +} + +/** + * Attr of this object. + */ +static inline __u32 lu_object_attr(const struct lu_object *o) +{ + LASSERT(lu_object_exists(o) != 0); + return o->lo_header->loh_attr; +} + +static inline void lu_object_ref_add(struct lu_object *o, + const char *scope, + const void *source) +{ + lu_ref_add(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_add_at(struct lu_object *o, + struct lu_ref_link *link, + const char *scope, + const void *source) +{ + lu_ref_add_at(&o->lo_header->loh_reference, link, scope, source); +} + +static inline void lu_object_ref_del(struct lu_object *o, + const char *scope, const void *source) +{ + lu_ref_del(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_del_at(struct lu_object *o, + struct lu_ref_link *link, + const char *scope, const void *source) +{ + lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source); +} + +/** input params, should be filled out by mdt */ +struct lu_rdpg { + /** hash */ + __u64 rp_hash; + /** count in bytes */ + unsigned int rp_count; + /** number of pages */ + unsigned int rp_npages; + /** requested attr */ + __u32 rp_attrs; + /** pointers to pages */ + struct page **rp_pages; +}; + +enum lu_xattr_flags { + LU_XATTR_REPLACE = (1 << 0), + LU_XATTR_CREATE = (1 << 1) +}; + +/** @} helpers */ + +/** \name lu_context + * @{ */ + +/** For lu_context health-checks */ +enum lu_context_state { + LCS_INITIALIZED = 1, + LCS_ENTERED, + LCS_LEFT, + LCS_FINALIZED +}; + +/** + * lu_context. Execution context for lu_object methods. Currently associated + * with thread. + * + * All lu_object methods, except device and device type methods (called during + * system initialization and shutdown) are executed "within" some + * lu_context. This means, that pointer to some "current" lu_context is passed + * as an argument to all methods. + * + * All service ptlrpc threads create lu_context as part of their + * initialization. It is possible to create "stand-alone" context for other + * execution environments (like system calls). + * + * lu_object methods mainly use lu_context through lu_context_key interface + * that allows each layer to associate arbitrary pieces of data with each + * context (see pthread_key_create(3) for similar interface). + * + * On a client, lu_context is bound to a thread, see cl_env_get(). + * + * \see lu_context_key + */ +struct lu_context { + /** + * lu_context is used on the client side too. Yet we don't want to + * allocate values of server-side keys for the client contexts and + * vice versa. + * + * To achieve this, set of tags in introduced. Contexts and keys are + * marked with tags. Key value are created only for context whose set + * of tags has non-empty intersection with one for key. Tags are taken + * from enum lu_context_tag. + */ + __u32 lc_tags; + enum lu_context_state lc_state; + /** + * Pointer to the home service thread. NULL for other execution + * contexts. + */ + struct ptlrpc_thread *lc_thread; + /** + * Pointer to an array with key values. Internal implementation + * detail. + */ + void **lc_value; + /** + * Linkage into a list of all remembered contexts. Only + * `non-transient' contexts, i.e., ones created for service threads + * are placed here. + */ + struct list_head lc_remember; + /** + * Version counter used to skip calls to lu_context_refill() when no + * keys were registered. + */ + unsigned lc_version; + /** + * Debugging cookie. + */ + unsigned lc_cookie; +}; + +/** + * lu_context_key interface. Similar to pthread_key. + */ + +enum lu_context_tag { + /** + * Thread on md server + */ + LCT_MD_THREAD = 1 << 0, + /** + * Thread on dt server + */ + LCT_DT_THREAD = 1 << 1, + /** + * Context for transaction handle + */ + LCT_TX_HANDLE = 1 << 2, + /** + * Thread on client + */ + LCT_CL_THREAD = 1 << 3, + /** + * A per-request session on a server, and a per-system-call session on + * a client. + */ + LCT_SESSION = 1 << 4, + /** + * A per-request data on OSP device + */ + LCT_OSP_THREAD = 1 << 5, + /** + * MGS device thread + */ + LCT_MG_THREAD = 1 << 6, + /** + * Context for local operations + */ + LCT_LOCAL = 1 << 7, + /** + * Set when at least one of keys, having values in this context has + * non-NULL lu_context_key::lct_exit() method. This is used to + * optimize lu_context_exit() call. + */ + LCT_HAS_EXIT = 1 << 28, + /** + * Don't add references for modules creating key values in that context. + * This is only for contexts used internally by lu_object framework. + */ + LCT_NOREF = 1 << 29, + /** + * Key is being prepared for retiring, don't create new values for it. + */ + LCT_QUIESCENT = 1 << 30, + /** + * Context should be remembered. + */ + LCT_REMEMBER = 1 << 31, + /** + * Contexts usable in cache shrinker thread. + */ + LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF +}; + +/** + * Key. Represents per-context value slot. + * + * Keys are usually registered when module owning the key is initialized, and + * de-registered when module is unloaded. Once key is registered, all new + * contexts with matching tags, will get key value. "Old" contexts, already + * initialized at the time of key registration, can be forced to get key value + * by calling lu_context_refill(). + * + * Every key value is counted in lu_context_key::lct_used and acquires a + * reference on an owning module. This means, that all key values have to be + * destroyed before module can be unloaded. This is usually achieved by + * stopping threads started by the module, that created contexts in their + * entry functions. Situation is complicated by the threads shared by multiple + * modules, like ptlrpcd daemon on a client. To work around this problem, + * contexts, created in such threads, are `remembered' (see + * LCT_REMEMBER)---i.e., added into a global list. When module is preparing + * for unloading it does the following: + * + * - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT) + * preventing new key values from being allocated in the new contexts, + * and + * + * - scans a list of remembered contexts, destroying values of module + * keys, thus releasing references to the module. + * + * This is done by lu_context_key_quiesce(). If module is re-activated + * before key has been de-registered, lu_context_key_revive() call clears + * `quiescent' marker. + * + * lu_context code doesn't provide any internal synchronization for these + * activities---it's assumed that startup (including threads start-up) and + * shutdown are serialized by some external means. + * + * \see lu_context + */ +struct lu_context_key { + /** + * Set of tags for which values of this key are to be instantiated. + */ + __u32 lct_tags; + /** + * Value constructor. This is called when new value is created for a + * context. Returns pointer to new value of error pointer. + */ + void *(*lct_init)(const struct lu_context *ctx, + struct lu_context_key *key); + /** + * Value destructor. Called when context with previously allocated + * value of this slot is destroyed. \a data is a value that was returned + * by a matching call to lu_context_key::lct_init(). + */ + void (*lct_fini)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Optional method called on lu_context_exit() for all allocated + * keys. Can be used by debugging code checking that locks are + * released, etc. + */ + void (*lct_exit)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Internal implementation detail: index within lu_context::lc_value[] + * reserved for this key. + */ + int lct_index; + /** + * Internal implementation detail: number of values created for this + * key. + */ + atomic_t lct_used; + /** + * Internal implementation detail: module for this key. + */ + struct module *lct_owner; + /** + * References to this key. For debugging. + */ + struct lu_ref lct_reference; +}; + +#define LU_KEY_INIT(mod, type) \ + static void *mod##_key_init(const struct lu_context *ctx, \ + struct lu_context_key *key) \ + { \ + type *value; \ + \ + CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value)); \ + \ + OBD_ALLOC_PTR(value); \ + if (value == NULL) \ + value = ERR_PTR(-ENOMEM); \ + \ + return value; \ + } \ + struct __##mod##__dummy_init {;} /* semicolon catcher */ + +#define LU_KEY_FINI(mod, type) \ + static void mod##_key_fini(const struct lu_context *ctx, \ + struct lu_context_key *key, void *data) \ + { \ + type *info = data; \ + \ + OBD_FREE_PTR(info); \ + } \ + struct __##mod##__dummy_fini {;} /* semicolon catcher */ + +#define LU_KEY_INIT_FINI(mod, type) \ + LU_KEY_INIT(mod, type); \ + LU_KEY_FINI(mod, type) + +#define LU_CONTEXT_KEY_DEFINE(mod, tags) \ + struct lu_context_key mod##_thread_key = { \ + .lct_tags = tags, \ + .lct_init = mod##_key_init, \ + .lct_fini = mod##_key_fini \ + } + +#define LU_CONTEXT_KEY_INIT(key) \ +do { \ + (key)->lct_owner = THIS_MODULE; \ +} while (0) + +int lu_context_key_register(struct lu_context_key *key); +void lu_context_key_degister(struct lu_context_key *key); +void *lu_context_key_get (const struct lu_context *ctx, + const struct lu_context_key *key); +void lu_context_key_quiesce (struct lu_context_key *key); +void lu_context_key_revive (struct lu_context_key *key); + + +/* + * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an + * owning module. + */ + +#define LU_KEY_INIT_GENERIC(mod) \ + static void mod##_key_init_generic(struct lu_context_key *k, ...) \ + { \ + struct lu_context_key *key = k; \ + va_list args; \ + \ + va_start(args, k); \ + do { \ + LU_CONTEXT_KEY_INIT(key); \ + key = va_arg(args, struct lu_context_key *); \ + } while (key != NULL); \ + va_end(args); \ + } + +#define LU_TYPE_INIT(mod, ...) \ + LU_KEY_INIT_GENERIC(mod) \ + static int mod##_type_init(struct lu_device_type *t) \ + { \ + mod##_key_init_generic(__VA_ARGS__, NULL); \ + return lu_context_key_register_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_init {;} + +#define LU_TYPE_FINI(mod, ...) \ + static void mod##_type_fini(struct lu_device_type *t) \ + { \ + lu_context_key_degister_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_fini {;} + +#define LU_TYPE_START(mod, ...) \ + static void mod##_type_start(struct lu_device_type *t) \ + { \ + lu_context_key_revive_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_start {;} + +#define LU_TYPE_STOP(mod, ...) \ + static void mod##_type_stop(struct lu_device_type *t) \ + { \ + lu_context_key_quiesce_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_stop {;} + + + +#define LU_TYPE_INIT_FINI(mod, ...) \ + LU_TYPE_INIT(mod, __VA_ARGS__); \ + LU_TYPE_FINI(mod, __VA_ARGS__); \ + LU_TYPE_START(mod, __VA_ARGS__); \ + LU_TYPE_STOP(mod, __VA_ARGS__) + +int lu_context_init (struct lu_context *ctx, __u32 tags); +void lu_context_fini (struct lu_context *ctx); +void lu_context_enter (struct lu_context *ctx); +void lu_context_exit (struct lu_context *ctx); +int lu_context_refill(struct lu_context *ctx); + +/* + * Helper functions to operate on multiple keys. These are used by the default + * device type operations, defined by LU_TYPE_INIT_FINI(). + */ + +int lu_context_key_register_many(struct lu_context_key *k, ...); +void lu_context_key_degister_many(struct lu_context_key *k, ...); +void lu_context_key_revive_many (struct lu_context_key *k, ...); +void lu_context_key_quiesce_many (struct lu_context_key *k, ...); + +/* + * update/clear ctx/ses tags. + */ +void lu_context_tags_update(__u32 tags); +void lu_context_tags_clear(__u32 tags); +void lu_session_tags_update(__u32 tags); +void lu_session_tags_clear(__u32 tags); + +/** + * Environment. + */ +struct lu_env { + /** + * "Local" context, used to store data instead of stack. + */ + struct lu_context le_ctx; + /** + * "Session" context for per-request data. + */ + struct lu_context *le_ses; +}; + +int lu_env_init (struct lu_env *env, __u32 tags); +void lu_env_fini (struct lu_env *env); +int lu_env_refill(struct lu_env *env); +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags); + +/** @} lu_context */ + +/** + * Output site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int lu_site_stats_print(const struct lu_site *s, struct seq_file *m); + +/** + * Common name structure to be passed around for various name related methods. + */ +struct lu_name { + const char *ln_name; + int ln_namelen; +}; + +/** + * Common buffer structure to be passed around for various xattr_{s,g}et() + * methods. + */ +struct lu_buf { + void *lb_buf; + ssize_t lb_len; +}; + +#define DLUBUF "(%p %zu)" +#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len +/** + * One-time initializers, called at obdclass module initialization, not + * exported. + */ + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void); + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void); + +struct lu_kmem_descr { + struct kmem_cache **ckd_cache; + const char *ckd_name; + const size_t ckd_size; +}; + +int lu_kmem_init(struct lu_kmem_descr *caches); +void lu_kmem_fini(struct lu_kmem_descr *caches); + +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid); +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf); + +/** null buffer */ +extern struct lu_buf LU_BUF_NULL; + +void lu_buf_free(struct lu_buf *buf); +void lu_buf_alloc(struct lu_buf *buf, int size); +void lu_buf_realloc(struct lu_buf *buf, int size); + +int lu_buf_check_and_grow(struct lu_buf *buf, int len); +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len); + +/** @} lu */ +#endif /* __LUSTRE_LU_OBJECT_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lu_ref.h b/kernel/drivers/staging/lustre/lustre/include/lu_ref.h new file mode 100644 index 000000000..b451a888c --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lu_ref.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Nikita Danilov + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LUSTRE_LU_REF_H +#define __LUSTRE_LU_REF_H + +#include + +/** \defgroup lu_ref lu_ref + * + * An interface to track references between objects. Mostly for debugging. + * + * Suppose there is a reference counted data-structure struct foo. To track + * who acquired references to instance of struct foo, add lu_ref field to it: + * + * \code + * struct foo { + * atomic_t foo_refcount; + * struct lu_ref foo_reference; + * ... + * }; + * \endcode + * + * foo::foo_reference has to be initialized by calling + * lu_ref_init(). Typically there will be functions or macros to increment and + * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo) + * and foo_put(struct foo *foo), respectively. + * + * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add() + * has to be called to insert into foo::foo_reference a record, describing + * acquired reference. Dually, lu_ref_del() removes matching record. Typical + * usages are: + * + * \code + * struct bar *bar; + * + * // bar owns a reference to foo. + * bar->bar_foo = foo_get(foo); + * lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del(&foo->foo_reference, "bar", bar); + * foo_put(bar->bar_foo); + * + * + * // current thread acquired a temporary reference to foo. + * foo_get(foo); + * lu_ref_add(&foo->reference, __func__, current); + * + * ... + * + * // temporary reference is released. + * lu_ref_del(&foo->reference, __func__, current); + * foo_put(foo); + * \endcode + * + * \e Et \e cetera. Often it makes sense to include lu_ref_add() and + * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct + * foo is destroyed, lu_ref_fini() has to be called that checks that no + * pending references remain. lu_ref_print() can be used to dump a list of + * pending references, while hunting down a leak. + * + * For objects to which a large number of references can be acquired, + * lu_ref_del() can become cpu consuming, as it has to scan the list of + * references. To work around this, remember result of lu_ref_add() (usually + * in the same place where pointer to struct foo is stored), and use + * lu_ref_del_at(): + * + * \code + * // There is a large number of bar's for a single foo. + * bar->bar_foo = foo_get(foo); + * bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar); + * foo_put(bar->bar_foo); + * \endcode + * + * lu_ref interface degrades gracefully in case of memory shortages. + * + * @{ + */ + + +/* + * dummy data structures/functions to pass compile for now. + * We need to reimplement them with kref. + */ +struct lu_ref {}; +struct lu_ref_link {}; + +static inline void lu_ref_init(struct lu_ref *ref) +{ +} + +static inline void lu_ref_fini(struct lu_ref *ref) +{ +} + +static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref, + const char *scope, + const void *source) +{ + return NULL; +} + +static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref, + const char *scope, + const void *source) +{ + return NULL; +} + +static inline void lu_ref_add_at(struct lu_ref *ref, + struct lu_ref_link *link, + const char *scope, + const void *source) +{ +} + +static inline void lu_ref_del(struct lu_ref *ref, const char *scope, + const void *source) +{ +} + +static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source0, + const void *source1) +{ +} + +static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source) +{ +} + +static inline int lu_ref_global_init(void) +{ + return 0; +} + +static inline void lu_ref_global_fini(void) +{ +} + +static inline void lu_ref_print(const struct lu_ref *ref) +{ +} + +static inline void lu_ref_print_all(void) +{ +} + +/** @} lu */ + +#endif /* __LUSTRE_LU_REF_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/libiam.h b/kernel/drivers/staging/lustre/lustre/include/lustre/libiam.h new file mode 100644 index 000000000..e8e0b084a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre/libiam.h @@ -0,0 +1,145 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/libiam.h + * + * iam user level library + * + * Author: Wang Di + * Author: Nikita Danilov + * Author: Fan Yong + */ + +/* + * lustre/libiam.h + */ + +#ifndef __IAM_ULIB_H__ +#define __IAM_ULIB_H__ + +/** \defgroup libiam libiam + * + * @{ + */ + + +#define DX_FMT_NAME_LEN 16 + +enum iam_fmt_t { + FMT_LFIX, + FMT_LVAR +}; + +struct iam_uapi_info { + __u16 iui_keysize; + __u16 iui_recsize; + __u16 iui_ptrsize; + __u16 iui_height; + char iui_fmt_name[DX_FMT_NAME_LEN]; +}; + +/* + * Creat an iam file, but do NOT open it. + * Return 0 if success, else -1. + */ +int iam_creat(char *filename, enum iam_fmt_t fmt, + int blocksize, int keysize, int recsize, int ptrsize); + +/* + * Open an iam file, but do NOT creat it if the file doesn't exist. + * Please use iam_creat for creating the file before use iam_open. + * Return file id (fd) if success, else -1. + */ +int iam_open(char *filename, struct iam_uapi_info *ua); + +/* + * Close file opened by iam_open. + */ +int iam_close(int fd); + +/* + * Please use iam_open before use this function. + */ +int iam_insert(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_lookup(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_delete(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_it_start(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_next(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_stop(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Change iam file mode. + */ +int iam_polymorph(char *filename, unsigned long mode); + +/** @} libiam */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/kernel/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h new file mode 100644 index 000000000..ad253c6de --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h @@ -0,0 +1,121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/ll_fiemap.h + * + * FIEMAP data structures and flags. This header file will be used until + * fiemap.h is available in the upstream kernel. + * + * Author: Kalpak Shah + * Author: Andreas Dilger + */ + +#ifndef _LUSTRE_FIEMAP_H +#define _LUSTRE_FIEMAP_H + + + +struct ll_fiemap_extent { + __u64 fe_logical; /* logical offset in bytes for the start of + * the extent from the beginning of the file */ + __u64 fe_physical; /* physical offset in bytes for the start + * of the extent from the beginning of the disk */ + __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_reserved64[2]; + __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_device; /* device number for this extent */ + __u32 fe_reserved[2]; +}; + +struct ll_user_fiemap { + __u64 fm_start; /* logical offset (inclusive) at + * which to start mapping (in) */ + __u64 fm_length; /* logical length of mapping which + * userspace wants (in) */ + __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ + __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ + __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u32 fm_reserved; + struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ +}; + +#define FIEMAP_MAX_OFFSET (~0ULL) + +#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ +#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ + +#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ +#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ +#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. + * Sets EXTENT_UNKNOWN. */ +#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read + * while fs is unmounted */ +#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs. + * Sets EXTENT_NO_DIRECT. */ +#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be + * block aligned. */ +#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but + * no data (i.e. zero). */ +#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively + * support extents. Result + * merged for efficiency. */ + + +static inline size_t fiemap_count_to_size(size_t extent_count) +{ + return (sizeof(struct ll_user_fiemap) + extent_count * + sizeof(struct ll_fiemap_extent)); +} + +static inline unsigned fiemap_size_to_count(size_t array_size) +{ + return ((array_size - sizeof(struct ll_user_fiemap)) / + sizeof(struct ll_fiemap_extent)); +} + +#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */ + +#ifdef FIEMAP_FLAGS_COMPAT +#undef FIEMAP_FLAGS_COMPAT +#endif + +/* Lustre specific flags - use a high bit, don't conflict with upstream flag */ +#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */ +#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely. + * Sets NO_DIRECT flag */ + +#endif /* _LUSTRE_FIEMAP_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h new file mode 100644 index 000000000..93a3d7db3 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h @@ -0,0 +1,2 @@ +#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0" +#define LUSTRE_RELEASE 3.9.0_g6e62c21 diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h new file mode 100644 index 000000000..35aefa2cd --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h @@ -0,0 +1,215 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * GPL HEADER END + */ +/* + * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved. + * + * Copyright (c) 2013, Intel Corporation. + */ + +#ifndef LUSTRE_ERRNO_H +#define LUSTRE_ERRNO_H + +/* + * Only "network" errnos, which are defined below, are allowed on wire (or on + * disk). Generic routines exist to help translate between these and a subset + * of the "host" errnos. Some host errnos (e.g., EDEADLOCK) are intentionally + * left out. See also the comment on lustre_errno_hton_mapping[]. + * + * To maintain compatibility with existing x86 clients and servers, each of + * these network errnos has the same numerical value as its corresponding host + * errno on x86. + */ +#define LUSTRE_EPERM 1 /* Operation not permitted */ +#define LUSTRE_ENOENT 2 /* No such file or directory */ +#define LUSTRE_ESRCH 3 /* No such process */ +#define LUSTRE_EINTR 4 /* Interrupted system call */ +#define LUSTRE_EIO 5 /* I/O error */ +#define LUSTRE_ENXIO 6 /* No such device or address */ +#define LUSTRE_E2BIG 7 /* Argument list too long */ +#define LUSTRE_ENOEXEC 8 /* Exec format error */ +#define LUSTRE_EBADF 9 /* Bad file number */ +#define LUSTRE_ECHILD 10 /* No child processes */ +#define LUSTRE_EAGAIN 11 /* Try again */ +#define LUSTRE_ENOMEM 12 /* Out of memory */ +#define LUSTRE_EACCES 13 /* Permission denied */ +#define LUSTRE_EFAULT 14 /* Bad address */ +#define LUSTRE_ENOTBLK 15 /* Block device required */ +#define LUSTRE_EBUSY 16 /* Device or resource busy */ +#define LUSTRE_EEXIST 17 /* File exists */ +#define LUSTRE_EXDEV 18 /* Cross-device link */ +#define LUSTRE_ENODEV 19 /* No such device */ +#define LUSTRE_ENOTDIR 20 /* Not a directory */ +#define LUSTRE_EISDIR 21 /* Is a directory */ +#define LUSTRE_EINVAL 22 /* Invalid argument */ +#define LUSTRE_ENFILE 23 /* File table overflow */ +#define LUSTRE_EMFILE 24 /* Too many open files */ +#define LUSTRE_ENOTTY 25 /* Not a typewriter */ +#define LUSTRE_ETXTBSY 26 /* Text file busy */ +#define LUSTRE_EFBIG 27 /* File too large */ +#define LUSTRE_ENOSPC 28 /* No space left on device */ +#define LUSTRE_ESPIPE 29 /* Illegal seek */ +#define LUSTRE_EROFS 30 /* Read-only file system */ +#define LUSTRE_EMLINK 31 /* Too many links */ +#define LUSTRE_EPIPE 32 /* Broken pipe */ +#define LUSTRE_EDOM 33 /* Math argument out of domain of + func */ +#define LUSTRE_ERANGE 34 /* Math result not representable */ +#define LUSTRE_EDEADLK 35 /* Resource deadlock would occur */ +#define LUSTRE_ENAMETOOLONG 36 /* File name too long */ +#define LUSTRE_ENOLCK 37 /* No record locks available */ +#define LUSTRE_ENOSYS 38 /* Function not implemented */ +#define LUSTRE_ENOTEMPTY 39 /* Directory not empty */ +#define LUSTRE_ELOOP 40 /* Too many symbolic links + encountered */ +#define LUSTRE_ENOMSG 42 /* No message of desired type */ +#define LUSTRE_EIDRM 43 /* Identifier removed */ +#define LUSTRE_ECHRNG 44 /* Channel number out of range */ +#define LUSTRE_EL2NSYNC 45 /* Level 2 not synchronized */ +#define LUSTRE_EL3HLT 46 /* Level 3 halted */ +#define LUSTRE_EL3RST 47 /* Level 3 reset */ +#define LUSTRE_ELNRNG 48 /* Link number out of range */ +#define LUSTRE_EUNATCH 49 /* Protocol driver not attached */ +#define LUSTRE_ENOCSI 50 /* No CSI structure available */ +#define LUSTRE_EL2HLT 51 /* Level 2 halted */ +#define LUSTRE_EBADE 52 /* Invalid exchange */ +#define LUSTRE_EBADR 53 /* Invalid request descriptor */ +#define LUSTRE_EXFULL 54 /* Exchange full */ +#define LUSTRE_ENOANO 55 /* No anode */ +#define LUSTRE_EBADRQC 56 /* Invalid request code */ +#define LUSTRE_EBADSLT 57 /* Invalid slot */ +#define LUSTRE_EBFONT 59 /* Bad font file format */ +#define LUSTRE_ENOSTR 60 /* Device not a stream */ +#define LUSTRE_ENODATA 61 /* No data available */ +#define LUSTRE_ETIME 62 /* Timer expired */ +#define LUSTRE_ENOSR 63 /* Out of streams resources */ +#define LUSTRE_ENONET 64 /* Machine is not on the network */ +#define LUSTRE_ENOPKG 65 /* Package not installed */ +#define LUSTRE_EREMOTE 66 /* Object is remote */ +#define LUSTRE_ENOLINK 67 /* Link has been severed */ +#define LUSTRE_EADV 68 /* Advertise error */ +#define LUSTRE_ESRMNT 69 /* Srmount error */ +#define LUSTRE_ECOMM 70 /* Communication error on send */ +#define LUSTRE_EPROTO 71 /* Protocol error */ +#define LUSTRE_EMULTIHOP 72 /* Multihop attempted */ +#define LUSTRE_EDOTDOT 73 /* RFS specific error */ +#define LUSTRE_EBADMSG 74 /* Not a data message */ +#define LUSTRE_EOVERFLOW 75 /* Value too large for defined data + type */ +#define LUSTRE_ENOTUNIQ 76 /* Name not unique on network */ +#define LUSTRE_EBADFD 77 /* File descriptor in bad state */ +#define LUSTRE_EREMCHG 78 /* Remote address changed */ +#define LUSTRE_ELIBACC 79 /* Can not access a needed shared + library */ +#define LUSTRE_ELIBBAD 80 /* Accessing a corrupted shared + library */ +#define LUSTRE_ELIBSCN 81 /* .lib section in a.out corrupted */ +#define LUSTRE_ELIBMAX 82 /* Attempting to link in too many shared + libraries */ +#define LUSTRE_ELIBEXEC 83 /* Cannot exec a shared library + directly */ +#define LUSTRE_EILSEQ 84 /* Illegal byte sequence */ +#define LUSTRE_ERESTART 85 /* Interrupted system call should be + restarted */ +#define LUSTRE_ESTRPIPE 86 /* Streams pipe error */ +#define LUSTRE_EUSERS 87 /* Too many users */ +#define LUSTRE_ENOTSOCK 88 /* Socket operation on non-socket */ +#define LUSTRE_EDESTADDRREQ 89 /* Destination address required */ +#define LUSTRE_EMSGSIZE 90 /* Message too long */ +#define LUSTRE_EPROTOTYPE 91 /* Protocol wrong type for socket */ +#define LUSTRE_ENOPROTOOPT 92 /* Protocol not available */ +#define LUSTRE_EPROTONOSUPPORT 93 /* Protocol not supported */ +#define LUSTRE_ESOCKTNOSUPPORT 94 /* Socket type not supported */ +#define LUSTRE_EOPNOTSUPP 95 /* Operation not supported on transport + endpoint */ +#define LUSTRE_EPFNOSUPPORT 96 /* Protocol family not supported */ +#define LUSTRE_EAFNOSUPPORT 97 /* Address family not supported by + protocol */ +#define LUSTRE_EADDRINUSE 98 /* Address already in use */ +#define LUSTRE_EADDRNOTAVAIL 99 /* Cannot assign requested address */ +#define LUSTRE_ENETDOWN 100 /* Network is down */ +#define LUSTRE_ENETUNREACH 101 /* Network is unreachable */ +#define LUSTRE_ENETRESET 102 /* Network dropped connection because of + reset */ +#define LUSTRE_ECONNABORTED 103 /* Software caused connection abort */ +#define LUSTRE_ECONNRESET 104 /* Connection reset by peer */ +#define LUSTRE_ENOBUFS 105 /* No buffer space available */ +#define LUSTRE_EISCONN 106 /* Transport endpoint is already + connected */ +#define LUSTRE_ENOTCONN 107 /* Transport endpoint is not + connected */ +#define LUSTRE_ESHUTDOWN 108 /* Cannot send after transport endpoint + shutdown */ +#define LUSTRE_ETOOMANYREFS 109 /* Too many references: cannot splice */ +#define LUSTRE_ETIMEDOUT 110 /* Connection timed out */ +#define LUSTRE_ECONNREFUSED 111 /* Connection refused */ +#define LUSTRE_EHOSTDOWN 112 /* Host is down */ +#define LUSTRE_EHOSTUNREACH 113 /* No route to host */ +#define LUSTRE_EALREADY 114 /* Operation already in progress */ +#define LUSTRE_EINPROGRESS 115 /* Operation now in progress */ +#define LUSTRE_ESTALE 116 /* Stale file handle */ +#define LUSTRE_EUCLEAN 117 /* Structure needs cleaning */ +#define LUSTRE_ENOTNAM 118 /* Not a XENIX named type file */ +#define LUSTRE_ENAVAIL 119 /* No XENIX semaphores available */ +#define LUSTRE_EISNAM 120 /* Is a named type file */ +#define LUSTRE_EREMOTEIO 121 /* Remote I/O error */ +#define LUSTRE_EDQUOT 122 /* Quota exceeded */ +#define LUSTRE_ENOMEDIUM 123 /* No medium found */ +#define LUSTRE_EMEDIUMTYPE 124 /* Wrong medium type */ +#define LUSTRE_ECANCELED 125 /* Operation Canceled */ +#define LUSTRE_ENOKEY 126 /* Required key not available */ +#define LUSTRE_EKEYEXPIRED 127 /* Key has expired */ +#define LUSTRE_EKEYREVOKED 128 /* Key has been revoked */ +#define LUSTRE_EKEYREJECTED 129 /* Key was rejected by service */ +#define LUSTRE_EOWNERDEAD 130 /* Owner died */ +#define LUSTRE_ENOTRECOVERABLE 131 /* State not recoverable */ +#define LUSTRE_ERESTARTSYS 512 +#define LUSTRE_ERESTARTNOINTR 513 +#define LUSTRE_ERESTARTNOHAND 514 /* restart if no handler.. */ +#define LUSTRE_ENOIOCTLCMD 515 /* No ioctl command */ +#define LUSTRE_ERESTART_RESTARTBLOCK 516 /* restart by calling + sys_restart_syscall */ +#define LUSTRE_EBADHANDLE 521 /* Illegal NFS file handle */ +#define LUSTRE_ENOTSYNC 522 /* Update synchronization mismatch */ +#define LUSTRE_EBADCOOKIE 523 /* Cookie is stale */ +#define LUSTRE_ENOTSUPP 524 /* Operation is not supported */ +#define LUSTRE_ETOOSMALL 525 /* Buffer or request is too small */ +#define LUSTRE_ESERVERFAULT 526 /* An untranslatable error occurred */ +#define LUSTRE_EBADTYPE 527 /* Type not supported by server */ +#define LUSTRE_EJUKEBOX 528 /* Request initiated, but will not + complete before timeout */ +#define LUSTRE_EIOCBQUEUED 529 /* iocb queued, will get completion + event */ +#define LUSTRE_EIOCBRETRY 530 /* iocb queued, will trigger a retry */ + +/* + * Translations are optimized away on x86. Host errnos that shouldn't be put + * on wire could leak through as a result. Do not count on this side effect. + */ +#ifdef CONFIG_LUSTRE_TRANSLATE_ERRNOS +unsigned int lustre_errno_hton(unsigned int h); +unsigned int lustre_errno_ntoh(unsigned int n); +#else +#define lustre_errno_hton(h) (h) +#define lustre_errno_ntoh(n) (n) +#endif + +#endif /* LUSTRE_ERRNO_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h new file mode 100644 index 000000000..305ecbee9 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h @@ -0,0 +1,3734 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/lustre_idl.h + * + * Lustre wire protocol definitions. + */ + +/** \defgroup lustreidl lustreidl + * + * Lustre wire protocol definitions. + * + * ALL structs passing over the wire should be declared here. Structs + * that are used in interfaces with userspace should go in lustre_user.h. + * + * All structs being declared here should be built from simple fixed-size + * types (__u8, __u16, __u32, __u64) or be built from other types or + * structs also declared in this file. Similarly, all flags and magic + * values in those structs should also be declared here. This ensures + * that the Lustre wire protocol is not influenced by external dependencies. + * + * The only other acceptable items in this file are VERY SIMPLE accessor + * functions to avoid callers grubbing inside the structures, and the + * prototypes of the swabber functions for each struct. Nothing that + * depends on external functions or definitions should be in here. + * + * Structs must be properly aligned to put 64-bit values on an 8-byte + * boundary. Any structs being added here must also be added to + * utils/wirecheck.c and "make newwiretest" run to regenerate the + * utils/wiretest.c sources. This allows us to verify that wire structs + * have the proper alignment/size on all architectures. + * + * DO NOT CHANGE any of the structs, flags, values declared here and used + * in released Lustre versions. Some structs may have padding fields that + * can be used. Some structs might allow addition at the end (verify this + * in the code to ensure that new/old clients that see this larger struct + * do not fail, otherwise you need to implement protocol compatibility). + * + * We assume all nodes are either little-endian or big-endian, and we + * always send messages in the sender's native format. The receiver + * detects the message format by checking the 'magic' field of the message + * (see lustre_msg_swabbed() below). + * + * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines, + * implemented either here, inline (trivial implementations) or in + * ptlrpc/pack_generic.c. These 'swabbers' convert the type from "other" + * endian, in-place in the message buffer. + * + * A swabber takes a single pointer argument. The caller must already have + * verified that the length of the message buffer >= sizeof (type). + * + * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine + * may be defined that swabs just the variable part, after the caller has + * verified that the message buffer is large enough. + * + * @{ + */ + +#ifndef _LUSTRE_IDL_H_ +#define _LUSTRE_IDL_H_ + +#include "../../../include/linux/libcfs/libcfs.h" + +/* Defn's shared with user-space. */ +#include "lustre_user.h" +#include "lustre_errno.h" + +/* + * GENERAL STUFF + */ +/* FOO_REQUEST_PORTAL is for incoming requests on the FOO + * FOO_REPLY_PORTAL is for incoming replies on the FOO + * FOO_BULK_PORTAL is for incoming bulk on the FOO + */ + +/* Lustre service names are following the format + * service name + MDT + seq name + */ +#define LUSTRE_MDT_MAXNAMELEN 80 + +#define CONNMGR_REQUEST_PORTAL 1 +#define CONNMGR_REPLY_PORTAL 2 +//#define OSC_REQUEST_PORTAL 3 +#define OSC_REPLY_PORTAL 4 +//#define OSC_BULK_PORTAL 5 +#define OST_IO_PORTAL 6 +#define OST_CREATE_PORTAL 7 +#define OST_BULK_PORTAL 8 +//#define MDC_REQUEST_PORTAL 9 +#define MDC_REPLY_PORTAL 10 +//#define MDC_BULK_PORTAL 11 +#define MDS_REQUEST_PORTAL 12 +//#define MDS_REPLY_PORTAL 13 +#define MDS_BULK_PORTAL 14 +#define LDLM_CB_REQUEST_PORTAL 15 +#define LDLM_CB_REPLY_PORTAL 16 +#define LDLM_CANCEL_REQUEST_PORTAL 17 +#define LDLM_CANCEL_REPLY_PORTAL 18 +//#define PTLBD_REQUEST_PORTAL 19 +//#define PTLBD_REPLY_PORTAL 20 +//#define PTLBD_BULK_PORTAL 21 +#define MDS_SETATTR_PORTAL 22 +#define MDS_READPAGE_PORTAL 23 +#define OUT_PORTAL 24 + +#define MGC_REPLY_PORTAL 25 +#define MGS_REQUEST_PORTAL 26 +#define MGS_REPLY_PORTAL 27 +#define OST_REQUEST_PORTAL 28 +#define FLD_REQUEST_PORTAL 29 +#define SEQ_METADATA_PORTAL 30 +#define SEQ_DATA_PORTAL 31 +#define SEQ_CONTROLLER_PORTAL 32 +#define MGS_BULK_PORTAL 33 + +/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */ + +/* packet types */ +#define PTL_RPC_MSG_REQUEST 4711 +#define PTL_RPC_MSG_ERR 4712 +#define PTL_RPC_MSG_REPLY 4713 + +/* DON'T use swabbed values of MAGIC as magic! */ +#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0 +#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3 + +#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B +#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B + +#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2 + +#define PTLRPC_MSG_VERSION 0x00000003 +#define LUSTRE_VERSION_MASK 0xffff0000 +#define LUSTRE_OBD_VERSION 0x00010000 +#define LUSTRE_MDS_VERSION 0x00020000 +#define LUSTRE_OST_VERSION 0x00030000 +#define LUSTRE_DLM_VERSION 0x00040000 +#define LUSTRE_LOG_VERSION 0x00050000 +#define LUSTRE_MGS_VERSION 0x00060000 + +/** + * Describes a range of sequence, lsr_start is included but lsr_end is + * not in the range. + * Same structure is used in fld module where lsr_index field holds mdt id + * of the home mdt. + */ +struct lu_seq_range { + __u64 lsr_start; + __u64 lsr_end; + __u32 lsr_index; + __u32 lsr_flags; +}; + +#define LU_SEQ_RANGE_MDT 0x0 +#define LU_SEQ_RANGE_OST 0x1 +#define LU_SEQ_RANGE_ANY 0x3 + +#define LU_SEQ_RANGE_MASK 0x3 + +static inline unsigned fld_range_type(const struct lu_seq_range *range) +{ + return range->lsr_flags & LU_SEQ_RANGE_MASK; +} + +static inline int fld_range_is_ost(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_OST; +} + +static inline int fld_range_is_mdt(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_MDT; +} + +/** + * This all range is only being used when fld client sends fld query request, + * but it does not know whether the seq is MDT or OST, so it will send req + * with ALL type, which means either seq type gotten from lookup can be + * expected. + */ +static inline unsigned fld_range_is_any(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_ANY; +} + +static inline void fld_range_set_type(struct lu_seq_range *range, + unsigned flags) +{ + range->lsr_flags |= flags; +} + +static inline void fld_range_set_mdt(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_MDT); +} + +static inline void fld_range_set_ost(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_OST); +} + +static inline void fld_range_set_any(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_ANY); +} + +/** + * returns width of given range \a r + */ + +static inline __u64 range_space(const struct lu_seq_range *range) +{ + return range->lsr_end - range->lsr_start; +} + +/** + * initialize range to zero + */ + +static inline void range_init(struct lu_seq_range *range) +{ + memset(range, 0, sizeof(*range)); +} + +/** + * check if given seq id \a s is within given range \a r + */ + +static inline int range_within(const struct lu_seq_range *range, + __u64 s) +{ + return s >= range->lsr_start && s < range->lsr_end; +} + +static inline int range_is_sane(const struct lu_seq_range *range) +{ + return (range->lsr_end >= range->lsr_start); +} + +static inline int range_is_zero(const struct lu_seq_range *range) +{ + return (range->lsr_start == 0 && range->lsr_end == 0); +} + +static inline int range_is_exhausted(const struct lu_seq_range *range) + +{ + return range_space(range) == 0; +} + +/* return 0 if two range have the same location */ +static inline int range_compare_loc(const struct lu_seq_range *r1, + const struct lu_seq_range *r2) +{ + return r1->lsr_index != r2->lsr_index || + r1->lsr_flags != r2->lsr_flags; +} + +#define DRANGE "[%#16.16Lx-%#16.16Lx):%x:%s" + +#define PRANGE(range) \ + (range)->lsr_start, \ + (range)->lsr_end, \ + (range)->lsr_index, \ + fld_range_is_mdt(range) ? "mdt" : "ost" + + +/** \defgroup lu_fid lu_fid + * @{ */ + +/** + * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat. + * Deprecated since HSM and SOM attributes are now stored in separate on-disk + * xattr. + */ +enum lma_compat { + LMAC_HSM = 0x00000001, + LMAC_SOM = 0x00000002, + LMAC_NOT_IN_OI = 0x00000004, /* the object does NOT need OI mapping */ + LMAC_FID_ON_OST = 0x00000008, /* For OST-object, its OI mapping is + * under /O//d. */ +}; + +/** + * Masks for all features that should be supported by a Lustre version to + * access a specific file. + * This information is stored in lustre_mdt_attrs::lma_incompat. + */ +enum lma_incompat { + LMAI_RELEASED = 0x00000001, /* file is released */ + LMAI_AGENT = 0x00000002, /* agent inode */ + LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object + is on the remote MDT */ +}; +#define LMA_INCOMPAT_SUPP (LMAI_AGENT | LMAI_REMOTE_PARENT) + +/** + * fid constants + */ +enum { + /** LASTID file has zero OID */ + LUSTRE_FID_LASTID_OID = 0UL, + /** initial fid id value */ + LUSTRE_FID_INIT_OID = 1UL +}; + +/** returns fid object sequence */ +static inline __u64 fid_seq(const struct lu_fid *fid) +{ + return fid->f_seq; +} + +/** returns fid object id */ +static inline __u32 fid_oid(const struct lu_fid *fid) +{ + return fid->f_oid; +} + +/** returns fid object version */ +static inline __u32 fid_ver(const struct lu_fid *fid) +{ + return fid->f_ver; +} + +static inline void fid_zero(struct lu_fid *fid) +{ + memset(fid, 0, sizeof(*fid)); +} + +static inline __u64 fid_ver_oid(const struct lu_fid *fid) +{ + return ((__u64)fid_ver(fid) << 32 | fid_oid(fid)); +} + +/** + * Note that reserved SEQ numbers below 12 will conflict with ldiskfs + * inodes in the IGIF namespace, so these reserved SEQ numbers can be + * used for other purposes and not risk collisions with existing inodes. + * + * Different FID Format + * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0 + */ +enum fid_seq { + FID_SEQ_OST_MDT0 = 0, + FID_SEQ_LLOG = 1, /* unnamed llogs */ + FID_SEQ_ECHO = 2, + FID_SEQ_OST_MDT1 = 3, + FID_SEQ_OST_MAX = 9, /* Max MDT count before OST_on_FID */ + FID_SEQ_LLOG_NAME = 10, /* named llogs */ + FID_SEQ_RSVD = 11, + FID_SEQ_IGIF = 12, + FID_SEQ_IGIF_MAX = 0x0ffffffffULL, + FID_SEQ_IDIF = 0x100000000ULL, + FID_SEQ_IDIF_MAX = 0x1ffffffffULL, + /* Normal FID sequence starts from this value, i.e. 1<<33 */ + FID_SEQ_START = 0x200000000ULL, + /* sequence for local pre-defined FIDs listed in local_oid */ + FID_SEQ_LOCAL_FILE = 0x200000001ULL, + FID_SEQ_DOT_LUSTRE = 0x200000002ULL, + /* sequence is used for local named objects FIDs generated + * by local_object_storage library */ + FID_SEQ_LOCAL_NAME = 0x200000003ULL, + /* Because current FLD will only cache the fid sequence, instead + * of oid on the client side, if the FID needs to be exposed to + * clients sides, it needs to make sure all of fids under one + * sequence will be located in one MDT. */ + FID_SEQ_SPECIAL = 0x200000004ULL, + FID_SEQ_QUOTA = 0x200000005ULL, + FID_SEQ_QUOTA_GLB = 0x200000006ULL, + FID_SEQ_ROOT = 0x200000007ULL, /* Located on MDT0 */ + FID_SEQ_NORMAL = 0x200000400ULL, + FID_SEQ_LOV_DEFAULT = 0xffffffffffffffffULL +}; + +#define OBIF_OID_MAX_BITS 32 +#define OBIF_MAX_OID (1ULL << OBIF_OID_MAX_BITS) +#define OBIF_OID_MASK ((1ULL << OBIF_OID_MAX_BITS) - 1) +#define IDIF_OID_MAX_BITS 48 +#define IDIF_MAX_OID (1ULL << IDIF_OID_MAX_BITS) +#define IDIF_OID_MASK ((1ULL << IDIF_OID_MAX_BITS) - 1) + +/** OID for FID_SEQ_SPECIAL */ +enum special_oid { + /* Big Filesystem Lock to serialize rename operations */ + FID_OID_SPECIAL_BFL = 1UL, +}; + +/** OID for FID_SEQ_DOT_LUSTRE */ +enum dot_lustre_oid { + FID_OID_DOT_LUSTRE = 1UL, + FID_OID_DOT_LUSTRE_OBF = 2UL, +}; + +static inline int fid_seq_is_mdt0(__u64 seq) +{ + return (seq == FID_SEQ_OST_MDT0); +} + +static inline int fid_seq_is_mdt(const __u64 seq) +{ + return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL; +}; + +static inline int fid_seq_is_echo(__u64 seq) +{ + return (seq == FID_SEQ_ECHO); +} + +static inline int fid_is_echo(const struct lu_fid *fid) +{ + return fid_seq_is_echo(fid_seq(fid)); +} + +static inline int fid_seq_is_llog(__u64 seq) +{ + return (seq == FID_SEQ_LLOG); +} + +static inline int fid_is_llog(const struct lu_fid *fid) +{ + /* file with OID == 0 is not llog but contains last oid */ + return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0; +} + +static inline int fid_seq_is_rsvd(const __u64 seq) +{ + return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD); +}; + +static inline int fid_seq_is_special(const __u64 seq) +{ + return seq == FID_SEQ_SPECIAL; +}; + +static inline int fid_seq_is_local_file(const __u64 seq) +{ + return seq == FID_SEQ_LOCAL_FILE || + seq == FID_SEQ_LOCAL_NAME; +}; + +static inline int fid_seq_is_root(const __u64 seq) +{ + return seq == FID_SEQ_ROOT; +} + +static inline int fid_seq_is_dot(const __u64 seq) +{ + return seq == FID_SEQ_DOT_LUSTRE; +} + +static inline int fid_seq_is_default(const __u64 seq) +{ + return seq == FID_SEQ_LOV_DEFAULT; +} + +static inline int fid_is_mdt0(const struct lu_fid *fid) +{ + return fid_seq_is_mdt0(fid_seq(fid)); +} + +static inline void lu_root_fid(struct lu_fid *fid) +{ + fid->f_seq = FID_SEQ_ROOT; + fid->f_oid = 1; + fid->f_ver = 0; +} + +/** + * Check if a fid is igif or not. + * \param fid the fid to be tested. + * \return true if the fid is a igif; otherwise false. + */ +static inline int fid_seq_is_igif(const __u64 seq) +{ + return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX; +} + +static inline int fid_is_igif(const struct lu_fid *fid) +{ + return fid_seq_is_igif(fid_seq(fid)); +} + +/** + * Check if a fid is idif or not. + * \param fid the fid to be tested. + * \return true if the fid is a idif; otherwise false. + */ +static inline int fid_seq_is_idif(const __u64 seq) +{ + return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX; +} + +static inline int fid_is_idif(const struct lu_fid *fid) +{ + return fid_seq_is_idif(fid_seq(fid)); +} + +static inline int fid_is_local_file(const struct lu_fid *fid) +{ + return fid_seq_is_local_file(fid_seq(fid)); +} + +static inline int fid_seq_is_norm(const __u64 seq) +{ + return (seq >= FID_SEQ_NORMAL); +} + +static inline int fid_is_norm(const struct lu_fid *fid) +{ + return fid_seq_is_norm(fid_seq(fid)); +} + +/* convert an OST objid into an IDIF FID SEQ number */ +static inline __u64 fid_idif_seq(__u64 id, __u32 ost_idx) +{ + return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff); +} + +/* convert a packed IDIF FID into an OST objid */ +static inline __u64 fid_idif_id(__u64 seq, __u32 oid, __u32 ver) +{ + return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid; +} + +/* extract ost index from IDIF FID */ +static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid) +{ + return (fid_seq(fid) >> 16) & 0xffff; +} + +/* extract OST sequence (group) from a wire ost_id (id/seq) pair */ +static inline __u64 ostid_seq(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid->oi.oi_seq)) + return FID_SEQ_OST_MDT0; + + if (fid_seq_is_default(ostid->oi.oi_seq)) + return FID_SEQ_LOV_DEFAULT; + + if (fid_is_idif(&ostid->oi_fid)) + return FID_SEQ_OST_MDT0; + + return fid_seq(&ostid->oi_fid); +} + +/* extract OST objid from a wire ost_id (id/seq) pair */ +static inline __u64 ostid_id(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid_seq(ostid))) + return ostid->oi.oi_id & IDIF_OID_MASK; + + if (fid_is_idif(&ostid->oi_fid)) + return fid_idif_id(fid_seq(&ostid->oi_fid), + fid_oid(&ostid->oi_fid), 0); + + return fid_oid(&ostid->oi_fid); +} + +static inline void ostid_set_seq(struct ost_id *oi, __u64 seq) +{ + if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) { + oi->oi.oi_seq = seq; + } else { + oi->oi_fid.f_seq = seq; + /* Note: if f_oid + f_ver is zero, we need init it + * to be 1, otherwise, ostid_seq will treat this + * as old ostid (oi_seq == 0) */ + if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0) + oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID; + } +} + +static inline void ostid_set_seq_mdt0(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_OST_MDT0); +} + +static inline void ostid_set_seq_echo(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_ECHO); +} + +static inline void ostid_set_seq_llog(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_LLOG); +} + +/** + * Note: we need check oi_seq to decide where to set oi_id, + * so oi_seq should always be set ahead of oi_id. + */ +static inline void ostid_set_id(struct ost_id *oi, __u64 oid) +{ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + if (oid >= IDIF_MAX_OID) { + CERROR("Bad %llu to set "DOSTID"\n", + oid, POSTID(oi)); + return; + } + oi->oi.oi_id = oid; + } else { + if (oid > OBIF_MAX_OID) { + CERROR("Bad %llu to set "DOSTID"\n", + oid, POSTID(oi)); + return; + } + oi->oi_fid.f_oid = oid; + } +} + +static inline void ostid_inc_id(struct ost_id *oi) +{ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) { + CERROR("Bad inc "DOSTID"\n", POSTID(oi)); + return; + } + oi->oi.oi_id++; + } else { + oi->oi_fid.f_oid++; + } +} + +static inline void ostid_dec_id(struct ost_id *oi) +{ + if (fid_seq_is_mdt0(ostid_seq(oi))) + oi->oi.oi_id--; + else + oi->oi_fid.f_oid--; +} + +/** + * Unpack an OST object id/seq (group) into a FID. This is needed for + * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper + * FIDs. Note that if an id/seq is already in FID/IDIF format it will + * be passed through unchanged. Only legacy OST objects in "group 0" + * will be mapped into the IDIF namespace so that they can fit into the + * struct lu_fid fields without loss. For reference see: + * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs + */ +static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid, + __u32 ost_idx) +{ + if (ost_idx > 0xffff) { + CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid), + ost_idx); + return -EBADF; + } + + if (fid_seq_is_mdt0(ostid_seq(ostid))) { + /* This is a "legacy" (old 1.x/2.early) OST object in "group 0" + * that we map into the IDIF namespace. It allows up to 2^48 + * objects per OST, as this is the object namespace that has + * been in production for years. This can handle create rates + * of 1M objects/s/OST for 9 years, or combinations thereof. */ + if (ostid_id(ostid) >= IDIF_MAX_OID) { + CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n", + POSTID(ostid), ost_idx); + return -EBADF; + } + fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx); + /* truncate to 32 bits by assignment */ + fid->f_oid = ostid_id(ostid); + /* in theory, not currently used */ + fid->f_ver = ostid_id(ostid) >> 48; + } else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ { + /* This is either an IDIF object, which identifies objects across + * all OSTs, or a regular FID. The IDIF namespace maps legacy + * OST objects into the FID namespace. In both cases, we just + * pass the FID through, no conversion needed. */ + if (ostid->oi_fid.f_ver != 0) { + CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n", + POSTID(ostid), ost_idx); + return -EBADF; + } + *fid = ostid->oi_fid; + } + + return 0; +} + +/* pack any OST FID into an ostid (id/seq) for the wire/disk */ +static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid) +{ + if (unlikely(fid_seq_is_igif(fid->f_seq))) { + CERROR("bad IGIF, "DFID"\n", PFID(fid)); + return -EBADF; + } + + if (fid_is_idif(fid)) { + ostid_set_seq_mdt0(ostid); + ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid), + fid_ver(fid))); + } else { + ostid->oi_fid = *fid; + } + + return 0; +} + +/* Check whether the fid is for LAST_ID */ +static inline int fid_is_last_id(const struct lu_fid *fid) +{ + return (fid_oid(fid) == 0); +} + +/** + * Get inode number from a igif. + * \param fid a igif to get inode number from. + * \return inode number for the igif. + */ +static inline ino_t lu_igif_ino(const struct lu_fid *fid) +{ + return fid_seq(fid); +} + +extern void lustre_swab_ost_id(struct ost_id *oid); + +/** + * Get inode generation from a igif. + * \param fid a igif to get inode generation from. + * \return inode generation for the igif. + */ +static inline __u32 lu_igif_gen(const struct lu_fid *fid) +{ + return fid_oid(fid); +} + +/** + * Build igif from the inode number/generation. + */ +static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen) +{ + fid->f_seq = ino; + fid->f_oid = gen; + fid->f_ver = 0; +} + +/* + * Fids are transmitted across network (in the sender byte-ordering), + * and stored on disk in big-endian order. + */ +static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = cpu_to_le64(fid_seq(src)); + dst->f_oid = cpu_to_le32(fid_oid(src)); + dst->f_ver = cpu_to_le32(fid_ver(src)); +} + +static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = le64_to_cpu(fid_seq(src)); + dst->f_oid = le32_to_cpu(fid_oid(src)); + dst->f_ver = le32_to_cpu(fid_ver(src)); +} + +static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = cpu_to_be64(fid_seq(src)); + dst->f_oid = cpu_to_be32(fid_oid(src)); + dst->f_ver = cpu_to_be32(fid_ver(src)); +} + +static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = be64_to_cpu(fid_seq(src)); + dst->f_oid = be32_to_cpu(fid_oid(src)); + dst->f_ver = be32_to_cpu(fid_ver(src)); +} + +static inline int fid_is_sane(const struct lu_fid *fid) +{ + return fid != NULL && + ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) || + fid_is_igif(fid) || fid_is_idif(fid) || + fid_seq_is_rsvd(fid_seq(fid))); +} + +static inline int fid_is_zero(const struct lu_fid *fid) +{ + return fid_seq(fid) == 0 && fid_oid(fid) == 0; +} + +extern void lustre_swab_lu_fid(struct lu_fid *fid); +extern void lustre_swab_lu_seq_range(struct lu_seq_range *range); + +static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1) +{ + return memcmp(f0, f1, sizeof(*f0)) == 0; +} + +#define __diff_normalize(val0, val1) \ +({ \ + typeof(val0) __val0 = (val0); \ + typeof(val1) __val1 = (val1); \ + \ + (__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1); \ +}) + +static inline int lu_fid_cmp(const struct lu_fid *f0, + const struct lu_fid *f1) +{ + return + __diff_normalize(fid_seq(f0), fid_seq(f1)) ?: + __diff_normalize(fid_oid(f0), fid_oid(f1)) ?: + __diff_normalize(fid_ver(f0), fid_ver(f1)); +} + +static inline void ostid_cpu_to_le(const struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(ostid_seq(src_oi))) { + dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq); + } else { + fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +static inline void ostid_le_to_cpu(const struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(ostid_seq(src_oi))) { + dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq); + } else { + fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +/** @} lu_fid */ + +/** \defgroup lu_dir lu_dir + * @{ */ + +/** + * Enumeration of possible directory entry attributes. + * + * Attributes follow directory entry header in the order they appear in this + * enumeration. + */ +enum lu_dirent_attrs { + LUDA_FID = 0x0001, + LUDA_TYPE = 0x0002, + LUDA_64BITHASH = 0x0004, + + /* The following attrs are used for MDT internal only, + * not visible to client */ + + /* Verify the dirent consistency */ + LUDA_VERIFY = 0x8000, + /* Only check but not repair the dirent inconsistency */ + LUDA_VERIFY_DRYRUN = 0x4000, + /* The dirent has been repaired, or to be repaired (dryrun). */ + LUDA_REPAIR = 0x2000, + /* The system is upgraded, has beed or to be repaired (dryrun). */ + LUDA_UPGRADE = 0x1000, + /* Ignore this record, go to next directly. */ + LUDA_IGNORE = 0x0800, +}; + +#define LU_DIRENT_ATTRS_MASK 0xf800 + +/** + * Layout of readdir pages, as transmitted on wire. + */ +struct lu_dirent { + /** valid if LUDA_FID is set. */ + struct lu_fid lde_fid; + /** a unique entry identifier: a hash or an offset. */ + __u64 lde_hash; + /** total record length, including all attributes. */ + __u16 lde_reclen; + /** name length */ + __u16 lde_namelen; + /** optional variable size attributes following this entry. + * taken from enum lu_dirent_attrs. + */ + __u32 lde_attrs; + /** name is followed by the attributes indicated in ->ldp_attrs, in + * their natural order. After the last attribute, padding bytes are + * added to make ->lde_reclen a multiple of 8. + */ + char lde_name[0]; +}; + +/* + * Definitions of optional directory entry attributes formats. + * + * Individual attributes do not have their length encoded in a generic way. It + * is assumed that consumer of an attribute knows its format. This means that + * it is impossible to skip over an unknown attribute, except by skipping over all + * remaining attributes (by using ->lde_reclen), which is not too + * constraining, because new server versions will append new attributes at + * the end of an entry. + */ + +/** + * Fid directory attribute: a fid of an object referenced by the entry. This + * will be almost always requested by the client and supplied by the server. + * + * Aligned to 8 bytes. + */ +/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */ + +/** + * File type. + * + * Aligned to 2 bytes. + */ +struct luda_type { + __u16 lt_type; +}; + +#ifndef IFSHIFT +#define IFSHIFT 12 +#endif + +#ifndef IFTODT +#define IFTODT(type) (((type) & S_IFMT) >> IFSHIFT) +#endif +#ifndef DTTOIF +#define DTTOIF(dirtype) ((dirtype) << IFSHIFT) +#endif + + +struct lu_dirpage { + __u64 ldp_hash_start; + __u64 ldp_hash_end; + __u32 ldp_flags; + __u32 ldp_pad0; + struct lu_dirent ldp_entries[0]; +}; + +enum lu_dirpage_flags { + /** + * dirpage contains no entry. + */ + LDF_EMPTY = 1 << 0, + /** + * last entry's lde_hash equals ldp_hash_end. + */ + LDF_COLLIDE = 1 << 1 +}; + +static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp) +{ + if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY) + return NULL; + else + return dp->ldp_entries; +} + +static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent) +{ + struct lu_dirent *next; + + if (le16_to_cpu(ent->lde_reclen) != 0) + next = ((void *)ent) + le16_to_cpu(ent->lde_reclen); + else + next = NULL; + + return next; +} + +static inline int lu_dirent_calc_size(int namelen, __u16 attr) +{ + int size; + + if (attr & LUDA_TYPE) { + const unsigned align = sizeof(struct luda_type) - 1; + size = (sizeof(struct lu_dirent) + namelen + align) & ~align; + size += sizeof(struct luda_type); + } else + size = sizeof(struct lu_dirent) + namelen; + + return (size + 7) & ~7; +} + +static inline int lu_dirent_size(struct lu_dirent *ent) +{ + if (le16_to_cpu(ent->lde_reclen) == 0) { + return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen), + le32_to_cpu(ent->lde_attrs)); + } + return le16_to_cpu(ent->lde_reclen); +} + +#define MDS_DIR_END_OFF 0xfffffffffffffffeULL + +/** + * MDS_READPAGE page size + * + * This is the directory page size packed in MDS_READPAGE RPC. + * It's different than PAGE_CACHE_SIZE because the client needs to + * access the struct lu_dirpage header packed at the beginning of + * the "page" and without this there isn't any way to know find the + * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ. + */ +#define LU_PAGE_SHIFT 12 +#define LU_PAGE_SIZE (1UL << LU_PAGE_SHIFT) +#define LU_PAGE_MASK (~(LU_PAGE_SIZE - 1)) + +#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT)) + +/** @} lu_dir */ + +struct lustre_handle { + __u64 cookie; +}; +#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL + +static inline int lustre_handle_is_used(struct lustre_handle *lh) +{ + return lh->cookie != 0ull; +} + +static inline int lustre_handle_equal(const struct lustre_handle *lh1, + const struct lustre_handle *lh2) +{ + return lh1->cookie == lh2->cookie; +} + +static inline void lustre_handle_copy(struct lustre_handle *tgt, + struct lustre_handle *src) +{ + tgt->cookie = src->cookie; +} + +/* flags for lm_flags */ +#define MSGHDR_AT_SUPPORT 0x1 +#define MSGHDR_CKSUM_INCOMPAT18 0x2 + +#define lustre_msg lustre_msg_v2 +/* we depend on this structure to be 8-byte aligned */ +/* this type is only endian-adjusted in lustre_unpack_msg() */ +struct lustre_msg_v2 { + __u32 lm_bufcount; + __u32 lm_secflvr; + __u32 lm_magic; + __u32 lm_repsize; + __u32 lm_cksum; + __u32 lm_flags; + __u32 lm_padding_2; + __u32 lm_padding_3; + __u32 lm_buflens[0]; +}; + +/* without gss, ptlrpc_body is put at the first buffer. */ +#define PTLRPC_NUM_VERSIONS 4 +#define JOBSTATS_JOBID_SIZE 32 /* 32 bytes string */ +struct ptlrpc_body_v3 { + struct lustre_handle pb_handle; + __u32 pb_type; + __u32 pb_version; + __u32 pb_opc; + __u32 pb_status; + __u64 pb_last_xid; + __u64 pb_last_seen; + __u64 pb_last_committed; + __u64 pb_transno; + __u32 pb_flags; + __u32 pb_op_flags; + __u32 pb_conn_cnt; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time */ + __u32 pb_limit; + __u64 pb_slv; + /* VBR: pre-versions */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + /* padding for future needs */ + __u64 pb_padding[4]; + char pb_jobid[JOBSTATS_JOBID_SIZE]; +}; +#define ptlrpc_body ptlrpc_body_v3 + +struct ptlrpc_body_v2 { + struct lustre_handle pb_handle; + __u32 pb_type; + __u32 pb_version; + __u32 pb_opc; + __u32 pb_status; + __u64 pb_last_xid; + __u64 pb_last_seen; + __u64 pb_last_committed; + __u64 pb_transno; + __u32 pb_flags; + __u32 pb_op_flags; + __u32 pb_conn_cnt; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time, also used for + net_latency of req */ + __u32 pb_limit; + __u64 pb_slv; + /* VBR: pre-versions */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + /* padding for future needs */ + __u64 pb_padding[4]; +}; + +extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); + +/* message body offset for lustre_msg_v2 */ +/* ptlrpc body offset in all request/reply messages */ +#define MSG_PTLRPC_BODY_OFF 0 + +/* normal request/reply message record offset */ +#define REQ_REC_OFF 1 +#define REPLY_REC_OFF 1 + +/* ldlm request message body offset */ +#define DLM_LOCKREQ_OFF 1 /* lockreq offset */ +#define DLM_REQ_REC_OFF 2 /* normal dlm request record offset */ + +/* ldlm intent lock message body offset */ +#define DLM_INTENT_IT_OFF 2 /* intent lock it offset */ +#define DLM_INTENT_REC_OFF 3 /* intent lock record offset */ + +/* ldlm reply message body offset */ +#define DLM_LOCKREPLY_OFF 1 /* lockrep offset */ +#define DLM_REPLY_REC_OFF 2 /* reply record offset */ + +/** only use in req->rq_{req,rep}_swab_mask */ +#define MSG_PTLRPC_HEADER_OFF 31 + +/* Flags that are operation-specific go in the top 16 bits. */ +#define MSG_OP_FLAG_MASK 0xffff0000 +#define MSG_OP_FLAG_SHIFT 16 + +/* Flags that apply to all requests are in the bottom 16 bits */ +#define MSG_GEN_FLAG_MASK 0x0000ffff +#define MSG_LAST_REPLAY 0x0001 +#define MSG_RESENT 0x0002 +#define MSG_REPLAY 0x0004 +/* #define MSG_AT_SUPPORT 0x0008 + * This was used in early prototypes of adaptive timeouts, and while there + * shouldn't be any users of that code there also isn't a need for using this + * bits. Defer usage until at least 1.10 to avoid potential conflict. */ +#define MSG_DELAY_REPLAY 0x0010 +#define MSG_VERSION_REPLAY 0x0020 +#define MSG_REQ_REPLAY_DONE 0x0040 +#define MSG_LOCK_REPLAY_DONE 0x0080 + +/* + * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT) + */ + +#define MSG_CONNECT_RECOVERING 0x00000001 +#define MSG_CONNECT_RECONNECT 0x00000002 +#define MSG_CONNECT_REPLAYABLE 0x00000004 +//#define MSG_CONNECT_PEER 0x8 +#define MSG_CONNECT_LIBCLIENT 0x00000010 +#define MSG_CONNECT_INITIAL 0x00000020 +#define MSG_CONNECT_ASYNC 0x00000040 +#define MSG_CONNECT_NEXT_VER 0x00000080 /* use next version of lustre_msg */ +#define MSG_CONNECT_TRANSNO 0x00000100 /* report transno */ + +/* Connect flags */ +#define OBD_CONNECT_RDONLY 0x1ULL /*client has read-only access*/ +#define OBD_CONNECT_INDEX 0x2ULL /*connect specific LOV idx */ +#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */ +#define OBD_CONNECT_GRANT 0x8ULL /*OSC gets grant at connect */ +#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for cli */ +#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */ +#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO req portal */ +#define OBD_CONNECT_ACL 0x80ULL /*access control lists */ +#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attr */ +#define OBD_CONNECT_CROW 0x200ULL /*MDS+OST create obj on write*/ +#define OBD_CONNECT_TRUNCLOCK 0x400ULL /*locks on server for punch */ +#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends init transno */ +#define OBD_CONNECT_IBITS 0x1000ULL /*support for inodebits locks*/ +#define OBD_CONNECT_JOIN 0x2000ULL /*files can be concatenated. + *We do not support JOIN FILE + *anymore, reserve this flags + *just for preventing such bit + *to be reused.*/ +#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server can GetAttr By Fid*/ +#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open hndl on specl nodes*/ +#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /*Remote client */ +#define OBD_CONNECT_RMT_CLIENT_FORCE 0x20000ULL /*Remote client by force */ +#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */ +#define OBD_CONNECT_QUOTA64 0x80000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */ +#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */ +#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */ +#define OBD_CONNECT_SOM 0x800000ULL /*Size on MDS */ +#define OBD_CONNECT_AT 0x1000000ULL /*client uses AT */ +#define OBD_CONNECT_LRU_RESIZE 0x2000000ULL /*LRU resize feature. */ +#define OBD_CONNECT_MDS_MDS 0x4000000ULL /*MDS-MDS connection */ +#define OBD_CONNECT_REAL 0x8000000ULL /*real connection */ +#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos*/ +#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */ +#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */ +#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */ +#define OBD_CONNECT_GRANT_SHRINK 0x200000000ULL /* support grant shrink */ +#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */ +#define OBD_CONNECT_MAX_EASIZE 0x800000000ULL /* preserved for large EA */ +#define OBD_CONNECT_FULL20 0x1000000000ULL /* it is 2.0 client */ +#define OBD_CONNECT_LAYOUTLOCK 0x2000000000ULL /* client uses layout lock */ +#define OBD_CONNECT_64BITHASH 0x4000000000ULL /* client supports 64-bits + * directory hash */ +#define OBD_CONNECT_MAXBYTES 0x8000000000ULL /* max stripe size */ +#define OBD_CONNECT_IMP_RECOV 0x10000000000ULL /* imp recovery support */ +#define OBD_CONNECT_JOBSTATS 0x20000000000ULL /* jobid in ptlrpc_body */ +#define OBD_CONNECT_UMASK 0x40000000000ULL /* create uses client umask */ +#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS + * RPC error properly */ +#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for + * finer space reservation */ +#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8 + * policy and 2.x server */ +#define OBD_CONNECT_LVB_TYPE 0x400000000000ULL /* variable type of LVB */ +#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */ +#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */ +#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */ +#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */ +#define OBD_CONNECT_FLOCK_DEAD 0x8000000000000ULL/* flock deadlock detection */ +#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/*create stripe disposition*/ + +/* XXX README XXX: + * Please DO NOT add flag values here before first ensuring that this same + * flag value is not in use on some other branch. Please clear any such + * changes with senior engineers before starting to use a new flag. Then, + * submit a small patch against EVERY branch that ONLY adds the new flag, + * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the + * flag to check_obd_connect_data(), and updates wiretests accordingly, so it + * can be approved and landed easily to reserve the flag for future use. */ + +/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS + * connection. It is a temporary bug fix for Imperative Recovery interop + * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for + * 2.2 clients/servers is no longer needed. LU-1252/LU-1644. */ +#define OBD_CONNECT_MNE_SWAB OBD_CONNECT_MDS_MDS + +#define OCD_HAS_FLAG(ocd, flg) \ + (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg)) + + +#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE + +#define MDT_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \ + OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \ + OBD_CONNECT_IBITS | \ + OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \ + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \ + OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | \ + OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \ + OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \ + OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \ + OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \ + OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \ + OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \ + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\ + OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\ + OBD_CONNECT_FLOCK_DEAD | \ + OBD_CONNECT_DISP_STRIPE) + +#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ + OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ + OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ + OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \ + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \ + LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \ + OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \ + OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \ + OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \ + OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \ + OBD_CONNECT_MAX_EASIZE | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\ + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \ + OBD_CONNECT_PINGLESS) +#define ECHO_CONNECT_SUPPORTED (0) +#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \ + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \ + OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS) + +/* Features required for this version of the client to work with server */ +#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \ + OBD_CONNECT_FULL20) + +#define OBD_OCD_VERSION(major, minor, patch, fix) (((major)<<24) + \ + ((minor)<<16) + \ + ((patch)<<8) + (fix)) +#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255) +#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255) +#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255) +#define OBD_OCD_VERSION_FIX(version) ((int)(version)&255) + +/* This structure is used for both request and reply. + * + * If we eventually have separate connect data for different types, which we + * almost certainly will, then perhaps we stick a union in here. */ +struct obd_connect_data_v1 { + __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */ + __u32 ocd_version; /* lustre release version number */ + __u32 ocd_grant; /* initial cache grant amount (bytes) */ + __u32 ocd_index; /* LOV index to connect to */ + __u32 ocd_brw_size; /* Maximum BRW size in bytes, must be 2^n */ + __u64 ocd_ibits_known; /* inode bits this client understands */ + __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */ + __u8 ocd_inodespace; /* log2 of the per-inode space consumption */ + __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */ + __u32 ocd_unused; /* also fix lustre_swab_connect */ + __u64 ocd_transno; /* first transno from client to be replayed */ + __u32 ocd_group; /* MDS group on OST */ + __u32 ocd_cksum_types; /* supported checksum algorithms */ + __u32 ocd_max_easize; /* How big LOV EA can be on MDS */ + __u32 ocd_instance; /* also fix lustre_swab_connect */ + __u64 ocd_maxbytes; /* Maximum stripe size in bytes */ +}; + +struct obd_connect_data { + __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */ + __u32 ocd_version; /* lustre release version number */ + __u32 ocd_grant; /* initial cache grant amount (bytes) */ + __u32 ocd_index; /* LOV index to connect to */ + __u32 ocd_brw_size; /* Maximum BRW size in bytes */ + __u64 ocd_ibits_known; /* inode bits this client understands */ + __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */ + __u8 ocd_inodespace; /* log2 of the per-inode space consumption */ + __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */ + __u32 ocd_unused; /* also fix lustre_swab_connect */ + __u64 ocd_transno; /* first transno from client to be replayed */ + __u32 ocd_group; /* MDS group on OST */ + __u32 ocd_cksum_types; /* supported checksum algorithms */ + __u32 ocd_max_easize; /* How big LOV EA can be on MDS */ + __u32 ocd_instance; /* instance # of this target */ + __u64 ocd_maxbytes; /* Maximum stripe size in bytes */ + /* Fields after ocd_maxbytes are only accessible by the receiver + * if the corresponding flag in ocd_connect_flags is set. Accessing + * any field after ocd_maxbytes on the receiver without a valid flag + * may result in out-of-bound memory access and kernel oops. */ + __u64 padding1; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding2; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding3; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding4; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding5; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding6; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding7; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding8; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding9; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingA; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingB; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingC; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingD; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingE; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingF; /* added 2.1.0. also fix lustre_swab_connect */ +}; +/* XXX README XXX: + * Please DO NOT use any fields here before first ensuring that this same + * field is not in use on some other branch. Please clear any such changes + * with senior engineers before starting to use a new field. Then, submit + * a small patch against EVERY branch that ONLY adds the new field along with + * the matching OBD_CONNECT flag, so that can be approved and landed easily to + * reserve the flag for future use. */ + + +extern void lustre_swab_connect(struct obd_connect_data *ocd); + +/* + * Supported checksum algorithms. Up to 32 checksum types are supported. + * (32-bit mask stored in obd_connect_data::ocd_cksum_types) + * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new + * algorithm and also the OBD_FL_CKSUM* flags. + */ +typedef enum { + OBD_CKSUM_CRC32 = 0x00000001, + OBD_CKSUM_ADLER = 0x00000002, + OBD_CKSUM_CRC32C= 0x00000004, +} cksum_type_t; + +/* + * OST requests: OBDO & OBD request records + */ + +/* opcodes */ +typedef enum { + OST_REPLY = 0, /* reply ? */ + OST_GETATTR = 1, + OST_SETATTR = 2, + OST_READ = 3, + OST_WRITE = 4, + OST_CREATE = 5, + OST_DESTROY = 6, + OST_GET_INFO = 7, + OST_CONNECT = 8, + OST_DISCONNECT = 9, + OST_PUNCH = 10, + OST_OPEN = 11, + OST_CLOSE = 12, + OST_STATFS = 13, + OST_SYNC = 16, + OST_SET_INFO = 17, + OST_QUOTACHECK = 18, + OST_QUOTACTL = 19, + OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */ + OST_LAST_OPC +} ost_cmd_t; +#define OST_FIRST_OPC OST_REPLY + +enum obdo_flags { + OBD_FL_INLINEDATA = 0x00000001, + OBD_FL_OBDMDEXISTS = 0x00000002, + OBD_FL_DELORPHAN = 0x00000004, /* if set in o_flags delete orphans */ + OBD_FL_NORPC = 0x00000008, /* set in o_flags do in OSC not OST */ + OBD_FL_IDONLY = 0x00000010, /* set in o_flags only adjust obj id*/ + OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */ + OBD_FL_DEBUG_CHECK = 0x00000040, /* echo client/server debug check */ + OBD_FL_NO_USRQUOTA = 0x00000100, /* the object's owner is over quota */ + OBD_FL_NO_GRPQUOTA = 0x00000200, /* the object's group is over quota */ + OBD_FL_CREATE_CROW = 0x00000400, /* object should be create on write */ + OBD_FL_SRVLOCK = 0x00000800, /* delegate DLM locking to server */ + OBD_FL_CKSUM_CRC32 = 0x00001000, /* CRC32 checksum type */ + OBD_FL_CKSUM_ADLER = 0x00002000, /* ADLER checksum type */ + OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */ + OBD_FL_CKSUM_RSVD2 = 0x00008000, /* for future cksum types */ + OBD_FL_CKSUM_RSVD3 = 0x00010000, /* for future cksum types */ + OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */ + OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client. + * XXX: obsoleted - reserved for old + * clients prior than 2.2 */ + OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */ + OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */ + + /* Note that while these checksum values are currently separate bits, + * in 2.x we can actually allow all values from 1-31 if we wanted. */ + OBD_FL_CKSUM_ALL = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER | + OBD_FL_CKSUM_CRC32C, + + /* mask for local-only flag, which won't be sent over network */ + OBD_FL_LOCAL_MASK = 0xF0000000, +}; + +#define LOV_MAGIC_V1 0x0BD10BD0 +#define LOV_MAGIC LOV_MAGIC_V1 +#define LOV_MAGIC_JOIN_V1 0x0BD20BD0 +#define LOV_MAGIC_V3 0x0BD30BD0 + +/* + * magic for fully defined striping + * the idea is that we should have different magics for striping "hints" + * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct + * lov_mds_md_v[13]). at the moment the magics are used in wire protocol, + * we can't just change it w/o long way preparation, but we still need a + * mechanism to allow LOD to differentiate hint versus ready striping. + * so, at the moment we do a trick: MDT knows what to expect from request + * depending on the case (replay uses ready striping, non-replay req uses + * hints), so MDT replaces magic with appropriate one and now LOD can + * easily understand what's inside -bzzz + */ +#define LOV_MAGIC_V1_DEF 0x0CD10BD0 +#define LOV_MAGIC_V3_DEF 0x0CD30BD0 + +#define LOV_PATTERN_RAID0 0x001 /* stripes are used round-robin */ +#define LOV_PATTERN_RAID1 0x002 /* stripes are mirrors of each other */ +#define LOV_PATTERN_FIRST 0x100 /* first stripe is not in round-robin */ +#define LOV_PATTERN_CMOBD 0x200 + +#define LOV_PATTERN_F_MASK 0xffff0000 +#define LOV_PATTERN_F_RELEASED 0x80000000 /* HSM released file */ + +#define lov_pattern(pattern) (pattern & ~LOV_PATTERN_F_MASK) +#define lov_pattern_flags(pattern) (pattern & LOV_PATTERN_F_MASK) + +#define lov_ost_data lov_ost_data_v1 +struct lov_ost_data_v1 { /* per-stripe data structure (little-endian)*/ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this l_ost_idx */ + __u32 l_ost_idx; /* OST index in LOV (lov_tgt_desc->tgts) */ +}; + +#define lov_mds_md lov_mds_md_v1 +struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +/** + * Sigh, because pre-2.4 uses + * struct lov_mds_md_v1 { + * ........ + * __u64 lmm_object_id; + * __u64 lmm_object_seq; + * ...... + * } + * to identify the LOV(MDT) object, and lmm_object_seq will + * be normal_fid, which make it hard to combine these conversion + * to ostid_to FID. so we will do lmm_oi/fid conversion separately + * + * We can tell the lmm_oi by this way, + * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0 + * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL + * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k}, + * lmm_oi.f_ver = 0 + * + * But currently lmm_oi/lsm_oi does not have any "real" usages, + * except for printing some information, and the user can always + * get the real FID from LMA, besides this multiple case check might + * make swab more complicate. So we will keep using id/seq for lmm_oi. + */ + +static inline void fid_to_lmm_oi(const struct lu_fid *fid, + struct ost_id *oi) +{ + oi->oi.oi_id = fid_oid(fid); + oi->oi.oi_seq = fid_seq(fid); +} + +static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq) +{ + oi->oi.oi_seq = seq; +} + +static inline __u64 lmm_oi_id(struct ost_id *oi) +{ + return oi->oi.oi_id; +} + +static inline __u64 lmm_oi_seq(struct ost_id *oi) +{ + return oi->oi.oi_seq; +} + +static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi, + struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq); +} + +static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi, + struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq); +} + +/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */ + +#define MAX_MD_SIZE \ + (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data)) +#define MIN_MD_SIZE \ + (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data)) + +#define XATTR_NAME_ACL_ACCESS "system.posix_acl_access" +#define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default" +#define XATTR_USER_PREFIX "user." +#define XATTR_TRUSTED_PREFIX "trusted." +#define XATTR_SECURITY_PREFIX "security." +#define XATTR_LUSTRE_PREFIX "lustre." + +#define XATTR_NAME_LOV "trusted.lov" +#define XATTR_NAME_LMA "trusted.lma" +#define XATTR_NAME_LMV "trusted.lmv" +#define XATTR_NAME_LINK "trusted.link" +#define XATTR_NAME_FID "trusted.fid" +#define XATTR_NAME_VERSION "trusted.version" +#define XATTR_NAME_SOM "trusted.som" +#define XATTR_NAME_HSM "trusted.hsm" +#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace" + +struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + char lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic) +{ + if (lmm_magic == LOV_MAGIC_V3) + return sizeof(struct lov_mds_md_v3) + + stripes * sizeof(struct lov_ost_data_v1); + else + return sizeof(struct lov_mds_md_v1) + + stripes * sizeof(struct lov_ost_data_v1); +} + +static inline __u32 +lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic) +{ + switch (lmm_magic) { + case LOV_MAGIC_V1: { + struct lov_mds_md_v1 lmm; + + if (buf_size < sizeof(lmm)) + return 0; + + return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]); + } + case LOV_MAGIC_V3: { + struct lov_mds_md_v3 lmm; + + if (buf_size < sizeof(lmm)) + return 0; + + return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]); + } + default: + return 0; + } +} + +#define OBD_MD_FLID (0x00000001ULL) /* object ID */ +#define OBD_MD_FLATIME (0x00000002ULL) /* access time */ +#define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */ +#define OBD_MD_FLCTIME (0x00000008ULL) /* change time */ +#define OBD_MD_FLSIZE (0x00000010ULL) /* size */ +#define OBD_MD_FLBLOCKS (0x00000020ULL) /* allocated blocks count */ +#define OBD_MD_FLBLKSZ (0x00000040ULL) /* block size */ +#define OBD_MD_FLMODE (0x00000080ULL) /* access bits (mode & ~S_IFMT) */ +#define OBD_MD_FLTYPE (0x00000100ULL) /* object type (mode & S_IFMT) */ +#define OBD_MD_FLUID (0x00000200ULL) /* user ID */ +#define OBD_MD_FLGID (0x00000400ULL) /* group ID */ +#define OBD_MD_FLFLAGS (0x00000800ULL) /* flags word */ +#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */ +#define OBD_MD_FLGENER (0x00004000ULL) /* generation number */ +/*#define OBD_MD_FLINLINE (0x00008000ULL) inline data. used until 1.6.5 */ +#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */ +#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */ +#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */ +#define OBD_MD_FLHANDLE (0x00080000ULL) /* file/lock handle */ +#define OBD_MD_FLCKSUM (0x00100000ULL) /* bulk data checksum */ +#define OBD_MD_FLQOS (0x00200000ULL) /* quality of service stats */ +/*#define OBD_MD_FLOSCOPQ (0x00400000ULL) osc opaque data, never used */ +#define OBD_MD_FLCOOKIE (0x00800000ULL) /* log cancellation cookie */ +#define OBD_MD_FLGROUP (0x01000000ULL) /* group */ +#define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */ +#define OBD_MD_FLEPOCH (0x04000000ULL) /* ->ost write with ioepoch */ + /* ->mds if epoch opens or closes */ +#define OBD_MD_FLGRANT (0x08000000ULL) /* ost preallocation space grant */ +#define OBD_MD_FLDIREA (0x10000000ULL) /* dir's extended attribute data */ +#define OBD_MD_FLUSRQUOTA (0x20000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLGRPQUOTA (0x40000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */ + +#define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */ +#define OBD_MD_REINT (0x0000000200000000ULL) /* reintegrate oa */ +#define OBD_MD_MEA (0x0000000400000000ULL) /* CMD split EA */ +#define OBD_MD_TSTATE (0x0000000800000000ULL) /* transient state field */ + +#define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */ +#define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */ +#define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */ +#define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */ +#define OBD_MD_FLRMTPERM (0x0000010000000000ULL) /* remote permission */ +#define OBD_MD_FLMDSCAPA (0x0000020000000000ULL) /* MDS capability */ +#define OBD_MD_FLOSSCAPA (0x0000040000000000ULL) /* OSS capability */ +#define OBD_MD_FLCKSPLIT (0x0000080000000000ULL) /* Check split on server */ +#define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */ +#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes + * under lock; for xattr + * requests means the + * client holds the lock */ +#define OBD_MD_FLOBJCOUNT (0x0000400000000000ULL) /* for multiple destroy */ + +#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */ +#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */ +#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */ +#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */ + +#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */ +#define OBD_MD_FLRELEASED (0x0020000000000000ULL) /* file released */ + +#define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \ + OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \ + OBD_MD_FLMODE | OBD_MD_FLTYPE | OBD_MD_FLUID | \ + OBD_MD_FLGID | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \ + OBD_MD_FLGENER | OBD_MD_FLRDEV | OBD_MD_FLGROUP) + +#define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS) + +/* don't forget obdo_fid which is way down at the bottom so it can + * come after the definition of llog_cookie */ + +enum hss_valid { + HSS_SETMASK = 0x01, + HSS_CLEARMASK = 0x02, + HSS_ARCHIVE_ID = 0x04, +}; + +struct hsm_state_set { + __u32 hss_valid; + __u32 hss_archive_id; + __u64 hss_setmask; + __u64 hss_clearmask; +}; + +extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss); + +extern void lustre_swab_obd_statfs (struct obd_statfs *os); + +/* ost_body.data values for OST_BRW */ + +#define OBD_BRW_READ 0x01 +#define OBD_BRW_WRITE 0x02 +#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE) +#define OBD_BRW_SYNC 0x08 /* this page is a part of synchronous + * transfer and is not accounted in + * the grant. */ +#define OBD_BRW_CHECK 0x10 +#define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */ +#define OBD_BRW_GRANTED 0x40 /* the ost manages this */ +#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */ +#define OBD_BRW_NOQUOTA 0x100 +#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */ +#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */ +#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */ +#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */ +#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */ + +#define OBD_OBJECT_EOF 0xffffffffffffffffULL + +#define OST_MIN_PRECREATE 32 +#define OST_MAX_PRECREATE 20000 + +struct obd_ioobj { + struct ost_id ioo_oid; /* object ID, if multi-obj BRW */ + __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4, + * now (PTLRPC_BULK_OPS_COUNT - 1) in + * high 16 bits in 2.4 and later */ + __u32 ioo_bufcnt; /* number of niobufs for this object */ +}; + +#define IOOBJ_MAX_BRW_BITS 16 +#define IOOBJ_TYPE_MASK ((1U << IOOBJ_MAX_BRW_BITS) - 1) +#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1) +#define ioobj_max_brw_set(ioo, num) \ +do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0) + +extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo); + +/* multiple of 8 bytes => can array */ +struct niobuf_remote { + __u64 offset; + __u32 len; + __u32 flags; +}; + +extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr); + +/* lock value block communicated between the filter and llite */ + +/* OST_LVB_ERR_INIT is needed because the return code in rc is + * negative, i.e. because ((MASK + rc) & MASK) != MASK. */ +#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL +#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL +#define OST_LVB_IS_ERR(blocks) \ + ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK) +#define OST_LVB_SET_ERR(blocks, rc) \ + do { blocks = OST_LVB_ERR_INIT + rc; } while (0) +#define OST_LVB_GET_ERR(blocks) (int)(blocks - OST_LVB_ERR_INIT) + +struct ost_lvb_v1 { + __u64 lvb_size; + __s64 lvb_mtime; + __s64 lvb_atime; + __s64 lvb_ctime; + __u64 lvb_blocks; +}; + +extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb); + +struct ost_lvb { + __u64 lvb_size; + __s64 lvb_mtime; + __s64 lvb_atime; + __s64 lvb_ctime; + __u64 lvb_blocks; + __u32 lvb_mtime_ns; + __u32 lvb_atime_ns; + __u32 lvb_ctime_ns; + __u32 lvb_padding; +}; + +extern void lustre_swab_ost_lvb(struct ost_lvb *lvb); + +/* + * lquota data structures + */ + +#ifndef QUOTABLOCK_BITS +#define QUOTABLOCK_BITS 10 +#endif + +#ifndef QUOTABLOCK_SIZE +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) +#endif + +#ifndef toqb +#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS) +#endif + +/* The lquota_id structure is an union of all the possible identifier types that + * can be used with quota, this includes: + * - 64-bit user ID + * - 64-bit group ID + * - a FID which can be used for per-directory quota in the future */ +union lquota_id { + struct lu_fid qid_fid; /* FID for per-directory quota */ + __u64 qid_uid; /* user identifier */ + __u64 qid_gid; /* group identifier */ +}; + +/* quotactl management */ +struct obd_quotactl { + __u32 qc_cmd; + __u32 qc_type; /* see Q_* flag below */ + __u32 qc_id; + __u32 qc_stat; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; +}; + +extern void lustre_swab_obd_quotactl(struct obd_quotactl *q); + +#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */ +#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */ +#define Q_GETOINFO 0x800102 /* get obd quota info */ +#define Q_GETOQUOTA 0x800103 /* get obd quotas */ +#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */ + +#define Q_COPY(out, in, member) (out)->member = (in)->member + +#define QCTL_COPY(out, in) \ +do { \ + Q_COPY(out, in, qc_cmd); \ + Q_COPY(out, in, qc_type); \ + Q_COPY(out, in, qc_id); \ + Q_COPY(out, in, qc_stat); \ + Q_COPY(out, in, qc_dqinfo); \ + Q_COPY(out, in, qc_dqblk); \ +} while (0) + +/* Body of quota request used for quota acquire/release RPCs between quota + * master (aka QMT) and slaves (ak QSD). */ +struct quota_body { + struct lu_fid qb_fid; /* FID of global index packing the pool ID + * and type (data or metadata) as well as + * the quota type (user or group). */ + union lquota_id qb_id; /* uid or gid or directory FID */ + __u32 qb_flags; /* see below */ + __u32 qb_padding; + __u64 qb_count; /* acquire/release count (kbytes/inodes) */ + __u64 qb_usage; /* current slave usage (kbytes/inodes) */ + __u64 qb_slv_ver; /* slave index file version */ + struct lustre_handle qb_lockh; /* per-ID lock handle */ + struct lustre_handle qb_glb_lockh; /* global lock handle */ + __u64 qb_padding1[4]; +}; + +/* When the quota_body is used in the reply of quota global intent + * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */ +#define qb_slv_fid qb_fid +/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in + * quota reply */ +#define qb_qunit qb_usage + +#define QUOTA_DQACQ_FL_ACQ 0x1 /* acquire quota */ +#define QUOTA_DQACQ_FL_PREACQ 0x2 /* pre-acquire */ +#define QUOTA_DQACQ_FL_REL 0x4 /* release quota */ +#define QUOTA_DQACQ_FL_REPORT 0x8 /* report usage */ + +extern void lustre_swab_quota_body(struct quota_body *b); + +/* Quota types currently supported */ +enum { + LQUOTA_TYPE_USR = 0x00, /* maps to USRQUOTA */ + LQUOTA_TYPE_GRP = 0x01, /* maps to GRPQUOTA */ + LQUOTA_TYPE_MAX +}; + +/* There are 2 different resource types on which a quota limit can be enforced: + * - inodes on the MDTs + * - blocks on the OSTs */ +enum { + LQUOTA_RES_MD = 0x01, /* skip 0 to avoid null oid in FID */ + LQUOTA_RES_DT = 0x02, + LQUOTA_LAST_RES, + LQUOTA_FIRST_RES = LQUOTA_RES_MD +}; +#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1) + +/* + * Space accounting support + * Format of an accounting record, providing disk usage information for a given + * user or group + */ +struct lquota_acct_rec { /* 16 bytes */ + __u64 bspace; /* current space in use */ + __u64 ispace; /* current # inodes in use */ +}; + +/* + * Global quota index support + * Format of a global record, providing global quota settings for a given quota + * identifier + */ +struct lquota_glb_rec { /* 32 bytes */ + __u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */ + __u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */ + __u64 qbr_time; /* grace time, in seconds */ + __u64 qbr_granted; /* how much is granted to slaves, in #inodes or + * kbytes */ +}; + +/* + * Slave index support + * Format of a slave record, recording how much space is granted to a given + * slave + */ +struct lquota_slv_rec { /* 8 bytes */ + __u64 qsr_granted; /* space granted to the slave for the key=ID, + * in #inodes or kbytes */ +}; + +/* Data structures associated with the quota locks */ + +/* Glimpse descriptor used for the index & per-ID quota locks */ +struct ldlm_gl_lquota_desc { + union lquota_id gl_id; /* quota ID subject to the glimpse */ + __u64 gl_flags; /* see LQUOTA_FL* below */ + __u64 gl_ver; /* new index version */ + __u64 gl_hardlimit; /* new hardlimit or qunit value */ + __u64 gl_softlimit; /* new softlimit */ + __u64 gl_time; + __u64 gl_pad2; +}; +#define gl_qunit gl_hardlimit /* current qunit value used when + * glimpsing per-ID quota locks */ + +/* quota glimpse flags */ +#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */ + +/* LVB used with quota (global and per-ID) locks */ +struct lquota_lvb { + __u64 lvb_flags; /* see LQUOTA_FL* above */ + __u64 lvb_id_may_rel; /* space that might be released later */ + __u64 lvb_id_rel; /* space released by the slave for this ID */ + __u64 lvb_id_qunit; /* current qunit value */ + __u64 lvb_pad1; +}; + +extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb); + +/* LVB used with global quota lock */ +#define lvb_glb_ver lvb_id_may_rel /* current version of the global index */ + +/* op codes */ +typedef enum { + QUOTA_DQACQ = 601, + QUOTA_DQREL = 602, + QUOTA_LAST_OPC +} quota_cmd_t; +#define QUOTA_FIRST_OPC QUOTA_DQACQ + +/* + * MDS REQ RECORDS + */ + +/* opcodes */ +typedef enum { + MDS_GETATTR = 33, + MDS_GETATTR_NAME = 34, + MDS_CLOSE = 35, + MDS_REINT = 36, + MDS_READPAGE = 37, + MDS_CONNECT = 38, + MDS_DISCONNECT = 39, + MDS_GETSTATUS = 40, + MDS_STATFS = 41, + MDS_PIN = 42, + MDS_UNPIN = 43, + MDS_SYNC = 44, + MDS_DONE_WRITING = 45, + MDS_SET_INFO = 46, + MDS_QUOTACHECK = 47, + MDS_QUOTACTL = 48, + MDS_GETXATTR = 49, + MDS_SETXATTR = 50, /* obsolete, now it's MDS_REINT op */ + MDS_WRITEPAGE = 51, + MDS_IS_SUBDIR = 52, + MDS_GET_INFO = 53, + MDS_HSM_STATE_GET = 54, + MDS_HSM_STATE_SET = 55, + MDS_HSM_ACTION = 56, + MDS_HSM_PROGRESS = 57, + MDS_HSM_REQUEST = 58, + MDS_HSM_CT_REGISTER = 59, + MDS_HSM_CT_UNREGISTER = 60, + MDS_SWAP_LAYOUTS = 61, + MDS_LAST_OPC +} mds_cmd_t; + +#define MDS_FIRST_OPC MDS_GETATTR + + +/* opcodes for object update */ +typedef enum { + UPDATE_OBJ = 1000, + UPDATE_LAST_OPC +} update_cmd_t; + +#define UPDATE_FIRST_OPC UPDATE_OBJ + +/* + * Do not exceed 63 + */ + +typedef enum { + REINT_SETATTR = 1, + REINT_CREATE = 2, + REINT_LINK = 3, + REINT_UNLINK = 4, + REINT_RENAME = 5, + REINT_OPEN = 6, + REINT_SETXATTR = 7, + REINT_RMENTRY = 8, +// REINT_WRITE = 9, + REINT_MAX +} mds_reint_t, mdt_reint_t; + +extern void lustre_swab_generic_32s (__u32 *val); + +/* the disposition of the intent outlines what was executed */ +#define DISP_IT_EXECD 0x00000001 +#define DISP_LOOKUP_EXECD 0x00000002 +#define DISP_LOOKUP_NEG 0x00000004 +#define DISP_LOOKUP_POS 0x00000008 +#define DISP_OPEN_CREATE 0x00000010 +#define DISP_OPEN_OPEN 0x00000020 +#define DISP_ENQ_COMPLETE 0x00400000 /* obsolete and unused */ +#define DISP_ENQ_OPEN_REF 0x00800000 +#define DISP_ENQ_CREATE_REF 0x01000000 +#define DISP_OPEN_LOCK 0x02000000 +#define DISP_OPEN_LEASE 0x04000000 +#define DISP_OPEN_STRIPE 0x08000000 + +/* INODE LOCK PARTS */ +#define MDS_INODELOCK_LOOKUP 0x000001 /* For namespace, dentry etc, and also + * was used to protect permission (mode, + * owner, group etc) before 2.4. */ +#define MDS_INODELOCK_UPDATE 0x000002 /* size, links, timestamps */ +#define MDS_INODELOCK_OPEN 0x000004 /* For opened files */ +#define MDS_INODELOCK_LAYOUT 0x000008 /* for layout */ + +/* The PERM bit is added int 2.4, and it is used to protect permission(mode, + * owner, group, acl etc), so to separate the permission from LOOKUP lock. + * Because for remote directories(in DNE), these locks will be granted by + * different MDTs(different ldlm namespace). + * + * For local directory, MDT will always grant UPDATE_LOCK|PERM_LOCK together. + * For Remote directory, the master MDT, where the remote directory is, will + * grant UPDATE_LOCK|PERM_LOCK, and the remote MDT, where the name entry is, + * will grant LOOKUP_LOCK. */ +#define MDS_INODELOCK_PERM 0x000010 +#define MDS_INODELOCK_XATTR 0x000020 /* extended attributes */ + +#define MDS_INODELOCK_MAXSHIFT 5 +/* This FULL lock is useful to take on unlink sort of operations */ +#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1) + +extern void lustre_swab_ll_fid (struct ll_fid *fid); + +/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * name[2,3] fields that need to be used for the quota id (also a FID). */ +enum { + LUSTRE_RES_ID_SEQ_OFF = 0, + LUSTRE_RES_ID_VER_OID_OFF = 1, + LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */ + LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2, + LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3, + LUSTRE_RES_ID_HSH_OFF = 3 +}; + +#define MDS_STATUS_CONN 1 +#define MDS_STATUS_LOV 2 + +/* mdt_thread_info.mti_flags. */ +enum md_op_flags { + /* The flag indicates Size-on-MDS attributes are changed. */ + MF_SOM_CHANGE = (1 << 0), + /* Flags indicates an epoch opens or closes. */ + MF_EPOCH_OPEN = (1 << 1), + MF_EPOCH_CLOSE = (1 << 2), + MF_MDC_CANCEL_FID1 = (1 << 3), + MF_MDC_CANCEL_FID2 = (1 << 4), + MF_MDC_CANCEL_FID3 = (1 << 5), + MF_MDC_CANCEL_FID4 = (1 << 6), + /* There is a pending attribute update. */ + MF_SOM_AU = (1 << 7), + /* Cancel OST locks while getattr OST attributes. */ + MF_GETATTR_LOCK = (1 << 8), + MF_GET_MDT_IDX = (1 << 9), +}; + +#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE) + +#define LUSTRE_BFLAG_UNCOMMITTED_WRITES 0x1 + +/* these should be identical to their EXT4_*_FL counterparts, they are + * redefined here only to avoid dragging in fs/ext4/ext4.h */ +#define LUSTRE_SYNC_FL 0x00000008 /* Synchronous updates */ +#define LUSTRE_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define LUSTRE_APPEND_FL 0x00000020 /* writes to file may only append */ +#define LUSTRE_NOATIME_FL 0x00000080 /* do not update atime */ +#define LUSTRE_DIRSYNC_FL 0x00010000 /* dirsync behaviour (dir only) */ + +/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values + * for the client inode i_flags. The LUSTRE_*_FL are the Lustre wire + * protocol equivalents of LDISKFS_*_FL values stored on disk, while + * the S_* flags are kernel-internal values that change between kernel + * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS. + * See b=16526 for a full history. */ +static inline int ll_ext_to_inode_flags(int flags) +{ + return (((flags & LUSTRE_SYNC_FL) ? S_SYNC : 0) | + ((flags & LUSTRE_NOATIME_FL) ? S_NOATIME : 0) | + ((flags & LUSTRE_APPEND_FL) ? S_APPEND : 0) | +#if defined(S_DIRSYNC) + ((flags & LUSTRE_DIRSYNC_FL) ? S_DIRSYNC : 0) | +#endif + ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0)); +} + +static inline int ll_inode_to_ext_flags(int iflags) +{ + return (((iflags & S_SYNC) ? LUSTRE_SYNC_FL : 0) | + ((iflags & S_NOATIME) ? LUSTRE_NOATIME_FL : 0) | + ((iflags & S_APPEND) ? LUSTRE_APPEND_FL : 0) | +#if defined(S_DIRSYNC) + ((iflags & S_DIRSYNC) ? LUSTRE_DIRSYNC_FL : 0) | +#endif + ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0)); +} + +/* 64 possible states */ +enum md_transient_state { + MS_RESTORE = (1 << 0), /* restore is running */ +}; + +struct mdt_body { + struct lu_fid fid1; + struct lu_fid fid2; + struct lustre_handle handle; + __u64 valid; + __u64 size; /* Offset, in the case of MDS_READPAGE */ + __s64 mtime; + __s64 atime; + __s64 ctime; + __u64 blocks; /* XID, in the case of MDS_READPAGE */ + __u64 ioepoch; + __u64 t_state; /* transient file state defined in + * enum md_transient_state + * was "ino" until 2.4.0 */ + __u32 fsuid; + __u32 fsgid; + __u32 capability; + __u32 mode; + __u32 uid; + __u32 gid; + __u32 flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */ + __u32 rdev; + __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */ + __u32 unused2; /* was "generation" until 2.4.0 */ + __u32 suppgid; + __u32 eadatasize; + __u32 aclsize; + __u32 max_mdsize; + __u32 max_cookiesize; + __u32 uid_h; /* high 32-bits of uid, for FUID */ + __u32 gid_h; /* high 32-bits of gid, for FUID */ + __u32 padding_5; /* also fix lustre_swab_mdt_body */ + __u64 padding_6; + __u64 padding_7; + __u64 padding_8; + __u64 padding_9; + __u64 padding_10; +}; /* 216 */ + +extern void lustre_swab_mdt_body (struct mdt_body *b); + +struct mdt_ioepoch { + struct lustre_handle handle; + __u64 ioepoch; + __u32 flags; + __u32 padding; +}; + +extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b); + +/* permissions for md_perm.mp_perm */ +enum { + CFS_SETUID_PERM = 0x01, + CFS_SETGID_PERM = 0x02, + CFS_SETGRP_PERM = 0x04, + CFS_RMTACL_PERM = 0x08, + CFS_RMTOWN_PERM = 0x10 +}; + +/* inode access permission for remote user, the inode info are omitted, + * for client knows them. */ +struct mdt_remote_perm { + __u32 rp_uid; + __u32 rp_gid; + __u32 rp_fsuid; + __u32 rp_fsuid_h; + __u32 rp_fsgid; + __u32 rp_fsgid_h; + __u32 rp_access_perm; /* MAY_READ/WRITE/EXEC */ + __u32 rp_padding; +}; + +extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p); + +struct mdt_rec_setattr { + __u32 sa_opcode; + __u32 sa_cap; + __u32 sa_fsuid; + __u32 sa_fsuid_h; + __u32 sa_fsgid; + __u32 sa_fsgid_h; + __u32 sa_suppgid; + __u32 sa_suppgid_h; + __u32 sa_padding_1; + __u32 sa_padding_1_h; + struct lu_fid sa_fid; + __u64 sa_valid; + __u32 sa_uid; + __u32 sa_gid; + __u64 sa_size; + __u64 sa_blocks; + __s64 sa_mtime; + __s64 sa_atime; + __s64 sa_ctime; + __u32 sa_attr_flags; + __u32 sa_mode; + __u32 sa_bias; /* some operation flags */ + __u32 sa_padding_3; + __u32 sa_padding_4; + __u32 sa_padding_5; +}; + +extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa); + +/* + * Attribute flags used in mdt_rec_setattr::sa_valid. + * The kernel's #defines for ATTR_* should not be used over the network + * since the client and MDS may run different kernels (see bug 13828) + * Therefore, we should only use MDS_ATTR_* attributes for sa_valid. + */ +#define MDS_ATTR_MODE 0x1ULL /* = 1 */ +#define MDS_ATTR_UID 0x2ULL /* = 2 */ +#define MDS_ATTR_GID 0x4ULL /* = 4 */ +#define MDS_ATTR_SIZE 0x8ULL /* = 8 */ +#define MDS_ATTR_ATIME 0x10ULL /* = 16 */ +#define MDS_ATTR_MTIME 0x20ULL /* = 32 */ +#define MDS_ATTR_CTIME 0x40ULL /* = 64 */ +#define MDS_ATTR_ATIME_SET 0x80ULL /* = 128 */ +#define MDS_ATTR_MTIME_SET 0x100ULL /* = 256 */ +#define MDS_ATTR_FORCE 0x200ULL /* = 512, Not a change, but a change it */ +#define MDS_ATTR_ATTR_FLAG 0x400ULL /* = 1024 */ +#define MDS_ATTR_KILL_SUID 0x800ULL /* = 2048 */ +#define MDS_ATTR_KILL_SGID 0x1000ULL /* = 4096 */ +#define MDS_ATTR_CTIME_SET 0x2000ULL /* = 8192 */ +#define MDS_ATTR_FROM_OPEN 0x4000ULL /* = 16384, called from open path, ie O_TRUNC */ +#define MDS_ATTR_BLOCKS 0x8000ULL /* = 32768 */ + +#ifndef FMODE_READ +#define FMODE_READ 00000001 +#define FMODE_WRITE 00000002 +#endif + +#define MDS_FMODE_CLOSED 00000000 +#define MDS_FMODE_EXEC 00000004 +/* IO Epoch is opened on a closed file. */ +#define MDS_FMODE_EPOCH 01000000 +/* IO Epoch is opened on a file truncate. */ +#define MDS_FMODE_TRUNC 02000000 +/* Size-on-MDS Attribute Update is pending. */ +#define MDS_FMODE_SOM 04000000 + +#define MDS_OPEN_CREATED 00000010 +#define MDS_OPEN_CROSS 00000020 + +#define MDS_OPEN_CREAT 00000100 +#define MDS_OPEN_EXCL 00000200 +#define MDS_OPEN_TRUNC 00001000 +#define MDS_OPEN_APPEND 00002000 +#define MDS_OPEN_SYNC 00010000 +#define MDS_OPEN_DIRECTORY 00200000 + +#define MDS_OPEN_BY_FID 040000000 /* open_by_fid for known object */ +#define MDS_OPEN_DELAY_CREATE 0100000000 /* delay initial object create */ +#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */ +#define MDS_OPEN_JOIN_FILE 0400000000 /* open for join file. + * We do not support JOIN FILE + * anymore, reserve this flags + * just for preventing such bit + * to be reused. */ + +#define MDS_OPEN_LOCK 04000000000 /* This open requires open lock */ +#define MDS_OPEN_HAS_EA 010000000000 /* specify object create pattern */ +#define MDS_OPEN_HAS_OBJS 020000000000 /* Just set the EA the obj exist */ +#define MDS_OPEN_NORESTORE 0100000000000ULL /* Do not restore file at open */ +#define MDS_OPEN_NEWSTRIPE 0200000000000ULL /* New stripe needed (restripe or + * hsm restore) */ +#define MDS_OPEN_VOLATILE 0400000000000ULL /* File is volatile = created + unlinked */ +#define MDS_OPEN_LEASE 01000000000000ULL /* Open the file and grant lease + * delegation, succeed if it's not + * being opened with conflict mode. + */ +#define MDS_OPEN_RELEASE 02000000000000ULL /* Open the file for HSM release */ + +/* permission for create non-directory file */ +#define MAY_CREATE (1 << 7) +/* permission for create directory file */ +#define MAY_LINK (1 << 8) +/* permission for delete from the directory */ +#define MAY_UNLINK (1 << 9) +/* source's permission for rename */ +#define MAY_RENAME_SRC (1 << 10) +/* target's permission for rename */ +#define MAY_RENAME_TAR (1 << 11) +/* part (parent's) VTX permission check */ +#define MAY_VTX_PART (1 << 12) +/* full VTX permission check */ +#define MAY_VTX_FULL (1 << 13) +/* lfs rgetfacl permission check */ +#define MAY_RGETFACL (1 << 14) + +enum mds_op_bias { + MDS_CHECK_SPLIT = 1 << 0, + MDS_CROSS_REF = 1 << 1, + MDS_VTX_BYPASS = 1 << 2, + MDS_PERM_BYPASS = 1 << 3, + MDS_SOM = 1 << 4, + MDS_QUOTA_IGNORE = 1 << 5, + MDS_CLOSE_CLEANUP = 1 << 6, + MDS_KEEP_ORPHAN = 1 << 7, + MDS_RECOV_OPEN = 1 << 8, + MDS_DATA_MODIFIED = 1 << 9, + MDS_CREATE_VOLATILE = 1 << 10, + MDS_OWNEROVERRIDE = 1 << 11, + MDS_HSM_RELEASE = 1 << 12, +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_create { + __u32 cr_opcode; + __u32 cr_cap; + __u32 cr_fsuid; + __u32 cr_fsuid_h; + __u32 cr_fsgid; + __u32 cr_fsgid_h; + __u32 cr_suppgid1; + __u32 cr_suppgid1_h; + __u32 cr_suppgid2; + __u32 cr_suppgid2_h; + struct lu_fid cr_fid1; + struct lu_fid cr_fid2; + struct lustre_handle cr_old_handle; /* handle in case of open replay */ + __s64 cr_time; + __u64 cr_rdev; + __u64 cr_ioepoch; + __u64 cr_padding_1; /* rr_blocks */ + __u32 cr_mode; + __u32 cr_bias; + /* use of helpers set/get_mrc_cr_flags() is needed to access + * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to + * extend cr_flags size without breaking 1.8 compat */ + __u32 cr_flags_l; /* for use with open, low 32 bits */ + __u32 cr_flags_h; /* for use with open, high 32 bits */ + __u32 cr_umask; /* umask for create */ + __u32 cr_padding_4; /* rr_padding_4 */ +}; + +static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags) +{ + mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll); + mrc->cr_flags_h = (__u32)(flags >> 32); +} + +static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc) +{ + return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32)); +} + +/* instance of mdt_reint_rec */ +struct mdt_rec_link { + __u32 lk_opcode; + __u32 lk_cap; + __u32 lk_fsuid; + __u32 lk_fsuid_h; + __u32 lk_fsgid; + __u32 lk_fsgid_h; + __u32 lk_suppgid1; + __u32 lk_suppgid1_h; + __u32 lk_suppgid2; + __u32 lk_suppgid2_h; + struct lu_fid lk_fid1; + struct lu_fid lk_fid2; + __s64 lk_time; + __u64 lk_padding_1; /* rr_atime */ + __u64 lk_padding_2; /* rr_ctime */ + __u64 lk_padding_3; /* rr_size */ + __u64 lk_padding_4; /* rr_blocks */ + __u32 lk_bias; + __u32 lk_padding_5; /* rr_mode */ + __u32 lk_padding_6; /* rr_flags */ + __u32 lk_padding_7; /* rr_padding_2 */ + __u32 lk_padding_8; /* rr_padding_3 */ + __u32 lk_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_unlink { + __u32 ul_opcode; + __u32 ul_cap; + __u32 ul_fsuid; + __u32 ul_fsuid_h; + __u32 ul_fsgid; + __u32 ul_fsgid_h; + __u32 ul_suppgid1; + __u32 ul_suppgid1_h; + __u32 ul_suppgid2; + __u32 ul_suppgid2_h; + struct lu_fid ul_fid1; + struct lu_fid ul_fid2; + __s64 ul_time; + __u64 ul_padding_2; /* rr_atime */ + __u64 ul_padding_3; /* rr_ctime */ + __u64 ul_padding_4; /* rr_size */ + __u64 ul_padding_5; /* rr_blocks */ + __u32 ul_bias; + __u32 ul_mode; + __u32 ul_padding_6; /* rr_flags */ + __u32 ul_padding_7; /* rr_padding_2 */ + __u32 ul_padding_8; /* rr_padding_3 */ + __u32 ul_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_rename { + __u32 rn_opcode; + __u32 rn_cap; + __u32 rn_fsuid; + __u32 rn_fsuid_h; + __u32 rn_fsgid; + __u32 rn_fsgid_h; + __u32 rn_suppgid1; + __u32 rn_suppgid1_h; + __u32 rn_suppgid2; + __u32 rn_suppgid2_h; + struct lu_fid rn_fid1; + struct lu_fid rn_fid2; + __s64 rn_time; + __u64 rn_padding_1; /* rr_atime */ + __u64 rn_padding_2; /* rr_ctime */ + __u64 rn_padding_3; /* rr_size */ + __u64 rn_padding_4; /* rr_blocks */ + __u32 rn_bias; /* some operation flags */ + __u32 rn_mode; /* cross-ref rename has mode */ + __u32 rn_padding_5; /* rr_flags */ + __u32 rn_padding_6; /* rr_padding_2 */ + __u32 rn_padding_7; /* rr_padding_3 */ + __u32 rn_padding_8; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_setxattr { + __u32 sx_opcode; + __u32 sx_cap; + __u32 sx_fsuid; + __u32 sx_fsuid_h; + __u32 sx_fsgid; + __u32 sx_fsgid_h; + __u32 sx_suppgid1; + __u32 sx_suppgid1_h; + __u32 sx_suppgid2; + __u32 sx_suppgid2_h; + struct lu_fid sx_fid; + __u64 sx_padding_1; /* These three are rr_fid2 */ + __u32 sx_padding_2; + __u32 sx_padding_3; + __u64 sx_valid; + __s64 sx_time; + __u64 sx_padding_5; /* rr_ctime */ + __u64 sx_padding_6; /* rr_size */ + __u64 sx_padding_7; /* rr_blocks */ + __u32 sx_size; + __u32 sx_flags; + __u32 sx_padding_8; /* rr_flags */ + __u32 sx_padding_9; /* rr_padding_2 */ + __u32 sx_padding_10; /* rr_padding_3 */ + __u32 sx_padding_11; /* rr_padding_4 */ +}; + +/* + * mdt_rec_reint is the template for all mdt_reint_xxx structures. + * Do NOT change the size of various members, otherwise the value + * will be broken in lustre_swab_mdt_rec_reint(). + * + * If you add new members in other mdt_reint_xxx structures and need to use the + * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also. + */ +struct mdt_rec_reint { + __u32 rr_opcode; + __u32 rr_cap; + __u32 rr_fsuid; + __u32 rr_fsuid_h; + __u32 rr_fsgid; + __u32 rr_fsgid_h; + __u32 rr_suppgid1; + __u32 rr_suppgid1_h; + __u32 rr_suppgid2; + __u32 rr_suppgid2_h; + struct lu_fid rr_fid1; + struct lu_fid rr_fid2; + __s64 rr_mtime; + __s64 rr_atime; + __s64 rr_ctime; + __u64 rr_size; + __u64 rr_blocks; + __u32 rr_bias; + __u32 rr_mode; + __u32 rr_flags; + __u32 rr_flags_h; + __u32 rr_umask; + __u32 rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */ +}; + +extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr); + +struct lmv_desc { + __u32 ld_tgt_count; /* how many MDS's */ + __u32 ld_active_tgt_count; /* how many active */ + __u32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default MEA_MAGIC_* */ + __u64 ld_default_hash_size; + __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */ + struct obd_uuid ld_uuid; +}; + +extern void lustre_swab_lmv_desc (struct lmv_desc *ld); + +/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */ +struct lmv_stripe_md { + __u32 mea_magic; + __u32 mea_count; + __u32 mea_master; + __u32 mea_padding; + char mea_pool_name[LOV_MAXPOOLNAME]; + struct lu_fid mea_ids[0]; +}; + +extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea); + +/* lmv structures */ +#define MEA_MAGIC_LAST_CHAR 0xb2221ca1 +#define MEA_MAGIC_ALL_CHARS 0xb222a11c +#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b + +#define MAX_HASH_SIZE_32 0x7fffffffUL +#define MAX_HASH_SIZE 0x7fffffffffffffffULL +#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL + +enum fld_rpc_opc { + FLD_QUERY = 900, + FLD_LAST_OPC, + FLD_FIRST_OPC = FLD_QUERY +}; + +enum seq_rpc_opc { + SEQ_QUERY = 700, + SEQ_LAST_OPC, + SEQ_FIRST_OPC = SEQ_QUERY +}; + +enum seq_op { + SEQ_ALLOC_SUPER = 0, + SEQ_ALLOC_META = 1 +}; + +/* + * LOV data structures + */ + +#define LOV_MAX_UUID_BUFFER_SIZE 8192 +/* The size of the buffer the lov/mdc reserves for the + * array of UUIDs returned by the MDS. With the current + * protocol, this will limit the max number of OSTs per LOV */ + +#define LOV_DESC_MAGIC 0xB0CCDE5C +#define LOV_DESC_QOS_MAXAGE_DEFAULT 5 /* Seconds */ +#define LOV_DESC_STRIPE_SIZE_DEFAULT (1 << LNET_MTU_BITS) + +/* LOV settings descriptor (should only contain static info) */ +struct lov_desc { + __u32 ld_tgt_count; /* how many OBD's */ + __u32 ld_active_tgt_count; /* how many active */ + __u32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default PATTERN_RAID0 */ + __u64 ld_default_stripe_size; /* in bytes */ + __u64 ld_default_stripe_offset; /* in bytes */ + __u32 ld_padding_0; /* unused */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */ + struct obd_uuid ld_uuid; +}; + +#define ld_magic ld_active_tgt_count /* for swabbing from llogs */ + +extern void lustre_swab_lov_desc (struct lov_desc *ld); + +/* + * LDLM requests: + */ +/* opcodes -- MUST be distinct from OST/MDS opcodes */ +typedef enum { + LDLM_ENQUEUE = 101, + LDLM_CONVERT = 102, + LDLM_CANCEL = 103, + LDLM_BL_CALLBACK = 104, + LDLM_CP_CALLBACK = 105, + LDLM_GL_CALLBACK = 106, + LDLM_SET_INFO = 107, + LDLM_LAST_OPC +} ldlm_cmd_t; +#define LDLM_FIRST_OPC LDLM_ENQUEUE + +#define RES_NAME_SIZE 4 +struct ldlm_res_id { + __u64 name[RES_NAME_SIZE]; +}; + +#define DLDLMRES "[%#llx:%#llx:%#llx].%llx" +#define PLDLMRES(res) (res)->lr_name.name[0], (res)->lr_name.name[1], \ + (res)->lr_name.name[2], (res)->lr_name.name[3] + +extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id); + +static inline int ldlm_res_eq(const struct ldlm_res_id *res0, + const struct ldlm_res_id *res1) +{ + return !memcmp(res0, res1, sizeof(*res0)); +} + +/* lock types */ +typedef enum { + LCK_MINMODE = 0, + LCK_EX = 1, + LCK_PW = 2, + LCK_PR = 4, + LCK_CW = 8, + LCK_CR = 16, + LCK_NL = 32, + LCK_GROUP = 64, + LCK_COS = 128, + LCK_MAXMODE +} ldlm_mode_t; + +#define LCK_MODE_NUM 8 + +typedef enum { + LDLM_PLAIN = 10, + LDLM_EXTENT = 11, + LDLM_FLOCK = 12, + LDLM_IBITS = 13, + LDLM_MAX_TYPE +} ldlm_type_t; + +#define LDLM_MIN_TYPE LDLM_PLAIN + +struct ldlm_extent { + __u64 start; + __u64 end; + __u64 gid; +}; + +static inline int ldlm_extent_overlap(struct ldlm_extent *ex1, + struct ldlm_extent *ex2) +{ + return (ex1->start <= ex2->end) && (ex2->start <= ex1->end); +} + +/* check if @ex1 contains @ex2 */ +static inline int ldlm_extent_contain(struct ldlm_extent *ex1, + struct ldlm_extent *ex2) +{ + return (ex1->start <= ex2->start) && (ex1->end >= ex2->end); +} + +struct ldlm_inodebits { + __u64 bits; +}; + +struct ldlm_flock_wire { + __u64 lfw_start; + __u64 lfw_end; + __u64 lfw_owner; + __u32 lfw_padding; + __u32 lfw_pid; +}; + +/* it's important that the fields of the ldlm_extent structure match + * the first fields of the ldlm_flock structure because there is only + * one ldlm_swab routine to process the ldlm_policy_data_t union. if + * this ever changes we will need to swab the union differently based + * on the resource type. */ + +typedef union { + struct ldlm_extent l_extent; + struct ldlm_flock_wire l_flock; + struct ldlm_inodebits l_inodebits; +} ldlm_wire_policy_data_t; + +extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d); + +union ldlm_gl_desc { + struct ldlm_gl_lquota_desc lquota_desc; +}; + +extern void lustre_swab_gl_desc(union ldlm_gl_desc *); + +struct ldlm_intent { + __u64 opc; +}; + +extern void lustre_swab_ldlm_intent (struct ldlm_intent *i); + +struct ldlm_resource_desc { + ldlm_type_t lr_type; + __u32 lr_padding; /* also fix lustre_swab_ldlm_resource_desc */ + struct ldlm_res_id lr_name; +}; + +extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r); + +struct ldlm_lock_desc { + struct ldlm_resource_desc l_resource; + ldlm_mode_t l_req_mode; + ldlm_mode_t l_granted_mode; + ldlm_wire_policy_data_t l_policy_data; +}; + +extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l); + +#define LDLM_LOCKREQ_HANDLES 2 +#define LDLM_ENQUEUE_CANCEL_OFF 1 + +struct ldlm_request { + __u32 lock_flags; + __u32 lock_count; + struct ldlm_lock_desc lock_desc; + struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES]; +}; + +extern void lustre_swab_ldlm_request (struct ldlm_request *rq); + +/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available. + * Otherwise, 2 are available. */ +#define ldlm_request_bufsize(count, type) \ +({ \ + int _avail = LDLM_LOCKREQ_HANDLES; \ + _avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \ + sizeof(struct ldlm_request) + \ + (count > _avail ? count - _avail : 0) * \ + sizeof(struct lustre_handle); \ +}) + +struct ldlm_reply { + __u32 lock_flags; + __u32 lock_padding; /* also fix lustre_swab_ldlm_reply */ + struct ldlm_lock_desc lock_desc; + struct lustre_handle lock_handle; + __u64 lock_policy_res1; + __u64 lock_policy_res2; +}; + +extern void lustre_swab_ldlm_reply (struct ldlm_reply *r); + +#define ldlm_flags_to_wire(flags) ((__u32)(flags)) +#define ldlm_flags_from_wire(flags) ((__u64)(flags)) + +/* + * Opcodes for mountconf (mgs and mgc) + */ +typedef enum { + MGS_CONNECT = 250, + MGS_DISCONNECT, + MGS_EXCEPTION, /* node died, etc. */ + MGS_TARGET_REG, /* whenever target starts up */ + MGS_TARGET_DEL, + MGS_SET_INFO, + MGS_CONFIG_READ, + MGS_LAST_OPC +} mgs_cmd_t; +#define MGS_FIRST_OPC MGS_CONNECT + +#define MGS_PARAM_MAXLEN 1024 +#define KEY_SET_INFO "set_info" + +struct mgs_send_param { + char mgs_param[MGS_PARAM_MAXLEN]; +}; + +/* We pass this info to the MGS so it can write config logs */ +#define MTI_NAME_MAXLEN 64 +#define MTI_PARAM_MAXLEN 4096 +#define MTI_NIDS_MAX 32 +struct mgs_target_info { + __u32 mti_lustre_ver; + __u32 mti_stripe_index; + __u32 mti_config_ver; + __u32 mti_flags; + __u32 mti_nid_count; + __u32 mti_instance; /* Running instance of target */ + char mti_fsname[MTI_NAME_MAXLEN]; + char mti_svname[MTI_NAME_MAXLEN]; + char mti_uuid[sizeof(struct obd_uuid)]; + __u64 mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t)*/ + char mti_params[MTI_PARAM_MAXLEN]; +}; +extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo); + +struct mgs_nidtbl_entry { + __u64 mne_version; /* table version of this entry */ + __u32 mne_instance; /* target instance # */ + __u32 mne_index; /* target index */ + __u32 mne_length; /* length of this entry - by bytes */ + __u8 mne_type; /* target type LDD_F_SV_TYPE_OST/MDT */ + __u8 mne_nid_type; /* type of nid(mbz). for ipv6. */ + __u8 mne_nid_size; /* size of each NID, by bytes */ + __u8 mne_nid_count; /* # of NIDs in buffer */ + union { + lnet_nid_t nids[0]; /* variable size buffer for NIDs. */ + } u; +}; +extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo); + +struct mgs_config_body { + char mcb_name[MTI_NAME_MAXLEN]; /* logname */ + __u64 mcb_offset; /* next index of config log to request */ + __u16 mcb_type; /* type of log: CONFIG_T_[CONFIG|RECOVER] */ + __u8 mcb_reserved; + __u8 mcb_bits; /* bits unit size of config log */ + __u32 mcb_units; /* # of units for bulk transfer */ +}; +extern void lustre_swab_mgs_config_body(struct mgs_config_body *body); + +struct mgs_config_res { + __u64 mcr_offset; /* index of last config log */ + __u64 mcr_size; /* size of the log */ +}; +extern void lustre_swab_mgs_config_res(struct mgs_config_res *body); + +/* Config marker flags (in config log) */ +#define CM_START 0x01 +#define CM_END 0x02 +#define CM_SKIP 0x04 +#define CM_UPGRADE146 0x08 +#define CM_EXCLUDE 0x10 +#define CM_START_SKIP (CM_START | CM_SKIP) + +struct cfg_marker { + __u32 cm_step; /* aka config version */ + __u32 cm_flags; + __u32 cm_vers; /* lustre release version number */ + __u32 cm_padding; /* 64 bit align */ + __s64 cm_createtime; /*when this record was first created */ + __s64 cm_canceltime; /*when this record is no longer valid*/ + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +extern void lustre_swab_cfg_marker(struct cfg_marker *marker, + int swab, int size); + +/* + * Opcodes for multiple servers. + */ + +typedef enum { + OBD_PING = 400, + OBD_LOG_CANCEL, + OBD_QC_CALLBACK, + OBD_IDX_READ, + OBD_LAST_OPC +} obd_cmd_t; +#define OBD_FIRST_OPC OBD_PING + +/* catalog of log objects */ + +/** Identifier for a single log object */ +struct llog_logid { + struct ost_id lgl_oi; + __u32 lgl_ogen; +} __attribute__((packed)); + +/** Records written to the CATALOGS list */ +#define CATLIST "CATALOGS" +struct llog_catid { + struct llog_logid lci_logid; + __u32 lci_padding1; + __u32 lci_padding2; + __u32 lci_padding3; +} __attribute__((packed)); + +/* Log data record types - there is no specific reason that these need to + * be related to the RPC opcodes, but no reason not to (may be handy later?) + */ +#define LLOG_OP_MAGIC 0x10600000 +#define LLOG_OP_MASK 0xfff00000 + +typedef enum { + LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0x00000, + OST_SZ_REC = LLOG_OP_MAGIC | 0x00f00, + /* OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, never used */ + MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | + REINT_UNLINK, /* obsolete after 2.5.0 */ + MDS_UNLINK64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_UNLINK, + /* MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */ + MDS_SETATTR64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_SETATTR, + OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000, + /* PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */ + LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000, + /* LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000, obsolete 1.8.0 */ + CHANGELOG_REC = LLOG_OP_MAGIC | 0x60000, + CHANGELOG_USER_REC = LLOG_OP_MAGIC | 0x70000, + HSM_AGENT_REC = LLOG_OP_MAGIC | 0x80000, + LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539, + LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b, +} llog_op_type; + +#define LLOG_REC_HDR_NEEDS_SWABBING(r) \ + (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC)) + +/** Log record header - stored in little endian order. + * Each record must start with this struct, end with a llog_rec_tail, + * and be a multiple of 256 bits in size. + */ +struct llog_rec_hdr { + __u32 lrh_len; + __u32 lrh_index; + __u32 lrh_type; + __u32 lrh_id; +}; + +struct llog_rec_tail { + __u32 lrt_len; + __u32 lrt_index; +}; + +/* Where data follow just after header */ +#define REC_DATA(ptr) \ + ((void *)((char *)ptr + sizeof(struct llog_rec_hdr))) + +#define REC_DATA_LEN(rec) \ + (rec->lrh_len - sizeof(struct llog_rec_hdr) - \ + sizeof(struct llog_rec_tail)) + +struct llog_logid_rec { + struct llog_rec_hdr lid_hdr; + struct llog_logid lid_id; + __u32 lid_padding1; + __u64 lid_padding2; + __u64 lid_padding3; + struct llog_rec_tail lid_tail; +} __attribute__((packed)); + +struct llog_unlink_rec { + struct llog_rec_hdr lur_hdr; + __u64 lur_oid; + __u32 lur_oseq; + __u32 lur_count; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_unlink64_rec { + struct llog_rec_hdr lur_hdr; + struct lu_fid lur_fid; + __u32 lur_count; /* to destroy the lost precreated */ + __u32 lur_padding1; + __u64 lur_padding2; + __u64 lur_padding3; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_setattr64_rec { + struct llog_rec_hdr lsr_hdr; + struct ost_id lsr_oi; + __u32 lsr_uid; + __u32 lsr_uid_h; + __u32 lsr_gid; + __u32 lsr_gid_h; + __u64 lsr_padding; + struct llog_rec_tail lsr_tail; +} __attribute__((packed)); + +struct llog_size_change_rec { + struct llog_rec_hdr lsc_hdr; + struct ll_fid lsc_fid; + __u32 lsc_ioepoch; + __u32 lsc_padding1; + __u64 lsc_padding2; + __u64 lsc_padding3; + struct llog_rec_tail lsc_tail; +} __attribute__((packed)); + +#define CHANGELOG_MAGIC 0xca103000 + +/** \a changelog_rec_type's that can't be masked */ +#define CHANGELOG_MINMASK (1 << CL_MARK) +/** bits covering all \a changelog_rec_type's */ +#define CHANGELOG_ALLMASK 0XFFFFFFFF +/** default \a changelog_rec_type mask */ +#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE) + +/* changelog llog name, needed by client replicators */ +#define CHANGELOG_CATALOG "changelog_catalog" + +struct changelog_setinfo { + __u64 cs_recno; + __u32 cs_id; +} __attribute__((packed)); + +/** changelog record */ +struct llog_changelog_rec { + struct llog_rec_hdr cr_hdr; + struct changelog_rec cr; + struct llog_rec_tail cr_tail; /**< for_sizezof_only */ +} __attribute__((packed)); + +struct llog_changelog_ext_rec { + struct llog_rec_hdr cr_hdr; + struct changelog_ext_rec cr; + struct llog_rec_tail cr_tail; /**< for_sizezof_only */ +} __attribute__((packed)); + +#define CHANGELOG_USER_PREFIX "cl" + +struct llog_changelog_user_rec { + struct llog_rec_hdr cur_hdr; + __u32 cur_id; + __u32 cur_padding; + __u64 cur_endrec; + struct llog_rec_tail cur_tail; +} __attribute__((packed)); + +enum agent_req_status { + ARS_WAITING, + ARS_STARTED, + ARS_FAILED, + ARS_CANCELED, + ARS_SUCCEED, +}; + +static inline char *agent_req_status2name(enum agent_req_status ars) +{ + switch (ars) { + case ARS_WAITING: + return "WAITING"; + case ARS_STARTED: + return "STARTED"; + case ARS_FAILED: + return "FAILED"; + case ARS_CANCELED: + return "CANCELED"; + case ARS_SUCCEED: + return "SUCCEED"; + default: + return "UNKNOWN"; + } +} + +static inline bool agent_req_in_final_state(enum agent_req_status ars) +{ + return ((ars == ARS_SUCCEED) || (ars == ARS_FAILED) || + (ars == ARS_CANCELED)); +} + +struct llog_agent_req_rec { + struct llog_rec_hdr arr_hdr; /**< record header */ + __u32 arr_status; /**< status of the request */ + /* must match enum + * agent_req_status */ + __u32 arr_archive_id; /**< backend archive number */ + __u64 arr_flags; /**< req flags */ + __u64 arr_compound_id; /**< compound cookie */ + __u64 arr_req_create; /**< req. creation time */ + __u64 arr_req_change; /**< req. status change time */ + struct hsm_action_item arr_hai; /**< req. to the agent */ + struct llog_rec_tail arr_tail; /**< record tail for_sizezof_only */ +} __attribute__((packed)); + +/* Old llog gen for compatibility */ +struct llog_gen { + __u64 mnt_cnt; + __u64 conn_cnt; +} __attribute__((packed)); + +struct llog_gen_rec { + struct llog_rec_hdr lgr_hdr; + struct llog_gen lgr_gen; + __u64 padding1; + __u64 padding2; + __u64 padding3; + struct llog_rec_tail lgr_tail; +}; + +/* On-disk header structure of each log object, stored in little endian order */ +#define LLOG_CHUNK_SIZE 8192 +#define LLOG_HEADER_SIZE (96) +#define LLOG_BITMAP_BYTES (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE) + +#define LLOG_MIN_REC_SIZE (24) /* round(llog_rec_hdr + llog_rec_tail) */ + +/* flags for the logs */ +enum llog_flag { + LLOG_F_ZAP_WHEN_EMPTY = 0x1, + LLOG_F_IS_CAT = 0x2, + LLOG_F_IS_PLAIN = 0x4, +}; + +struct llog_log_hdr { + struct llog_rec_hdr llh_hdr; + __s64 llh_timestamp; + __u32 llh_count; + __u32 llh_bitmap_offset; + __u32 llh_size; + __u32 llh_flags; + __u32 llh_cat_idx; + /* for a catalog the first plain slot is next to it */ + struct obd_uuid llh_tgtuuid; + __u32 llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23]; + __u32 llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)]; + struct llog_rec_tail llh_tail; +} __attribute__((packed)); + +#define LLOG_BITMAP_SIZE(llh) (__u32)((llh->llh_hdr.lrh_len - \ + llh->llh_bitmap_offset - \ + sizeof(llh->llh_tail)) * 8) + +/** log cookies are used to reference a specific log file and a record therein */ +struct llog_cookie { + struct llog_logid lgc_lgl; + __u32 lgc_subsys; + __u32 lgc_index; + __u32 lgc_padding; +} __attribute__((packed)); + +/** llog protocol */ +enum llogd_rpc_ops { + LLOG_ORIGIN_HANDLE_CREATE = 501, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502, + LLOG_ORIGIN_HANDLE_READ_HEADER = 503, + LLOG_ORIGIN_HANDLE_WRITE_REC = 504, + LLOG_ORIGIN_HANDLE_CLOSE = 505, + LLOG_ORIGIN_CONNECT = 506, + LLOG_CATINFO = 507, /* deprecated */ + LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508, + LLOG_ORIGIN_HANDLE_DESTROY = 509, /* for destroy llog object*/ + LLOG_LAST_OPC, + LLOG_FIRST_OPC = LLOG_ORIGIN_HANDLE_CREATE +}; + +struct llogd_body { + struct llog_logid lgd_logid; + __u32 lgd_ctxt_idx; + __u32 lgd_llh_flags; + __u32 lgd_index; + __u32 lgd_saved_index; + __u32 lgd_len; + __u64 lgd_cur_offset; +} __attribute__((packed)); + +struct llogd_conn_body { + struct llog_gen lgdc_gen; + struct llog_logid lgdc_logid; + __u32 lgdc_ctxt_idx; +} __attribute__((packed)); + +/* Note: 64-bit types are 64-bit aligned in structure */ +struct obdo { + __u64 o_valid; /* hot fields in this obdo */ + struct ost_id o_oi; + __u64 o_parent_seq; + __u64 o_size; /* o_size-o_blocks == ost_lvb */ + __s64 o_mtime; + __s64 o_atime; + __s64 o_ctime; + __u64 o_blocks; /* brw: cli sent cached bytes */ + __u64 o_grant; + + /* 32-bit fields start here: keep an even number of them via padding */ + __u32 o_blksize; /* optimal IO blocksize */ + __u32 o_mode; /* brw: cli sent cache remain */ + __u32 o_uid; + __u32 o_gid; + __u32 o_flags; + __u32 o_nlink; /* brw: checksum */ + __u32 o_parent_oid; + __u32 o_misc; /* brw: o_dropped */ + + __u64 o_ioepoch; /* epoch in ost writes */ + __u32 o_stripe_idx; /* holds stripe idx */ + __u32 o_parent_ver; + struct lustre_handle o_handle; /* brw: lock handle to prolong + * locks */ + struct llog_cookie o_lcookie; /* destroy: unlink cookie from + * MDS */ + __u32 o_uid_h; + __u32 o_gid_h; + + __u64 o_data_version; /* getattr: sum of iversion for + * each stripe. + * brw: grant space consumed on + * the client for the write */ + __u64 o_padding_4; + __u64 o_padding_5; + __u64 o_padding_6; +}; + +#define o_dirty o_blocks +#define o_undirty o_mode +#define o_dropped o_misc +#define o_cksum o_nlink +#define o_grant_used o_data_version + +static inline void lustre_set_wire_obdo(struct obd_connect_data *ocd, + struct obdo *wobdo, + const struct obdo *lobdo) +{ + *wobdo = *lobdo; + wobdo->o_flags &= ~OBD_FL_LOCAL_MASK; + if (ocd == NULL) + return; + + if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && + fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) { + /* Currently OBD_FL_OSTID will only be used when 2.4 echo + * client communicate with pre-2.4 server */ + wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid); + wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid); + } +} + +static inline void lustre_get_wire_obdo(struct obd_connect_data *ocd, + struct obdo *lobdo, + const struct obdo *wobdo) +{ + __u32 local_flags = 0; + + if (lobdo->o_valid & OBD_MD_FLFLAGS) + local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK; + + *lobdo = *wobdo; + if (local_flags != 0) { + lobdo->o_valid |= OBD_MD_FLFLAGS; + lobdo->o_flags &= ~OBD_FL_LOCAL_MASK; + lobdo->o_flags |= local_flags; + } + if (ocd == NULL) + return; + + if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && + fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) { + /* see above */ + lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq; + lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id; + lobdo->o_oi.oi_fid.f_ver = 0; + } +} + +extern void lustre_swab_obdo (struct obdo *o); + +/* request structure for OST's */ +struct ost_body { + struct obdo oa; +}; + +/* Key for FIEMAP to be used in get_info calls */ +struct ll_fiemap_info_key { + char name[8]; + struct obdo oa; + struct ll_user_fiemap fiemap; +}; + +extern void lustre_swab_ost_body (struct ost_body *b); +extern void lustre_swab_ost_last_id(__u64 *id); +extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap); + +extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum); +extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum); +extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count); +extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm); + +/* llog_swab.c */ +extern void lustre_swab_llogd_body (struct llogd_body *d); +extern void lustre_swab_llog_hdr (struct llog_log_hdr *h); +extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d); +extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec); +extern void lustre_swab_llog_id(struct llog_logid *lid); + +struct lustre_cfg; +extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg); + +/* Functions for dumping PTLRPC fields */ +void dump_rniobuf(struct niobuf_remote *rnb); +void dump_ioo(struct obd_ioobj *nb); +void dump_obdo(struct obdo *oa); +void dump_ost_body(struct ost_body *ob); +void dump_rcs(__u32 *rc); + +#define IDX_INFO_MAGIC 0x3D37CC37 + +/* Index file transfer through the network. The server serializes the index into + * a byte stream which is sent to the client via a bulk transfer */ +struct idx_info { + __u32 ii_magic; + + /* reply: see idx_info_flags below */ + __u32 ii_flags; + + /* request & reply: number of lu_idxpage (to be) transferred */ + __u16 ii_count; + __u16 ii_pad0; + + /* request: requested attributes passed down to the iterator API */ + __u32 ii_attrs; + + /* request & reply: index file identifier (FID) */ + struct lu_fid ii_fid; + + /* reply: version of the index file before starting to walk the index. + * Please note that the version can be modified at any time during the + * transfer */ + __u64 ii_version; + + /* request: hash to start with: + * reply: hash of the first entry of the first lu_idxpage and hash + * of the entry to read next if any */ + __u64 ii_hash_start; + __u64 ii_hash_end; + + /* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is + * set */ + __u16 ii_keysize; + + /* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC + * is set */ + __u16 ii_recsize; + + __u32 ii_pad1; + __u64 ii_pad2; + __u64 ii_pad3; +}; +extern void lustre_swab_idx_info(struct idx_info *ii); + +#define II_END_OFF MDS_DIR_END_OFF /* all entries have been read */ + +/* List of flags used in idx_info::ii_flags */ +enum idx_info_flags { + II_FL_NOHASH = 1 << 0, /* client doesn't care about hash value */ + II_FL_VARKEY = 1 << 1, /* keys can be of variable size */ + II_FL_VARREC = 1 << 2, /* records can be of variable size */ + II_FL_NONUNQ = 1 << 3, /* index supports non-unique keys */ +}; + +#define LIP_MAGIC 0x8A6D6B6C + +/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */ +struct lu_idxpage { + /* 16-byte header */ + __u32 lip_magic; + __u16 lip_flags; + __u16 lip_nr; /* number of entries in the container */ + __u64 lip_pad0; /* additional padding for future use */ + + /* key/record pairs are stored in the remaining 4080 bytes. + * depending upon the flags in idx_info::ii_flags, each key/record + * pair might be preceded by: + * - a hash value + * - the key size (II_FL_VARKEY is set) + * - the record size (II_FL_VARREC is set) + * + * For the time being, we only support fixed-size key & record. */ + char lip_entries[0]; +}; +extern void lustre_swab_lip_header(struct lu_idxpage *lip); + +#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries)) + +/* Gather all possible type associated with a 4KB container */ +union lu_page { + struct lu_dirpage lp_dir; /* for MDS_READPAGE */ + struct lu_idxpage lp_idx; /* for OBD_IDX_READ */ + char lp_array[LU_PAGE_SIZE]; +}; + +/* security opcodes */ +typedef enum { + SEC_CTX_INIT = 801, + SEC_CTX_INIT_CONT = 802, + SEC_CTX_FINI = 803, + SEC_LAST_OPC, + SEC_FIRST_OPC = SEC_CTX_INIT +} sec_cmd_t; + +/* + * capa related definitions + */ +#define CAPA_HMAC_MAX_LEN 64 +#define CAPA_HMAC_KEY_MAX_LEN 56 + +/* NB take care when changing the sequence of elements this struct, + * because the offset info is used in find_capa() */ +struct lustre_capa { + struct lu_fid lc_fid; /** fid */ + __u64 lc_opc; /** operations allowed */ + __u64 lc_uid; /** file owner */ + __u64 lc_gid; /** file group */ + __u32 lc_flags; /** HMAC algorithm & flags */ + __u32 lc_keyid; /** key# used for the capability */ + __u32 lc_timeout; /** capa timeout value (sec) */ + __u32 lc_expiry; /** expiry time (sec) */ + __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /** HMAC */ +} __attribute__((packed)); + +extern void lustre_swab_lustre_capa(struct lustre_capa *c); + +/** lustre_capa::lc_opc */ +enum { + CAPA_OPC_BODY_WRITE = 1<<0, /**< write object data */ + CAPA_OPC_BODY_READ = 1<<1, /**< read object data */ + CAPA_OPC_INDEX_LOOKUP = 1<<2, /**< lookup object fid */ + CAPA_OPC_INDEX_INSERT = 1<<3, /**< insert object fid */ + CAPA_OPC_INDEX_DELETE = 1<<4, /**< delete object fid */ + CAPA_OPC_OSS_WRITE = 1<<5, /**< write oss object data */ + CAPA_OPC_OSS_READ = 1<<6, /**< read oss object data */ + CAPA_OPC_OSS_TRUNC = 1<<7, /**< truncate oss object */ + CAPA_OPC_OSS_DESTROY = 1<<8, /**< destroy oss object */ + CAPA_OPC_META_WRITE = 1<<9, /**< write object meta data */ + CAPA_OPC_META_READ = 1<<10, /**< read object meta data */ +}; + +#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE) +#define CAPA_OPC_MDS_ONLY \ + (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \ + CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE) +#define CAPA_OPC_OSS_ONLY \ + (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC | \ + CAPA_OPC_OSS_DESTROY) +#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY +#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY) + +/* MDS capability covers object capability for operations of body r/w + * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w, + * while OSS capability only covers object capability for operations of + * oss data(file content) r/w/truncate. + */ +static inline int capa_for_mds(struct lustre_capa *c) +{ + return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0; +} + +static inline int capa_for_oss(struct lustre_capa *c) +{ + return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0; +} + +/* lustre_capa::lc_hmac_alg */ +enum { + CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */ + CAPA_HMAC_ALG_MAX, +}; + +#define CAPA_FL_MASK 0x00ffffff +#define CAPA_HMAC_ALG_MASK 0xff000000 + +struct lustre_capa_key { + __u64 lk_seq; /**< mds# */ + __u32 lk_keyid; /**< key# */ + __u32 lk_padding; + __u8 lk_key[CAPA_HMAC_KEY_MAX_LEN]; /**< key */ +} __attribute__((packed)); + +extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k); + +/** The link ea holds 1 \a link_ea_entry for each hardlink */ +#define LINK_EA_MAGIC 0x11EAF1DFUL +struct link_ea_header { + __u32 leh_magic; + __u32 leh_reccount; + __u64 leh_len; /* total size */ + /* future use */ + __u32 padding1; + __u32 padding2; +}; + +/** Hardlink data is name and parent fid. + * Stored in this crazy struct for maximum packing and endian-neutrality + */ +struct link_ea_entry { + /** __u16 stored big-endian, unaligned */ + unsigned char lee_reclen[2]; + unsigned char lee_parent_fid[sizeof(struct lu_fid)]; + char lee_name[0]; +}__attribute__((packed)); + +/** fid2path request/reply structure */ +struct getinfo_fid2path { + struct lu_fid gf_fid; + __u64 gf_recno; + __u32 gf_linkno; + __u32 gf_pathlen; + char gf_path[0]; +} __attribute__((packed)); + +void lustre_swab_fid2path (struct getinfo_fid2path *gf); + +enum { + LAYOUT_INTENT_ACCESS = 0, + LAYOUT_INTENT_READ = 1, + LAYOUT_INTENT_WRITE = 2, + LAYOUT_INTENT_GLIMPSE = 3, + LAYOUT_INTENT_TRUNC = 4, + LAYOUT_INTENT_RELEASE = 5, + LAYOUT_INTENT_RESTORE = 6 +}; + +/* enqueue layout lock with intent */ +struct layout_intent { + __u32 li_opc; /* intent operation for enqueue, read, write etc */ + __u32 li_flags; + __u64 li_start; + __u64 li_end; +}; + +void lustre_swab_layout_intent(struct layout_intent *li); + +/** + * On the wire version of hsm_progress structure. + * + * Contains the userspace hsm_progress and some internal fields. + */ +struct hsm_progress_kernel { + /* Field taken from struct hsm_progress */ + lustre_fid hpk_fid; + __u64 hpk_cookie; + struct hsm_extent hpk_extent; + __u16 hpk_flags; + __u16 hpk_errval; /* positive val */ + __u32 hpk_padding1; + /* Additional fields */ + __u64 hpk_data_version; + __u64 hpk_padding2; +} __attribute__((packed)); + +extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +extern void lustre_swab_hsm_current_action(struct hsm_current_action *action); +extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk); +extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui); +extern void lustre_swab_hsm_request(struct hsm_request *hr); + +/** + * These are object update opcode under UPDATE_OBJ, which is currently + * being used by cross-ref operations between MDT. + * + * During the cross-ref operation, the Master MDT, which the client send the + * request to, will disassembly the operation into object updates, then OSP + * will send these updates to the remote MDT to be executed. + * + * Update request format + * magic: UPDATE_BUFFER_MAGIC_V1 + * Count: How many updates in the req. + * bufs[0] : following are packets of object. + * update[0]: + * type: object_update_op, the op code of update + * fid: The object fid of the update. + * lens/bufs: other parameters of the update. + * update[1]: + * type: object_update_op, the op code of update + * fid: The object fid of the update. + * lens/bufs: other parameters of the update. + * .......... + * update[7]: type: object_update_op, the op code of update + * fid: The object fid of the update. + * lens/bufs: other parameters of the update. + * Current 8 maxim updates per object update request. + * + ******************************************************************* + * update reply format: + * + * ur_version: UPDATE_REPLY_V1 + * ur_count: The count of the reply, which is usually equal + * to the number of updates in the request. + * ur_lens: The reply lengths of each object update. + * + * replies: 1st update reply [4bytes_ret: other body] + * 2nd update reply [4bytes_ret: other body] + * ..... + * nth update reply [4bytes_ret: other body] + * + * For each reply of the update, the format would be + * result(4 bytes):Other stuff + */ + +#define UPDATE_MAX_OPS 10 +#define UPDATE_BUFFER_MAGIC_V1 0xBDDE0001 +#define UPDATE_BUFFER_MAGIC UPDATE_BUFFER_MAGIC_V1 +#define UPDATE_BUF_COUNT 8 +enum object_update_op { + OBJ_CREATE = 1, + OBJ_DESTROY = 2, + OBJ_REF_ADD = 3, + OBJ_REF_DEL = 4, + OBJ_ATTR_SET = 5, + OBJ_ATTR_GET = 6, + OBJ_XATTR_SET = 7, + OBJ_XATTR_GET = 8, + OBJ_INDEX_LOOKUP = 9, + OBJ_INDEX_INSERT = 10, + OBJ_INDEX_DELETE = 11, + OBJ_LAST +}; + +struct update { + __u32 u_type; + __u32 u_batchid; + struct lu_fid u_fid; + __u32 u_lens[UPDATE_BUF_COUNT]; + __u32 u_bufs[0]; +}; + +struct update_buf { + __u32 ub_magic; + __u32 ub_count; + __u32 ub_bufs[0]; +}; + +#define UPDATE_REPLY_V1 0x00BD0001 +struct update_reply { + __u32 ur_version; + __u32 ur_count; + __u32 ur_lens[0]; +}; + +void lustre_swab_update_buf(struct update_buf *ub); +void lustre_swab_update_reply_buf(struct update_reply *ur); + +/** layout swap request structure + * fid1 and fid2 are in mdt_body + */ +struct mdc_swap_layouts { + __u64 msl_flags; +} __packed; + +void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl); + +struct close_data { + struct lustre_handle cd_handle; + struct lu_fid cd_fid; + __u64 cd_data_version; + __u64 cd_reserved[8]; +}; + +void lustre_swab_close_data(struct close_data *data); + +#endif +/** @} lustreidl */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h new file mode 100644 index 000000000..1c87a61a7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * lustre/include/lustre/lustre_lfsck_user.h + * + * Lustre LFSCK userspace interfaces. + * + * Author: Fan Yong + */ + +#ifndef _LUSTRE_LFSCK_USER_H +# define _LUSTRE_LFSCK_USER_H + +enum lfsck_param_flags { + /* Reset LFSCK iterator position to the device beginning. */ + LPF_RESET = 0x0001, + + /* Exit when fail. */ + LPF_FAILOUT = 0x0002, + + /* Dryrun mode, only check without modification */ + LPF_DRYRUN = 0x0004, +}; + +enum lfsck_type { + /* For MDT-OST consistency check/repair. */ + LT_LAYOUT = 0x0001, + + /* For MDT-MDT consistency check/repair. */ + LT_DNE = 0x0002, + + /* For FID-in-dirent and linkEA consistency check/repair. */ + LT_NAMESPACE = 0x0004, +}; + +#define LFSCK_VERSION_V1 1 +#define LFSCK_VERSION_V2 2 + +#define LFSCK_TYPES_ALL ((__u16)(~0)) +#define LFSCK_TYPES_DEF ((__u16)0) +#define LFSCK_TYPES_SUPPORTED LT_NAMESPACE + +#define LFSCK_SPEED_NO_LIMIT 0 +#define LFSCK_SPEED_LIMIT_DEF LFSCK_SPEED_NO_LIMIT + +enum lfsck_start_valid { + LSV_SPEED_LIMIT = 0x00000001, + LSV_ERROR_HANDLE = 0x00000002, + LSV_DRYRUN = 0x00000004, +}; + +/* Arguments for starting lfsck. */ +struct lfsck_start { + /* Which arguments are valid, see 'enum lfsck_start_valid'. */ + __u32 ls_valid; + + /* How many items can be scanned at most per second. */ + __u32 ls_speed_limit; + + /* For compatibility between user space tools and kernel service. */ + __u16 ls_version; + + /* Which LFSCK components to be (have been) started. */ + __u16 ls_active; + + /* Flags for the LFSCK, see 'enum lfsck_param_flags'. */ + __u16 ls_flags; + + /* For 64-bits aligned. */ + __u16 ls_padding; +}; + +#endif /* _LUSTRE_LFSCK_USER_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_user.h new file mode 100644 index 000000000..89794fdfe --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre/lustre_user.h @@ -0,0 +1,1179 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +#ifndef _LUSTRE_USER_H +#define _LUSTRE_USER_H + +/** \defgroup lustreuser lustreuser + * + * @{ + */ + +#include "ll_fiemap.h" +#include "../linux/lustre_user.h" + +/* for statfs() */ +#define LL_SUPER_MAGIC 0x0BD00BD0 + +#ifndef FSFILT_IOC_GETFLAGS +#define FSFILT_IOC_GETFLAGS _IOR('f', 1, long) +#define FSFILT_IOC_SETFLAGS _IOW('f', 2, long) +#define FSFILT_IOC_GETVERSION _IOR('f', 3, long) +#define FSFILT_IOC_SETVERSION _IOW('f', 4, long) +#define FSFILT_IOC_GETVERSION_OLD _IOR('v', 1, long) +#define FSFILT_IOC_SETVERSION_OLD _IOW('v', 2, long) +#define FSFILT_IOC_FIEMAP _IOWR('f', 11, struct ll_user_fiemap) +#endif + +/* FIEMAP flags supported by Lustre */ +#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER) + +enum obd_statfs_state { + OS_STATE_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */ + OS_STATE_READONLY = 0x00000002, /**< filesystem is read-only */ + OS_STATE_RDONLY_1 = 0x00000004, /**< obsolete 1.6, was EROFS=30 */ + OS_STATE_RDONLY_2 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */ + OS_STATE_RDONLY_3 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */ +}; + +struct obd_statfs { + __u64 os_type; + __u64 os_blocks; + __u64 os_bfree; + __u64 os_bavail; + __u64 os_files; + __u64 os_ffree; + __u8 os_fsid[40]; + __u32 os_bsize; + __u32 os_namelen; + __u64 os_maxbytes; + __u32 os_state; /**< obd_statfs_state OS_STATE_* flag */ + __u32 os_fprecreated; /* objs available now to the caller */ + /* used in QoS code to find preferred + * OSTs */ + __u32 os_spare2; + __u32 os_spare3; + __u32 os_spare4; + __u32 os_spare5; + __u32 os_spare6; + __u32 os_spare7; + __u32 os_spare8; + __u32 os_spare9; +}; + +/** + * File IDentifier. + * + * FID is a cluster-wide unique identifier of a file or an object (stripe). + * FIDs are never reused. + **/ +struct lu_fid { + /** + * FID sequence. Sequence is a unit of migration: all files (objects) + * with FIDs from a given sequence are stored on the same server. + * Lustre should support 2^64 objects, so even if each sequence + * has only a single object we can still enumerate 2^64 objects. + **/ + __u64 f_seq; + /* FID number within sequence. */ + __u32 f_oid; + /** + * FID version, used to distinguish different versions (in the sense + * of snapshots, etc.) of the same file system object. Not currently + * used. + **/ + __u32 f_ver; +}; + +struct filter_fid { + struct lu_fid ff_parent; /* ff_parent.f_ver == file stripe number */ +}; + +/* keep this one for compatibility */ +struct filter_fid_old { + struct lu_fid ff_parent; + __u64 ff_objid; + __u64 ff_seq; +}; + +/* Userspace should treat lu_fid as opaque, and only use the following methods + * to print or parse them. Other functions (e.g. compare, swab) could be moved + * here from lustre_idl.h if needed. */ +typedef struct lu_fid lustre_fid; + +/** + * Following struct for object attributes, that will be kept inode's EA. + * Introduced in 2.0 release (please see b15993, for details) + * Added to all objects since Lustre 2.4 as contains self FID + */ +struct lustre_mdt_attrs { + /** + * Bitfield for supported data in this structure. From enum lma_compat. + * lma_self_fid and lma_flags are always available. + */ + __u32 lma_compat; + /** + * Per-file incompat feature list. Lustre version should support all + * flags set in this field. The supported feature mask is available in + * LMA_INCOMPAT_SUPP. + */ + __u32 lma_incompat; + /** FID of this inode */ + struct lu_fid lma_self_fid; +}; + +/** + * Prior to 2.4, the LMA structure also included SOM attributes which has since + * been moved to a dedicated xattr + * lma_flags was also removed because of lma_compat/incompat fields. + */ +#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64)) + +/** + * OST object IDentifier. + */ +struct ost_id { + union { + struct ostid { + __u64 oi_id; + __u64 oi_seq; + } oi; + struct lu_fid oi_fid; + }; +}; + +#define DOSTID "%#llx:%llu" +#define POSTID(oi) ostid_seq(oi), ostid_id(oi) + +/* + * The ioctl naming rules: + * LL_* - works on the currently opened filehandle instead of parent dir + * *_OBD_* - gets data for both OSC or MDC (LOV, LMV indirectly) + * *_MDC_* - gets/sets data related to MDC + * *_LOV_* - gets/sets data related to OSC/LOV + * *FILE* - called on parent dir and passes in a filename + * *STRIPE* - set/get lov_user_md + * *INFO - set/get lov_user_mds_data + */ +/* see for ioctl numberss 101-150 */ +#define LL_IOC_GETFLAGS _IOR ('f', 151, long) +#define LL_IOC_SETFLAGS _IOW ('f', 152, long) +#define LL_IOC_CLRFLAGS _IOW ('f', 153, long) +/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */ +#define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long) +/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */ +#define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long) +/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */ +#define LL_IOC_LOV_SETEA _IOW ('f', 156, long) +#define LL_IOC_RECREATE_OBJ _IOW ('f', 157, long) +#define LL_IOC_RECREATE_FID _IOW ('f', 157, struct lu_fid) +#define LL_IOC_GROUP_LOCK _IOW ('f', 158, long) +#define LL_IOC_GROUP_UNLOCK _IOW ('f', 159, long) +/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */ +#define LL_IOC_QUOTACHECK _IOW ('f', 160, int) +/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */ +#define LL_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *) +/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */ +#define LL_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +#define IOC_OBD_STATFS _IOWR('f', 164, struct obd_statfs *) +#define IOC_LOV_GETINFO _IOWR('f', 165, struct lov_user_mds_data *) +#define LL_IOC_FLUSHCTX _IOW ('f', 166, long) +#define LL_IOC_RMTACL _IOW ('f', 167, long) +#define LL_IOC_GETOBDCOUNT _IOR ('f', 168, long) +#define LL_IOC_LLOOP_ATTACH _IOWR('f', 169, long) +#define LL_IOC_LLOOP_DETACH _IOWR('f', 170, long) +#define LL_IOC_LLOOP_INFO _IOWR('f', 171, struct lu_fid) +#define LL_IOC_LLOOP_DETACH_BYDEV _IOWR('f', 172, long) +#define LL_IOC_PATH2FID _IOR ('f', 173, long) +#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *) +#define LL_IOC_GET_MDTIDX _IOR ('f', 175, int) + +/* see for ioctl numbers 177-210 */ + +#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state) +#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set) +#define LL_IOC_HSM_CT_START _IOW('f', 213, struct lustre_kernelcomm) +#define LL_IOC_HSM_COPY_START _IOW('f', 214, struct hsm_copy *) +#define LL_IOC_HSM_COPY_END _IOW('f', 215, struct hsm_copy *) +#define LL_IOC_HSM_PROGRESS _IOW('f', 216, struct hsm_user_request) +#define LL_IOC_HSM_REQUEST _IOW('f', 217, struct hsm_user_request) +#define LL_IOC_DATA_VERSION _IOR('f', 218, struct ioc_data_version) +#define LL_IOC_LOV_SWAP_LAYOUTS _IOW('f', 219, \ + struct lustre_swap_layouts) +#define LL_IOC_HSM_ACTION _IOR('f', 220, \ + struct hsm_current_action) +/* see for ioctl numbers 221-232 */ + +#define LL_IOC_LMV_SETSTRIPE _IOWR('f', 240, struct lmv_user_md) +#define LL_IOC_LMV_GETSTRIPE _IOWR('f', 241, struct lmv_user_md) +#define LL_IOC_REMOVE_ENTRY _IOWR('f', 242, __u64) +#define LL_IOC_SET_LEASE _IOWR('f', 243, long) +#define LL_IOC_GET_LEASE _IO('f', 244) +#define LL_IOC_HSM_IMPORT _IOWR('f', 245, struct hsm_user_import) + +#define LL_STATFS_LMV 1 +#define LL_STATFS_LOV 2 +#define LL_STATFS_NODELAY 4 + +#define IOC_MDC_TYPE 'i' +#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) +#define IOC_MDC_GETFILESTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *) +#define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *) +#define LL_IOC_MDC_GETINFO _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *) + +/* Keep these for backward compartability. */ +#define LL_IOC_OBD_STATFS IOC_OBD_STATFS +#define IOC_MDC_GETSTRIPE IOC_MDC_GETFILESTRIPE + + +#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */ + +/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular + * files, but are unlikely to be used in practice and are not harmful if + * used incorrectly. O_NOCTTY and FASYNC are only meaningful for character + * devices and are safe for use on new files (See LU-812, LU-4209). */ +#define O_LOV_DELAY_CREATE (O_NOCTTY | FASYNC) + +#define LL_FILE_IGNORE_LOCK 0x00000001 +#define LL_FILE_GROUP_LOCKED 0x00000002 +#define LL_FILE_READAHEA 0x00000004 +#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */ +#define LL_FILE_LOCKLESS_IO 0x00000010 /* server-side locks with cio */ +#define LL_FILE_RMTACL 0x00000020 + +#define LOV_USER_MAGIC_V1 0x0BD10BD0 +#define LOV_USER_MAGIC LOV_USER_MAGIC_V1 +#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0 +#define LOV_USER_MAGIC_V3 0x0BD30BD0 + +#define LMV_MAGIC_V1 0x0CD10CD0 /*normal stripe lmv magic */ +#define LMV_USER_MAGIC 0x0CD20CD0 /*default lmv magic*/ + +#define LOV_PATTERN_RAID0 0x001 +#define LOV_PATTERN_RAID1 0x002 +#define LOV_PATTERN_FIRST 0x100 + +#define LOV_MAXPOOLNAME 16 +#define LOV_POOLNAMEF "%.16s" + +#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */ +#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS) +#define LOV_MAX_STRIPE_COUNT_OLD 160 +/* This calculation is crafted so that input of 4096 will result in 160 + * which in turn is equal to old maximal stripe count. + * XXX: In fact this is too simplified for now, what it also need is to get + * ea_type argument to clearly know how much space each stripe consumes. + * + * The limit of 12 pages is somewhat arbitrary, but is a reasonably large + * allocation that is sufficient for the current generation of systems. + * + * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */ +#define LOV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ +#define LOV_ALL_STRIPES 0xffff /* only valid for directories */ +#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */ + +#define lov_user_ost_data lov_user_ost_data_v1 +struct lov_user_ost_data_v1 { /* per-stripe data structure */ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this OST index */ + __u32 l_ost_idx; /* OST index in LOV */ +} __attribute__((packed)); + +#define lov_user_md lov_user_md_v1 +struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed, __may_alias__)); + +struct lov_user_md_v3 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + char lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */ + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed)); + +static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic) +{ + if (lmm_magic == LOV_USER_MAGIC_V3) + return sizeof(struct lov_user_md_v3) + + stripes * sizeof(struct lov_user_ost_data_v1); + else + return sizeof(struct lov_user_md_v1) + + stripes * sizeof(struct lov_user_ost_data_v1); +} + +/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to + * use this. It is unsafe to #define those values in this header as it + * is possible the application has already #included . */ +#ifdef HAVE_LOV_USER_MDS_DATA +#define lov_user_mds_data lov_user_mds_data_v1 +struct lov_user_mds_data_v1 { + lstat_t lmd_st; /* MDS stat struct */ + struct lov_user_md_v1 lmd_lmm; /* LOV EA V1 user data */ +} __attribute__((packed)); + +struct lov_user_mds_data_v3 { + lstat_t lmd_st; /* MDS stat struct */ + struct lov_user_md_v3 lmd_lmm; /* LOV EA V3 user data */ +} __attribute__((packed)); +#endif + +/* keep this to be the same size as lov_user_ost_data_v1 */ +struct lmv_user_mds_data { + struct lu_fid lum_fid; + __u32 lum_padding; + __u32 lum_mds; +}; + +/* lum_type */ +enum { + LMV_STRIPE_TYPE = 0, + LMV_DEFAULT_TYPE = 1, +}; + +#define lmv_user_md lmv_user_md_v1 +struct lmv_user_md_v1 { + __u32 lum_magic; /* must be the first field */ + __u32 lum_stripe_count; /* dirstripe count */ + __u32 lum_stripe_offset; /* MDT idx for default dirstripe */ + __u32 lum_hash_type; /* Dir stripe policy */ + __u32 lum_type; /* LMV type: default or normal */ + __u32 lum_padding1; + __u32 lum_padding2; + __u32 lum_padding3; + char lum_pool_name[LOV_MAXPOOLNAME]; + struct lmv_user_mds_data lum_objects[0]; +}; + +static inline int lmv_user_md_size(int stripes, int lmm_magic) +{ + return sizeof(struct lmv_user_md) + + stripes * sizeof(struct lmv_user_mds_data); +} + +extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum); + +struct ll_recreate_obj { + __u64 lrc_id; + __u32 lrc_ost_idx; +}; + +struct ll_fid { + __u64 id; /* holds object id */ + __u32 generation; /* holds object generation */ + __u32 f_type; /* holds object type or stripe idx when passing it to + * OST for saving into EA. */ +}; + +#define UUID_MAX 40 +struct obd_uuid { + char uuid[UUID_MAX]; +}; + +static inline bool obd_uuid_equals(const struct obd_uuid *u1, + const struct obd_uuid *u2) +{ + return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0; +} + +static inline int obd_uuid_empty(struct obd_uuid *uuid) +{ + return uuid->uuid[0] == '\0'; +} + +static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp) +{ + strncpy((char *)uuid->uuid, tmp, sizeof(*uuid)); + uuid->uuid[sizeof(*uuid) - 1] = '\0'; +} + +/* For printf's only, make sure uuid is terminated */ +static inline char *obd_uuid2str(const struct obd_uuid *uuid) +{ + if (uuid->uuid[sizeof(*uuid) - 1] != '\0') { + /* Obviously not safe, but for printfs, no real harm done... + we're always null-terminated, even in a race. */ + static char temp[sizeof(*uuid)]; + memcpy(temp, uuid->uuid, sizeof(*uuid) - 1); + temp[sizeof(*uuid) - 1] = '\0'; + return temp; + } + return (char *)(uuid->uuid); +} + +/* Extract fsname from uuid (or target name) of a target + e.g. (myfs-OST0007_UUID -> myfs) + see also deuuidify. */ +static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen) +{ + char *p; + + strncpy(buf, uuid, buflen - 1); + buf[buflen - 1] = '\0'; + p = strrchr(buf, '-'); + if (p) + *p = '\0'; +} + +/* printf display format + e.g. printf("file FID is "DFID"\n", PFID(fid)); */ +#define FID_NOBRACE_LEN 40 +#define FID_LEN (FID_NOBRACE_LEN + 2) +#define DFID_NOBRACE "%#llx:0x%x:0x%x" +#define DFID "["DFID_NOBRACE"]" +#define PFID(fid) \ + (fid)->f_seq, \ + (fid)->f_oid, \ + (fid)->f_ver + +/* scanf input parse format -- strip '[' first. + e.g. sscanf(fidstr, SFID, RFID(&fid)); */ +#define SFID "0x%llx:0x%x:0x%x" +#define RFID(fid) \ + &((fid)->f_seq), \ + &((fid)->f_oid), \ + &((fid)->f_ver) + + +/********* Quotas **********/ + +/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */ +#define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */ +#define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */ +#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */ +#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */ +#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */ +#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */ +/* lustre-specific control commands */ +#define LUSTRE_Q_INVALIDATE 0x80000b /* invalidate quota data */ +#define LUSTRE_Q_FINVALIDATE 0x80000c /* invalidate filter quota data */ + +#define UGQUOTA 2 /* set both USRQUOTA and GRPQUOTA */ + +struct if_quotacheck { + char obd_type[16]; + struct obd_uuid obd_uuid; +}; + +#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629 + +/* permission */ +#define N_PERMS_MAX 64 + +struct perm_downcall_data { + __u64 pdd_nid; + __u32 pdd_perm; + __u32 pdd_padding; +}; + +struct identity_downcall_data { + __u32 idd_magic; + __u32 idd_err; + __u32 idd_uid; + __u32 idd_gid; + __u32 idd_nperms; + __u32 idd_ngroups; + struct perm_downcall_data idd_perms[N_PERMS_MAX]; + __u32 idd_groups[0]; +}; + +/* for non-mapped uid/gid */ +#define NOBODY_UID 99 +#define NOBODY_GID 99 + +#define INVALID_ID (-1) + +enum { + RMT_LSETFACL = 1, + RMT_LGETFACL = 2, + RMT_RSETFACL = 3, + RMT_RGETFACL = 4 +}; + +#ifdef NEED_QUOTA_DEFS +#ifndef QIF_BLIMITS +#define QIF_BLIMITS 1 +#define QIF_SPACE 2 +#define QIF_ILIMITS 4 +#define QIF_INODES 8 +#define QIF_BTIME 16 +#define QIF_ITIME 32 +#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS) +#define QIF_USAGE (QIF_SPACE | QIF_INODES) +#define QIF_TIMES (QIF_BTIME | QIF_ITIME) +#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES) +#endif + +#endif /* !__KERNEL__ */ + +/* lustre volatile file support + * file name header: .^L^S^T^R:volatile" + */ +#define LUSTRE_VOLATILE_HDR ".\x0c\x13\x14\x12:VOLATILE" +#define LUSTRE_VOLATILE_HDR_LEN 14 +/* hdr + MDT index */ +#define LUSTRE_VOLATILE_IDX LUSTRE_VOLATILE_HDR":%.4X:" + +typedef enum lustre_quota_version { + LUSTRE_QUOTA_V2 = 1 +} lustre_quota_version_t; + +/* XXX: same as if_dqinfo struct in kernel */ +struct obd_dqinfo { + __u64 dqi_bgrace; + __u64 dqi_igrace; + __u32 dqi_flags; + __u32 dqi_valid; +}; + +/* XXX: same as if_dqblk struct in kernel, plus one padding */ +struct obd_dqblk { + __u64 dqb_bhardlimit; + __u64 dqb_bsoftlimit; + __u64 dqb_curspace; + __u64 dqb_ihardlimit; + __u64 dqb_isoftlimit; + __u64 dqb_curinodes; + __u64 dqb_btime; + __u64 dqb_itime; + __u32 dqb_valid; + __u32 dqb_padding; +}; + +enum { + QC_GENERAL = 0, + QC_MDTIDX = 1, + QC_OSTIDX = 2, + QC_UUID = 3 +}; + +struct if_quotactl { + __u32 qc_cmd; + __u32 qc_type; + __u32 qc_id; + __u32 qc_stat; + __u32 qc_valid; + __u32 qc_idx; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; + char obd_type[16]; + struct obd_uuid obd_uuid; +}; + +/* swap layout flags */ +#define SWAP_LAYOUTS_CHECK_DV1 (1 << 0) +#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1) +#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2) +#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3) + +/* Swap XATTR_NAME_HSM as well, only on the MDT so far */ +#define SWAP_LAYOUTS_MDS_HSM (1 << 31) +struct lustre_swap_layouts { + __u64 sl_flags; + __u32 sl_fd; + __u32 sl_gid; + __u64 sl_dv1; + __u64 sl_dv2; +}; + + +/********* Changelogs **********/ +/** Changelog record types */ +enum changelog_rec_type { + CL_MARK = 0, + CL_CREATE = 1, /* namespace */ + CL_MKDIR = 2, /* namespace */ + CL_HARDLINK = 3, /* namespace */ + CL_SOFTLINK = 4, /* namespace */ + CL_MKNOD = 5, /* namespace */ + CL_UNLINK = 6, /* namespace */ + CL_RMDIR = 7, /* namespace */ + CL_RENAME = 8, /* namespace */ + CL_EXT = 9, /* namespace extended record (2nd half of rename) */ + CL_OPEN = 10, /* not currently used */ + CL_CLOSE = 11, /* may be written to log only with mtime change */ + CL_LAYOUT = 12, /* file layout/striping modified */ + CL_TRUNC = 13, + CL_SETATTR = 14, + CL_XATTR = 15, + CL_HSM = 16, /* HSM specific events, see flags */ + CL_MTIME = 17, /* Precedence: setattr > mtime > ctime > atime */ + CL_CTIME = 18, + CL_ATIME = 19, + CL_LAST +}; + +static inline const char *changelog_type2str(int type) { + static const char *changelog_str[] = { + "MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK", + "RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "LYOUT", "TRUNC", + "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME", + }; + + if (type >= 0 && type < CL_LAST) + return changelog_str[type]; + return NULL; +} + +/* per-record flags */ +#define CLF_VERSION 0x1000 +#define CLF_EXT_VERSION 0x2000 +#define CLF_FLAGSHIFT 12 +#define CLF_FLAGMASK ((1U << CLF_FLAGSHIFT) - 1) +#define CLF_VERMASK (~CLF_FLAGMASK) +/* Anything under the flagmask may be per-type (if desired) */ +/* Flags for unlink */ +#define CLF_UNLINK_LAST 0x0001 /* Unlink of last hardlink */ +#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */ + /* HSM cleaning needed */ +/* Flags for rename */ +#define CLF_RENAME_LAST 0x0001 /* rename unlink last hardlink of target */ + +/* Flags for HSM */ +/* 12b used (from high weight to low weight): + * 2b for flags + * 3b for event + * 7b for error code + */ +#define CLF_HSM_ERR_L 0 /* HSM return code, 7 bits */ +#define CLF_HSM_ERR_H 6 +#define CLF_HSM_EVENT_L 7 /* HSM event, 3 bits, see enum hsm_event */ +#define CLF_HSM_EVENT_H 9 +#define CLF_HSM_FLAG_L 10 /* HSM flags, 2 bits, 1 used, 1 spare */ +#define CLF_HSM_FLAG_H 11 +#define CLF_HSM_SPARE_L 12 /* 4 spare bits */ +#define CLF_HSM_SPARE_H 15 +#define CLF_HSM_LAST 15 + +/* Remove bits higher than _h, then extract the value + * between _h and _l by shifting lower weigth to bit 0. */ +#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \ + >> (CLF_HSM_LAST - _h + _l)) + +#define CLF_HSM_SUCCESS 0x00 +#define CLF_HSM_MAXERROR 0x7E +#define CLF_HSM_ERROVERFLOW 0x7F + +#define CLF_HSM_DIRTY 1 /* file is dirty after HSM request end */ + +/* 3 bits field => 8 values allowed */ +enum hsm_event { + HE_ARCHIVE = 0, + HE_RESTORE = 1, + HE_CANCEL = 2, + HE_RELEASE = 3, + HE_REMOVE = 4, + HE_STATE = 5, + HE_SPARE1 = 6, + HE_SPARE2 = 7, +}; + +static inline enum hsm_event hsm_get_cl_event(__u16 flags) +{ + return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L); +} + +static inline void hsm_set_cl_event(int *flags, enum hsm_event he) +{ + *flags |= (he << CLF_HSM_EVENT_L); +} + +static inline __u16 hsm_get_cl_flags(int flags) +{ + return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L); +} + +static inline void hsm_set_cl_flags(int *flags, int bits) +{ + *flags |= (bits << CLF_HSM_FLAG_L); +} + +static inline int hsm_get_cl_error(int flags) +{ + return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L); +} + +static inline void hsm_set_cl_error(int *flags, int error) +{ + *flags |= (error << CLF_HSM_ERR_L); +} + +#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + \ + sizeof(struct changelog_ext_rec)) + +struct changelog_rec { + __u16 cr_namelen; + __u16 cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */ + __u32 cr_type; /**< \a changelog_rec_type */ + __u64 cr_index; /**< changelog record number */ + __u64 cr_prev; /**< last index for this target fid */ + __u64 cr_time; + union { + lustre_fid cr_tfid; /**< target fid */ + __u32 cr_markerflags; /**< CL_MARK flags */ + }; + lustre_fid cr_pfid; /**< parent fid */ + char cr_name[0]; /**< last element */ +} __attribute__((packed)); + +/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save + * space, only rename uses changelog_ext_rec, while others use changelog_rec to + * store records. + */ +struct changelog_ext_rec { + __u16 cr_namelen; + __u16 cr_flags; /**< (flags & CLF_FLAGMASK) | + CLF_EXT_VERSION */ + __u32 cr_type; /**< \a changelog_rec_type */ + __u64 cr_index; /**< changelog record number */ + __u64 cr_prev; /**< last index for this target fid */ + __u64 cr_time; + union { + lustre_fid cr_tfid; /**< target fid */ + __u32 cr_markerflags; /**< CL_MARK flags */ + }; + lustre_fid cr_pfid; /**< target parent fid */ + lustre_fid cr_sfid; /**< source fid, or zero */ + lustre_fid cr_spfid; /**< source parent fid, or zero */ + char cr_name[0]; /**< last element */ +} __attribute__((packed)); + +#define CHANGELOG_REC_EXTENDED(rec) \ + (((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION) + +static inline int changelog_rec_size(struct changelog_rec *rec) +{ + return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec): + sizeof(*rec); +} + +static inline char *changelog_rec_name(struct changelog_rec *rec) +{ + return CHANGELOG_REC_EXTENDED(rec) ? + ((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name; +} + +static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec) +{ + return rec->cr_namelen - strlen(rec->cr_name) - 1; +} + +static inline char *changelog_rec_sname(struct changelog_ext_rec *rec) +{ + return rec->cr_name + strlen(rec->cr_name) + 1; +} + +struct ioc_changelog { + __u64 icc_recno; + __u32 icc_mdtindex; + __u32 icc_id; + __u32 icc_flags; +}; + +enum changelog_message_type { + CL_RECORD = 10, /* message is a changelog_rec */ + CL_EOF = 11, /* at end of current changelog */ +}; + +/********* Misc **********/ + +struct ioc_data_version { + __u64 idv_version; + __u64 idv_flags; /* See LL_DV_xxx */ +}; +#define LL_DV_NOFLUSH 0x01 /* Do not take READ EXTENT LOCK before sampling + version. Dirty caches are left unchanged. */ + +#ifndef offsetof +# define offsetof(typ, memb) ((unsigned long)((char *)&(((typ *)0)->memb))) +#endif + +#define dot_lustre_name ".lustre" + + +/********* HSM **********/ + +/** HSM per-file state + * See HSM_FLAGS below. + */ +enum hsm_states { + HS_EXISTS = 0x00000001, + HS_DIRTY = 0x00000002, + HS_RELEASED = 0x00000004, + HS_ARCHIVED = 0x00000008, + HS_NORELEASE = 0x00000010, + HS_NOARCHIVE = 0x00000020, + HS_LOST = 0x00000040, +}; + +/* HSM user-setable flags. */ +#define HSM_USER_MASK (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY) + +/* Other HSM flags. */ +#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED) + +/* + * All HSM-related possible flags that could be applied to a file. + * This should be kept in sync with hsm_states. + */ +#define HSM_FLAGS_MASK (HSM_USER_MASK | HSM_STATUS_MASK) + +/** + * HSM request progress state + */ +enum hsm_progress_states { + HPS_WAITING = 1, + HPS_RUNNING = 2, + HPS_DONE = 3, +}; +#define HPS_NONE 0 + +static inline char *hsm_progress_state2name(enum hsm_progress_states s) +{ + switch (s) { + case HPS_WAITING: return "waiting"; + case HPS_RUNNING: return "running"; + case HPS_DONE: return "done"; + default: return "unknown"; + } +} + +struct hsm_extent { + __u64 offset; + __u64 length; +} __attribute__((packed)); + +/** + * Current HSM states of a Lustre file. + * + * This structure purpose is to be sent to user-space mainly. It describes the + * current HSM flags and in-progress action. + */ +struct hsm_user_state { + /** Current HSM states, from enum hsm_states. */ + __u32 hus_states; + __u32 hus_archive_id; + /** The current undergoing action, if there is one */ + __u32 hus_in_progress_state; + __u32 hus_in_progress_action; + struct hsm_extent hus_in_progress_location; + char hus_extended_info[]; +}; + +struct hsm_state_set_ioc { + struct lu_fid hssi_fid; + __u64 hssi_setmask; + __u64 hssi_clearmask; +}; + +/* + * This structure describes the current in-progress action for a file. + * it is returned to user space and send over the wire + */ +struct hsm_current_action { + /** The current undergoing action, if there is one */ + /* state is one of hsm_progress_states */ + __u32 hca_state; + /* action is one of hsm_user_action */ + __u32 hca_action; + struct hsm_extent hca_location; +}; + +/***** HSM user requests ******/ +/* User-generated (lfs/ioctl) request types */ +enum hsm_user_action { + HUA_NONE = 1, /* no action (noop) */ + HUA_ARCHIVE = 10, /* copy to hsm */ + HUA_RESTORE = 11, /* prestage */ + HUA_RELEASE = 12, /* drop ost objects */ + HUA_REMOVE = 13, /* remove from archive */ + HUA_CANCEL = 14 /* cancel a request */ +}; + +static inline char *hsm_user_action2name(enum hsm_user_action a) +{ + switch (a) { + case HUA_NONE: return "NOOP"; + case HUA_ARCHIVE: return "ARCHIVE"; + case HUA_RESTORE: return "RESTORE"; + case HUA_RELEASE: return "RELEASE"; + case HUA_REMOVE: return "REMOVE"; + case HUA_CANCEL: return "CANCEL"; + default: return "UNKNOWN"; + } +} + +/* + * List of hr_flags (bit field) + */ +#define HSM_FORCE_ACTION 0x0001 +/* used by CT, connot be set by user */ +#define HSM_GHOST_COPY 0x0002 + +/** + * Contains all the fixed part of struct hsm_user_request. + * + */ +struct hsm_request { + __u32 hr_action; /* enum hsm_user_action */ + __u32 hr_archive_id; /* archive id, used only with HUA_ARCHIVE */ + __u64 hr_flags; /* request flags */ + __u32 hr_itemcount; /* item count in hur_user_item vector */ + __u32 hr_data_len; +}; + +struct hsm_user_item { + lustre_fid hui_fid; + struct hsm_extent hui_extent; +} __attribute__((packed)); + +struct hsm_user_request { + struct hsm_request hur_request; + struct hsm_user_item hur_user_item[0]; + /* extra data blob at end of struct (after all + * hur_user_items), only use helpers to access it + */ +} __attribute__((packed)); + +/** Return pointer to data field in a hsm user request */ +static inline void *hur_data(struct hsm_user_request *hur) +{ + return &(hur->hur_user_item[hur->hur_request.hr_itemcount]); +} + +/** + * Compute the current length of the provided hsm_user_request. This returns -1 + * instead of an errno because ssize_t is defined to be only [ -1, SSIZE_MAX ] + * + * return -1 on bounds check error. + */ +static inline ssize_t hur_len(struct hsm_user_request *hur) +{ + __u64 size; + + /* can't overflow a __u64 since hr_itemcount is only __u32 */ + size = offsetof(struct hsm_user_request, hur_user_item[0]) + + (__u64)hur->hur_request.hr_itemcount * + sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len; + + if (size != (ssize_t)size) + return -1; + + return size; +} + +/****** HSM RPCs to copytool *****/ +/* Message types the copytool may receive */ +enum hsm_message_type { + HMT_ACTION_LIST = 100, /* message is a hsm_action_list */ +}; + +/* Actions the copytool may be instructed to take for a given action_item */ +enum hsm_copytool_action { + HSMA_NONE = 10, /* no action */ + HSMA_ARCHIVE = 20, /* arbitrary offset */ + HSMA_RESTORE = 21, + HSMA_REMOVE = 22, + HSMA_CANCEL = 23 +}; + +static inline char *hsm_copytool_action2name(enum hsm_copytool_action a) +{ + switch (a) { + case HSMA_NONE: return "NOOP"; + case HSMA_ARCHIVE: return "ARCHIVE"; + case HSMA_RESTORE: return "RESTORE"; + case HSMA_REMOVE: return "REMOVE"; + case HSMA_CANCEL: return "CANCEL"; + default: return "UNKNOWN"; + } +} + +/* Copytool item action description */ +struct hsm_action_item { + __u32 hai_len; /* valid size of this struct */ + __u32 hai_action; /* hsm_copytool_action, but use known size */ + lustre_fid hai_fid; /* Lustre FID to operated on */ + lustre_fid hai_dfid; /* fid used for data access */ + struct hsm_extent hai_extent; /* byte range to operate on */ + __u64 hai_cookie; /* action cookie from coordinator */ + __u64 hai_gid; /* grouplock id */ + char hai_data[0]; /* variable length */ +} __attribute__((packed)); + +/* + * helper function which print in hexa the first bytes of + * hai opaque field + * \param hai [IN] record to print + * \param buffer [OUT] output buffer + * \param len [IN] max buffer len + * \retval buffer + */ +static inline char *hai_dump_data_field(struct hsm_action_item *hai, + char *buffer, int len) +{ + int i, sz, data_len; + char *ptr; + + ptr = buffer; + sz = len; + data_len = hai->hai_len - sizeof(*hai); + for (i = 0 ; (i < data_len) && (sz > 0) ; i++) { + int cnt; + + cnt = snprintf(ptr, sz, "%.2X", + (unsigned char)hai->hai_data[i]); + ptr += cnt; + sz -= cnt; + } + *ptr = '\0'; + return buffer; +} + +/* Copytool action list */ +#define HAL_VERSION 1 +#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */ +struct hsm_action_list { + __u32 hal_version; + __u32 hal_count; /* number of hai's to follow */ + __u64 hal_compound_id; /* returned by coordinator */ + __u64 hal_flags; + __u32 hal_archive_id; /* which archive backend */ + __u32 padding1; + char hal_fsname[0]; /* null-terminated */ + /* struct hsm_action_item[hal_count] follows, aligned on 8-byte + boundaries. See hai_zero */ +} __attribute__((packed)); + +#ifndef HAVE_CFS_SIZE_ROUND +static inline int cfs_size_round (int val) +{ + return (val + 7) & (~0x7); +} +#define HAVE_CFS_SIZE_ROUND +#endif + +/* Return pointer to first hai in action list */ +static inline struct hsm_action_item *hai_zero(struct hsm_action_list *hal) +{ + return (struct hsm_action_item *)(hal->hal_fsname + + cfs_size_round(strlen(hal-> \ + hal_fsname) + + 1)); +} +/* Return pointer to next hai */ +static inline struct hsm_action_item *hai_next(struct hsm_action_item *hai) +{ + return (struct hsm_action_item *)((char *)hai + + cfs_size_round(hai->hai_len)); +} + +/* Return size of an hsm_action_list */ +static inline int hal_size(struct hsm_action_list *hal) +{ + int i, sz; + struct hsm_action_item *hai; + + sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname) + 1); + hai = hai_zero(hal); + for (i = 0; i < hal->hal_count; i++, hai = hai_next(hai)) + sz += cfs_size_round(hai->hai_len); + + return sz; +} + +/* HSM file import + * describe the attributes to be set on imported file + */ +struct hsm_user_import { + __u64 hui_size; + __u64 hui_atime; + __u64 hui_mtime; + __u32 hui_atime_ns; + __u32 hui_mtime_ns; + __u32 hui_uid; + __u32 hui_gid; + __u32 hui_mode; + __u32 hui_archive_id; +}; + +/* Copytool progress reporting */ +#define HP_FLAG_COMPLETED 0x01 +#define HP_FLAG_RETRY 0x02 + +struct hsm_progress { + lustre_fid hp_fid; + __u64 hp_cookie; + struct hsm_extent hp_extent; + __u16 hp_flags; + __u16 hp_errval; /* positive val */ + __u32 padding; +}; + +struct hsm_copy { + __u64 hc_data_version; + __u16 hc_flags; + __u16 hc_errval; /* positive val */ + __u32 padding; + struct hsm_action_item hc_hai; +}; + +/** @} lustreuser */ + +#endif /* _LUSTRE_USER_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_acl.h b/kernel/drivers/staging/lustre/lustre/include/lustre_acl.h new file mode 100644 index 000000000..aa4cfa7b7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_acl.h @@ -0,0 +1,49 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_acl.h + */ + +#ifndef _LUSTRE_ACL_H +#define _LUSTRE_ACL_H + +#include +#include +#include + +#define LUSTRE_POSIX_ACL_MAX_ENTRIES 32 +#define LUSTRE_POSIX_ACL_MAX_SIZE \ + (sizeof(posix_acl_xattr_header) + \ + LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry)) + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_capa.h b/kernel/drivers/staging/lustre/lustre/include/lustre_capa.h new file mode 100644 index 000000000..fe19534eb --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_capa.h @@ -0,0 +1,305 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_capa.h + * + * Author: Lai Siyao + */ + +#ifndef __LINUX_CAPA_H_ +#define __LINUX_CAPA_H_ + +/** \defgroup capa capa + * + * @{ + */ + +/* + * capability + */ +#include +#include "lustre/lustre_idl.h" + +#define CAPA_TIMEOUT 1800 /* sec, == 30 min */ +#define CAPA_KEY_TIMEOUT (24 * 60 * 60) /* sec, == 1 days */ + +struct capa_hmac_alg { + const char *ha_name; + int ha_len; + int ha_keylen; +}; + +#define DEF_CAPA_HMAC_ALG(name, type, len, keylen) \ +[CAPA_HMAC_ALG_ ## type] = { \ + .ha_name = name, \ + .ha_len = len, \ + .ha_keylen = keylen, \ +} + +struct client_capa { + struct inode *inode; + struct list_head lli_list; /* link to lli_oss_capas */ +}; + +struct target_capa { + struct hlist_node c_hash; /* link to capa hash */ +}; + +struct obd_capa { + struct list_head c_list; /* link to capa_list */ + + struct lustre_capa c_capa; /* capa */ + atomic_t c_refc; /* ref count */ + unsigned long c_expiry; /* jiffies */ + spinlock_t c_lock; /* protect capa content */ + int c_site; + + union { + struct client_capa cli; + struct target_capa tgt; + } u; +}; + +enum { + CAPA_SITE_CLIENT = 0, + CAPA_SITE_SERVER, + CAPA_SITE_MAX +}; + +static inline struct lu_fid *capa_fid(struct lustre_capa *capa) +{ + return &capa->lc_fid; +} + +static inline __u64 capa_opc(struct lustre_capa *capa) +{ + return capa->lc_opc; +} + +static inline __u64 capa_uid(struct lustre_capa *capa) +{ + return capa->lc_uid; +} + +static inline __u64 capa_gid(struct lustre_capa *capa) +{ + return capa->lc_gid; +} + +static inline __u32 capa_flags(struct lustre_capa *capa) +{ + return capa->lc_flags & 0xffffff; +} + +static inline __u32 capa_alg(struct lustre_capa *capa) +{ + return (capa->lc_flags >> 24); +} + +static inline __u32 capa_keyid(struct lustre_capa *capa) +{ + return capa->lc_keyid; +} + +static inline __u64 capa_key_seq(struct lustre_capa_key *key) +{ + return key->lk_seq; +} + +static inline __u32 capa_key_keyid(struct lustre_capa_key *key) +{ + return key->lk_keyid; +} + +static inline __u32 capa_timeout(struct lustre_capa *capa) +{ + return capa->lc_timeout; +} + +static inline __u32 capa_expiry(struct lustre_capa *capa) +{ + return capa->lc_expiry; +} + +void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *, + const char *fmt, ...); +#define DEBUG_CAPA(level, capa, fmt, args...) \ +do { \ + if (((level) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (level)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + _debug_capa((capa), &msgdata, fmt, ##args); \ + } \ +} while (0) + +#define DEBUG_CAPA_KEY(level, k, fmt, args...) \ +do { \ +CDEBUG(level, fmt " capability key@%p seq %llu keyid %u\n", \ + ##args, k, capa_key_seq(k), capa_key_keyid(k)); \ +} while (0) + +typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *); + +/* obdclass/capa.c */ +extern struct list_head capa_list[]; +extern spinlock_t capa_lock; +extern int capa_count[]; +extern struct kmem_cache *capa_cachep; + +struct hlist_head *init_capa_hash(void); +void cleanup_capa_hash(struct hlist_head *hash); + +struct obd_capa *capa_add(struct hlist_head *hash, + struct lustre_capa *capa); +struct obd_capa *capa_lookup(struct hlist_head *hash, + struct lustre_capa *capa, int alive); + +int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key); +int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen); +int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen); +void capa_cpy(void *dst, struct obd_capa *ocapa); +static inline struct obd_capa *alloc_capa(int site) +{ + struct obd_capa *ocapa; + + if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER)) + return ERR_PTR(-EINVAL); + + OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep); + if (unlikely(!ocapa)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ocapa->c_list); + atomic_set(&ocapa->c_refc, 1); + spin_lock_init(&ocapa->c_lock); + ocapa->c_site = site; + if (ocapa->c_site == CAPA_SITE_CLIENT) + INIT_LIST_HEAD(&ocapa->u.cli.lli_list); + else + INIT_HLIST_NODE(&ocapa->u.tgt.c_hash); + + return ocapa; +} + +static inline struct obd_capa *capa_get(struct obd_capa *ocapa) +{ + if (!ocapa) + return NULL; + + atomic_inc(&ocapa->c_refc); + return ocapa; +} + +static inline void capa_put(struct obd_capa *ocapa) +{ + if (!ocapa) + return; + + if (atomic_read(&ocapa->c_refc) == 0) { + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for"); + LBUG(); + } + + if (atomic_dec_and_test(&ocapa->c_refc)) { + LASSERT(list_empty(&ocapa->c_list)); + if (ocapa->c_site == CAPA_SITE_CLIENT) { + LASSERT(list_empty(&ocapa->u.cli.lli_list)); + } else { + struct hlist_node *hnode; + + hnode = &ocapa->u.tgt.c_hash; + LASSERT(!hnode->next && !hnode->pprev); + } + OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa)); + } +} + +static inline int open_flags_to_accmode(int flags) +{ + int mode = flags; + + if ((mode + 1) & O_ACCMODE) + mode++; + if (mode & O_TRUNC) + mode |= 2; + + return mode; +} + +static inline __u64 capa_open_opc(int mode) +{ + return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ; +} + +static inline void set_capa_expiry(struct obd_capa *ocapa) +{ + unsigned long expiry = cfs_time_sub((unsigned long)ocapa->c_capa.lc_expiry, + get_seconds()); + ocapa->c_expiry = cfs_time_add(cfs_time_current(), + cfs_time_seconds(expiry)); +} + +static inline int capa_is_expired_sec(struct lustre_capa *capa) +{ + return (capa->lc_expiry - get_seconds() <= 0); +} + +static inline int capa_is_expired(struct obd_capa *ocapa) +{ + return time_before_eq(ocapa->c_expiry, cfs_time_current()); +} + +static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc) +{ + return (capa_opc(capa) & opc) == opc; +} + +struct filter_capa_key { + struct list_head k_list; + struct lustre_capa_key k_key; +}; + +enum { + LC_ID_NONE = 0, + LC_ID_PLAIN = 1, + LC_ID_CONVERT = 2 +}; + +#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT) + +/** @} capa */ + +#endif /* __LINUX_CAPA_H_ */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_cfg.h b/kernel/drivers/staging/lustre/lustre/include/lustre_cfg.h new file mode 100644 index 000000000..7b385b872 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_cfg.h @@ -0,0 +1,293 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_CFG_H +#define _LUSTRE_CFG_H + +/** \defgroup cfg cfg + * + * @{ + */ + +/* + * 1cf6 + * lcfG + */ +#define LUSTRE_CFG_VERSION 0x1cf60001 +#define LUSTRE_CFG_MAX_BUFCOUNT 8 + +#define LCFG_HDR_SIZE(count) \ + cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)])) + +/** If the LCFG_REQUIRED bit is set in a configuration command, + * then the client is required to understand this parameter + * in order to mount the filesystem. If it does not understand + * a REQUIRED command the client mount will fail. */ +#define LCFG_REQUIRED 0x0001000 + +enum lcfg_command_type { + LCFG_ATTACH = 0x00cf001, /**< create a new obd instance */ + LCFG_DETACH = 0x00cf002, /**< destroy obd instance */ + LCFG_SETUP = 0x00cf003, /**< call type-specific setup */ + LCFG_CLEANUP = 0x00cf004, /**< call type-specific cleanup */ + LCFG_ADD_UUID = 0x00cf005, /**< add a nid to a niduuid */ + LCFG_DEL_UUID = 0x00cf006, /**< remove a nid from a niduuid */ + LCFG_MOUNTOPT = 0x00cf007, /**< create a profile (mdc, osc) */ + LCFG_DEL_MOUNTOPT = 0x00cf008, /**< destroy a profile */ + LCFG_SET_TIMEOUT = 0x00cf009, /**< set obd_timeout */ + LCFG_SET_UPCALL = 0x00cf00a, /**< deprecated */ + LCFG_ADD_CONN = 0x00cf00b, /**< add a failover niduuid to an obd */ + LCFG_DEL_CONN = 0x00cf00c, /**< remove a failover niduuid */ + LCFG_LOV_ADD_OBD = 0x00cf00d, /**< add an osc to a lov */ + LCFG_LOV_DEL_OBD = 0x00cf00e, /**< remove an osc from a lov */ + LCFG_PARAM = 0x00cf00f, /**< set a proc parameter */ + LCFG_MARKER = 0x00cf010, /**< metadata about next cfg rec */ + LCFG_LOG_START = 0x00ce011, /**< mgc only, process a cfg log */ + LCFG_LOG_END = 0x00ce012, /**< stop processing updates */ + LCFG_LOV_ADD_INA = 0x00ce013, /**< like LOV_ADD_OBD, inactive */ + LCFG_ADD_MDC = 0x00cf014, /**< add an mdc to a lmv */ + LCFG_DEL_MDC = 0x00cf015, /**< remove an mdc from a lmv */ + LCFG_SPTLRPC_CONF = 0x00ce016, /**< security */ + LCFG_POOL_NEW = 0x00ce020, /**< create an ost pool name */ + LCFG_POOL_ADD = 0x00ce021, /**< add an ost to a pool */ + LCFG_POOL_REM = 0x00ce022, /**< remove an ost from a pool */ + LCFG_POOL_DEL = 0x00ce023, /**< destroy an ost pool name */ + LCFG_SET_LDLM_TIMEOUT = 0x00ce030, /**< set ldlm_timeout */ + LCFG_PRE_CLEANUP = 0x00cf031, /**< call type-specific pre + * cleanup cleanup */ + LCFG_SET_PARAM = 0x00ce032, /**< use set_param syntax to set + *a proc parameters */ +}; + +struct lustre_cfg_bufs { + void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_bufcount; +}; + +struct lustre_cfg { + __u32 lcfg_version; + __u32 lcfg_command; + + __u32 lcfg_num; + __u32 lcfg_flags; + __u64 lcfg_nid; + __u32 lcfg_nal; /* not used any more */ + + __u32 lcfg_bufcount; + __u32 lcfg_buflens[0]; +}; + +enum cfg_record_type { + PORTALS_CFG_TYPE = 1, + LUSTRE_CFG_TYPE = 123, +}; + +#define LUSTRE_CFG_BUFLEN(lcfg, idx) \ + ((lcfg)->lcfg_bufcount <= (idx) \ + ? 0 \ + : (lcfg)->lcfg_buflens[(idx)]) + +static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs, + __u32 index, + void *buf, + __u32 buflen) +{ + if (index >= LUSTRE_CFG_MAX_BUFCOUNT) + return; + if (bufs == NULL) + return; + + if (bufs->lcfg_bufcount <= index) + bufs->lcfg_bufcount = index + 1; + + bufs->lcfg_buf[index] = buf; + bufs->lcfg_buflen[index] = buflen; +} + +static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs, + __u32 index, + char *str) +{ + lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0); +} + +static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name) +{ + memset((bufs), 0, sizeof(*bufs)); + if (name) + lustre_cfg_bufs_set_string(bufs, 0, name); +} + +static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index) +{ + int i; + int offset; + int bufcount; + LASSERT (lcfg != NULL); + LASSERT (index >= 0); + + bufcount = lcfg->lcfg_bufcount; + if (index >= bufcount) + return NULL; + + offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < index; i++) + offset += cfs_size_round(lcfg->lcfg_buflens[i]); + return (char *)lcfg + offset; +} + +static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs, + struct lustre_cfg *lcfg) +{ + int i; + bufs->lcfg_bufcount = lcfg->lcfg_bufcount; + for (i = 0; i < bufs->lcfg_bufcount; i++) { + bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i]; + bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i); + } +} + +static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index) +{ + char *s; + + if (lcfg->lcfg_buflens[index] == 0) + return NULL; + + s = lustre_cfg_buf(lcfg, index); + if (s == NULL) + return NULL; + + /* + * make sure it's NULL terminated, even if this kills a char + * of data. Try to use the padding first though. + */ + if (s[lcfg->lcfg_buflens[index] - 1] != '\0') { + int last = min((int)lcfg->lcfg_buflens[index], + cfs_size_round(lcfg->lcfg_buflens[index]) - 1); + char lost = s[last]; + s[last] = '\0'; + if (lost != '\0') { + CWARN("Truncated buf %d to '%s' (lost '%c'...)\n", + index, s, lost); + } + } + return s; +} + +static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens) +{ + int i; + int len; + + len = LCFG_HDR_SIZE(bufcount); + for (i = 0; i < bufcount; i++) + len += cfs_size_round(buflens[i]); + + return cfs_size_round(len); +} + + +#include "obd_support.h" + +static inline struct lustre_cfg *lustre_cfg_new(int cmd, + struct lustre_cfg_bufs *bufs) +{ + struct lustre_cfg *lcfg; + char *ptr; + int i; + + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, + bufs->lcfg_buflen)); + if (!lcfg) + return ERR_PTR(-ENOMEM); + + lcfg->lcfg_version = LUSTRE_CFG_VERSION; + lcfg->lcfg_command = cmd; + lcfg->lcfg_bufcount = bufs->lcfg_bufcount; + + ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i]; + LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr); + } + return lcfg; +} + +static inline void lustre_cfg_free(struct lustre_cfg *lcfg) +{ + int len; + + len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens); + + OBD_FREE(lcfg, len); + return; +} + +static inline int lustre_cfg_sanity_check(void *buf, int len) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)buf; + + if (!lcfg) + return -EINVAL; + + /* check that the first bits of the struct are valid */ + if (len < LCFG_HDR_SIZE(0)) + return -EINVAL; + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) + return -EINVAL; + + if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT) + return -EINVAL; + + /* check that the buflens are valid */ + if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount)) + return -EINVAL; + + /* make sure all the pointers point inside the data */ + if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)) + return -EINVAL; + + return 0; +} + +#include "lustre/lustre_user.h" + +/** @} cfg */ + +#endif /* _LUSTRE_CFG_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_debug.h b/kernel/drivers/staging/lustre/lustre/include/lustre_debug.h new file mode 100644 index 000000000..6c92d0bc9 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_debug.h @@ -0,0 +1,56 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_DEBUG_H +#define _LUSTRE_DEBUG_H + +/** \defgroup debug debug + * + * @{ + */ + +#include "lustre_net.h" +#include "obd.h" + +/* lib/debug.c */ +void dump_lniobuf(struct niobuf_local *lnb); +int dump_req(struct ptlrpc_request *req); +int block_debug_setup(void *addr, int len, __u64 off, __u64 id); +int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id); + +/** @} debug */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_disk.h b/kernel/drivers/staging/lustre/lustre/include/lustre_disk.h new file mode 100644 index 000000000..9b2833131 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_disk.h @@ -0,0 +1,547 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_disk.h + * + * Lustre disk format definitions. + * + * Author: Nathan Rutman + */ + +#ifndef _LUSTRE_DISK_H +#define _LUSTRE_DISK_H + +/** \defgroup disk disk + * + * @{ + */ + +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/types.h" +#include + +/****************** on-disk files *********************/ + +#define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */ +#define MOUNT_CONFIGS_DIR "CONFIGS" +#define CONFIGS_FILE "mountdata" +/** Persistent mount data are stored on the disk in this file. */ +#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/"CONFIGS_FILE +#define LAST_RCVD "last_rcvd" +#define LOV_OBJID "lov_objid" +#define LOV_OBJSEQ "lov_objseq" +#define HEALTH_CHECK "health_check" +#define CAPA_KEYS "capa_keys" +#define CHANGELOG_USERS "changelog_users" +#define MGS_NIDTBL_DIR "NIDTBL_VERSIONS" +#define QMT_DIR "quota_master" +#define QSD_DIR "quota_slave" +#define HSM_ACTIONS "hsm_actions" + +/****************** persistent mount data *********************/ + +#define LDD_F_SV_TYPE_MDT 0x0001 +#define LDD_F_SV_TYPE_OST 0x0002 +#define LDD_F_SV_TYPE_MGS 0x0004 +#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \ + LDD_F_SV_TYPE_OST | \ + LDD_F_SV_TYPE_MGS) +#define LDD_F_SV_ALL 0x0008 +/** need an index assignment */ +#define LDD_F_NEED_INDEX 0x0010 +/** never registered */ +#define LDD_F_VIRGIN 0x0020 +/** update the config logs for this server */ +#define LDD_F_UPDATE 0x0040 +/** rewrite the LDD */ +#define LDD_F_REWRITE_LDD 0x0080 +/** regenerate config logs for this fs or server */ +#define LDD_F_WRITECONF 0x0100 +/** COMPAT_14 */ +#define LDD_F_UPGRADE14 0x0200 +/** process as lctl conf_param */ +#define LDD_F_PARAM 0x0400 +/** all nodes are specified as service nodes */ +#define LDD_F_NO_PRIMNODE 0x1000 +/** IR enable flag */ +#define LDD_F_IR_CAPABLE 0x2000 +/** the MGS refused to register the target. */ +#define LDD_F_ERROR 0x4000 +/** process at lctl conf_param */ +#define LDD_F_PARAM2 0x8000 + +/* opc for target register */ +#define LDD_F_OPC_REG 0x10000000 +#define LDD_F_OPC_UNREG 0x20000000 +#define LDD_F_OPC_READY 0x40000000 +#define LDD_F_OPC_MASK 0xf0000000 + +#define LDD_F_ONDISK_MASK (LDD_F_SV_TYPE_MASK) + +#define LDD_F_MASK 0xFFFF + +enum ldd_mount_type { + LDD_MT_EXT3 = 0, + LDD_MT_LDISKFS, + LDD_MT_SMFS, + LDD_MT_REISERFS, + LDD_MT_LDISKFS2, + LDD_MT_ZFS, + LDD_MT_LAST +}; + +static inline char *mt_str(enum ldd_mount_type mt) +{ + static char *mount_type_string[] = { + "ext3", + "ldiskfs", + "smfs", + "reiserfs", + "ldiskfs2", + "zfs", + }; + return mount_type_string[mt]; +} + +static inline char *mt_type(enum ldd_mount_type mt) +{ + static char *mount_type_string[] = { + "osd-ldiskfs", + "osd-ldiskfs", + "osd-smfs", + "osd-reiserfs", + "osd-ldiskfs", + "osd-zfs", + }; + return mount_type_string[mt]; +} + +#define LDD_INCOMPAT_SUPP 0 +#define LDD_ROCOMPAT_SUPP 0 + +#define LDD_MAGIC 0x1dd00001 + +/* On-disk configuration file. In host-endian order. */ +struct lustre_disk_data { + __u32 ldd_magic; + __u32 ldd_feature_compat; /* compatible feature flags */ + __u32 ldd_feature_rocompat;/* read-only compatible feature flags */ + __u32 ldd_feature_incompat;/* incompatible feature flags */ + + __u32 ldd_config_ver; /* config rewrite count - not used */ + __u32 ldd_flags; /* LDD_SV_TYPE */ + __u32 ldd_svindex; /* server index (0001), must match + svname */ + __u32 ldd_mount_type; /* target fs type LDD_MT_* */ + char ldd_fsname[64]; /* filesystem this server is part of, + MTI_NAME_MAXLEN */ + char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/ + __u8 ldd_uuid[40]; /* server UUID (COMPAT_146) */ + +/*200*/ char ldd_userdata[1024 - 200]; /* arbitrary user string */ +/*1024*/__u8 ldd_padding[4096 - 1024]; +/*4096*/char ldd_mount_opts[4096]; /* target fs mount opts */ +/*8192*/char ldd_params[4096]; /* key=value pairs */ +}; + + +#define IS_MDT(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MDT) +#define IS_OST(data) ((data)->lsi_flags & LDD_F_SV_TYPE_OST) +#define IS_MGS(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MGS) +#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \ + LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) +#define MT_STR(data) mt_str((data)->ldd_mount_type) + +/* Make the mdt/ost server obd name based on the filesystem name */ +static inline int server_make_name(__u32 flags, __u16 index, char *fs, + char *name) +{ + if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) { + if (!(flags & LDD_F_SV_ALL)) + sprintf(name, "%.8s%c%s%04x", fs, + (flags & LDD_F_VIRGIN) ? ':' : + ((flags & LDD_F_WRITECONF) ? '=' : '-'), + (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST", + index); + } else if (flags & LDD_F_SV_TYPE_MGS) { + sprintf(name, "MGS"); + } else { + CERROR("unknown server type %#x\n", flags); + return 1; + } + return 0; +} + +/****************** mount command *********************/ + +/* The lmd is only used internally by Lustre; mount simply passes + everything as string options */ + +#define LMD_MAGIC 0xbdacbd03 + +/* gleaned from the mount command - no persistent info here */ +struct lustre_mount_data { + __u32 lmd_magic; + __u32 lmd_flags; /* lustre mount flags */ + int lmd_mgs_failnodes; /* mgs failover node count */ + int lmd_exclude_count; + int lmd_recovery_time_soft; + int lmd_recovery_time_hard; + char *lmd_dev; /* device name */ + char *lmd_profile; /* client only */ + char *lmd_mgssec; /* sptlrpc flavor to mgs */ + char *lmd_opts; /* lustre mount options (as opposed to + _device_ mount options) */ + char *lmd_params; /* lustre params */ + __u32 *lmd_exclude; /* array of OSTs to ignore */ + char *lmd_mgs; /* MGS nid */ + char *lmd_osd_type; /* OSD type */ +}; + +#define LMD_FLG_SERVER 0x0001 /* Mounting a server */ +#define LMD_FLG_CLIENT 0x0002 /* Mounting a client */ +#define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */ +#define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers, + no other services */ +#define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing + existing MGS services */ +#define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */ +#define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */ +#define LMD_FLG_NOSCRUB 0x0100 /* Do not trigger scrub automatically */ +#define LMD_FLG_MGS 0x0200 /* Also start MGS along with server */ +#define LMD_FLG_IAM 0x0400 /* IAM dir */ +#define LMD_FLG_NO_PRIMNODE 0x0800 /* all nodes are service nodes */ +#define LMD_FLG_VIRGIN 0x1000 /* the service registers first time */ +#define LMD_FLG_UPDATE 0x2000 /* update parameters */ +#define LMD_FLG_HSM 0x4000 /* Start coordinator */ + +#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) + + +/****************** last_rcvd file *********************/ + +/** version recovery epoch */ +#define LR_EPOCH_BITS 32 +#define lr_epoch(a) ((a) >> LR_EPOCH_BITS) +#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */ +#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */ + +#define LR_SERVER_SIZE 512 +#define LR_CLIENT_START 8192 +#define LR_CLIENT_SIZE 128 +#if LR_CLIENT_START < LR_SERVER_SIZE +#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE" +#endif + +/* + * This limit is arbitrary (131072 clients on x86), but it is convenient to use + * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation. + * If we need more than 131072 clients (order-2 allocation on x86) then this + * should become an array of single-page pointers that are allocated on demand. + */ +#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8) +#define LR_MAX_CLIENTS (128 * 1024UL) +#else +#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8) +#endif + +/** COMPAT_146: this is an OST (temporary) */ +#define OBD_COMPAT_OST 0x00000002 +/** COMPAT_146: this is an MDT (temporary) */ +#define OBD_COMPAT_MDT 0x00000004 +/** 2.0 server, interop flag to show server version is changed */ +#define OBD_COMPAT_20 0x00000008 + +/** MDS handles LOV_OBJID file */ +#define OBD_ROCOMPAT_LOVOBJID 0x00000001 + +/** OST handles group subdirs */ +#define OBD_INCOMPAT_GROUPS 0x00000001 +/** this is an OST */ +#define OBD_INCOMPAT_OST 0x00000002 +/** this is an MDT */ +#define OBD_INCOMPAT_MDT 0x00000004 +/** common last_rvcd format */ +#define OBD_INCOMPAT_COMMON_LR 0x00000008 +/** FID is enabled */ +#define OBD_INCOMPAT_FID 0x00000010 +/** Size-on-MDS is enabled */ +#define OBD_INCOMPAT_SOM 0x00000020 +/** filesystem using iam format to store directory entries */ +#define OBD_INCOMPAT_IAM_DIR 0x00000040 +/** LMA attribute contains per-inode incompatible flags */ +#define OBD_INCOMPAT_LMA 0x00000080 +/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16 + * bits are now used to store a generation. Once we start changing the layout + * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count + * will be confused by interpreting stripe_count | gen << 16 as the actual + * stripe count */ +#define OBD_INCOMPAT_LMM_VER 0x00000100 +/** multiple OI files for MDT */ +#define OBD_INCOMPAT_MULTI_OI 0x00000200 + +/* Data stored per server at the head of the last_rcvd file. In le32 order. + This should be common to filter_internal.h, lustre_mds.h */ +struct lr_server_data { + __u8 lsd_uuid[40]; /* server UUID */ + __u64 lsd_last_transno; /* last completed transaction ID */ + __u64 lsd_compat14; /* reserved - compat with old last_rcvd */ + __u64 lsd_mount_count; /* incarnation number */ + __u32 lsd_feature_compat; /* compatible feature flags */ + __u32 lsd_feature_rocompat;/* read-only compatible feature flags */ + __u32 lsd_feature_incompat;/* incompatible feature flags */ + __u32 lsd_server_size; /* size of server data area */ + __u32 lsd_client_start; /* start of per-client data area */ + __u16 lsd_client_size; /* size of per-client data area */ + __u16 lsd_subdir_count; /* number of subdirectories for objects */ + __u64 lsd_catalog_oid; /* recovery catalog object id */ + __u32 lsd_catalog_ogen; /* recovery catalog inode generation */ + __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */ + __u32 lsd_osd_index; /* index number of OST in LOV */ + __u32 lsd_padding1; /* was lsd_mdt_index, unused in 2.4.0 */ + __u32 lsd_start_epoch; /* VBR: start epoch from last boot */ + /** transaction values since lsd_trans_table_time */ + __u64 lsd_trans_table[LR_EXPIRE_INTERVALS]; + /** start point of transno table below */ + __u32 lsd_trans_table_time; /* time of first slot in table above */ + __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */ + __u8 lsd_padding[LR_SERVER_SIZE - 288]; +}; + +/* Data stored per client in the last_rcvd file. In le32 order. */ +struct lsd_client_data { + __u8 lcd_uuid[40]; /* client UUID */ + __u64 lcd_last_transno; /* last completed transaction ID */ + __u64 lcd_last_xid; /* xid for the last transaction */ + __u32 lcd_last_result; /* result from last RPC */ + __u32 lcd_last_data; /* per-op data (disposition for open &c.) */ + /* for MDS_CLOSE requests */ + __u64 lcd_last_close_transno; /* last completed transaction ID */ + __u64 lcd_last_close_xid; /* xid for the last transaction */ + __u32 lcd_last_close_result; /* result from last RPC */ + __u32 lcd_last_close_data; /* per-op data */ + /* VBR: last versions */ + __u64 lcd_pre_versions[4]; + __u32 lcd_last_epoch; + /** orphans handling for delayed export rely on that */ + __u32 lcd_first_epoch; + __u8 lcd_padding[LR_CLIENT_SIZE - 128]; +}; + +/* bug20354: the lcd_uuid for export of clients may be wrong */ +static inline void check_lcd(char *obd_name, int index, + struct lsd_client_data *lcd) +{ + int length = sizeof(lcd->lcd_uuid); + if (strnlen((char*)lcd->lcd_uuid, length) == length) { + lcd->lcd_uuid[length - 1] = '\0'; + + LCONSOLE_ERROR("the client UUID (%s) on %s for exports stored in last_rcvd(index = %d) is bad!\n", + lcd->lcd_uuid, obd_name, index); + } +} + +/* last_rcvd handling */ +static inline void lsd_le_to_cpu(struct lr_server_data *buf, + struct lr_server_data *lsd) +{ + int i; + memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid)); + lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno); + lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14); + lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count); + lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat); + lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat); + lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat); + lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size); + lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start); + lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size); + lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count); + lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid); + lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen); + memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid)); + lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index); + lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1); + lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]); + lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time); + lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals); +} + +static inline void lsd_cpu_to_le(struct lr_server_data *lsd, + struct lr_server_data *buf) +{ + int i; + memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid)); + buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno); + buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14); + buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count); + buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat); + buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat); + buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat); + buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size); + buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start); + buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size); + buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count); + buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid); + buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen); + memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid)); + buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index); + buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1); + buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]); + buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time); + buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals); +} + +static inline void lcd_le_to_cpu(struct lsd_client_data *buf, + struct lsd_client_data *lcd) +{ + memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid)); + lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno); + lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid); + lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result); + lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data); + lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno); + lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid); + lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result); + lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data); + lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]); + lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]); + lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]); + lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]); + lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch); + lcd->lcd_first_epoch = le32_to_cpu(buf->lcd_first_epoch); +} + +static inline void lcd_cpu_to_le(struct lsd_client_data *lcd, + struct lsd_client_data *buf) +{ + memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid)); + buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno); + buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid); + buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result); + buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data); + buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno); + buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid); + buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result); + buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data); + buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]); + buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]); + buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]); + buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]); + buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch); + buf->lcd_first_epoch = cpu_to_le32(lcd->lcd_first_epoch); +} + +static inline __u64 lcd_last_transno(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ? + lcd->lcd_last_transno : lcd->lcd_last_close_transno); +} + +static inline __u64 lcd_last_xid(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ? + lcd->lcd_last_xid : lcd->lcd_last_close_xid); +} + +/****************** superblock additional info *********************/ + +struct ll_sb_info; + +struct lustre_sb_info { + int lsi_flags; + struct obd_device *lsi_mgc; /* mgc obd */ + struct lustre_mount_data *lsi_lmd; /* mount command info */ + struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */ + struct dt_device *lsi_dt_dev; /* dt device to access disk fs*/ + struct vfsmount *lsi_srv_mnt; /* the one server mount */ + atomic_t lsi_mounts; /* references to the srv_mnt */ + char lsi_svname[MTI_NAME_MAXLEN]; + char lsi_osd_obdname[64]; + char lsi_osd_uuid[64]; + struct obd_export *lsi_osd_exp; + char lsi_osd_type[16]; + char lsi_fstype[16]; + struct backing_dev_info lsi_bdi; /* each client mountpoint needs + own backing_dev_info */ +}; + +#define LSI_UMOUNT_FAILOVER 0x00200000 +#define LSI_BDI_INITIALIZED 0x00400000 + +#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info)) +#define s2lsi_nocast(sb) ((sb)->s_fs_info) + +#define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile) +#define get_mount_flags(sb) (s2lsi(sb)->lsi_lmd->lmd_flags) +#define get_mntdev_name(sb) (s2lsi(sb)->lsi_lmd->lmd_dev) + + +/****************** mount lookup info *********************/ + +struct lustre_mount_info { + char *lmi_name; + struct super_block *lmi_sb; + struct vfsmount *lmi_mnt; + struct list_head lmi_list_chain; +}; + +/****************** prototypes *********************/ + +/* obd_mount.c */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr); +int server_name2index(const char *svname, __u32 *idx, const char **endptr); +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize); + +int lustre_put_lsi(struct super_block *sb); +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4); +int lustre_start_mgc(struct super_block *sb); +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb, + struct vfsmount *mnt)); +void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb)); +int lustre_common_put_super(struct super_block *sb); + + +int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type); + +/** @} disk */ + +#endif /* _LUSTRE_DISK_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_dlm.h b/kernel/drivers/staging/lustre/lustre/include/lustre_dlm.h new file mode 100644 index 000000000..bac9902b5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_dlm.h @@ -0,0 +1,1480 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** \defgroup LDLM Lustre Distributed Lock Manager + * + * Lustre DLM is based on VAX DLM. + * Its two main roles are: + * - To provide locking assuring consistency of data on all Lustre nodes. + * - To allow clients to cache state protected by a lock by holding the + * lock until a conflicting lock is requested or it is expired by the LRU. + * + * @{ + */ + +#ifndef _LUSTRE_DLM_H__ +#define _LUSTRE_DLM_H__ + +#include "lustre_lib.h" +#include "lustre_net.h" +#include "lustre_import.h" +#include "lustre_handles.h" +#include "interval_tree.h" /* for interval_node{}, ldlm_extent */ +#include "lu_ref.h" + +#include "lustre_dlm_flags.h" + +struct obd_ops; +struct obd_device; + +#define OBD_LDLM_DEVICENAME "ldlm" + +#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus()) +#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000)) +#define LDLM_CTIME_AGE_LIMIT (10) +#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024 + +/** + * LDLM non-error return states + */ +typedef enum { + ELDLM_OK = 0, + + ELDLM_LOCK_CHANGED = 300, + ELDLM_LOCK_ABORTED = 301, + ELDLM_LOCK_REPLACED = 302, + ELDLM_NO_LOCK_DATA = 303, + ELDLM_LOCK_WOULDBLOCK = 304, + + ELDLM_NAMESPACE_EXISTS = 400, + ELDLM_BAD_NAMESPACE = 401 +} ldlm_error_t; + +/** + * LDLM namespace type. + * The "client" type is actually an indication that this is a narrow local view + * into complete namespace on the server. Such namespaces cannot make any + * decisions about lack of conflicts or do any autonomous lock granting without + * first speaking to a server. + */ +typedef enum { + LDLM_NAMESPACE_SERVER = 1 << 0, + LDLM_NAMESPACE_CLIENT = 1 << 1 +} ldlm_side_t; + +/** + * The blocking callback is overloaded to perform two functions. These flags + * indicate which operation should be performed. + */ +#define LDLM_CB_BLOCKING 1 +#define LDLM_CB_CANCELING 2 + +/** + * \name Lock Compatibility Matrix. + * + * A lock has both a type (extent, flock, inode bits, or plain) and a mode. + * Lock types are described in their respective implementation files: + * ldlm_{extent,flock,inodebits,plain}.c. + * + * There are six lock modes along with a compatibility matrix to indicate if + * two locks are compatible. + * + * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock + * on the parent. + * - PW: Protective Write (normal write) mode. When a client requests a write + * lock from an OST, a lock with PW mode will be issued. + * - PR: Protective Read (normal read) mode. When a client requests a read from + * an OST, a lock with PR mode will be issued. Also, if the client opens a + * file for execution, it is granted a lock with PR mode. + * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client + * requests a write lock during a file open operation. + * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants + * an inodebit lock with the CR mode on the intermediate path component. + * - NL Null mode. + * + *
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * 
+ */ +/** @{ */ +#define LCK_COMPAT_EX LCK_NL +#define LCK_COMPAT_PW (LCK_COMPAT_EX | LCK_CR) +#define LCK_COMPAT_PR (LCK_COMPAT_PW | LCK_PR) +#define LCK_COMPAT_CW (LCK_COMPAT_PW | LCK_CW) +#define LCK_COMPAT_CR (LCK_COMPAT_CW | LCK_PR | LCK_PW) +#define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX | LCK_GROUP) +#define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL) +#define LCK_COMPAT_COS (LCK_COS) +/** @} Lock Compatibility Matrix */ + +extern ldlm_mode_t lck_compat_array[]; + +static inline void lockmode_verify(ldlm_mode_t mode) +{ + LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE); +} + +static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode) +{ + return (lck_compat_array[exist_mode] & new_mode); +} + +/* + * + * cluster name spaces + * + */ + +#define DLM_OST_NAMESPACE 1 +#define DLM_MDS_NAMESPACE 2 + +/* XXX + - do we just separate this by security domains and use a prefix for + multiple namespaces in the same domain? + - +*/ + +/** + * Locking rules for LDLM: + * + * lr_lock + * + * lr_lock + * waiting_locks_spinlock + * + * lr_lock + * led_lock + * + * lr_lock + * ns_lock + * + * lr_lvb_mutex + * lr_lock + * + */ + +struct ldlm_pool; +struct ldlm_lock; +struct ldlm_resource; +struct ldlm_namespace; + +/** + * Operations on LDLM pools. + * LDLM pool is a pool of locks in the namespace without any implicitly + * specified limits. + * Locks in the pool are organized in LRU. + * Local memory pressure or server instructions (e.g. mempressure on server) + * can trigger freeing of locks from the pool + */ +struct ldlm_pool_ops { + /** Recalculate pool \a pl usage */ + int (*po_recalc)(struct ldlm_pool *pl); + /** Cancel at least \a nr locks from pool \a pl */ + int (*po_shrink)(struct ldlm_pool *pl, int nr, + gfp_t gfp_mask); + int (*po_setup)(struct ldlm_pool *pl, int limit); +}; + +/** One second for pools thread check interval. Each pool has own period. */ +#define LDLM_POOLS_THREAD_PERIOD (1) + +/** ~6% margin for modest pools. See ldlm_pool.c for details. */ +#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4) + +/** Default recalc period for server side pools in sec. */ +#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1) + +/** Default recalc period for client side pools in sec. */ +#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10) + +/** + * LDLM pool structure to track granted locks. + * For purposes of determining when to release locks on e.g. memory pressure. + * This feature is commonly referred to as lru_resize. + */ +struct ldlm_pool { + /** Pool proc directory. */ + struct proc_dir_entry *pl_proc_dir; + /** Pool name, must be long enough to hold compound proc entry name. */ + char pl_name[100]; + /** Lock for protecting SLV/CLV updates. */ + spinlock_t pl_lock; + /** Number of allowed locks in in pool, both, client and server side. */ + atomic_t pl_limit; + /** Number of granted locks in */ + atomic_t pl_granted; + /** Grant rate per T. */ + atomic_t pl_grant_rate; + /** Cancel rate per T. */ + atomic_t pl_cancel_rate; + /** Server lock volume (SLV). Protected by pl_lock. */ + __u64 pl_server_lock_volume; + /** Current biggest client lock volume. Protected by pl_lock. */ + __u64 pl_client_lock_volume; + /** Lock volume factor. SLV on client is calculated as following: + * server_slv * lock_volume_factor. */ + atomic_t pl_lock_volume_factor; + /** Time when last SLV from server was obtained. */ + time_t pl_recalc_time; + /** Recalculation period for pool. */ + time_t pl_recalc_period; + /** Recalculation and shrink operations. */ + const struct ldlm_pool_ops *pl_ops; + /** Number of planned locks for next period. */ + int pl_grant_plan; + /** Pool statistics. */ + struct lprocfs_stats *pl_stats; +}; + +typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **, + void *req_cookie, ldlm_mode_t mode, __u64 flags, + void *data); + +typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock); + +/** + * LVB operations. + * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could + * be associated with an LDLM lock and transferred from client to server and + * back. + * + * Currently LVBs are used by: + * - OSC-OST code to maintain current object size/times + * - layout lock code to return the layout when the layout lock is granted + */ +struct ldlm_valblock_ops { + int (*lvbo_init)(struct ldlm_resource *res); + int (*lvbo_update)(struct ldlm_resource *res, + struct ptlrpc_request *r, + int increase); + int (*lvbo_free)(struct ldlm_resource *res); + /* Return size of lvb data appropriate RPC size can be reserved */ + int (*lvbo_size)(struct ldlm_lock *lock); + /* Called to fill in lvb data to RPC buffer @buf */ + int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen); +}; + +/** + * LDLM pools related, type of lock pool in the namespace. + * Greedy means release cached locks aggressively + */ +typedef enum { + LDLM_NAMESPACE_GREEDY = 1 << 0, + LDLM_NAMESPACE_MODEST = 1 << 1 +} ldlm_appetite_t; + +/** + * Default values for the "max_nolock_size", "contention_time" and + * "contended_locks" namespace tunables. + */ +#define NS_DEFAULT_MAX_NOLOCK_BYTES 0 +#define NS_DEFAULT_CONTENTION_SECONDS 2 +#define NS_DEFAULT_CONTENDED_LOCKS 32 + +struct ldlm_ns_bucket { + /** back pointer to namespace */ + struct ldlm_namespace *nsb_namespace; + /** + * Estimated lock callback time. Used by adaptive timeout code to + * avoid spurious client evictions due to unresponsiveness when in + * fact the network or overall system load is at fault + */ + struct adaptive_timeout nsb_at_estimate; +}; + +enum { + /** LDLM namespace lock stats */ + LDLM_NSS_LOCKS = 0, + LDLM_NSS_LAST +}; + +typedef enum { + /** invalid type */ + LDLM_NS_TYPE_UNKNOWN = 0, + /** mdc namespace */ + LDLM_NS_TYPE_MDC, + /** mds namespace */ + LDLM_NS_TYPE_MDT, + /** osc namespace */ + LDLM_NS_TYPE_OSC, + /** ost namespace */ + LDLM_NS_TYPE_OST, + /** mgc namespace */ + LDLM_NS_TYPE_MGC, + /** mgs namespace */ + LDLM_NS_TYPE_MGT, +} ldlm_ns_type_t; + +/** + * LDLM Namespace. + * + * Namespace serves to contain locks related to a particular service. + * There are two kinds of namespaces: + * - Server namespace has knowledge of all locks and is therefore authoritative + * to make decisions like what locks could be granted and what conflicts + * exist during new lock enqueue. + * - Client namespace only has limited knowledge about locks in the namespace, + * only seeing locks held by the client. + * + * Every Lustre service has one server namespace present on the server serving + * that service. Every client connected to the service has a client namespace + * for it. + * Every lock obtained by client in that namespace is actually represented by + * two in-memory locks. One on the server and one on the client. The locks are + * linked by a special cookie by which one node can tell to the other which lock + * it actually means during communications. Such locks are called remote locks. + * The locks held by server only without any reference to a client are called + * local locks. + */ +struct ldlm_namespace { + /** Backward link to OBD, required for LDLM pool to store new SLV. */ + struct obd_device *ns_obd; + + /** Flag indicating if namespace is on client instead of server */ + ldlm_side_t ns_client; + + /** Resource hash table for namespace. */ + struct cfs_hash *ns_rs_hash; + + /** serialize */ + spinlock_t ns_lock; + + /** big refcount (by bucket) */ + atomic_t ns_bref; + + /** + * Namespace connect flags supported by server (may be changed via + * /proc, LRU resize may be disabled/enabled). + */ + __u64 ns_connect_flags; + + /** Client side original connect flags supported by server. */ + __u64 ns_orig_connect_flags; + + /* namespace proc dir entry */ + struct proc_dir_entry *ns_proc_dir_entry; + + /** + * Position in global namespace list linking all namespaces on + * the node. + */ + struct list_head ns_list_chain; + + /** + * List of unused locks for this namespace. This list is also called + * LRU lock list. + * Unused locks are locks with zero reader/writer reference counts. + * This list is only used on clients for lock caching purposes. + * When we want to release some locks voluntarily or if server wants + * us to release some locks due to e.g. memory pressure, we take locks + * to release from the head of this list. + * Locks are linked via l_lru field in \see struct ldlm_lock. + */ + struct list_head ns_unused_list; + /** Number of locks in the LRU list above */ + int ns_nr_unused; + + /** + * Maximum number of locks permitted in the LRU. If 0, means locks + * are managed by pools and there is no preset limit, rather it is all + * controlled by available memory on this client and on server. + */ + unsigned int ns_max_unused; + /** Maximum allowed age (last used time) for locks in the LRU */ + unsigned int ns_max_age; + /** + * Server only: number of times we evicted clients due to lack of reply + * to ASTs. + */ + unsigned int ns_timeouts; + /** + * Number of seconds since the file change time after which the + * MDT will return an UPDATE lock along with a LOOKUP lock. + * This allows the client to start caching negative dentries + * for a directory and may save an RPC for a later stat. + */ + unsigned int ns_ctime_age_limit; + + /** + * Used to rate-limit ldlm_namespace_dump calls. + * \see ldlm_namespace_dump. Increased by 10 seconds every time + * it is called. + */ + unsigned long ns_next_dump; + + /** "policy" function that does actual lock conflict determination */ + ldlm_res_policy ns_policy; + + /** + * LVB operations for this namespace. + * \see struct ldlm_valblock_ops + */ + struct ldlm_valblock_ops *ns_lvbo; + + /** + * Used by filter code to store pointer to OBD of the service. + * Should be dropped in favor of \a ns_obd + */ + void *ns_lvbp; + + /** + * Wait queue used by __ldlm_namespace_free. Gets woken up every time + * a resource is removed. + */ + wait_queue_head_t ns_waitq; + /** LDLM pool structure for this namespace */ + struct ldlm_pool ns_pool; + /** Definition of how eagerly unused locks will be released from LRU */ + ldlm_appetite_t ns_appetite; + + /** + * If more than \a ns_contended_locks are found, the resource is + * considered to be contended. Lock enqueues might specify that no + * contended locks should be granted + */ + unsigned ns_contended_locks; + + /** + * The resources in this namespace remember contended state during + * \a ns_contention_time, in seconds. + */ + unsigned ns_contention_time; + + /** + * Limit size of contended extent locks, in bytes. + * If extended lock is requested for more then this many bytes and + * caller instructs us not to grant contended locks, we would disregard + * such a request. + */ + unsigned ns_max_nolock_size; + + /** Limit of parallel AST RPC count. */ + unsigned ns_max_parallel_ast; + + /** Callback to cancel locks before replaying it during recovery. */ + ldlm_cancel_for_recovery ns_cancel_for_recovery; + + /** LDLM lock stats */ + struct lprocfs_stats *ns_stats; + + /** + * Flag to indicate namespace is being freed. Used to determine if + * recalculation of LDLM pool statistics should be skipped. + */ + unsigned ns_stopping:1; +}; + +/** + * Returns 1 if namespace \a ns is a client namespace. + */ +static inline int ns_is_client(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT | + LDLM_NAMESPACE_SERVER))); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_CLIENT; +} + +/** + * Returns 1 if namespace \a ns is a server namespace. + */ +static inline int ns_is_server(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT | + LDLM_NAMESPACE_SERVER))); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_SERVER; +} + +/** + * Returns 1 if namespace \a ns supports early lock cancel (ELC). + */ +static inline int ns_connect_cancelset(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET); +} + +/** + * Returns 1 if this namespace supports lru_resize. + */ +static inline int ns_connect_lru_resize(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline void ns_register_cancel(struct ldlm_namespace *ns, + ldlm_cancel_for_recovery arg) +{ + LASSERT(ns != NULL); + ns->ns_cancel_for_recovery = arg; +} + +struct ldlm_lock; + +/** Type for blocking callback function of a lock. */ +typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock, + struct ldlm_lock_desc *new, void *data, + int flag); +/** Type for completion callback function of a lock. */ +typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags, + void *data); +/** Type for glimpse callback function of a lock. */ +typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data); + +/** Work list for sending GL ASTs to multiple locks. */ +struct ldlm_glimpse_work { + struct ldlm_lock *gl_lock; /* lock to glimpse */ + struct list_head gl_list; /* linkage to other gl work structs */ + __u32 gl_flags;/* see LDLM_GL_WORK_* below */ + union ldlm_gl_desc *gl_desc; /* glimpse descriptor to be packed in + * glimpse callback request */ +}; + +/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */ +#define LDLM_GL_WORK_NOFREE 0x1 + +/** Interval node data for each LDLM_EXTENT lock. */ +struct ldlm_interval { + struct interval_node li_node; /* node for tree management */ + struct list_head li_group; /* the locks which have the same + * policy - group of the policy */ +}; +#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node) + +/** + * Interval tree for extent locks. + * The interval tree must be accessed under the resource lock. + * Interval trees are used for granted extent locks to speed up conflicts + * lookup. See ldlm/interval_tree.c for more details. + */ +struct ldlm_interval_tree { + /** Tree size. */ + int lit_size; + ldlm_mode_t lit_mode; /* lock mode */ + struct interval_node *lit_root; /* actual ldlm_interval */ +}; + +/** Whether to track references to exports by LDLM locks. */ +#define LUSTRE_TRACKS_LOCK_EXP_REFS (0) + +/** Cancel flags. */ +typedef enum { + LCF_ASYNC = 0x1, /* Cancel locks asynchronously. */ + LCF_LOCAL = 0x2, /* Cancel locks locally, not notifing server */ + LCF_BL_AST = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST + * in the same RPC */ +} ldlm_cancel_flags_t; + +struct ldlm_flock { + __u64 start; + __u64 end; + __u64 owner; + __u64 blocking_owner; + struct obd_export *blocking_export; + /* Protected by the hash lock */ + __u32 blocking_refs; + __u32 pid; +}; + +typedef union { + struct ldlm_extent l_extent; + struct ldlm_flock l_flock; + struct ldlm_inodebits l_inodebits; +} ldlm_policy_data_t; + +void ldlm_convert_policy_to_wire(ldlm_type_t type, + const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); +void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type, + const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); + +enum lvb_type { + LVB_T_NONE = 0, + LVB_T_OST = 1, + LVB_T_LQUOTA = 2, + LVB_T_LAYOUT = 3, +}; + +/** + * LDLM lock structure + * + * Represents a single LDLM lock and its state in memory. Each lock is + * associated with a single ldlm_resource, the object which is being + * locked. There may be multiple ldlm_locks on a single resource, + * depending on the lock type and whether the locks are conflicting or + * not. + */ +struct ldlm_lock { + /** + * Local lock handle. + * When remote side wants to tell us about a lock, they address + * it by this opaque handle. The handle does not hold a + * reference on the ldlm_lock, so it can be safely passed to + * other threads or nodes. When the lock needs to be accessed + * from the handle, it is looked up again in the lock table, and + * may no longer exist. + * + * Must be first in the structure. + */ + struct portals_handle l_handle; + /** + * Lock reference count. + * This is how many users have pointers to actual structure, so that + * we do not accidentally free lock structure that is in use. + */ + atomic_t l_refc; + /** + * Internal spinlock protects l_resource. We should hold this lock + * first before taking res_lock. + */ + spinlock_t l_lock; + /** + * Pointer to actual resource this lock is in. + * ldlm_lock_change_resource() can change this. + */ + struct ldlm_resource *l_resource; + /** + * List item for client side LRU list. + * Protected by ns_lock in struct ldlm_namespace. + */ + struct list_head l_lru; + /** + * Linkage to resource's lock queues according to current lock state. + * (could be granted, waiting or converting) + * Protected by lr_lock in struct ldlm_resource. + */ + struct list_head l_res_link; + /** + * Tree node for ldlm_extent. + */ + struct ldlm_interval *l_tree_node; + /** + * Per export hash of locks. + * Protected by per-bucket exp->exp_lock_hash locks. + */ + struct hlist_node l_exp_hash; + /** + * Per export hash of flock locks. + * Protected by per-bucket exp->exp_flock_hash locks. + */ + struct hlist_node l_exp_flock_hash; + /** + * Requested mode. + * Protected by lr_lock. + */ + ldlm_mode_t l_req_mode; + /** + * Granted mode, also protected by lr_lock. + */ + ldlm_mode_t l_granted_mode; + /** Lock completion handler pointer. Called when lock is granted. */ + ldlm_completion_callback l_completion_ast; + /** + * Lock blocking AST handler pointer. + * It plays two roles: + * - as a notification of an attempt to queue a conflicting lock (once) + * - as a notification when the lock is being cancelled. + * + * As such it's typically called twice: once for the initial conflict + * and then once more when the last user went away and the lock is + * cancelled (could happen recursively). + */ + ldlm_blocking_callback l_blocking_ast; + /** + * Lock glimpse handler. + * Glimpse handler is used to obtain LVB updates from a client by + * server + */ + ldlm_glimpse_callback l_glimpse_ast; + + /** + * Lock export. + * This is a pointer to actual client export for locks that were granted + * to clients. Used server-side. + */ + struct obd_export *l_export; + /** + * Lock connection export. + * Pointer to server export on a client. + */ + struct obd_export *l_conn_export; + + /** + * Remote lock handle. + * If the lock is remote, this is the handle of the other side lock + * (l_handle) + */ + struct lustre_handle l_remote_handle; + + /** + * Representation of private data specific for a lock type. + * Examples are: extent range for extent lock or bitmask for ibits locks + */ + ldlm_policy_data_t l_policy_data; + + /** + * Lock state flags. Protected by lr_lock. + * \see lustre_dlm_flags.h where the bits are defined. + */ + __u64 l_flags; + + /** + * Lock r/w usage counters. + * Protected by lr_lock. + */ + __u32 l_readers; + __u32 l_writers; + /** + * If the lock is granted, a process sleeps on this waitq to learn when + * it's no longer in use. If the lock is not granted, a process sleeps + * on this waitq to learn when it becomes granted. + */ + wait_queue_head_t l_waitq; + + /** + * Seconds. It will be updated if there is any activity related to + * the lock, e.g. enqueue the lock or send blocking AST. + */ + unsigned long l_last_activity; + + /** + * Time last used by e.g. being matched by lock match. + * Jiffies. Should be converted to time if needed. + */ + unsigned long l_last_used; + + /** Originally requested extent for the extent lock. */ + struct ldlm_extent l_req_extent; + + /* + * Client-side-only members. + */ + + enum lvb_type l_lvb_type; + + /** + * Temporary storage for a LVB received during an enqueue operation. + */ + __u32 l_lvb_len; + void *l_lvb_data; + + /** Private storage for lock user. Opaque to LDLM. */ + void *l_ast_data; + + /* + * Server-side-only members. + */ + + /** + * Connection cookie for the client originating the operation. + * Used by Commit on Share (COS) code. Currently only used for + * inodebits locks on MDS. + */ + __u64 l_client_cookie; + + /** + * List item for locks waiting for cancellation from clients. + * The lists this could be linked into are: + * waiting_locks_list (protected by waiting_locks_spinlock), + * then if the lock timed out, it is moved to + * expired_lock_thread.elt_expired_locks for further processing. + * Protected by elt_lock. + */ + struct list_head l_pending_chain; + + /** + * Set when lock is sent a blocking AST. Time in seconds when timeout + * is reached and client holding this lock could be evicted. + * This timeout could be further extended by e.g. certain IO activity + * under this lock. + * \see ost_rw_prolong_locks + */ + unsigned long l_callback_timeout; + + /** Local PID of process which created this lock. */ + __u32 l_pid; + + /** + * Number of times blocking AST was sent for this lock. + * This is for debugging. Valid values are 0 and 1, if there is an + * attempt to send blocking AST more than once, an assertion would be + * hit. \see ldlm_work_bl_ast_lock + */ + int l_bl_ast_run; + /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */ + struct list_head l_bl_ast; + /** List item ldlm_add_ast_work_item() for case of completion ASTs. */ + struct list_head l_cp_ast; + /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */ + struct list_head l_rk_ast; + + /** + * Pointer to a conflicting lock that caused blocking AST to be sent + * for this lock + */ + struct ldlm_lock *l_blocking_lock; + + /** + * Protected by lr_lock, linkages to "skip lists". + * For more explanations of skip lists see ldlm/ldlm_inodebits.c + */ + struct list_head l_sl_mode; + struct list_head l_sl_policy; + + /** Reference tracking structure to debug leaked locks. */ + struct lu_ref l_reference; +#if LUSTRE_TRACKS_LOCK_EXP_REFS + /* Debugging stuff for bug 20498, for tracking export references. */ + /** number of export references taken */ + int l_exp_refs_nr; + /** link all locks referencing one export */ + struct list_head l_exp_refs_link; + /** referenced export object */ + struct obd_export *l_exp_refs_target; +#endif + /** + * export blocking dlm lock list, protected by + * l_export->exp_bl_list_lock. + * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock + * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock. + */ + struct list_head l_exp_list; +}; + +/** + * LDLM resource description. + * Basically, resource is a representation for a single object. + * Object has a name which is currently 4 64-bit integers. LDLM user is + * responsible for creation of a mapping between objects it wants to be + * protected and resource names. + * + * A resource can only hold locks of a single lock type, though there may be + * multiple ldlm_locks on a single resource, depending on the lock type and + * whether the locks are conflicting or not. + */ +struct ldlm_resource { + struct ldlm_ns_bucket *lr_ns_bucket; + + /** + * List item for list in namespace hash. + * protected by ns_lock + */ + struct hlist_node lr_hash; + + /** Spinlock to protect locks under this resource. */ + spinlock_t lr_lock; + + /** + * protected by lr_lock + * @{ */ + /** List of locks in granted state */ + struct list_head lr_granted; + /** List of locks waiting to change their granted mode (converted) */ + struct list_head lr_converting; + /** + * List of locks that could not be granted due to conflicts and + * that are waiting for conflicts to go away */ + struct list_head lr_waiting; + /** @} */ + + /* XXX No longer needed? Remove ASAP */ + ldlm_mode_t lr_most_restr; + + /** Type of locks this resource can hold. Only one type per resource. */ + ldlm_type_t lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */ + + /** Resource name */ + struct ldlm_res_id lr_name; + /** Reference count for this resource */ + atomic_t lr_refcount; + + /** + * Interval trees (only for extent locks) for all modes of this resource + */ + struct ldlm_interval_tree lr_itree[LCK_MODE_NUM]; + + /** + * Server-side-only lock value block elements. + * To serialize lvbo_init. + */ + struct mutex lr_lvb_mutex; + int lr_lvb_len; + /** protected by lr_lock */ + void *lr_lvb_data; + + /** When the resource was considered as contended. */ + unsigned long lr_contention_time; + /** List of references to this resource. For debugging. */ + struct lu_ref lr_reference; + + struct inode *lr_lvb_inode; +}; + +static inline bool ldlm_has_layout(struct ldlm_lock *lock) +{ + return lock->l_resource->lr_type == LDLM_IBITS && + lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT; +} + +static inline char * +ldlm_ns_name(struct ldlm_namespace *ns) +{ + return ns->ns_rs_hash->hs_name; +} + +static inline struct ldlm_namespace * +ldlm_res_to_ns(struct ldlm_resource *res) +{ + return res->lr_ns_bucket->nsb_namespace; +} + +static inline struct ldlm_namespace * +ldlm_lock_to_ns(struct ldlm_lock *lock) +{ + return ldlm_res_to_ns(lock->l_resource); +} + +static inline char * +ldlm_lock_to_ns_name(struct ldlm_lock *lock) +{ + return ldlm_ns_name(ldlm_lock_to_ns(lock)); +} + +static inline struct adaptive_timeout * +ldlm_lock_to_ns_at(struct ldlm_lock *lock) +{ + return &lock->l_resource->lr_ns_bucket->nsb_at_estimate; +} + +static inline int ldlm_lvbo_init(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + + if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL) + return ns->ns_lvbo->lvbo_init(res); + + return 0; +} + +static inline int ldlm_lvbo_size(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL) + return ns->ns_lvbo->lvbo_size(lock); + + return 0; +} + +static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + if (ns->ns_lvbo != NULL) { + LASSERT(ns->ns_lvbo->lvbo_fill != NULL); + return ns->ns_lvbo->lvbo_fill(lock, buf, len); + } + return 0; +} + +struct ldlm_ast_work { + struct ldlm_lock *w_lock; + int w_blocking; + struct ldlm_lock_desc w_desc; + struct list_head w_list; + int w_flags; + void *w_data; + int w_datalen; +}; + +/** + * Common ldlm_enqueue parameters + */ +struct ldlm_enqueue_info { + __u32 ei_type; /** Type of the lock being enqueued. */ + __u32 ei_mode; /** Mode of the lock being enqueued. */ + void *ei_cb_bl; /** blocking lock callback */ + void *ei_cb_cp; /** lock completion callback */ + void *ei_cb_gl; /** lock glimpse callback */ + void *ei_cbdata; /** Data to be passed into callbacks. */ +}; + +extern struct obd_ops ldlm_obd_ops; + +extern char *ldlm_lockname[]; +extern char *ldlm_typename[]; +extern char *ldlm_it2str(int it); + +/** + * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG. + * For the cases where we do not have actual lock to print along + * with a debugging message that is ldlm-related + */ +#define LDLM_DEBUG_NOLOCK(format, a...) \ + CDEBUG(D_DLMTRACE, "### " format "\n" , ##a) + +/** + * Support function for lock information printing into debug logs. + * \see LDLM_DEBUG + */ +#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _ldlm_lock_debug(lock, msgdata, fmt, ##a); \ +} while (0) + +void _ldlm_lock_debug(struct ldlm_lock *lock, + struct libcfs_debug_msg_data *data, + const char *fmt, ...) + __printf(3, 4); + +/** + * Rate-limited version of lock printing function. + */ +#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \ + static struct cfs_debug_limit_state _ldlm_cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls); \ + ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\ +} while (0) + +#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a) +#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a) + +/** Non-rate-limited lock printing function for debugging purposes. */ +#define LDLM_DEBUG(lock, fmt, a...) do { \ + if (likely(lock != NULL)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL); \ + ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, \ + "### " fmt , ##a); \ + } else { \ + LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a); \ + } \ +} while (0) + +typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags, + int first_enq, ldlm_error_t *err, + struct list_head *work_list); + +/** + * Return values for lock iterators. + * Also used during deciding of lock grants and cancellations. + */ +#define LDLM_ITER_CONTINUE 1 /* keep iterating */ +#define LDLM_ITER_STOP 2 /* stop iterating */ + +typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *); +typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *); + +/** \defgroup ldlm_iterator Lock iterators + * + * LDLM provides for a way to iterate through every lock on a resource or + * namespace or every resource in a namespace. + * @{ */ +int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, + void *closure); +void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter, + void *closure); +int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *, + ldlm_iterator_t iter, void *data); +/** @} ldlm_iterator */ + +int ldlm_replay_locks(struct obd_import *imp); + +/* ldlm_flock.c */ +int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); + +/* ldlm_extent.c */ +__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms); + +struct ldlm_callback_suite { + ldlm_completion_callback lcs_completion; + ldlm_blocking_callback lcs_blocking; + ldlm_glimpse_callback lcs_glimpse; +}; + +/* ldlm_lockd.c */ +int ldlm_del_waiting_lock(struct ldlm_lock *lock); +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout); +int ldlm_get_ref(void); +void ldlm_put_ref(void); +int ldlm_init_export(struct obd_export *exp); +void ldlm_destroy_export(struct obd_export *exp); +struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req); + +/* ldlm_lock.c */ +void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg); +void ldlm_lock2handle(const struct ldlm_lock *lock, + struct lustre_handle *lockh); +struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags); +void ldlm_cancel_callback(struct ldlm_lock *); +int ldlm_lock_remove_from_lru(struct ldlm_lock *); +int ldlm_lock_set_data(struct lustre_handle *, void *); + +/** + * Obtain a lock reference by its handle. + */ +static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h) +{ + return __ldlm_handle2lock(h, 0); +} + +#define LDLM_LOCK_REF_DEL(lock) \ + lu_ref_del(&lock->l_reference, "handle", current) + +static inline struct ldlm_lock * +ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags) +{ + struct ldlm_lock *lock; + + lock = __ldlm_handle2lock(h, flags); + if (lock != NULL) + LDLM_LOCK_REF_DEL(lock); + return lock; +} + +/** + * Update Lock Value Block Operations (LVBO) on a resource taking into account + * data from request \a r + */ +static inline int ldlm_res_lvbo_update(struct ldlm_resource *res, + struct ptlrpc_request *r, int increase) +{ + if (ldlm_res_to_ns(res)->ns_lvbo && + ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) { + return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r, + increase); + } + return 0; +} + +int ldlm_error2errno(ldlm_error_t error); +ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this + * confuses user-space. */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void ldlm_dump_export_locks(struct obd_export *exp); +#endif + +/** + * Release a temporary lock reference obtained by ldlm_handle2lock() or + * __ldlm_handle2lock(). + */ +#define LDLM_LOCK_PUT(lock) \ +do { \ + LDLM_LOCK_REF_DEL(lock); \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +/** + * Release a lock reference obtained by some other means (see + * LDLM_LOCK_PUT()). + */ +#define LDLM_LOCK_RELEASE(lock) \ +do { \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +#define LDLM_LOCK_GET(lock) \ +({ \ + ldlm_lock_get(lock); \ + /*LDLM_DEBUG((lock), "get");*/ \ + lock; \ +}) + +#define ldlm_lock_list_put(head, member, count) \ +({ \ + struct ldlm_lock *_lock, *_next; \ + int c = count; \ + list_for_each_entry_safe(_lock, _next, head, member) { \ + if (c-- == 0) \ + break; \ + list_del_init(&_lock->member); \ + LDLM_LOCK_RELEASE(_lock); \ + } \ + LASSERT(c <= 0); \ +}) + +struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); +void ldlm_lock_put(struct ldlm_lock *lock); +void ldlm_lock_destroy(struct ldlm_lock *lock); +void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc); +void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode); +int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode); +void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode); +void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode); +void ldlm_lock_fail_match_locked(struct ldlm_lock *lock); +void ldlm_lock_fail_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock); +ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags, + const struct ldlm_res_id *, ldlm_type_t type, + ldlm_policy_data_t *, ldlm_mode_t mode, + struct lustre_handle *, int unref); +ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh, + __u64 *bits); +struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, + __u32 *flags); +void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode); +void ldlm_lock_cancel(struct ldlm_lock *lock); +void ldlm_reprocess_all(struct ldlm_resource *res); +void ldlm_reprocess_all_ns(struct ldlm_namespace *ns); +void ldlm_lock_dump_handle(int level, struct lustre_handle *); +void ldlm_unlink_lock_skiplist(struct ldlm_lock *req); + +/* resource.c */ +struct ldlm_namespace * +ldlm_namespace_new(struct obd_device *obd, char *name, + ldlm_side_t client, ldlm_appetite_t apt, + ldlm_ns_type_t ns_type); +int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags); +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, int force); +void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client); +void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client); +void ldlm_namespace_get(struct ldlm_namespace *ns); +void ldlm_namespace_put(struct ldlm_namespace *ns); +#if defined (CONFIG_PROC_FS) +int ldlm_proc_setup(void); +void ldlm_proc_cleanup(void); +#else +static inline int ldlm_proc_setup(void) { return 0; } +static inline void ldlm_proc_cleanup(void) {} +#endif + +/* resource.c - internal */ +struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns, + struct ldlm_resource *parent, + const struct ldlm_res_id *, + ldlm_type_t type, int create); +struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res); +int ldlm_resource_putref(struct ldlm_resource *res); +void ldlm_resource_add_lock(struct ldlm_resource *res, + struct list_head *head, + struct ldlm_lock *lock); +void ldlm_resource_unlink_lock(struct ldlm_lock *lock); +void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc); +void ldlm_dump_all_namespaces(ldlm_side_t client, int level); +void ldlm_namespace_dump(int level, struct ldlm_namespace *); +void ldlm_resource_dump(int level, struct ldlm_resource *); +int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *, + const struct ldlm_res_id *); + +#define LDLM_RESOURCE_ADDREF(res) do { \ + lu_ref_add_atomic(&(res)->lr_reference, __func__, current); \ +} while (0) + +#define LDLM_RESOURCE_DELREF(res) do { \ + lu_ref_del(&(res)->lr_reference, __func__, current); \ +} while (0) + +/* ldlm_request.c */ +int ldlm_expired_completion_wait(void *data); +/** \defgroup ldlm_local_ast Default AST handlers for local locks + * These AST handlers are typically used for server-side local locks and are + * also used by client-side lock handlers to perform minimum level base + * processing. + * @{ */ +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock); +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag); +int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp); +int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data); +int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); +/** @} ldlm_local_ast */ + +/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users. + * These are typically used by client and server (*_local versions) + * to obtain and release locks. + * @{ */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_enqueue_info *einfo, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t const *policy, __u64 *flags, + void *lvb, __u32 lvb_len, enum lvb_type lvb_type, + struct lustre_handle *lockh, int async); +int ldlm_prep_enqueue_req(struct obd_export *exp, + struct ptlrpc_request *req, + struct list_head *cancels, + int count); +int ldlm_prep_elc_req(struct obd_export *exp, + struct ptlrpc_request *req, + int version, int opc, int canceloff, + struct list_head *cancels, int count); + +struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len); +int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + const struct ldlm_callback_suite *cbs); +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode, + __u64 *flags, void *lvb, __u32 lvb_len, + struct lustre_handle *lockh, int rc); +int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_type_t type, ldlm_policy_data_t *policy, + ldlm_mode_t mode, __u64 *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, enum lvb_type lvb_type, + const __u64 *client_cookie, + struct lustre_handle *lockh); +int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new, + void *data, __u32 data_len); +int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags); +int ldlm_cli_update_pool(struct ptlrpc_request *req); +int ldlm_cli_cancel(struct lustre_handle *lockh, + ldlm_cancel_flags_t cancel_flags); +int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *, + ldlm_cancel_flags_t flags, void *opaque); +int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque); +int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head, + int count, ldlm_cancel_flags_t flags); +int ldlm_cancel_resource_local(struct ldlm_resource *res, + struct list_head *cancels, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, __u64 lock_flags, + ldlm_cancel_flags_t cancel_flags, void *opaque); +int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, + ldlm_cancel_flags_t flags); +int ldlm_cli_cancel_list(struct list_head *head, int count, + struct ptlrpc_request *req, ldlm_cancel_flags_t flags); +/** @} ldlm_cli_api */ + +/* mds/handler.c */ +/* This has to be here because recursive inclusion sucks. */ +int intent_disposition(struct ldlm_reply *rep, int flag); +void intent_set_disposition(struct ldlm_reply *rep, int flag); + + +/* ioctls for trying requests */ +#define IOC_LDLM_TYPE 'f' +#define IOC_LDLM_MIN_NR 40 + +#define IOC_LDLM_TEST _IOWR('f', 40, long) +#define IOC_LDLM_DUMP _IOWR('f', 41, long) +#define IOC_LDLM_REGRESS_START _IOWR('f', 42, long) +#define IOC_LDLM_REGRESS_STOP _IOWR('f', 43, long) +#define IOC_LDLM_MAX_NR 43 + +/** + * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more + * than one lock_res is dead-lock safe. + */ +enum lock_res_type { + LRT_NORMAL, + LRT_NEW +}; + +/** Lock resource. */ +static inline void lock_res(struct ldlm_resource *res) +{ + spin_lock(&res->lr_lock); +} + +/** Lock resource with a way to instruct lockdep code about nestedness-safe. */ +static inline void lock_res_nested(struct ldlm_resource *res, + enum lock_res_type mode) +{ + spin_lock_nested(&res->lr_lock, mode); +} + +/** Unlock resource. */ +static inline void unlock_res(struct ldlm_resource *res) +{ + spin_unlock(&res->lr_lock); +} + +/** Check if resource is already locked, assert if not. */ +static inline void check_res_locked(struct ldlm_resource *res) +{ + assert_spin_locked(&res->lr_lock); +} + +struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock); +void unlock_res_and_lock(struct ldlm_lock *lock); + +/* ldlm_pool.c */ +/** \defgroup ldlm_pools Various LDLM pool related functions + * There are not used outside of ldlm. + * @{ + */ +int ldlm_pools_recalc(ldlm_side_t client); +int ldlm_pools_init(void); +void ldlm_pools_fini(void); + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, ldlm_side_t client); +int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, + gfp_t gfp_mask); +void ldlm_pool_fini(struct ldlm_pool *pl); +int ldlm_pool_setup(struct ldlm_pool *pl, int limit); +int ldlm_pool_recalc(struct ldlm_pool *pl); +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl); +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl); +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl); +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl); +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv); +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv); +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit); +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock); +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock); +/** @} */ + +#endif +/** @} LDLM */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h b/kernel/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h new file mode 100644 index 000000000..16dcdbfae --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h @@ -0,0 +1,476 @@ +/* -*- buffer-read-only: t -*- vi: set ro: + * + * DO NOT EDIT THIS FILE (lustre_dlm_flags.h) + * + * It has been AutoGen-ed + * From the definitions lustre_dlm_flags.def + * and the template file lustre_dlm_flags.tpl + * + * lustre is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * lustre is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . + */ +/** + * \file lustre_dlm_flags.h + * The flags and collections of flags (masks) for \see struct ldlm_lock. + * This file is derived from flag definitions in lustre_dlm_flags.def. + * The format is defined in the lustre_dlm_flags.tpl template file. + * + * \addtogroup LDLM Lustre Distributed Lock Manager + * @{ + * + * \name flags + * The flags and collections of flags (masks) for \see struct ldlm_lock. + * @{ + */ +#ifndef LDLM_ALL_FLAGS_MASK + +/** l_flags bits marked as "all_flags" bits */ +#define LDLM_FL_ALL_FLAGS_MASK 0x00FFFFFFC08F932FULL + +/** l_flags bits marked as "ast" bits */ +#define LDLM_FL_AST_MASK 0x0000000080008000ULL + +/** l_flags bits marked as "blocked" bits */ +#define LDLM_FL_BLOCKED_MASK 0x000000000000000EULL + +/** l_flags bits marked as "gone" bits */ +#define LDLM_FL_GONE_MASK 0x0006004000000000ULL + +/** l_flags bits marked as "hide_lock" bits */ +#define LDLM_FL_HIDE_LOCK_MASK 0x0000206400000000ULL + +/** l_flags bits marked as "inherit" bits */ +#define LDLM_FL_INHERIT_MASK 0x0000000000800000ULL + +/** l_flags bits marked as "local_only" bits */ +#define LDLM_FL_LOCAL_ONLY_MASK 0x00FFFFFF00000000ULL + +/** l_flags bits marked as "on_wire" bits */ +#define LDLM_FL_ON_WIRE_MASK 0x00000000C08F932FULL + +/** extent, mode, or resource changed */ +#define LDLM_FL_LOCK_CHANGED 0x0000000000000001ULL // bit 0 +#define ldlm_is_lock_changed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 0) +#define ldlm_set_lock_changed(_l) LDLM_SET_FLAG(( _l), 1ULL << 0) +#define ldlm_clear_lock_changed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 0) + +/** + * Server placed lock on granted list, or a recovering client wants the + * lock added to the granted list, no questions asked. */ +#define LDLM_FL_BLOCK_GRANTED 0x0000000000000002ULL // bit 1 +#define ldlm_is_block_granted(_l) LDLM_TEST_FLAG(( _l), 1ULL << 1) +#define ldlm_set_block_granted(_l) LDLM_SET_FLAG(( _l), 1ULL << 1) +#define ldlm_clear_block_granted(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 1) + +/** + * Server placed lock on conv list, or a recovering client wants the lock + * added to the conv list, no questions asked. */ +#define LDLM_FL_BLOCK_CONV 0x0000000000000004ULL // bit 2 +#define ldlm_is_block_conv(_l) LDLM_TEST_FLAG(( _l), 1ULL << 2) +#define ldlm_set_block_conv(_l) LDLM_SET_FLAG(( _l), 1ULL << 2) +#define ldlm_clear_block_conv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 2) + +/** + * Server placed lock on wait list, or a recovering client wants the lock + * added to the wait list, no questions asked. */ +#define LDLM_FL_BLOCK_WAIT 0x0000000000000008ULL // bit 3 +#define ldlm_is_block_wait(_l) LDLM_TEST_FLAG(( _l), 1ULL << 3) +#define ldlm_set_block_wait(_l) LDLM_SET_FLAG(( _l), 1ULL << 3) +#define ldlm_clear_block_wait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 3) + +/** blocking or cancel packet was queued for sending. */ +#define LDLM_FL_AST_SENT 0x0000000000000020ULL // bit 5 +#define ldlm_is_ast_sent(_l) LDLM_TEST_FLAG(( _l), 1ULL << 5) +#define ldlm_set_ast_sent(_l) LDLM_SET_FLAG(( _l), 1ULL << 5) +#define ldlm_clear_ast_sent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 5) + +/** + * Lock is being replayed. This could probably be implied by the fact that + * one of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. */ +#define LDLM_FL_REPLAY 0x0000000000000100ULL // bit 8 +#define ldlm_is_replay(_l) LDLM_TEST_FLAG(( _l), 1ULL << 8) +#define ldlm_set_replay(_l) LDLM_SET_FLAG(( _l), 1ULL << 8) +#define ldlm_clear_replay(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 8) + +/** Don't grant lock, just do intent. */ +#define LDLM_FL_INTENT_ONLY 0x0000000000000200ULL // bit 9 +#define ldlm_is_intent_only(_l) LDLM_TEST_FLAG(( _l), 1ULL << 9) +#define ldlm_set_intent_only(_l) LDLM_SET_FLAG(( _l), 1ULL << 9) +#define ldlm_clear_intent_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 9) + +/** lock request has intent */ +#define LDLM_FL_HAS_INTENT 0x0000000000001000ULL // bit 12 +#define ldlm_is_has_intent(_l) LDLM_TEST_FLAG(( _l), 1ULL << 12) +#define ldlm_set_has_intent(_l) LDLM_SET_FLAG(( _l), 1ULL << 12) +#define ldlm_clear_has_intent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 12) + +/** flock deadlock detected */ +#define LDLM_FL_FLOCK_DEADLOCK 0x0000000000008000ULL /* bit 15 */ +#define ldlm_is_flock_deadlock(_l) LDLM_TEST_FLAG((_l), 1ULL << 15) +#define ldlm_set_flock_deadlock(_l) LDLM_SET_FLAG((_l), 1ULL << 15) +#define ldlm_clear_flock_deadlock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 15) + +/** discard (no writeback) on cancel */ +#define LDLM_FL_DISCARD_DATA 0x0000000000010000ULL // bit 16 +#define ldlm_is_discard_data(_l) LDLM_TEST_FLAG(( _l), 1ULL << 16) +#define ldlm_set_discard_data(_l) LDLM_SET_FLAG(( _l), 1ULL << 16) +#define ldlm_clear_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 16) + +/** Blocked by group lock - wait indefinitely */ +#define LDLM_FL_NO_TIMEOUT 0x0000000000020000ULL // bit 17 +#define ldlm_is_no_timeout(_l) LDLM_TEST_FLAG(( _l), 1ULL << 17) +#define ldlm_set_no_timeout(_l) LDLM_SET_FLAG(( _l), 1ULL << 17) +#define ldlm_clear_no_timeout(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 17) + +/** + * Server told not to wait if blocked. For AGL, OST will not send glimpse + * callback. */ +#define LDLM_FL_BLOCK_NOWAIT 0x0000000000040000ULL // bit 18 +#define ldlm_is_block_nowait(_l) LDLM_TEST_FLAG(( _l), 1ULL << 18) +#define ldlm_set_block_nowait(_l) LDLM_SET_FLAG(( _l), 1ULL << 18) +#define ldlm_clear_block_nowait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 18) + +/** return blocking lock */ +#define LDLM_FL_TEST_LOCK 0x0000000000080000ULL // bit 19 +#define ldlm_is_test_lock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 19) +#define ldlm_set_test_lock(_l) LDLM_SET_FLAG(( _l), 1ULL << 19) +#define ldlm_clear_test_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 19) + +/** + * Immediately cancel such locks when they block some other locks. Send + * cancel notification to original lock holder, but expect no reply. This + * is for clients (like liblustre) that cannot be expected to reliably + * response to blocking AST. */ +#define LDLM_FL_CANCEL_ON_BLOCK 0x0000000000800000ULL // bit 23 +#define ldlm_is_cancel_on_block(_l) LDLM_TEST_FLAG(( _l), 1ULL << 23) +#define ldlm_set_cancel_on_block(_l) LDLM_SET_FLAG(( _l), 1ULL << 23) +#define ldlm_clear_cancel_on_block(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 23) + +/** + * measure lock contention and return -EUSERS if locking contention is high */ +#define LDLM_FL_DENY_ON_CONTENTION 0x0000000040000000ULL // bit 30 +#define ldlm_is_deny_on_contention(_l) LDLM_TEST_FLAG(( _l), 1ULL << 30) +#define ldlm_set_deny_on_contention(_l) LDLM_SET_FLAG(( _l), 1ULL << 30) +#define ldlm_clear_deny_on_contention(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 30) + +/** + * These are flags that are mapped into the flags and ASTs of blocking + * locks Add FL_DISCARD to blocking ASTs */ +#define LDLM_FL_AST_DISCARD_DATA 0x0000000080000000ULL // bit 31 +#define ldlm_is_ast_discard_data(_l) LDLM_TEST_FLAG(( _l), 1ULL << 31) +#define ldlm_set_ast_discard_data(_l) LDLM_SET_FLAG(( _l), 1ULL << 31) +#define ldlm_clear_ast_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 31) + +/** + * Used for marking lock as a target for -EINTR while cp_ast sleep emulation + * + race with upcoming bl_ast. */ +#define LDLM_FL_FAIL_LOC 0x0000000100000000ULL // bit 32 +#define ldlm_is_fail_loc(_l) LDLM_TEST_FLAG(( _l), 1ULL << 32) +#define ldlm_set_fail_loc(_l) LDLM_SET_FLAG(( _l), 1ULL << 32) +#define ldlm_clear_fail_loc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 32) + +/** + * Used while processing the unused list to know that we have already + * handled this lock and decided to skip it. */ +#define LDLM_FL_SKIPPED 0x0000000200000000ULL // bit 33 +#define ldlm_is_skipped(_l) LDLM_TEST_FLAG(( _l), 1ULL << 33) +#define ldlm_set_skipped(_l) LDLM_SET_FLAG(( _l), 1ULL << 33) +#define ldlm_clear_skipped(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 33) + +/** this lock is being destroyed */ +#define LDLM_FL_CBPENDING 0x0000000400000000ULL // bit 34 +#define ldlm_is_cbpending(_l) LDLM_TEST_FLAG(( _l), 1ULL << 34) +#define ldlm_set_cbpending(_l) LDLM_SET_FLAG(( _l), 1ULL << 34) +#define ldlm_clear_cbpending(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 34) + +/** not a real flag, not saved in lock */ +#define LDLM_FL_WAIT_NOREPROC 0x0000000800000000ULL // bit 35 +#define ldlm_is_wait_noreproc(_l) LDLM_TEST_FLAG(( _l), 1ULL << 35) +#define ldlm_set_wait_noreproc(_l) LDLM_SET_FLAG(( _l), 1ULL << 35) +#define ldlm_clear_wait_noreproc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 35) + +/** cancellation callback already run */ +#define LDLM_FL_CANCEL 0x0000001000000000ULL // bit 36 +#define ldlm_is_cancel(_l) LDLM_TEST_FLAG(( _l), 1ULL << 36) +#define ldlm_set_cancel(_l) LDLM_SET_FLAG(( _l), 1ULL << 36) +#define ldlm_clear_cancel(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 36) + +/** whatever it might mean */ +#define LDLM_FL_LOCAL_ONLY 0x0000002000000000ULL // bit 37 +#define ldlm_is_local_only(_l) LDLM_TEST_FLAG(( _l), 1ULL << 37) +#define ldlm_set_local_only(_l) LDLM_SET_FLAG(( _l), 1ULL << 37) +#define ldlm_clear_local_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 37) + +/** don't run the cancel callback under ldlm_cli_cancel_unused */ +#define LDLM_FL_FAILED 0x0000004000000000ULL // bit 38 +#define ldlm_is_failed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 38) +#define ldlm_set_failed(_l) LDLM_SET_FLAG(( _l), 1ULL << 38) +#define ldlm_clear_failed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 38) + +/** lock cancel has already been sent */ +#define LDLM_FL_CANCELING 0x0000008000000000ULL // bit 39 +#define ldlm_is_canceling(_l) LDLM_TEST_FLAG(( _l), 1ULL << 39) +#define ldlm_set_canceling(_l) LDLM_SET_FLAG(( _l), 1ULL << 39) +#define ldlm_clear_canceling(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 39) + +/** local lock (ie, no srv/cli split) */ +#define LDLM_FL_LOCAL 0x0000010000000000ULL // bit 40 +#define ldlm_is_local(_l) LDLM_TEST_FLAG(( _l), 1ULL << 40) +#define ldlm_set_local(_l) LDLM_SET_FLAG(( _l), 1ULL << 40) +#define ldlm_clear_local(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 40) + +/** + * XXX FIXME: This is being added to b_size as a low-risk fix to the + * fact that the LVB filling happens _after_ the lock has been granted, + * so another thread can match it before the LVB has been updated. As a + * dirty hack, we set LDLM_FL_LVB_READY only after we've done the LVB poop. + * this is only needed on LOV/OSC now, where LVB is actually used and + * callers must set it in input flags. + * + * The proper fix is to do the granting inside of the completion AST, + * which can be replaced with a LVB-aware wrapping function for OSC locks. + * That change is pretty high-risk, though, and would need a lot more + * testing. */ +#define LDLM_FL_LVB_READY 0x0000020000000000ULL // bit 41 +#define ldlm_is_lvb_ready(_l) LDLM_TEST_FLAG(( _l), 1ULL << 41) +#define ldlm_set_lvb_ready(_l) LDLM_SET_FLAG(( _l), 1ULL << 41) +#define ldlm_clear_lvb_ready(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 41) + +/** + * A lock contributes to the known minimum size (KMS) calculation until it + * has finished the part of its cancellation that performs write back on its + * dirty pages. It can remain on the granted list during this whole time. + * Threads racing to update the KMS after performing their writeback need + * to know to exclude each other's locks from the calculation as they walk + * the granted list. */ +#define LDLM_FL_KMS_IGNORE 0x0000040000000000ULL // bit 42 +#define ldlm_is_kms_ignore(_l) LDLM_TEST_FLAG(( _l), 1ULL << 42) +#define ldlm_set_kms_ignore(_l) LDLM_SET_FLAG(( _l), 1ULL << 42) +#define ldlm_clear_kms_ignore(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 42) + +/** completion AST to be executed */ +#define LDLM_FL_CP_REQD 0x0000080000000000ULL // bit 43 +#define ldlm_is_cp_reqd(_l) LDLM_TEST_FLAG(( _l), 1ULL << 43) +#define ldlm_set_cp_reqd(_l) LDLM_SET_FLAG(( _l), 1ULL << 43) +#define ldlm_clear_cp_reqd(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 43) + +/** cleanup_resource has already handled the lock */ +#define LDLM_FL_CLEANED 0x0000100000000000ULL // bit 44 +#define ldlm_is_cleaned(_l) LDLM_TEST_FLAG(( _l), 1ULL << 44) +#define ldlm_set_cleaned(_l) LDLM_SET_FLAG(( _l), 1ULL << 44) +#define ldlm_clear_cleaned(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 44) + +/** + * optimization hint: LDLM can run blocking callback from current context + * w/o involving separate thread. in order to decrease cs rate */ +#define LDLM_FL_ATOMIC_CB 0x0000200000000000ULL // bit 45 +#define ldlm_is_atomic_cb(_l) LDLM_TEST_FLAG(( _l), 1ULL << 45) +#define ldlm_set_atomic_cb(_l) LDLM_SET_FLAG(( _l), 1ULL << 45) +#define ldlm_clear_atomic_cb(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 45) + +/** + * It may happen that a client initiates two operations, e.g. unlink and + * mkdir, such that the server sends a blocking AST for conflicting locks + * to this client for the first operation, whereas the second operation + * has canceled this lock and is waiting for rpc_lock which is taken by + * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in + * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it. + * + * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock cache is + * dropped to let ldlm_callback_handler() return EINVAL to the server. It + * is used when ELC RPC is already prepared and is waiting for rpc_lock, + * too late to send a separate CANCEL RPC. */ +#define LDLM_FL_BL_AST 0x0000400000000000ULL // bit 46 +#define ldlm_is_bl_ast(_l) LDLM_TEST_FLAG(( _l), 1ULL << 46) +#define ldlm_set_bl_ast(_l) LDLM_SET_FLAG(( _l), 1ULL << 46) +#define ldlm_clear_bl_ast(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 46) + +/** whatever it might mean */ +#define LDLM_FL_BL_DONE 0x0000800000000000ULL // bit 47 +#define ldlm_is_bl_done(_l) LDLM_TEST_FLAG(( _l), 1ULL << 47) +#define ldlm_set_bl_done(_l) LDLM_SET_FLAG(( _l), 1ULL << 47) +#define ldlm_clear_bl_done(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 47) + +/** + * Don't put lock into the LRU list, so that it is not canceled due + * to aging. Used by MGC locks, they are cancelled only at unmount or + * by callback. */ +#define LDLM_FL_NO_LRU 0x0001000000000000ULL // bit 48 +#define ldlm_is_no_lru(_l) LDLM_TEST_FLAG(( _l), 1ULL << 48) +#define ldlm_set_no_lru(_l) LDLM_SET_FLAG(( _l), 1ULL << 48) +#define ldlm_clear_no_lru(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 48) + +/** + * Set for locks that failed and where the server has been notified. + * + * Protected by lock and resource locks. */ +#define LDLM_FL_FAIL_NOTIFIED 0x0002000000000000ULL // bit 49 +#define ldlm_is_fail_notified(_l) LDLM_TEST_FLAG(( _l), 1ULL << 49) +#define ldlm_set_fail_notified(_l) LDLM_SET_FLAG(( _l), 1ULL << 49) +#define ldlm_clear_fail_notified(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 49) + +/** + * Set for locks that were removed from class hash table and will + * be destroyed when last reference to them is released. Set by + * ldlm_lock_destroy_internal(). + * + * Protected by lock and resource locks. */ +#define LDLM_FL_DESTROYED 0x0004000000000000ULL // bit 50 +#define ldlm_is_destroyed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 50) +#define ldlm_set_destroyed(_l) LDLM_SET_FLAG(( _l), 1ULL << 50) +#define ldlm_clear_destroyed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 50) + +/** flag whether this is a server namespace lock */ +#define LDLM_FL_SERVER_LOCK 0x0008000000000000ULL // bit 51 +#define ldlm_is_server_lock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 51) +#define ldlm_set_server_lock(_l) LDLM_SET_FLAG(( _l), 1ULL << 51) +#define ldlm_clear_server_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 51) + +/** + * It's set in lock_res_and_lock() and unset in unlock_res_and_lock(). + * + * NB: compared with check_res_locked(), checking this bit is cheaper. + * Also, spin_is_locked() is deprecated for kernel code; one reason is + * because it works only for SMP so user needs to add extra macros like + * LASSERT_SPIN_LOCKED for uniprocessor kernels. */ +#define LDLM_FL_RES_LOCKED 0x0010000000000000ULL // bit 52 +#define ldlm_is_res_locked(_l) LDLM_TEST_FLAG(( _l), 1ULL << 52) +#define ldlm_set_res_locked(_l) LDLM_SET_FLAG(( _l), 1ULL << 52) +#define ldlm_clear_res_locked(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 52) + +/** + * It's set once we call ldlm_add_waiting_lock_res_locked() to start the + * lock-timeout timer and it will never be reset. + * + * Protected by lock and resource locks. */ +#define LDLM_FL_WAITED 0x0020000000000000ULL // bit 53 +#define ldlm_is_waited(_l) LDLM_TEST_FLAG(( _l), 1ULL << 53) +#define ldlm_set_waited(_l) LDLM_SET_FLAG(( _l), 1ULL << 53) +#define ldlm_clear_waited(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 53) + +/** Flag whether this is a server namespace lock. */ +#define LDLM_FL_NS_SRV 0x0040000000000000ULL // bit 54 +#define ldlm_is_ns_srv(_l) LDLM_TEST_FLAG(( _l), 1ULL << 54) +#define ldlm_set_ns_srv(_l) LDLM_SET_FLAG(( _l), 1ULL << 54) +#define ldlm_clear_ns_srv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 54) + +/** Flag whether this lock can be reused. Used by exclusive open. */ +#define LDLM_FL_EXCL 0x0080000000000000ULL /* bit 55 */ +#define ldlm_is_excl(_l) LDLM_TEST_FLAG((_l), 1ULL << 55) +#define ldlm_set_excl(_l) LDLM_SET_FLAG((_l), 1ULL << 55) +#define ldlm_clear_excl(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 55) + +/** test for ldlm_lock flag bit set */ +#define LDLM_TEST_FLAG(_l, _b) (((_l)->l_flags & (_b)) != 0) + +/** set a ldlm_lock flag bit */ +#define LDLM_SET_FLAG(_l, _b) (((_l)->l_flags |= (_b)) + +/** clear a ldlm_lock flag bit */ +#define LDLM_CLEAR_FLAG(_l, _b) (((_l)->l_flags &= ~(_b)) + +/** Mask of flags inherited from parent lock when doing intents. */ +#define LDLM_INHERIT_FLAGS LDLM_FL_INHERIT_MASK + +/** Mask of Flags sent in AST lock_flags to map into the receiving lock. */ +#define LDLM_AST_FLAGS LDLM_FL_AST_MASK + +/** @} subgroup */ +/** @} group */ +#ifdef WIRESHARK_COMPILE +static int hf_lustre_ldlm_fl_lock_changed = -1; +static int hf_lustre_ldlm_fl_block_granted = -1; +static int hf_lustre_ldlm_fl_block_conv = -1; +static int hf_lustre_ldlm_fl_block_wait = -1; +static int hf_lustre_ldlm_fl_ast_sent = -1; +static int hf_lustre_ldlm_fl_replay = -1; +static int hf_lustre_ldlm_fl_intent_only = -1; +static int hf_lustre_ldlm_fl_has_intent = -1; +static int hf_lustre_ldlm_fl_flock_deadlock = -1; +static int hf_lustre_ldlm_fl_discard_data = -1; +static int hf_lustre_ldlm_fl_no_timeout = -1; +static int hf_lustre_ldlm_fl_block_nowait = -1; +static int hf_lustre_ldlm_fl_test_lock = -1; +static int hf_lustre_ldlm_fl_cancel_on_block = -1; +static int hf_lustre_ldlm_fl_deny_on_contention = -1; +static int hf_lustre_ldlm_fl_ast_discard_data = -1; +static int hf_lustre_ldlm_fl_fail_loc = -1; +static int hf_lustre_ldlm_fl_skipped = -1; +static int hf_lustre_ldlm_fl_cbpending = -1; +static int hf_lustre_ldlm_fl_wait_noreproc = -1; +static int hf_lustre_ldlm_fl_cancel = -1; +static int hf_lustre_ldlm_fl_local_only = -1; +static int hf_lustre_ldlm_fl_failed = -1; +static int hf_lustre_ldlm_fl_canceling = -1; +static int hf_lustre_ldlm_fl_local = -1; +static int hf_lustre_ldlm_fl_lvb_ready = -1; +static int hf_lustre_ldlm_fl_kms_ignore = -1; +static int hf_lustre_ldlm_fl_cp_reqd = -1; +static int hf_lustre_ldlm_fl_cleaned = -1; +static int hf_lustre_ldlm_fl_atomic_cb = -1; +static int hf_lustre_ldlm_fl_bl_ast = -1; +static int hf_lustre_ldlm_fl_bl_done = -1; +static int hf_lustre_ldlm_fl_no_lru = -1; +static int hf_lustre_ldlm_fl_fail_notified = -1; +static int hf_lustre_ldlm_fl_destroyed = -1; +static int hf_lustre_ldlm_fl_server_lock = -1; +static int hf_lustre_ldlm_fl_res_locked = -1; +static int hf_lustre_ldlm_fl_waited = -1; +static int hf_lustre_ldlm_fl_ns_srv = -1; +static int hf_lustre_ldlm_fl_excl = -1; + +const value_string lustre_ldlm_flags_vals[] = { + {LDLM_FL_LOCK_CHANGED, "LDLM_FL_LOCK_CHANGED"}, + {LDLM_FL_BLOCK_GRANTED, "LDLM_FL_BLOCK_GRANTED"}, + {LDLM_FL_BLOCK_CONV, "LDLM_FL_BLOCK_CONV"}, + {LDLM_FL_BLOCK_WAIT, "LDLM_FL_BLOCK_WAIT"}, + {LDLM_FL_AST_SENT, "LDLM_FL_AST_SENT"}, + {LDLM_FL_REPLAY, "LDLM_FL_REPLAY"}, + {LDLM_FL_INTENT_ONLY, "LDLM_FL_INTENT_ONLY"}, + {LDLM_FL_HAS_INTENT, "LDLM_FL_HAS_INTENT"}, + {LDLM_FL_FLOCK_DEADLOCK, "LDLM_FL_FLOCK_DEADLOCK"}, + {LDLM_FL_DISCARD_DATA, "LDLM_FL_DISCARD_DATA"}, + {LDLM_FL_NO_TIMEOUT, "LDLM_FL_NO_TIMEOUT"}, + {LDLM_FL_BLOCK_NOWAIT, "LDLM_FL_BLOCK_NOWAIT"}, + {LDLM_FL_TEST_LOCK, "LDLM_FL_TEST_LOCK"}, + {LDLM_FL_CANCEL_ON_BLOCK, "LDLM_FL_CANCEL_ON_BLOCK"}, + {LDLM_FL_DENY_ON_CONTENTION, "LDLM_FL_DENY_ON_CONTENTION"}, + {LDLM_FL_AST_DISCARD_DATA, "LDLM_FL_AST_DISCARD_DATA"}, + {LDLM_FL_FAIL_LOC, "LDLM_FL_FAIL_LOC"}, + {LDLM_FL_SKIPPED, "LDLM_FL_SKIPPED"}, + {LDLM_FL_CBPENDING, "LDLM_FL_CBPENDING"}, + {LDLM_FL_WAIT_NOREPROC, "LDLM_FL_WAIT_NOREPROC"}, + {LDLM_FL_CANCEL, "LDLM_FL_CANCEL"}, + {LDLM_FL_LOCAL_ONLY, "LDLM_FL_LOCAL_ONLY"}, + {LDLM_FL_FAILED, "LDLM_FL_FAILED"}, + {LDLM_FL_CANCELING, "LDLM_FL_CANCELING"}, + {LDLM_FL_LOCAL, "LDLM_FL_LOCAL"}, + {LDLM_FL_LVB_READY, "LDLM_FL_LVB_READY"}, + {LDLM_FL_KMS_IGNORE, "LDLM_FL_KMS_IGNORE"}, + {LDLM_FL_CP_REQD, "LDLM_FL_CP_REQD"}, + {LDLM_FL_CLEANED, "LDLM_FL_CLEANED"}, + {LDLM_FL_ATOMIC_CB, "LDLM_FL_ATOMIC_CB"}, + {LDLM_FL_BL_AST, "LDLM_FL_BL_AST"}, + {LDLM_FL_BL_DONE, "LDLM_FL_BL_DONE"}, + {LDLM_FL_NO_LRU, "LDLM_FL_NO_LRU"}, + {LDLM_FL_FAIL_NOTIFIED, "LDLM_FL_FAIL_NOTIFIED"}, + {LDLM_FL_DESTROYED, "LDLM_FL_DESTROYED"}, + {LDLM_FL_SERVER_LOCK, "LDLM_FL_SERVER_LOCK"}, + {LDLM_FL_RES_LOCKED, "LDLM_FL_RES_LOCKED"}, + {LDLM_FL_WAITED, "LDLM_FL_WAITED"}, + {LDLM_FL_NS_SRV, "LDLM_FL_NS_SRV"}, + {LDLM_FL_EXCL, "LDLM_FL_EXCL"}, + { 0, NULL } +}; +#endif /* WIRESHARK_COMPILE */ +#endif /* LDLM_ALL_FLAGS_MASK */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_eacl.h b/kernel/drivers/staging/lustre/lustre/include/lustre_eacl.h new file mode 100644 index 000000000..0f8f76c43 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_eacl.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/include/lustre_idmap.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_EACL_H +#define _LUSTRE_EACL_H + +/** \defgroup eacl eacl + * + * @{ + */ + +#ifdef CONFIG_FS_POSIX_ACL + +#include + +typedef struct { + __u16 e_tag; + __u16 e_perm; + __u32 e_id; + __u32 e_stat; +} ext_acl_xattr_entry; + +typedef struct { + __u32 a_count; + ext_acl_xattr_entry a_entries[0]; +} ext_acl_xattr_header; + +#define CFS_ACL_XATTR_SIZE(count, prefix) \ + (sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry)) + +#define CFS_ACL_XATTR_COUNT(size, prefix) \ + (((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry)) + + +extern ext_acl_xattr_header * +lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size); +extern int +lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, size_t size, + posix_acl_xattr_header **out); +extern void +lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size); +extern void +lustre_ext_acl_xattr_free(ext_acl_xattr_header *header); +extern int +lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header, + posix_acl_xattr_header **out); +extern ext_acl_xattr_header * +lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header); + +#endif /* CONFIG_FS_POSIX_ACL */ + +/** @} eacl */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_export.h b/kernel/drivers/staging/lustre/lustre/include/lustre_export.h new file mode 100644 index 000000000..9c06a49f1 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_export.h @@ -0,0 +1,406 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup obd_export PortalRPC export definitions + * + * @{ + */ + +#ifndef __EXPORT_H +#define __EXPORT_H + +/** \defgroup export export + * + * @{ + */ + +#include "lprocfs_status.h" +#include "lustre/lustre_idl.h" +#include "lustre_dlm.h" + +struct mds_client_data; +struct mdt_client_data; +struct mds_idmap_table; +struct mdt_idmap_table; + +/** + * Target-specific export data + */ +struct tg_export_data { + /** Protects led_lcd below */ + struct mutex ted_lcd_lock; + /** Per-client data for each export */ + struct lsd_client_data *ted_lcd; + /** Offset of record in last_rcvd file */ + loff_t ted_lr_off; + /** Client index in last_rcvd file */ + int ted_lr_idx; +}; + +/** + * MDT-specific export data + */ +struct mdt_export_data { + struct tg_export_data med_ted; + /** List of all files opened by client on this MDT */ + struct list_head med_open_head; + spinlock_t med_open_lock; /* med_open_head, mfd_list */ + /** Bitmask of all ibit locks this MDT understands */ + __u64 med_ibits_known; + struct mutex med_idmap_mutex; + struct lustre_idmap_table *med_idmap; +}; + +struct ec_export_data { /* echo client */ + struct list_head eced_locks; +}; + +/* In-memory access to client data from OST struct */ +/** Filter (oss-side) specific import data */ +struct filter_export_data { + struct tg_export_data fed_ted; + spinlock_t fed_lock; /**< protects fed_mod_list */ + long fed_dirty; /* in bytes */ + long fed_grant; /* in bytes */ + struct list_head fed_mod_list; /* files being modified */ + int fed_mod_count;/* items in fed_writing list */ + long fed_pending; /* bytes just being written */ + __u32 fed_group; + __u8 fed_pagesize; /* log2 of client page size */ +}; + +struct mgs_export_data { + struct list_head med_clients; /* mgc fs client via this exp */ + spinlock_t med_lock; /* protect med_clients */ +}; + +/** + * per-NID statistics structure. + * It tracks access patterns to this export on a per-client-NID basis + */ +struct nid_stat { + lnet_nid_t nid; + struct hlist_node nid_hash; + struct list_head nid_list; + struct obd_device *nid_obd; + struct proc_dir_entry *nid_proc; + struct lprocfs_stats *nid_stats; + struct lprocfs_stats *nid_ldlm_stats; + atomic_t nid_exp_ref_count; /* for obd_nid_stats_hash + exp_nid_stats */ +}; + +#define nidstat_getref(nidstat) \ +do { \ + atomic_inc(&(nidstat)->nid_exp_ref_count); \ +} while (0) + +#define nidstat_putref(nidstat) \ +do { \ + atomic_dec(&(nidstat)->nid_exp_ref_count); \ + LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0, \ + "stat %p nid_exp_ref_count < 0\n", nidstat); \ +} while (0) + +enum obd_option { + OBD_OPT_FORCE = 0x0001, + OBD_OPT_FAILOVER = 0x0002, + OBD_OPT_ABORT_RECOV = 0x0004, +}; + +/** + * Export structure. Represents target-side of connection in portals. + * Also used in Lustre to connect between layers on the same node when + * there is no network-connection in-between. + * For every connected client there is an export structure on the server + * attached to the same obd device. + */ +struct obd_export { + /** + * Export handle, it's id is provided to client on connect + * Subsequent client RPCs contain this handle id to identify + * what export they are talking to. + */ + struct portals_handle exp_handle; + atomic_t exp_refcount; + /** + * Set of counters below is to track where export references are + * kept. The exp_rpc_count is used for reconnect handling also, + * the cb_count and locks_count are for debug purposes only for now. + * The sum of them should be less than exp_refcount by 3 + */ + atomic_t exp_rpc_count; /* RPC references */ + atomic_t exp_cb_count; /* Commit callback references */ + /** Number of queued replay requests to be processes */ + atomic_t exp_replay_count; + atomic_t exp_locks_count; /** Lock references */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS + struct list_head exp_locks_list; + spinlock_t exp_locks_list_guard; +#endif + /** UUID of client connected to this export */ + struct obd_uuid exp_client_uuid; + /** To link all exports on an obd device */ + struct list_head exp_obd_chain; + struct hlist_node exp_uuid_hash; /** uuid-export hash*/ + struct hlist_node exp_nid_hash; /** nid-export hash */ + /** + * All exports eligible for ping evictor are linked into a list + * through this field in "most time since last request on this export" + * order + * protected by obd_dev_lock + */ + struct list_head exp_obd_chain_timed; + /** Obd device of this export */ + struct obd_device *exp_obd; + /** + * "reverse" import to send requests (e.g. from ldlm) back to client + * exp_lock protect its change + */ + struct obd_import *exp_imp_reverse; + struct nid_stat *exp_nid_stats; + struct lprocfs_stats *exp_md_stats; + /** Active connection */ + struct ptlrpc_connection *exp_connection; + /** Connection count value from last successful reconnect rpc */ + __u32 exp_conn_cnt; + /** Hash list of all ldlm locks granted on this export */ + struct cfs_hash *exp_lock_hash; + /** + * Hash list for Posix lock deadlock detection, added with + * ldlm_lock::l_exp_flock_hash. + */ + struct cfs_hash *exp_flock_hash; + struct list_head exp_outstanding_replies; + struct list_head exp_uncommitted_replies; + spinlock_t exp_uncommitted_replies_lock; + /** Last committed transno for this export */ + __u64 exp_last_committed; + /** When was last request received */ + unsigned long exp_last_request_time; + /** On replay all requests waiting for replay are linked here */ + struct list_head exp_req_replay_queue; + /** + * protects exp_flags, exp_outstanding_replies and the change + * of exp_imp_reverse + */ + spinlock_t exp_lock; + /** Compatibility flags for this export are embedded into + * exp_connect_data */ + struct obd_connect_data exp_connect_data; + enum obd_option exp_flags; + unsigned long exp_failed:1, + exp_in_recovery:1, + exp_disconnected:1, + exp_connecting:1, + /** VBR: export missed recovery */ + exp_delayed:1, + /** VBR: failed version checking */ + exp_vbr_failed:1, + exp_req_replay_needed:1, + exp_lock_replay_needed:1, + exp_need_sync:1, + exp_flvr_changed:1, + exp_flvr_adapt:1, + exp_libclient:1, /* liblustre client? */ + /* client timed out and tried to reconnect, + * but couldn't because of active rpcs */ + exp_abort_active_req:1, + /* if to swap nidtbl entries for 2.2 clients. + * Only used by the MGS to fix LU-1644. */ + exp_need_mne_swab:1; + /* also protected by exp_lock */ + enum lustre_sec_part exp_sp_peer; + struct sptlrpc_flavor exp_flvr; /* current */ + struct sptlrpc_flavor exp_flvr_old[2]; /* about-to-expire */ + unsigned long exp_flvr_expire[2]; /* seconds */ + + /** protects exp_hp_rpcs */ + spinlock_t exp_rpc_lock; + struct list_head exp_hp_rpcs; /* (potential) HP RPCs */ + + /** blocking dlm lock list, protected by exp_bl_list_lock */ + struct list_head exp_bl_list; + spinlock_t exp_bl_list_lock; + + /** Target specific data */ + union { + struct tg_export_data eu_target_data; + struct mdt_export_data eu_mdt_data; + struct filter_export_data eu_filter_data; + struct ec_export_data eu_ec_data; + struct mgs_export_data eu_mgs_data; + } u; +}; + +#define exp_target_data u.eu_target_data +#define exp_mdt_data u.eu_mdt_data +#define exp_filter_data u.eu_filter_data +#define exp_ec_data u.eu_ec_data + +static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp) +{ + return &exp->exp_connect_data.ocd_connect_flags; +} + +static inline __u64 exp_connect_flags(struct obd_export *exp) +{ + return *exp_connect_flags_ptr(exp); +} + +static inline int exp_max_brw_size(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE) + return exp->exp_connect_data.ocd_brw_size; + + return ONE_MB_BRW_SIZE; +} + +static inline int exp_connect_multibulk(struct obd_export *exp) +{ + return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE; +} + +static inline int exp_expired(struct obd_export *exp, long age) +{ + LASSERT(exp->exp_delayed); + return time_before(cfs_time_add(exp->exp_last_request_time, age), + get_seconds()); +} + +static inline int exp_connect_cancelset(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET); +} + +static inline int exp_connect_lru_resize(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_rmtclient(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT); +} + +static inline int client_is_remote(struct obd_export *exp) +{ + struct obd_import *imp = class_exp2cliimp(exp); + + return !!(imp->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_RMT_CLIENT); +} + +static inline int exp_connect_vbr(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT(exp->exp_connection); + return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR); +} + +static inline int exp_connect_som(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM); +} + +static inline int exp_connect_umask(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK); +} + +static inline int imp_connect_lru_resize(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_layout(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK); +} + +static inline bool exp_connect_lvb_type(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +static inline bool imp_connect_lvb_type(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +static inline __u64 exp_connect_ibits(struct obd_export *exp) +{ + struct obd_connect_data *ocd; + + ocd = &exp->exp_connect_data; + return ocd->ocd_ibits_known; +} + +static inline bool imp_connect_disp_stripe(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE; +} + +extern struct obd_export *class_conn2export(struct lustre_handle *conn); +extern struct obd_device *class_conn2obd(struct lustre_handle *conn); + +/** @} export */ + +#endif /* __EXPORT_H */ +/** @} obd_export */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_fid.h b/kernel/drivers/staging/lustre/lustre/include/lustre_fid.h new file mode 100644 index 000000000..0a0929fd9 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_fid.h @@ -0,0 +1,767 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_fid.h + * + * Author: Yury Umanets + */ + +#ifndef __LUSTRE_FID_H +#define __LUSTRE_FID_H + +/** \defgroup fid fid + * + * @{ + * + * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs + * describes the FID namespace and interoperability requirements for FIDs. + * The important parts of that document are included here for reference. + * + * FID + * File IDentifier generated by client from range allocated by the SEQuence + * service and stored in struct lu_fid. The FID is composed of three parts: + * SEQuence, ObjectID, and VERsion. The SEQ component is a filesystem + * unique 64-bit integer, and only one client is ever assigned any SEQ value. + * The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved + * for system use. The OID component is a 32-bit value generated by the + * client on a per-SEQ basis to allow creating many unique FIDs without + * communication with the server. The VER component is a 32-bit value that + * distinguishes between different FID instantiations, such as snapshots or + * separate subtrees within the filesystem. FIDs with the same VER field + * are considered part of the same namespace. + * + * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and + * MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while + * OSTs use 64-bit Lustre object IDs and generation numbers. + * + * NEW filesystems are those formatted since the introduction of FIDs. + * + * IGIF + * Inode and Generation In FID, a surrogate FID used to globally identify + * an existing object on OLD formatted MDT file system. This would only be + * used on MDT0 in a DNE filesystem, because there cannot be more than one + * MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1] + * range, where inode number is stored in SEQ, and inode generation is in OID. + * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem, + * which is the maximum possible for an ldiskfs backend. It also assumes + * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible + * to clients, which has always been true. + * + * IDIF + * object ID In FID, a surrogate FID used to globally identify an existing + * OST object on OLD formatted OST file system. Belongs to a sequence in + * [2^32, 2^33 - 1]. Sequence number is calculated as: + * + * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff) + * + * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object + * ID. The generation of unique SEQ values per OST allows the IDIF FIDs to + * be identified in the FLD correctly. The OID field is calculated as: + * + * objid & 0xffffffff + * + * that is, it consists of lower 32 bits of object ID. For objects within + * the IDIF range, object ID extraction will be: + * + * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid; + * o_seq = 0; // formerly group number + * + * NOTE: This assumes that no more than 2^48-1 objects have ever been created + * on any OST, and that no more than 65535 OSTs are in use. Both are very + * reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming + * a maximum creation rate of 1M objects per second for a maximum of 9 years, + * or combinations thereof. + * + * OST_MDT0 + * Surrogate FID used to identify an existing object on OLD formatted OST + * filesystem. Belongs to the reserved SEQuence 0, and is used prior to + * the introduction of FID-on-OST, at which point IDIF will be used to + * identify objects as residing on a specific OST. + * + * LLOG + * For Lustre Log objects the object sequence 1 is used. This is compatible + * with both OLD and NEW namespaces, as this SEQ number is in the + * ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * ECHO + * For testing OST IO performance the object sequence 2 is used. This is + * compatible with both OLD and NEW namespaces, as this SEQ number is in + * the ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * OST_MDT1 .. OST_MAX + * For testing with multiple MDTs the object sequence 3 through 9 is used, + * allowing direct mapping of MDTs 1 through 7 respectively, for a total + * of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group" + * mappings. However, this SEQ range is only for testing prior to any + * production DNE release, as the objects in this range conflict across all + * OSTs, as the OST index is not part of the FID. For production DNE usage, + * OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs. + * + * DLM OST objid to IDIF mapping + * For compatibility with existing OLD OST network protocol structures, the + * FID must map onto the o_id and o_seq in a manner that ensures existing + * objects are identified consistently for IO, as well as onto the LDLM + * namespace to ensure IDIFs there is only a single resource name for any + * object in the DLM. The OLD OST object DLM resource mapping is: + * + * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases + * + * The NEW OST object DLM resource mapping is the same for both MDT and OST: + * + * resource[] = {SEQ, OID, VER, HASH}; + * + * NOTE: for mapping IDIF values to DLM resource names the o_id may be + * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible + * for the o_id numbers to overlap FID SEQ numbers in the resource. However, + * in all production releases the OLD o_seq field is always zero, and all + * valid FID OID values are non-zero, so the lock resources will not collide. + * Even so, the MDT and OST resources are also in different LDLM namespaces. + */ + +#include "../../include/linux/libcfs/libcfs.h" +#include "lustre/lustre_idl.h" + +struct lu_env; +struct lu_site; +struct lu_context; +struct obd_device; +struct obd_export; + +/* Whole sequences space range and zero range definitions */ +extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE; +extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE; +extern const struct lu_fid LUSTRE_BFL_FID; +extern const struct lu_fid LU_OBF_FID; +extern const struct lu_fid LU_DOT_LUSTRE_FID; + +enum { + /* + * This is how may metadata FIDs may be allocated in one sequence(128k) + */ + LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL, + + /* + * This is how many data FIDs could be allocated in one sequence(4B - 1) + */ + LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL, + + /* + * How many sequences to allocate to a client at once. + */ + LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL, + + /* + * seq allocation pool size. + */ + LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000, + + /* + * This is how many sequences may be in one super-sequence allocated to + * MDTs. + */ + LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH) +}; + +enum { + /** 2^6 FIDs for OI containers */ + OSD_OI_FID_OID_BITS = 6, + /** reserve enough FIDs in case we want more in the future */ + OSD_OI_FID_OID_BITS_MAX = 10, +}; + +/** special OID for local objects */ +enum local_oid { + /** \see fld_mod_init */ + FLD_INDEX_OID = 3UL, + /** \see fid_mod_init */ + FID_SEQ_CTL_OID = 4UL, + FID_SEQ_SRV_OID = 5UL, + /** \see mdd_mod_init */ + MDD_ROOT_INDEX_OID = 6UL, /* deprecated in 2.4 */ + MDD_ORPHAN_OID = 7UL, /* deprecated in 2.4 */ + MDD_LOV_OBJ_OID = 8UL, + MDD_CAPA_KEYS_OID = 9UL, + /** \see mdt_mod_init */ + LAST_RECV_OID = 11UL, + OSD_FS_ROOT_OID = 13UL, + ACCT_USER_OID = 15UL, + ACCT_GROUP_OID = 16UL, + LFSCK_BOOKMARK_OID = 17UL, + OTABLE_IT_OID = 18UL, + /* These two definitions are obsolete + * OFD_GROUP0_LAST_OID = 20UL, + * OFD_GROUP4K_LAST_OID = 20UL+4096, + */ + OFD_LAST_GROUP_OID = 4117UL, + LLOG_CATALOGS_OID = 4118UL, + MGS_CONFIGS_OID = 4119UL, + OFD_HEALTH_CHECK_OID = 4120UL, + MDD_LOV_OBJ_OSEQ = 4121UL, + LFSCK_NAMESPACE_OID = 4122UL, + REMOTE_PARENT_DIR_OID = 4123UL, +}; + +static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_FILE; + fid->f_oid = oid; + fid->f_ver = 0; +} + +static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_NAME; + fid->f_oid = oid; + fid->f_ver = 0; +} + +/* For new FS (>= 2.4), the root FID will be changed to + * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4), + * the root FID will still be IGIF */ +static inline int fid_is_root(const struct lu_fid *fid) +{ + return unlikely((fid_seq(fid) == FID_SEQ_ROOT && + fid_oid(fid) == 1)); +} + +static inline int fid_is_dot_lustre(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE); +} + +static inline int fid_is_obf(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF); +} + +static inline int fid_is_otable_it(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE && + fid_oid(fid) == OTABLE_IT_OID); +} + +static inline int fid_is_acct(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_LOCAL_FILE && + (fid_oid(fid) == ACCT_USER_OID || + fid_oid(fid) == ACCT_GROUP_OID); +} + +static inline int fid_is_quota(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_QUOTA || + fid_seq(fid) == FID_SEQ_QUOTA_GLB; +} + +static inline int fid_is_namespace_visible(const struct lu_fid *fid) +{ + const __u64 seq = fid_seq(fid); + + /* Here, we cannot distinguish whether the normal FID is for OST + * object or not. It is caller's duty to check more if needed. */ + return (!fid_is_last_id(fid) && + (fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) || + fid_is_root(fid) || fid_is_dot_lustre(fid); +} + +static inline int fid_seq_in_fldb(__u64 seq) +{ + return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) || + fid_seq_is_root(seq) || fid_seq_is_dot(seq); +} + +static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq) +{ + if (fid_seq_is_mdt0(seq)) { + fid->f_seq = fid_idif_seq(0, 0); + } else { + LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) || + fid_seq_is_idif(seq), "%#llx\n", seq); + fid->f_seq = seq; + } + fid->f_oid = 0; + fid->f_ver = 0; +} + +/* seq client type */ +enum lu_cli_type { + LUSTRE_SEQ_METADATA = 1, + LUSTRE_SEQ_DATA +}; + +enum lu_mgr_type { + LUSTRE_SEQ_SERVER, + LUSTRE_SEQ_CONTROLLER +}; + +struct lu_server_seq; + +/* Client sequence manager interface. */ +struct lu_client_seq { + /* Sequence-controller export. */ + struct obd_export *lcs_exp; + struct mutex lcs_mutex; + + /* + * Range of allowed for allocation sequences. When using lu_client_seq on + * clients, this contains meta-sequence range. And for servers this + * contains super-sequence range. + */ + struct lu_seq_range lcs_space; + + /* Seq related proc */ + struct proc_dir_entry *lcs_proc_dir; + + /* This holds last allocated fid in last obtained seq */ + struct lu_fid lcs_fid; + + /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */ + enum lu_cli_type lcs_type; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with procfs. + */ + char lcs_name[LUSTRE_MDT_MAXNAMELEN]; + + /* + * Sequence width, that is how many objects may be allocated in one + * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH. + */ + __u64 lcs_width; + + /* Seq-server for direct talking */ + struct lu_server_seq *lcs_srv; + + /* wait queue for fid allocation and update indicator */ + wait_queue_head_t lcs_waitq; + int lcs_update; +}; + +/* server sequence manager interface */ +struct lu_server_seq { + /* Available sequences space */ + struct lu_seq_range lss_space; + + /* keeps highwater in lsr_end for seq allocation algorithm */ + struct lu_seq_range lss_lowater_set; + struct lu_seq_range lss_hiwater_set; + + /* + * Device for server side seq manager needs (saving sequences to backing + * store). + */ + struct dt_device *lss_dev; + + /* /seq file object device */ + struct dt_object *lss_obj; + + /* Seq related proc */ + struct proc_dir_entry *lss_proc_dir; + + /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */ + enum lu_mgr_type lss_type; + + /* Client interface to request controller */ + struct lu_client_seq *lss_cli; + + /* Mutex for protecting allocation */ + struct mutex lss_mutex; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with procfs. + */ + char lss_name[LUSTRE_MDT_MAXNAMELEN]; + + /* + * Allocation chunks for super and meta sequences. Default values are + * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH. + */ + __u64 lss_width; + + /* + * minimum lss_alloc_set size that should be allocated from + * lss_space + */ + __u64 lss_set_width; + + /* sync is needed for update operation */ + __u32 lss_need_sync; + + /** + * Pointer to site object, required to access site fld. + */ + struct seq_server_site *lss_site; +}; + +/* Server methods */ + +int seq_server_init(struct lu_server_seq *seq, + struct dt_device *dev, + const char *prefix, + enum lu_mgr_type type, + struct seq_server_site *ss, + const struct lu_env *env); + +void seq_server_fini(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_set_cli(struct lu_server_seq *seq, + struct lu_client_seq *cli, + const struct lu_env *env); + +/* Client methods */ +int seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + const char *prefix, + struct lu_server_seq *srv); + +void seq_client_fini(struct lu_client_seq *seq); + +void seq_client_flush(struct lu_client_seq *seq); + +int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq, + struct lu_fid *fid); +int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq, + u64 *seqnr); +int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss); +/* Fids common stuff */ +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid); + +enum lu_cli_type; +int client_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type); +int client_fid_fini(struct obd_device *obd); + +/* fid locking */ + +struct ldlm_namespace; + +/* + * Build (DLM) resource name from FID. + * + * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * renaming name[2,3] fields that need to be used for the quota identifier. + */ +static inline struct ldlm_res_id * +fid_build_reg_res_name(const struct lu_fid *fid, struct ldlm_res_id *res) +{ + memset(res, 0, sizeof(*res)); + res->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(fid); + res->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(fid); + + return res; +} + +/* + * Return true if resource is for object identified by FID. + */ +static inline int fid_res_name_eq(const struct lu_fid *fid, + const struct ldlm_res_id *res) +{ + return res->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(fid) && + res->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(fid); +} + +/* + * Extract FID from LDLM resource. Reverse of fid_build_reg_res_name(). + */ +static inline struct lu_fid * +fid_extract_from_res_name(struct lu_fid *fid, const struct ldlm_res_id *res) +{ + fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF]; + fid->f_oid = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF]); + fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); + LASSERT(fid_res_name_eq(fid, res)); + + return fid; +} + +/* + * Build (DLM) resource identifier from global quota FID and quota ID. + */ +static inline struct ldlm_res_id * +fid_build_quota_res_name(const struct lu_fid *glb_fid, union lquota_id *qid, + struct ldlm_res_id *res) +{ + fid_build_reg_res_name(glb_fid, res); + res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid); + res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid); + + return res; +} + +/* + * Extract global FID and quota ID from resource name + */ +static inline void fid_extract_from_quota_res(struct lu_fid *glb_fid, + union lquota_id *qid, + const struct ldlm_res_id *res) +{ + fid_extract_from_res_name(glb_fid, res); + qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF]; + qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF]; + qid->qid_fid.f_ver = + (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32); +} + +static inline struct ldlm_res_id * +fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash, + struct ldlm_res_id *res) +{ + fid_build_reg_res_name(fid, res); + res->name[LUSTRE_RES_ID_HSH_OFF] = hash; + + return res; +} + +/** + * Build DLM resource name from object id & seq, which will be removed + * finally, when we replace ost_id with FID in data stack. + * + * Currently, resid from the old client, whose res[0] = object_id, + * res[1] = object_seq, is just opposite with Metatdata + * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid. + * To unify the resid identification, we will reverse the data + * resid to keep it same with Metadata resid, i.e. + * + * For resid from the old client, + * res[0] = objid, res[1] = 0, still keep the original order, + * for compatibility. + * + * For new resid + * res will be built from normal FID directly, i.e. res[0] = f_seq, + * res[1] = f_oid + f_ver. + */ +static inline void ostid_build_res_name(struct ost_id *oi, + struct ldlm_res_id *name) +{ + memset(name, 0, sizeof(*name)); + if (fid_seq_is_mdt0(ostid_seq(oi))) { + name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi); + name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi); + } else { + fid_build_reg_res_name(&oi->oi_fid, name); + } +} + +static inline void ostid_res_name_to_id(struct ost_id *oi, + struct ldlm_res_id *name) +{ + if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) { + /* old resid */ + ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]); + ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]); + } else { + /* new resid */ + fid_extract_from_res_name(&oi->oi_fid, name); + } +} + +/** + * Return true if the resource is for the object identified by this id & group. + */ +static inline int ostid_res_name_eq(struct ost_id *oi, + struct ldlm_res_id *name) +{ + /* Note: it is just a trick here to save some effort, probably the + * correct way would be turn them into the FID and compare */ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi); + } else { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi); + } +} + +/* The same as osc_build_res_name() */ +static inline void ost_fid_build_resid(const struct lu_fid *fid, + struct ldlm_res_id *resname) +{ + if (fid_is_mdt0(fid) || fid_is_idif(fid)) { + struct ost_id oi; + oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */ + if (fid_to_ostid(fid, &oi) != 0) + return; + ostid_build_res_name(&oi, resname); + } else { + fid_build_reg_res_name(fid, resname); + } +} + +static inline void ost_fid_from_resid(struct lu_fid *fid, + const struct ldlm_res_id *name) +{ + if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) { + /* old resid */ + struct ost_id oi; + ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]); + ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]); + ostid_to_fid(fid, &oi, 0); + } else { + /* new resid */ + fid_extract_from_res_name(fid, name); + } +} + +/** + * Flatten 128-bit FID values into a 64-bit value for use as an inode number. + * For non-IGIF FIDs this starts just over 2^32, and continues without + * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ + * into the range where there may not be many OID values in use, to minimize + * the risk of conflict. + * + * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true, + * the time between re-used inode numbers is very long - 2^40 SEQ numbers, + * or about 2^40 client mounts, if clients create less than 2^24 files/mount. + */ +static inline __u64 fid_flatten(const struct lu_fid *fid) +{ + __u64 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + return ino; + } + + seq = fid_seq(fid); + + ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid); + + return ino ? ino : fid_oid(fid); +} + +static inline __u32 fid_hash(const struct lu_fid *f, int bits) +{ + /* all objects with same id and different versions will belong to same + * collisions list. */ + return hash_long(fid_flatten(f), bits); +} + +/** + * map fid to 32 bit value for ino on 32bit systems. */ +static inline __u32 fid_flatten32(const struct lu_fid *fid) +{ + __u32 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + return ino; + } + + seq = fid_seq(fid) - FID_SEQ_START; + + /* Map the high bits of the OID into higher bits of the inode number so + * that inodes generated at about the same time have a reduced chance + * of collisions. This will give a period of 2^12 = 1024 unique clients + * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects + * (from OID), or up to 128M inodes without collisions for new files. */ + ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) + + (seq >> (64 - (40-8)) & 0xffffff00) + + (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8); + + return ino ? ino : fid_oid(fid); +} + +static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2) +{ + LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n", + PFID(fid1), PFID(fid2)); + + if (fid_is_idif(fid1) && fid_is_idif(fid2)) + return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) - + fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver); + + return fid_oid(fid1) - fid_oid(fid2); +} + +#define LUSTRE_SEQ_SRV_NAME "seq_srv" +#define LUSTRE_SEQ_CTL_NAME "seq_ctl" + +/* Range common stuff */ +static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_le64(src->lsr_start); + dst->lsr_end = cpu_to_le64(src->lsr_end); + dst->lsr_index = cpu_to_le32(src->lsr_index); + dst->lsr_flags = cpu_to_le32(src->lsr_flags); +} + +static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = le64_to_cpu(src->lsr_start); + dst->lsr_end = le64_to_cpu(src->lsr_end); + dst->lsr_index = le32_to_cpu(src->lsr_index); + dst->lsr_flags = le32_to_cpu(src->lsr_flags); +} + +static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_be64(src->lsr_start); + dst->lsr_end = cpu_to_be64(src->lsr_end); + dst->lsr_index = cpu_to_be32(src->lsr_index); + dst->lsr_flags = cpu_to_be32(src->lsr_flags); +} + +static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = be64_to_cpu(src->lsr_start); + dst->lsr_end = be64_to_cpu(src->lsr_end); + dst->lsr_index = be32_to_cpu(src->lsr_index); + dst->lsr_flags = be32_to_cpu(src->lsr_flags); +} + +/** @} fid */ + +#endif /* __LUSTRE_FID_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_fld.h b/kernel/drivers/staging/lustre/lustre/include/lustre_fld.h new file mode 100644 index 000000000..5ee4b1ed0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_fld.h @@ -0,0 +1,160 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_FLD_H +#define __LINUX_FLD_H + +/** \defgroup fld fld + * + * @{ + */ + +#include "lustre/lustre_idl.h" +#include "../../include/linux/libcfs/libcfs.h" + +struct lu_client_fld; +struct lu_server_fld; +struct lu_fld_hash; +struct fld_cache; + +extern const struct dt_index_features fld_index_features; +extern const char fld_index_name[]; + +/* + * FLD (Fid Location Database) interface. + */ +enum { + LUSTRE_CLI_FLD_HASH_DHT = 0, + LUSTRE_CLI_FLD_HASH_RRB +}; + + +struct lu_fld_target { + struct list_head ft_chain; + struct obd_export *ft_exp; + struct lu_server_fld *ft_srv; + __u64 ft_idx; +}; + +struct lu_server_fld { + /** + * Fld dir proc entry. */ + struct proc_dir_entry *lsf_proc_dir; + + /** + * /fld file object device */ + struct dt_object *lsf_obj; + + /** + * super sequence controller export, needed to forward fld + * lookup request. */ + struct obd_export *lsf_control_exp; + + /** + * Client FLD cache. */ + struct fld_cache *lsf_cache; + + /** + * Protect index modifications */ + struct mutex lsf_lock; + + /** + * Fld service name in form "fld-srv-lustre-MDTXXX" */ + char lsf_name[LUSTRE_MDT_MAXNAMELEN]; + +}; + +struct lu_client_fld { + /** + * Client side proc entry. */ + struct proc_dir_entry *lcf_proc_dir; + + /** + * List of exports client FLD knows about. */ + struct list_head lcf_targets; + + /** + * Current hash to be used to chose an export. */ + struct lu_fld_hash *lcf_hash; + + /** + * Exports count. */ + int lcf_count; + + /** + * Lock protecting exports list and fld_hash. */ + spinlock_t lcf_lock; + + /** + * Client FLD cache. */ + struct fld_cache *lcf_cache; + + /** + * Client fld proc entry name. */ + char lcf_name[LUSTRE_MDT_MAXNAMELEN]; + + int lcf_flags; +}; + +/* Client methods */ +int fld_client_init(struct lu_client_fld *fld, + const char *prefix, int hash); + +void fld_client_fini(struct lu_client_fld *fld); + +void fld_client_flush(struct lu_client_fld *fld); + +int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds, + __u32 flags, const struct lu_env *env); + +int fld_client_create(struct lu_client_fld *fld, + struct lu_seq_range *range, + const struct lu_env *env); + +int fld_client_delete(struct lu_client_fld *fld, u64 seq, + const struct lu_env *env); + +int fld_client_add_target(struct lu_client_fld *fld, + struct lu_fld_target *tar); + +int fld_client_del_target(struct lu_client_fld *fld, + __u64 idx); + +void fld_client_proc_fini(struct lu_client_fld *fld); + +/** @} fld */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_ha.h b/kernel/drivers/staging/lustre/lustre/include/lustre_ha.h new file mode 100644 index 000000000..f3ae02b3e --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_ha.h @@ -0,0 +1,64 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_HA_H +#define _LUSTRE_HA_H + +/** \defgroup ha ha + * + * @{ + */ + +struct obd_import; +struct obd_export; +struct obd_device; +struct ptlrpc_request; + + +int ptlrpc_replay(struct obd_import *imp); +int ptlrpc_resend(struct obd_import *imp); +void ptlrpc_free_committed(struct obd_import *imp); +void ptlrpc_wake_delayed(struct obd_import *imp); +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async); +int ptlrpc_set_import_active(struct obd_import *imp, int active); +void ptlrpc_activate_import(struct obd_import *imp); +void ptlrpc_deactivate_import(struct obd_import *imp); +void ptlrpc_invalidate_import(struct obd_import *imp); +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt); + +/** @} ha */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_handles.h b/kernel/drivers/staging/lustre/lustre/include/lustre_handles.h new file mode 100644 index 000000000..726bbd3ea --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_handles.h @@ -0,0 +1,97 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_HANDLES_H_ +#define __LUSTRE_HANDLES_H_ + +/** \defgroup handles handles + * + * @{ + */ + +#include +#include +#include +#include +#include + +#include "../../include/linux/libcfs/libcfs.h" + + +struct portals_handle_ops { + void (*hop_addref)(void *object); + void (*hop_free)(void *object, int size); +}; + +/* These handles are most easily used by having them appear at the very top of + * whatever object that you want to make handles for. ie: + * + * struct ldlm_lock { + * struct portals_handle handle; + * ... + * }; + * + * Now you're able to assign the results of cookie2handle directly to an + * ldlm_lock. If it's not at the top, you'll want to use container_of() + * to compute the start of the structure based on the handle field. */ +struct portals_handle { + struct list_head h_link; + __u64 h_cookie; + struct portals_handle_ops *h_ops; + + /* newly added fields to handle the RCU issue. -jxiong */ + struct rcu_head h_rcu; + spinlock_t h_lock; + unsigned int h_size:31; + unsigned int h_in:1; +}; +#define RCU2HANDLE(rcu) container_of(rcu, struct portals_handle, h_rcu) + +/* handles.c */ + +/* Add a handle to the hash table */ +void class_handle_hash(struct portals_handle *, + struct portals_handle_ops *ops); +void class_handle_unhash(struct portals_handle *); +void class_handle_hash_back(struct portals_handle *); +void *class_handle2object(__u64 cookie); +void class_handle_free_cb(struct rcu_head *rcu); +int class_handle_init(void); +void class_handle_cleanup(void); + +/** @} handles */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_import.h b/kernel/drivers/staging/lustre/lustre/include/lustre_import.h new file mode 100644 index 000000000..dcc807676 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_import.h @@ -0,0 +1,385 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup obd_import PtlRPC import definitions + * Imports are client-side representation of remote obd target. + * + * @{ + */ + +#ifndef __IMPORT_H +#define __IMPORT_H + +/** \defgroup export export + * + * @{ + */ + +#include "lustre_handles.h" +#include "lustre/lustre_idl.h" + + +/** + * Adaptive Timeout stuff + * + * @{ + */ +#define D_ADAPTTO D_OTHER +#define AT_BINS 4 /* "bin" means "N seconds of history" */ +#define AT_FLG_NOHIST 0x1 /* use last reported value only */ + +struct adaptive_timeout { + time_t at_binstart; /* bin start time */ + unsigned int at_hist[AT_BINS]; /* timeout history bins */ + unsigned int at_flags; + unsigned int at_current; /* current timeout value */ + unsigned int at_worst_ever; /* worst-ever timeout value */ + time_t at_worst_time; /* worst-ever timeout timestamp */ + spinlock_t at_lock; +}; + +struct ptlrpc_at_array { + struct list_head *paa_reqs_array; /** array to hold requests */ + __u32 paa_size; /** the size of array */ + __u32 paa_count; /** the total count of reqs */ + time_t paa_deadline; /** the earliest deadline of reqs */ + __u32 *paa_reqs_count; /** the count of reqs in each entry */ +}; + +#define IMP_AT_MAX_PORTALS 8 +struct imp_at { + int iat_portal[IMP_AT_MAX_PORTALS]; + struct adaptive_timeout iat_net_latency; + struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS]; +}; + + +/** @} */ + +/** Possible import states */ +enum lustre_imp_state { + LUSTRE_IMP_CLOSED = 1, + LUSTRE_IMP_NEW = 2, + LUSTRE_IMP_DISCON = 3, + LUSTRE_IMP_CONNECTING = 4, + LUSTRE_IMP_REPLAY = 5, + LUSTRE_IMP_REPLAY_LOCKS = 6, + LUSTRE_IMP_REPLAY_WAIT = 7, + LUSTRE_IMP_RECOVER = 8, + LUSTRE_IMP_FULL = 9, + LUSTRE_IMP_EVICTED = 10, +}; + +/** Returns test string representation of numeric import state \a state */ +static inline char *ptlrpc_import_state_name(enum lustre_imp_state state) +{ + static char *import_state_names[] = { + "", "CLOSED", "NEW", "DISCONN", + "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT", + "RECOVER", "FULL", "EVICTED", + }; + + LASSERT (state <= LUSTRE_IMP_EVICTED); + return import_state_names[state]; +} + +/** + * List of import event types + */ +enum obd_import_event { + IMP_EVENT_DISCON = 0x808001, + IMP_EVENT_INACTIVE = 0x808002, + IMP_EVENT_INVALIDATE = 0x808003, + IMP_EVENT_ACTIVE = 0x808004, + IMP_EVENT_OCD = 0x808005, + IMP_EVENT_DEACTIVATE = 0x808006, + IMP_EVENT_ACTIVATE = 0x808007, +}; + +/** + * Definition of import connection structure + */ +struct obd_import_conn { + /** Item for linking connections together */ + struct list_head oic_item; + /** Pointer to actual PortalRPC connection */ + struct ptlrpc_connection *oic_conn; + /** uuid of remote side */ + struct obd_uuid oic_uuid; + /** + * Time (64 bit jiffies) of last connection attempt on this connection + */ + __u64 oic_last_attempt; +}; + +/* state history */ +#define IMP_STATE_HIST_LEN 16 +struct import_state_hist { + enum lustre_imp_state ish_state; + time_t ish_time; +}; + +/** + * Definition of PortalRPC import structure. + * Imports are representing client-side view to remote target. + */ +struct obd_import { + /** Local handle (== id) for this import. */ + struct portals_handle imp_handle; + /** Reference counter */ + atomic_t imp_refcount; + struct lustre_handle imp_dlm_handle; /* client's ldlm export */ + /** Currently active connection */ + struct ptlrpc_connection *imp_connection; + /** PortalRPC client structure for this import */ + struct ptlrpc_client *imp_client; + /** List element for linking into pinger chain */ + struct list_head imp_pinger_chain; + /** List element for linking into chain for destruction */ + struct list_head imp_zombie_chain; + + /** + * Lists of requests that are retained for replay, waiting for a reply, + * or waiting for recovery to complete, respectively. + * @{ + */ + struct list_head imp_replay_list; + struct list_head imp_sending_list; + struct list_head imp_delayed_list; + /** @} */ + + /** + * List of requests that are retained for committed open replay. Once + * open is committed, open replay request will be moved from the + * imp_replay_list into the imp_committed_list. + * The imp_replay_cursor is for accelerating searching during replay. + * @{ + */ + struct list_head imp_committed_list; + struct list_head *imp_replay_cursor; + /** @} */ + + /** obd device for this import */ + struct obd_device *imp_obd; + + /** + * some seciruty-related fields + * @{ + */ + struct ptlrpc_sec *imp_sec; + struct mutex imp_sec_mutex; + unsigned long imp_sec_expire; + /** @} */ + + /** Wait queue for those who need to wait for recovery completion */ + wait_queue_head_t imp_recovery_waitq; + + /** Number of requests currently in-flight */ + atomic_t imp_inflight; + /** Number of requests currently unregistering */ + atomic_t imp_unregistering; + /** Number of replay requests inflight */ + atomic_t imp_replay_inflight; + /** Number of currently happening import invalidations */ + atomic_t imp_inval_count; + /** Numbner of request timeouts */ + atomic_t imp_timeouts; + /** Current import state */ + enum lustre_imp_state imp_state; + /** Last replay state */ + enum lustre_imp_state imp_replay_state; + /** History of import states */ + struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN]; + int imp_state_hist_idx; + /** Current import generation. Incremented on every reconnect */ + int imp_generation; + /** Incremented every time we send reconnection request */ + __u32 imp_conn_cnt; + /** + * \see ptlrpc_free_committed remembers imp_generation value here + * after a check to save on unnecessary replay list iterations + */ + int imp_last_generation_checked; + /** Last transno we replayed */ + __u64 imp_last_replay_transno; + /** Last transno committed on remote side */ + __u64 imp_peer_committed_transno; + /** + * \see ptlrpc_free_committed remembers last_transno since its last + * check here and if last_transno did not change since last run of + * ptlrpc_free_committed and import generation is the same, we can + * skip looking for requests to remove from replay list as optimisation + */ + __u64 imp_last_transno_checked; + /** + * Remote export handle. This is how remote side knows what export + * we are talking to. Filled from response to connect request + */ + struct lustre_handle imp_remote_handle; + /** When to perform next ping. time in jiffies. */ + unsigned long imp_next_ping; + /** When we last successfully connected. time in 64bit jiffies */ + __u64 imp_last_success_conn; + + /** List of all possible connection for import. */ + struct list_head imp_conn_list; + /** + * Current connection. \a imp_connection is imp_conn_current->oic_conn + */ + struct obd_import_conn *imp_conn_current; + + /** Protects flags, level, generation, conn_cnt, *_list */ + spinlock_t imp_lock; + + /* flags */ + unsigned long imp_no_timeout:1, /* timeouts are disabled */ + imp_invalid:1, /* evicted */ + /* administratively disabled */ + imp_deactive:1, + /* try to recover the import */ + imp_replayable:1, + /* don't run recovery (timeout instead) */ + imp_dlm_fake:1, + /* use 1/2 timeout on MDS' OSCs */ + imp_server_timeout:1, + /* VBR: imp in delayed recovery */ + imp_delayed_recovery:1, + /* VBR: if gap was found then no lock replays + */ + imp_no_lock_replay:1, + /* recovery by versions was failed */ + imp_vbr_failed:1, + /* force an immediate ping */ + imp_force_verify:1, + /* force a scheduled ping */ + imp_force_next_verify:1, + /* pingable */ + imp_pingable:1, + /* resend for replay */ + imp_resend_replay:1, + /* disable normal recovery, for test only. */ + imp_no_pinger_recover:1, + /* need IR MNE swab */ + imp_need_mne_swab:1, + /* import must be reconnected instead of + * chose new connection */ + imp_force_reconnect:1, + /* import has tried to connect with server */ + imp_connect_tried:1; + __u32 imp_connect_op; + struct obd_connect_data imp_connect_data; + __u64 imp_connect_flags_orig; + int imp_connect_error; + + __u32 imp_msg_magic; + __u32 imp_msghdr_flags; /* adjusted based on server capability */ + + struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ + + struct imp_at imp_at; /* adaptive timeout data */ + time_t imp_last_reply_time; /* for health check */ +}; + +typedef void (*obd_import_callback)(struct obd_import *imp, void *closure, + int event, void *event_arg, void *cb_data); + +/** + * Structure for import observer. + * It is possible to register "observer" on an import and every time + * something happens to an import (like connect/evict/disconnect) + * obderver will get its callback called with event type + */ +struct obd_import_observer { + struct list_head oio_chain; + obd_import_callback oio_cb; + void *oio_cb_data; +}; + +void class_observe_import(struct obd_import *imp, obd_import_callback cb, + void *cb_data); +void class_unobserve_import(struct obd_import *imp, obd_import_callback cb, + void *cb_data); +void class_notify_import_observers(struct obd_import *imp, int event, + void *event_arg); + +/* import.c */ +static inline unsigned int at_est2timeout(unsigned int val) +{ + /* add an arbitrary minimum: 125% +5 sec */ + return (val + (val >> 2) + 5); +} + +static inline unsigned int at_timeout2est(unsigned int val) +{ + /* restore estimate value from timeout: e=4/5(t-5) */ + LASSERT(val); + return (max((val << 2) / 5, 5U) - 4); +} + +static inline void at_reset(struct adaptive_timeout *at, int val) +{ + spin_lock(&at->at_lock); + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = get_seconds(); + spin_unlock(&at->at_lock); +} +static inline void at_init(struct adaptive_timeout *at, int val, int flags) +{ + memset(at, 0, sizeof(*at)); + spin_lock_init(&at->at_lock); + at->at_flags = flags; + at_reset(at, val); +} +extern unsigned int at_min; +static inline int at_get(struct adaptive_timeout *at) +{ + return (at->at_current > at_min) ? at->at_current : at_min; +} +int at_measured(struct adaptive_timeout *at, unsigned int val); +int import_at_get_index(struct obd_import *imp, int portal); +extern unsigned int at_max; +#define AT_OFF (at_max == 0) + +/* genops.c */ +struct obd_export; +extern struct obd_import *class_exp2cliimp(struct obd_export *); +extern struct obd_import *class_conn2cliimp(struct lustre_handle *); + +/** @} import */ + +#endif /* __IMPORT_H */ + +/** @} obd_import */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_intent.h b/kernel/drivers/staging/lustre/lustre/include/lustre_intent.h new file mode 100644 index 000000000..c491d52d8 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_intent.h @@ -0,0 +1,62 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LUSTRE_INTENT_H +#define LUSTRE_INTENT_H + +/* intent IT_XXX are defined in lustre/include/obd.h */ +struct lustre_intent_data { + int it_disposition; + int it_status; + __u64 it_lock_handle; + __u64 it_lock_bits; + int it_lock_mode; + int it_remote_lock_mode; + __u64 it_remote_lock_handle; + void *it_data; + unsigned int it_lock_set:1; +}; + +struct lookup_intent { + int it_op; + int it_create_mode; + __u64 it_flags; + union { + struct lustre_intent_data lustre; + } d; +}; + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_lib.h b/kernel/drivers/staging/lustre/lustre/include/lustre_lib.h new file mode 100644 index 000000000..bf135630c --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_lib.h @@ -0,0 +1,666 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_lib.h + * + * Basic Lustre library routines. + */ + +#ifndef _LUSTRE_LIB_H +#define _LUSTRE_LIB_H + +/** \defgroup lib lib + * + * @{ + */ + +#include +#include +#include +#include "../../include/linux/libcfs/libcfs.h" +#include "lustre/lustre_idl.h" +#include "lustre_ver.h" +#include "lustre_cfg.h" + +/* target.c */ +struct kstatfs; +struct ptlrpc_request; +struct obd_export; +struct lu_target; +struct l_wait_info; +#include "lustre_ha.h" +#include "lustre_net.h" + +#define LI_POISON 0x5a5a5a5a +#if BITS_PER_LONG > 32 +# define LL_POISON 0x5a5a5a5a5a5a5a5aL +#else +# define LL_POISON 0x5a5a5a5aL +#endif +#define LP_POISON ((void *)LL_POISON) + +int target_pack_pool_reply(struct ptlrpc_request *req); +int do_set_info_async(struct obd_import *imp, + int opcode, int version, + u32 keylen, void *key, + u32 vallen, void *val, + struct ptlrpc_request_set *set); + +#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */ +#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); + +/* client.c */ + +int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg); +struct client_obd *client_conn2cli(struct lustre_handle *conn); + +struct md_open_data; +struct obd_client_handle { + struct lustre_handle och_fh; + struct lu_fid och_fid; + struct md_open_data *och_mod; + struct lustre_handle och_lease_handle; /* open lock for lease */ + __u32 och_magic; + fmode_t och_flags; +}; +#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed + +/* statfs_pack.c */ +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs); +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs); + +/* + * For md echo client + */ +enum md_echo_cmd { + ECHO_MD_CREATE = 1, /* Open/Create file on MDT */ + ECHO_MD_MKDIR = 2, /* Mkdir on MDT */ + ECHO_MD_DESTROY = 3, /* Unlink file on MDT */ + ECHO_MD_RMDIR = 4, /* Rmdir on MDT */ + ECHO_MD_LOOKUP = 5, /* Lookup on MDT */ + ECHO_MD_GETATTR = 6, /* Getattr on MDT */ + ECHO_MD_SETATTR = 7, /* Setattr on MDT */ + ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */ +}; + +/* + * OBD IOCTLS + */ +#define OBD_IOCTL_VERSION 0x00010004 + +struct obd_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + + union { + __u64 ioc_cookie; + __u64 ioc_u64_1; + }; + union { + __u32 ioc_conn1; + __u32 ioc_u32_1; + }; + union { + __u32 ioc_conn2; + __u32 ioc_u32_2; + }; + + struct obdo ioc_obdo1; + struct obdo ioc_obdo2; + + u64 ioc_count; + u64 ioc_offset; + __u32 ioc_dev; + __u32 ioc_command; + + __u64 ioc_nid; + __u32 ioc_nal; + __u32 ioc_type; + + /* buffers the kernel will treat as user pointers */ + __u32 ioc_plen1; + char *ioc_pbuf1; + __u32 ioc_plen2; + char *ioc_pbuf2; + + /* inline buffers for various arguments */ + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + __u32 ioc_inllen3; + char *ioc_inlbuf3; + __u32 ioc_inllen4; + char *ioc_inlbuf4; + + char ioc_bulk[0]; +}; + +struct obd_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +static inline int obd_ioctl_packlen(struct obd_ioctl_data *data) +{ + int len = cfs_size_round(sizeof(struct obd_ioctl_data)); + len += cfs_size_round(data->ioc_inllen1); + len += cfs_size_round(data->ioc_inllen2); + len += cfs_size_round(data->ioc_inllen3); + len += cfs_size_round(data->ioc_inllen4); + return len; +} + + +static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data) +{ + if (data->ioc_len > OBD_MAX_IOCTL_BUFFER) { + CERROR("OBD ioctl: ioc_len larger than %d\n", + OBD_MAX_IOCTL_BUFFER); + return 1; + } + if (data->ioc_inllen1 > OBD_MAX_IOCTL_BUFFER) { + CERROR("OBD ioctl: ioc_inllen1 larger than ioc_len\n"); + return 1; + } + if (data->ioc_inllen2 > OBD_MAX_IOCTL_BUFFER) { + CERROR("OBD ioctl: ioc_inllen2 larger than ioc_len\n"); + return 1; + } + if (data->ioc_inllen3 > OBD_MAX_IOCTL_BUFFER) { + CERROR("OBD ioctl: ioc_inllen3 larger than ioc_len\n"); + return 1; + } + if (data->ioc_inllen4 > OBD_MAX_IOCTL_BUFFER) { + CERROR("OBD ioctl: ioc_inllen4 larger than ioc_len\n"); + return 1; + } + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf3 && !data->ioc_inllen3) { + CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf4 && !data->ioc_inllen4) { + CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR("OBD ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR("OBD ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_plen1 && !data->ioc_pbuf1) { + CERROR("OBD ioctl: plen1 set but NULL pointer\n"); + return 1; + } + if (data->ioc_plen2 && !data->ioc_pbuf2) { + CERROR("OBD ioctl: plen2 set but NULL pointer\n"); + return 1; + } + if (obd_ioctl_packlen(data) > data->ioc_len) { + CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n", + obd_ioctl_packlen(data), data->ioc_len); + return 1; + } + return 0; +} + + +#include "obd_support.h" + +/* function defined in lustre/obdclass//-module.c */ +int obd_ioctl_getdata(char **buf, int *len, void *arg); +int obd_ioctl_popdata(void *arg, void *data, int len); + +static inline void obd_ioctl_freedata(char *buf, int len) +{ + OBD_FREE_LARGE(buf, len); + return; +} + +/* + * BSD ioctl description: + * #define IOC_V1 _IOR(g, n1, long) + * #define IOC_V2 _IOW(g, n2, long) + * + * ioctl(f, IOC_V1, arg); + * arg will be treated as a long value, + * + * ioctl(f, IOC_V2, arg) + * arg will be treated as a pointer, bsd will call + * copyin(buf, arg, sizeof(long)) + * + * To make BSD ioctl handles argument correctly and simplely, + * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data + * for us. Does this change affect Linux? (XXX Liang) + */ +#define OBD_IOC_DATA_TYPE long + +#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DESTROY _IOW ('f', 104, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_SETATTR _IOW ('f', 107, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETATTR _IOWR ('f', 108, OBD_IOC_DATA_TYPE) +#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE) +#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE) + + +#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SYNC _IOW ('f', 114, OBD_IOC_DATA_TYPE) +#define OBD_IOC_READ2 _IOWR('f', 115, OBD_IOC_DATA_TYPE) +#define OBD_IOC_FORMAT _IOWR('f', 116, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARTITION _IOWR('f', 117, OBD_IOC_DATA_TYPE) +#define OBD_IOC_COPY _IOWR('f', 120, OBD_IOC_DATA_TYPE) +#define OBD_IOC_MIGR _IOWR('f', 121, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PUNCH _IOWR('f', 122, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_MODULE_DEBUG _IOWR('f', 124, OBD_IOC_DATA_TYPE) +#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE) +#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE) +#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE) +#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_GETNAME _IOWR('f', 131, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME]) +#define OBD_IOC_GETDTNAME OBD_IOC_GETNAME + +#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLIENT_RECOVER _IOW ('f', 133, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PING_TARGET _IOW ('f', 136, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_DEC_FS_USE_COUNT _IO ('f', 139 ) +#define OBD_IOC_NO_TRANSNO _IOW ('f', 140, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SET_READONLY _IOW ('f', 141, OBD_IOC_DATA_TYPE) +#define OBD_IOC_ABORT_RECOVERY _IOR ('f', 142, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_ROOT_SQUASH _IOWR('f', 143, OBD_IOC_DATA_TYPE) + +#define OBD_GET_VERSION _IOWR ('f', 144, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CLOSE_UUID _IOWR ('f', 147, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CHANGELOG_SEND _IOW ('f', 148, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETDEVICE _IOWR ('f', 149, OBD_IOC_DATA_TYPE) +#define OBD_IOC_FID2PATH _IOWR ('f', 150, OBD_IOC_DATA_TYPE) +/* see also for ioctls 151-153 */ +/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */ +#define OBD_IOC_LOV_SETSTRIPE _IOW ('f', 154, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */ +#define OBD_IOC_LOV_GETSTRIPE _IOW ('f', 155, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */ +#define OBD_IOC_LOV_SETEA _IOW ('f', 156, OBD_IOC_DATA_TYPE) +/* see for ioctls 157-159 */ +/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */ +#define OBD_IOC_QUOTACHECK _IOW ('f', 160, int) +/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */ +#define OBD_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *) +/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */ +#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +/* see also for ioctls 163-176 */ +#define OBD_IOC_CHANGELOG_REG _IOW ('f', 177, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_DEREG _IOW ('f', 178, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_CLEAR _IOW ('f', 179, struct obd_ioctl_data) +#define OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE) +#define OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARAM _IOW ('f', 187, OBD_IOC_DATA_TYPE) +#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE) +#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LLOG_CATINFO is deprecated */ +#define OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE) + +#define ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE) +#define ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE) +#define ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE) +#define ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE) + +/* defines ioctl number 218-219 */ +#define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t) + +#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data) +#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data) + +#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE) +#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PAUSE_LFSCK _IOW('f', 232, OBD_IOC_DATA_TYPE) + +/* XXX _IOWR('f', 250, long) has been defined in + * libcfs/include/libcfs/libcfs_private.h for debug, don't use it + */ + +/* Until such time as we get_info the per-stripe maximum from the OST, + * we define this to be 2T - 4k, which is the ext3 maxbytes. */ +#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL + +/* Special values for remove LOV EA from disk */ +#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \ + offset == (typeof(offset))(-1)) + +/* #define POISON_BULK 0 */ + +/* + * l_wait_event is a flexible sleeping function, permitting simple caller + * configuration of interrupt and timeout sensitivity along with actions to + * be performed in the event of either exception. + * + * The first form of usage looks like this: + * + * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler, + * intr_handler, callback_data); + * rc = l_wait_event(waitq, condition, &lwi); + * + * l_wait_event() makes the current process wait on 'waitq' until 'condition' + * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending. It + * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before + * 'condition' becomes true, it optionally calls the specified 'intr_handler' + * if not NULL, and returns -EINTR. + * + * If a non-zero timeout is specified, signals are ignored until the timeout + * has expired. At this time, if 'timeout_handler' is not NULL it is called. + * If it returns FALSE l_wait_event() continues to wait as described above with + * signals enabled. Otherwise it returns -ETIMEDOUT. + * + * LWI_INTR(intr_handler, callback_data) is shorthand for + * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data) + * + * The second form of usage looks like this: + * + * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler); + * rc = l_wait_event(waitq, condition, &lwi); + * + * This form is the same as the first except that it COMPLETELY IGNORES + * SIGNALS. The caller must therefore beware that if 'timeout' is zero, or if + * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that + * can unblock the current process is 'condition' becoming TRUE. + * + * Another form of usage is: + * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval, + * timeout_handler); + * rc = l_wait_event(waitq, condition, &lwi); + * This is the same as previous case, but condition is checked once every + * 'interval' jiffies (if non-zero). + * + * Subtle synchronization point: this macro does *not* necessary takes + * wait-queue spin-lock before returning, and, hence, following idiom is safe + * ONLY when caller provides some external locking: + * + * Thread1 Thread2 + * + * l_wait_event(&obj->wq, ....); (1) + * + * wake_up(&obj->wq): (2) + * spin_lock(&q->lock); (2.1) + * __wake_up_common(q, ...); (2.2) + * spin_unlock(&q->lock, flags); (2.3) + * + * OBD_FREE_PTR(obj); (3) + * + * As l_wait_event() may "short-cut" execution and return without taking + * wait-queue spin-lock, some additional synchronization is necessary to + * guarantee that step (3) can begin only after (2.3) finishes. + * + * XXX nikita: some ptlrpc daemon threads have races of that sort. + * + */ +static inline int back_to_sleep(void *arg) +{ + return 0; +} + +#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1)) + +struct l_wait_info { + long lwi_timeout; + long lwi_interval; + int lwi_allow_intr; + int (*lwi_on_timeout)(void *); + void (*lwi_on_signal)(void *); + void *lwi_cb_data; +}; + +/* NB: LWI_TIMEOUT ignores signals completely */ +#define LWI_TIMEOUT(time, cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = cb, \ + .lwi_cb_data = data, \ + .lwi_interval = interval, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = time_cb, \ + .lwi_on_signal = sig_cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = time_cb, \ + .lwi_on_signal = sig_cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 1 \ +}) + +#define LWI_INTR(cb, data) LWI_TIMEOUT_INTR(0, NULL, cb, data) + +#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | \ + sigmask(SIGTERM) | sigmask(SIGQUIT) | \ + sigmask(SIGALRM)) + + +/* + * wait for @condition to become true, but no longer than timeout, specified + * by @info. + */ +#define __l_wait_event(wq, condition, info, ret, l_add_wait) \ +do { \ + wait_queue_t __wait; \ + long __timeout = info->lwi_timeout; \ + sigset_t __blocked; \ + int __allow_intr = info->lwi_allow_intr; \ + \ + ret = 0; \ + if (condition) \ + break; \ + \ + init_waitqueue_entry(&__wait, current); \ + l_add_wait(&wq, &__wait); \ + \ + /* Block all signals (just the non-fatal ones if no timeout). */ \ + if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr)) \ + __blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS); \ + else \ + __blocked = cfs_block_sigsinv(0); \ + \ + for (;;) { \ + unsigned __wstate; \ + \ + __wstate = info->lwi_on_signal != NULL && \ + (__timeout == 0 || __allow_intr) ? \ + TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; \ + \ + set_current_state(TASK_INTERRUPTIBLE); \ + \ + if (condition) \ + break; \ + \ + if (__timeout == 0) { \ + schedule(); \ + } else { \ + long interval = info->lwi_interval? \ + min_t(long, \ + info->lwi_interval,__timeout):\ + __timeout; \ + long remaining = schedule_timeout(interval);\ + __timeout = cfs_time_sub(__timeout, \ + cfs_time_sub(interval, remaining));\ + if (__timeout == 0) { \ + if (info->lwi_on_timeout == NULL || \ + info->lwi_on_timeout(info->lwi_cb_data)) { \ + ret = -ETIMEDOUT; \ + break; \ + } \ + /* Take signals after the timeout expires. */ \ + if (info->lwi_on_signal != NULL) \ + (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\ + } \ + } \ + \ + if (condition) \ + break; \ + if (cfs_signal_pending()) { \ + if (info->lwi_on_signal != NULL && \ + (__timeout == 0 || __allow_intr)) { \ + if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \ + info->lwi_on_signal(info->lwi_cb_data);\ + ret = -EINTR; \ + break; \ + } \ + /* We have to do this here because some signals */ \ + /* are not blockable - ie from strace(1). */ \ + /* In these cases we want to schedule_timeout() */ \ + /* again, because we don't want that to return */ \ + /* -EINTR when the RPC actually succeeded. */ \ + /* the recalc_sigpending() below will deliver the */ \ + /* signal properly. */ \ + cfs_clear_sigpending(); \ + } \ + } \ + \ + cfs_restore_sigs(__blocked); \ + \ + set_current_state(TASK_RUNNING); \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + + + +#define l_wait_event(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue); \ + __ret; \ +}) + +#define l_wait_event_exclusive(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue_exclusive); \ + __ret; \ +}) + +#define l_wait_event_exclusive_head(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue_exclusive_head); \ + __ret; \ +}) + +#define l_wait_condition(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event(wq, condition, &lwi); \ +}) + +#define l_wait_condition_exclusive(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event_exclusive(wq, condition, &lwi); \ +}) + +#define l_wait_condition_exclusive_head(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event_exclusive_head(wq, condition, &lwi); \ +}) + +#define LIBLUSTRE_CLIENT (0) + +/** @} lib */ + +#endif /* _LUSTRE_LIB_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_lite.h b/kernel/drivers/staging/lustre/lustre/include/lustre_lite.h new file mode 100644 index 000000000..df557c22a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_lite.h @@ -0,0 +1,150 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LL_H +#define _LL_H + +/** \defgroup lite lite + * + * @{ + */ + +#include "linux/lustre_lite.h" + +#include "obd_class.h" +#include "lustre_net.h" +#include "lustre_mds.h" +#include "lustre_ha.h" + +/* 4UL * 1024 * 1024 */ +#define LL_MAX_BLKSIZE_BITS (22) +#define LL_MAX_BLKSIZE (1UL<lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW; + params->lrp_brw_flags = 0; + + params->lrp_policy.l_extent.start = pos; + params->lrp_policy.l_extent.end = pos + len - 1; + /* + * for now O_APPEND always takes local locks. + */ + if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) { + params->lrp_policy.l_extent.start = 0; + params->lrp_policy.l_extent.end = OBD_OBJECT_EOF; + } else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) { + /* + * liblustre: OST-side locking for all non-O_APPEND + * reads/writes. + */ + params->lrp_lock_mode = LCK_NL; + params->lrp_brw_flags = OBD_BRW_SRVLOCK; + } else { + /* + * nothing special for the kernel. In the future llite may use + * OST-side locks for small writes into highly contended + * files. + */ + } + params->lrp_ast_flags = (open_flags & O_NONBLOCK) ? + LDLM_FL_BLOCK_NOWAIT : 0; +} + +/* + * This is embedded into liblustre and llite super-blocks to keep track of + * connect flags (capabilities) supported by all imports given mount is + * connected to. + */ +struct lustre_client_ocd { + /* + * This is conjunction of connect_flags across all imports (LOVs) this + * mount is connected to. This field is updated by cl_ocd_update() + * under ->lco_lock. + */ + __u64 lco_flags; + struct mutex lco_lock; + struct obd_export *lco_md_exp; + struct obd_export *lco_dt_exp; +}; + +/* + * Chain of hash overflow pages. + */ +struct ll_dir_chain { + /* XXX something. Later */ +}; + +static inline void ll_dir_chain_init(struct ll_dir_chain *chain) +{ +} + +static inline void ll_dir_chain_fini(struct ll_dir_chain *chain) +{ +} + +static inline unsigned long hash_x_index(__u64 hash, int hash64) +{ + if (BITS_PER_LONG == 32 && hash64) + hash >>= 32; + /* save hash 0 as index 0 because otherwise we'll save it at + * page index end (~0UL) and it causes truncate_inode_pages_range() + * to loop forever. + */ + return ~0UL - (hash + !hash); +} + +/** @} lite */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_log.h b/kernel/drivers/staging/lustre/lustre/include/lustre_log.h new file mode 100644 index 000000000..2187fb615 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_log.h @@ -0,0 +1,545 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_log.h + * + * Generic infrastructure for managing a collection of logs. + * These logs are used for: + * + * - orphan recovery: OST adds record on create + * - mtime/size consistency: the OST adds a record on first write + * - open/unlinked objects: OST adds a record on destroy + * + * - mds unlink log: the MDS adds an entry upon delete + * + * - raid1 replication log between OST's + * - MDS replication logs + */ + +#ifndef _LUSTRE_LOG_H +#define _LUSTRE_LOG_H + +/** \defgroup log log + * + * @{ + */ + +#include "obd_class.h" +#include "lustre/lustre_idl.h" +#include "dt_object.h" + +#define LOG_NAME_LIMIT(logname, name) \ + snprintf(logname, sizeof(logname), "LOGS/%s", name) +#define LLOG_EEMPTY 4711 + +enum llog_open_param { + LLOG_OPEN_EXISTS = 0x0000, + LLOG_OPEN_NEW = 0x0001, +}; + +struct plain_handle_data { + struct list_head phd_entry; + struct llog_handle *phd_cat_handle; + struct llog_cookie phd_cookie; /* cookie of this log in its cat */ +}; + +struct cat_handle_data { + struct list_head chd_head; + struct llog_handle *chd_current_log; /* currently open log */ + struct llog_handle *chd_next_log; /* llog to be used next */ +}; + +static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid) +{ + /* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS) + * logid's by non-zero ogen (inode generation) and convert them + * into IGIF */ + if (id->lgl_ogen == 0) { + fid->f_seq = id->lgl_oi.oi.oi_seq; + fid->f_oid = id->lgl_oi.oi.oi_id; + fid->f_ver = 0; + } else { + lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen); + } +} + +static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id) +{ + id->lgl_oi.oi.oi_seq = fid->f_seq; + id->lgl_oi.oi.oi_id = fid->f_oid; + id->lgl_ogen = 0; +} + +static inline void logid_set_id(struct llog_logid *log_id, __u64 id) +{ + log_id->lgl_oi.oi.oi_id = id; +} + +static inline __u64 logid_id(struct llog_logid *log_id) +{ + return log_id->lgl_oi.oi.oi_id; +} + +struct llog_handle; + +/* llog.c - general API */ +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid); +int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data); +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata); +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index); +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param); +int llog_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name); +int llog_backup(const struct lu_env *env, struct obd_device *obd, + struct llog_ctxt *ctxt, struct llog_ctxt *bak_ctxt, + char *name, char *backup); + +/* llog_process flags */ +#define LLOG_FLAG_NODEAMON 0x0001 + +/* llog_cat.c - catalog api */ +struct llog_process_data { + /** + * Any useful data needed while processing catalog. This is + * passed later to process callback. + */ + void *lpd_data; + /** + * Catalog process callback function, called for each record + * in catalog. + */ + llog_cb_t lpd_cb; + /** + * Start processing the catalog from startcat/startidx + */ + int lpd_startcat; + int lpd_startidx; +}; + +struct llog_process_cat_data { + /** + * Temporary stored first_idx while scanning log. + */ + int lpcd_first_idx; + /** + * Temporary stored last_idx while scanning log. + */ + int lpcd_last_idx; +}; + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf, struct thandle *th); +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th); +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf); +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies); +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cb, + void *data, int startcat, int startidx, bool fork); +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx); +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cb, + void *data); +int llog_cat_init_and_process(const struct lu_env *env, + struct llog_handle *llh); + +/* llog_obd.c */ +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, struct llog_operations *op); +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt); +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *); +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags); +int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_cookie *cookies, int flags); + +/* llog_net.c */ +int llog_initiator_connect(struct llog_ctxt *ctxt); + +struct llog_operations { + int (*lop_destroy)(const struct lu_env *env, + struct llog_handle *handle); + int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h, + int *curr_idx, int next_idx, __u64 *offset, + void *buf, int len); + int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h, + int prev_idx, void *buf, int len); + int (*lop_read_header)(const struct lu_env *env, + struct llog_handle *handle); + int (*lop_setup)(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int ctxt_idx, + struct obd_device *disk_obd); + int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp, + int flags); + int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt); + int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_cookie *cookies, int flags); + int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid, + struct llog_gen *gen, struct obd_uuid *uuid); + /** + * Any llog file must be opened first using llog_open(). Llog can be + * opened by name, logid or without both, in last case the new logid + * will be generated. + */ + int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_logid *logid, char *name, + enum llog_open_param); + /** + * Opened llog may not exist and this must be checked where needed using + * the llog_exist() call. + */ + int (*lop_exist)(struct llog_handle *lgh); + /** + * Close llog file and calls llog_free_handle() implicitly. + * Any opened llog must be closed by llog_close() call. + */ + int (*lop_close)(const struct lu_env *env, struct llog_handle *handle); + /** + * Create new llog file. The llog must be opened. + * Must be used only for local llog operations. + */ + int (*lop_declare_create)(const struct lu_env *env, + struct llog_handle *handle, + struct thandle *th); + int (*lop_create)(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); + /** + * write new record in llog. It appends records usually but can edit + * existing records too. + */ + int (*lop_declare_write_rec)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, + int idx, struct thandle *th); + int (*lop_write_rec)(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *cookie, int cookiecount, + void *buf, int idx, struct thandle *th); + /** + * Add new record in llog catalog. Does the same as llog_write_rec() + * but using llog catalog. + */ + int (*lop_declare_add)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); + int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *cookie, + void *buf, struct thandle *th); +}; + +/* In-memory descriptor for a log object or log catalog */ +struct llog_handle { + struct rw_semaphore lgh_lock; + spinlock_t lgh_hdr_lock; /* protect lgh_hdr data */ + struct llog_logid lgh_id; /* id of this log */ + struct llog_log_hdr *lgh_hdr; + struct file *lgh_file; + struct dt_object *lgh_obj; + int lgh_last_idx; + int lgh_cur_idx; /* used during llog_process */ + __u64 lgh_cur_offset; /* used during llog_process */ + struct llog_ctxt *lgh_ctxt; + union { + struct plain_handle_data phd; + struct cat_handle_data chd; + } u; + char *lgh_name; + void *private_data; + struct llog_operations *lgh_logops; + atomic_t lgh_refcount; +}; + +#define LLOG_CTXT_FLAG_UNINITIALIZED 0x00000001 +#define LLOG_CTXT_FLAG_STOP 0x00000002 + +struct llog_ctxt { + int loc_idx; /* my index the obd array of ctxt's */ + struct obd_device *loc_obd; /* points back to the containing obd*/ + struct obd_llog_group *loc_olg; /* group containing that ctxt */ + struct obd_export *loc_exp; /* parent "disk" export (e.g. MDS) */ + struct obd_import *loc_imp; /* to use in RPC's: can be backward + pointing import */ + struct llog_operations *loc_logops; + struct llog_handle *loc_handle; + struct mutex loc_mutex; /* protect loc_imp */ + atomic_t loc_refcount; + long loc_flags; /* flags, see above defines */ + struct dt_object *loc_dir; +}; + +#define LLOG_PROC_BREAK 0x0001 +#define LLOG_DEL_RECORD 0x0002 + +static inline int llog_obd2ops(struct llog_ctxt *ctxt, + struct llog_operations **lop) +{ + if (ctxt == NULL) + return -ENOTCONN; + + *lop = ctxt->loc_logops; + if (*lop == NULL) + return -EOPNOTSUPP; + + return 0; +} + +static inline int llog_handle2ops(struct llog_handle *loghandle, + struct llog_operations **lop) +{ + if (loghandle == NULL || loghandle->lgh_logops == NULL) + return -EINVAL; + + *lop = loghandle->lgh_logops; + return 0; +} + +static inline int llog_data_len(int len) +{ + return cfs_size_round(len); +} + +static inline int llog_get_size(struct llog_handle *loghandle) +{ + if (loghandle && loghandle->lgh_hdr) + return loghandle->lgh_hdr->llh_count; + return 0; +} + +static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt) +{ + atomic_inc(&ctxt->loc_refcount); + CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount)); + return ctxt; +} + +static inline void llog_ctxt_put(struct llog_ctxt *ctxt) +{ + if (ctxt == NULL) + return; + LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount) - 1); + __llog_ctxt_put(NULL, ctxt); +} + +static inline void llog_group_init(struct obd_llog_group *olg, int group) +{ + init_waitqueue_head(&olg->olg_waitq); + spin_lock_init(&olg->olg_lock); + mutex_init(&olg->olg_cat_processing); + olg->olg_seq = group; +} + +static inline int llog_group_set_ctxt(struct obd_llog_group *olg, + struct llog_ctxt *ctxt, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] != NULL) { + spin_unlock(&olg->olg_lock); + return -EEXIST; + } + olg->olg_ctxts[index] = ctxt; + spin_unlock(&olg->olg_lock); + return 0; +} + +static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg, + int index) +{ + struct llog_ctxt *ctxt; + + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] == NULL) + ctxt = NULL; + else + ctxt = llog_ctxt_get(olg->olg_ctxts[index]); + spin_unlock(&olg->olg_lock); + return ctxt; +} + +static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + spin_lock(&olg->olg_lock); + olg->olg_ctxts[index] = NULL; + spin_unlock(&olg->olg_lock); +} + +static inline struct llog_ctxt *llog_get_context(struct obd_device *obd, + int index) +{ + return llog_group_get_ctxt(&obd->obd_olg, index); +} + +static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index) +{ + return (olg->olg_ctxts[index] == NULL); +} + +static inline int llog_ctxt_null(struct obd_device *obd, int index) +{ + return llog_group_ctxt_null(&obd->obd_olg, index); +} + +static inline int llog_destroy(const struct lu_env *env, + struct llog_handle *handle) +{ + struct llog_operations *lop; + int rc; + + rc = llog_handle2ops(handle, &lop); + if (rc) + return rc; + if (lop->lop_destroy == NULL) + return -EOPNOTSUPP; + + rc = lop->lop_destroy(env, handle); + return rc; +} + +static inline int llog_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + struct llog_operations *lop; + int rc; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + return rc; + if (lop->lop_next_block == NULL) + return -EOPNOTSUPP; + + rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx, + cur_offset, buf, len); + return rc; +} + +static inline int llog_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct llog_operations *lop; + int rc; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + return rc; + if (lop->lop_prev_block == NULL) + return -EOPNOTSUPP; + + rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len); + return rc; +} + +static inline int llog_connect(struct llog_ctxt *ctxt, + struct llog_logid *logid, struct llog_gen *gen, + struct obd_uuid *uuid) +{ + struct llog_operations *lop; + int rc; + + rc = llog_obd2ops(ctxt, &lop); + if (rc) + return rc; + if (lop->lop_connect == NULL) + return -EOPNOTSUPP; + + rc = lop->lop_connect(ctxt, logid, gen, uuid); + return rc; +} + +/* llog.c */ +int llog_exist(struct llog_handle *loghandle); +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th); +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th); +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int numcookies, void *buf, int idx, struct thandle *th); +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + void *buf, struct thandle *th); +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name); +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name); +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + int cookiecount, void *buf, int idx); + +/** @} log */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_mdc.h b/kernel/drivers/staging/lustre/lustre/include/lustre_mdc.h new file mode 100644 index 000000000..b1b05c8a3 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_mdc.h @@ -0,0 +1,191 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_mdc.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDC_H +#define _LUSTRE_MDC_H + +/** \defgroup mdc mdc + * + * @{ + */ + +#include +#include +#include "lustre_intent.h" +#include "lustre_handles.h" +#include "../../include/linux/libcfs/libcfs.h" +#include "obd_class.h" +#include "lustre/lustre_idl.h" +#include "lustre_lib.h" +#include "lustre_dlm.h" +#include "lustre_export.h" + +struct ptlrpc_client; +struct obd_export; +struct ptlrpc_request; +struct obd_device; + +struct mdc_rpc_lock { + struct mutex rpcl_mutex; + struct lookup_intent *rpcl_it; + int rpcl_fakes; +}; + +#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL) + +static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck) +{ + mutex_init(&lck->rpcl_mutex); + lck->rpcl_it = NULL; +} + +static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || + it->it_op == IT_LAYOUT)) + return; + + /* This would normally block until the existing request finishes. + * If fail_loc is set it will block until the regular request is + * done, then set rpcl_it to MDC_FAKE_RPCL_IT. Once that is set + * it will only be cleared when all fake requests are finished. + * Only when all fake requests are finished can normal requests + * be sent, to ensure they are recoverable again. */ + again: + mutex_lock(&lck->rpcl_mutex); + + if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) { + lck->rpcl_it = MDC_FAKE_RPCL_IT; + lck->rpcl_fakes++; + mutex_unlock(&lck->rpcl_mutex); + return; + } + + /* This will only happen when the CFS_FAIL_CHECK() was + * just turned off but there are still requests in progress. + * Wait until they finish. It doesn't need to be efficient + * in this extremely rare case, just have low overhead in + * the common case when it isn't true. */ + while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) { + mutex_unlock(&lck->rpcl_mutex); + schedule_timeout(cfs_time_seconds(1) / 4); + goto again; + } + + LASSERT(lck->rpcl_it == NULL); + lck->rpcl_it = it; +} + +static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || + it->it_op == IT_LAYOUT)) + return; + + if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */ + mutex_lock(&lck->rpcl_mutex); + + LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes); + lck->rpcl_fakes--; + + if (lck->rpcl_fakes == 0) + lck->rpcl_it = NULL; + + } else { + LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it); + lck->rpcl_it = NULL; + } + + mutex_unlock(&lck->rpcl_mutex); +} + +/* Update the maximum observed easize and cookiesize. The default easize + * and cookiesize is initialized to the minimum value but allowed to grow + * up to a single page in size if required to handle the common case. + */ +static inline void mdc_update_max_ea_from_body(struct obd_export *exp, + struct mdt_body *body) +{ + if (body->valid & OBD_MD_FLMODEASIZE) { + struct client_obd *cli = &exp->exp_obd->u.cli; + + if (cli->cl_max_mds_easize < body->max_mdsize) { + cli->cl_max_mds_easize = body->max_mdsize; + cli->cl_default_mds_easize = + min_t(__u32, body->max_mdsize, PAGE_CACHE_SIZE); + } + if (cli->cl_max_mds_cookiesize < body->max_cookiesize) { + cli->cl_max_mds_cookiesize = body->max_cookiesize; + cli->cl_default_mds_cookiesize = + min_t(__u32, body->max_cookiesize, PAGE_CACHE_SIZE); + } + } +} + + +struct mdc_cache_waiter { + struct list_head mcw_entry; + wait_queue_head_t mcw_waitq; +}; + +/* mdc/mdc_locks.c */ +int it_disposition(struct lookup_intent *it, int flag); +void it_clear_disposition(struct lookup_intent *it, int flag); +void it_set_disposition(struct lookup_intent *it, int flag); +int it_open_error(int phase, struct lookup_intent *it); + +static inline bool cl_is_lov_delay_create(unsigned int flags) +{ + return (flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE; +} + +static inline void cl_lov_delay_create_clear(unsigned int *flags) +{ + if ((*flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE) + *flags &= ~O_LOV_DELAY_CREATE; +} + +/** @} mdc */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_mds.h b/kernel/drivers/staging/lustre/lustre/include/lustre_mds.h new file mode 100644 index 000000000..f0cce41c5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_mds.h @@ -0,0 +1,81 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_mds.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDS_H +#define _LUSTRE_MDS_H + +/** \defgroup mds mds + * + * @{ + */ + +#include "lustre_handles.h" +#include "../../include/linux/libcfs/libcfs.h" +#include "lustre/lustre_idl.h" +#include "lustre_lib.h" +#include "lustre_dlm.h" +#include "lustre_export.h" + +struct mds_group_info { + struct obd_uuid *uuid; + int group; +}; + +struct mds_capa_info { + struct obd_uuid *uuid; + struct lustre_capa_key *capa; +}; + +#define MDD_OBD_NAME "mdd_obd" +#define MDD_OBD_UUID "mdd_obd_uuid" + +static inline int md_should_create(__u64 flags) +{ + return !(flags & MDS_OPEN_DELAY_CREATE || + !(flags & FMODE_WRITE)); +} + +/* these are local flags, used only on the client, private */ +#define M_CHECK_STALE 0200000000 + +/** @} mds */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_net.h b/kernel/drivers/staging/lustre/lustre/include/lustre_net.h new file mode 100644 index 000000000..e2805bd1a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_net.h @@ -0,0 +1,2967 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup PtlRPC Portal RPC and networking module. + * + * PortalRPC is the layer used by rest of lustre code to achieve network + * communications: establish connections with corresponding export and import + * states, listen for a service, send and receive RPCs. + * PortalRPC also includes base recovery framework: packet resending and + * replaying, reconnections, pinger. + * + * PortalRPC utilizes LNet as its transport layer. + * + * @{ + */ + + +#ifndef _LUSTRE_NET_H +#define _LUSTRE_NET_H + +/** \defgroup net net + * + * @{ + */ + +#include "../../include/linux/libcfs/libcfs.h" +// #include +#include "../../include/linux/lnet/lnet.h" +#include "lustre/lustre_idl.h" +#include "lustre_ha.h" +#include "lustre_sec.h" +#include "lustre_import.h" +#include "lprocfs_status.h" +#include "lu_object.h" +#include "lustre_req_layout.h" + +#include "obd_support.h" +#include "lustre_ver.h" + +/* MD flags we _always_ use */ +#define PTLRPC_MD_OPTIONS 0 + +/** + * Max # of bulk operations in one request. + * In order for the client and server to properly negotiate the maximum + * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two + * value. The client is free to limit the actual RPC size for any bulk + * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */ +#define PTLRPC_BULK_OPS_BITS 2 +#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS) +/** + * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and + * should not be used on the server at all. Otherwise, it imposes a + * protocol limitation on the maximum RPC size that can be used by any + * RPC sent to that server in the future. Instead, the server should + * use the negotiated per-client ocd_brw_size to determine the bulk + * RPC count. */ +#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1)) + +/** + * Define maxima for bulk I/O. + * + * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT + * of LNET_MTU sized RDMA transfers. Clients and servers negotiate the + * currently supported maximum between peers at connect via ocd_brw_size. + */ +#define PTLRPC_MAX_BRW_BITS (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS) +#define PTLRPC_MAX_BRW_SIZE (1 << PTLRPC_MAX_BRW_BITS) +#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT) + +#define ONE_MB_BRW_SIZE (1 << LNET_MTU_BITS) +#define MD_MAX_BRW_SIZE (1 << LNET_MTU_BITS) +#define MD_MAX_BRW_PAGES (MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT) +#define DT_MAX_BRW_SIZE PTLRPC_MAX_BRW_SIZE +#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT) +#define OFD_MAX_BRW_SIZE (1 << LNET_MTU_BITS) + +/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */ +# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0) +# error "PTLRPC_MAX_BRW_PAGES isn't a power of two" +# endif +# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE)) +# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE" +# endif +# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_SIZE too big" +# endif +# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_PAGES too big" +# endif + +#define PTLRPC_NTHRS_INIT 2 + +/** + * Buffer Constants + * + * Constants determine how memory is used to buffer incoming service requests. + * + * ?_NBUFS # buffers to allocate when growing the pool + * ?_BUFSIZE # bytes in a single request buffer + * ?_MAXREQSIZE # maximum request service will receive + * + * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk + * of ?_NBUFS is added to the pool. + * + * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are + * considered full when less than ?_MAXREQSIZE is left in them. + */ +/** + * Thread Constants + * + * Constants determine how threads are created for ptlrpc service. + * + * ?_NTHRS_INIT # threads to create for each service partition on + * initializing. If it's non-affinity service and + * there is only one partition, it's the overall # + * threads for the service while initializing. + * ?_NTHRS_BASE # threads should be created at least for each + * ptlrpc partition to keep the service healthy. + * It's the low-water mark of threads upper-limit + * for each partition. + * ?_THR_FACTOR # threads can be added on threads upper-limit for + * each CPU core. This factor is only for reference, + * we might decrease value of factor if number of cores + * per CPT is above a limit. + * ?_NTHRS_MAX # overall threads can be created for a service, + * it's a soft limit because if service is running + * on machine with hundreds of cores and tens of + * CPU partitions, we need to guarantee each partition + * has ?_NTHRS_BASE threads, which means total threads + * will be ?_NTHRS_BASE * number_of_cpts which can + * exceed ?_NTHRS_MAX. + * + * Examples + * + * #define MDS_NTHRS_INIT 2 + * #define MDS_NTHRS_BASE 64 + * #define MDS_NTHRS_FACTOR 8 + * #define MDS_NTHRS_MAX 1024 + * + * Example 1): + * --------------------------------------------------------------------- + * Server(A) has 16 cores, user configured it to 4 partitions so each + * partition has 4 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96 + * + * Total number of threads for the service is: + * 96 * partitions(4) = 384 + * + * Example 2): + * --------------------------------------------------------------------- + * Server(B) has 32 cores, user configured it to 4 partitions so each + * partition has 8 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128 + * + * Total number of threads for the service is: + * 128 * partitions(4) = 512 + * + * Example 3): + * --------------------------------------------------------------------- + * Server(B) has 96 cores, user configured it to 8 partitions so each + * partition has 12 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160 + * + * Total number of threads for the service is: + * 160 * partitions(8) = 1280 + * + * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number + * as upper limit of threads number for each partition: + * MDS_NTHRS_MAX(1024) / partitions(8) = 128 + * + * Example 4): + * --------------------------------------------------------------------- + * Server(C) have a thousand of cores and user configured it to 32 partitions + * MDS_NTHRS_BASE(64) * 32 = 2048 + * + * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need + * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads + * to keep service healthy, so total number of threads will just be 2048. + * + * NB: we don't suggest to choose server with that many cores because backend + * filesystem itself, buffer cache, or underlying network stack might + * have some SMP scalability issues at that large scale. + * + * If user already has a fat machine with hundreds or thousands of cores, + * there are two choices for configuration: + * a) create CPU table from subset of all CPUs and run Lustre on + * top of this subset + * b) bind service threads on a few partitions, see modparameters of + * MDS and OSS for details +* + * NB: these calculations (and examples below) are simplified to help + * understanding, the real implementation is a little more complex, + * please see ptlrpc_server_nthreads_check() for details. + * + */ + + /* + * LDLM threads constants: + * + * Given 8 as factor and 24 as base threads number + * + * example 1) + * On 4-core machine we will have 24 + 8 * 4 = 56 threads. + * + * example 2) + * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56 + * threads for each partition and total threads number will be 112. + * + * example 3) + * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24) + * threads for each partition to keep service healthy, so total threads + * number should be 24 * 8 = 192. + * + * So with these constants, threads number will be at the similar level + * of old versions, unless target machine has over a hundred cores + */ +#define LDLM_THR_FACTOR 8 +#define LDLM_NTHRS_INIT PTLRPC_NTHRS_INIT +#define LDLM_NTHRS_BASE 24 +#define LDLM_NTHRS_MAX (num_online_cpus() == 1 ? 64 : 128) + +#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT +#define LDLM_CLIENT_NBUFS 1 +#define LDLM_SERVER_NBUFS 64 +#define LDLM_BUFSIZE (8 * 1024) +#define LDLM_MAXREQSIZE (5 * 1024) +#define LDLM_MAXREPSIZE (1024) + +#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */ + +#define OST_MAXREQSIZE (5 * 1024) + +/* Macro to hide a typecast. */ +#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args) + +/** + * Structure to single define portal connection. + */ +struct ptlrpc_connection { + /** linkage for connections hash table */ + struct hlist_node c_hash; + /** Our own lnet nid for this connection */ + lnet_nid_t c_self; + /** Remote side nid for this connection */ + lnet_process_id_t c_peer; + /** UUID of the other side */ + struct obd_uuid c_remote_uuid; + /** reference counter for this connection */ + atomic_t c_refcount; +}; + +/** Client definition for PortalRPC */ +struct ptlrpc_client { + /** What lnet portal does this client send messages to by default */ + __u32 cli_request_portal; + /** What portal do we expect replies on */ + __u32 cli_reply_portal; + /** Name of the client */ + char *cli_name; +}; + +/** state flags of requests */ +/* XXX only ones left are those used by the bulk descs as well! */ +#define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */ +#define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */ + +#define REQ_MAX_ACK_LOCKS 8 + +union ptlrpc_async_args { + /** + * Scratchpad for passing args to completion interpreter. Users + * cast to the struct of their choosing, and CLASSERT that this is + * big enough. For _tons_ of context, OBD_ALLOC a struct and store + * a pointer to it here. The pointer_arg ensures this struct is at + * least big enough for that. + */ + void *pointer_arg[11]; + __u64 space[7]; +}; + +struct ptlrpc_request_set; +typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int); +typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *); + +/** + * Definition of request set structure. + * Request set is a list of requests (not necessary to the same target) that + * once populated with RPCs could be sent in parallel. + * There are two kinds of request sets. General purpose and with dedicated + * serving thread. Example of the latter is ptlrpcd set. + * For general purpose sets once request set started sending it is impossible + * to add new requests to such set. + * Provides a way to call "completion callbacks" when all requests in the set + * returned. + */ +struct ptlrpc_request_set { + atomic_t set_refcount; + /** number of in queue requests */ + atomic_t set_new_count; + /** number of uncompleted requests */ + atomic_t set_remaining; + /** wait queue to wait on for request events */ + wait_queue_head_t set_waitq; + wait_queue_head_t *set_wakeup_ptr; + /** List of requests in the set */ + struct list_head set_requests; + /** + * List of completion callbacks to be called when the set is completed + * This is only used if \a set_interpret is NULL. + * Links struct ptlrpc_set_cbdata. + */ + struct list_head set_cblist; + /** Completion callback, if only one. */ + set_interpreter_func set_interpret; + /** opaq argument passed to completion \a set_interpret callback. */ + void *set_arg; + /** + * Lock for \a set_new_requests manipulations + * locked so that any old caller can communicate requests to + * the set holder who can then fold them into the lock-free set + */ + spinlock_t set_new_req_lock; + /** List of new yet unsent requests. Only used with ptlrpcd now. */ + struct list_head set_new_requests; + + /** rq_status of requests that have been freed already */ + int set_rc; + /** Additional fields used by the flow control extension */ + /** Maximum number of RPCs in flight */ + int set_max_inflight; + /** Callback function used to generate RPCs */ + set_producer_func set_producer; + /** opaq argument passed to the producer callback */ + void *set_producer_arg; +}; + +/** + * Description of a single ptrlrpc_set callback + */ +struct ptlrpc_set_cbdata { + /** List linkage item */ + struct list_head psc_item; + /** Pointer to interpreting function */ + set_interpreter_func psc_interpret; + /** Opaq argument to pass to the callback */ + void *psc_data; +}; + +struct ptlrpc_bulk_desc; +struct ptlrpc_service_part; +struct ptlrpc_service; + +/** + * ptlrpc callback & work item stuff + */ +struct ptlrpc_cb_id { + void (*cbid_fn)(lnet_event_t *ev); /* specific callback fn */ + void *cbid_arg; /* additional arg */ +}; + +/** Maximum number of locks to fit into reply state */ +#define RS_MAX_LOCKS 8 +#define RS_DEBUG 0 + +/** + * Structure to define reply state on the server + * Reply state holds various reply message information. Also for "difficult" + * replies (rep-ack case) we store the state after sending reply and wait + * for the client to acknowledge the reception. In these cases locks could be + * added to the state for replay/failover consistency guarantees. + */ +struct ptlrpc_reply_state { + /** Callback description */ + struct ptlrpc_cb_id rs_cb_id; + /** Linkage for list of all reply states in a system */ + struct list_head rs_list; + /** Linkage for list of all reply states on same export */ + struct list_head rs_exp_list; + /** Linkage for list of all reply states for same obd */ + struct list_head rs_obd_list; +#if RS_DEBUG + struct list_head rs_debug_list; +#endif + /** A spinlock to protect the reply state flags */ + spinlock_t rs_lock; + /** Reply state flags */ + unsigned long rs_difficult:1; /* ACK/commit stuff */ + unsigned long rs_no_ack:1; /* no ACK, even for + difficult requests */ + unsigned long rs_scheduled:1; /* being handled? */ + unsigned long rs_scheduled_ever:1;/* any schedule attempts? */ + unsigned long rs_handled:1; /* been handled yet? */ + unsigned long rs_on_net:1; /* reply_out_callback pending? */ + unsigned long rs_prealloc:1; /* rs from prealloc list */ + unsigned long rs_committed:1;/* the transaction was committed + and the rs was dispatched + by ptlrpc_commit_replies */ + /** Size of the state */ + int rs_size; + /** opcode */ + __u32 rs_opc; + /** Transaction number */ + __u64 rs_transno; + /** xid */ + __u64 rs_xid; + struct obd_export *rs_export; + struct ptlrpc_service_part *rs_svcpt; + /** Lnet metadata handle for the reply */ + lnet_handle_md_t rs_md_h; + atomic_t rs_refcount; + + /** Context for the service thread */ + struct ptlrpc_svc_ctx *rs_svc_ctx; + /** Reply buffer (actually sent to the client), encoded if needed */ + struct lustre_msg *rs_repbuf; /* wrapper */ + /** Size of the reply buffer */ + int rs_repbuf_len; /* wrapper buf length */ + /** Size of the reply message */ + int rs_repdata_len; /* wrapper msg length */ + /** + * Actual reply message. Its content is encrypted (if needed) to + * produce reply buffer for actual sending. In simple case + * of no network encryption we just set \a rs_repbuf to \a rs_msg + */ + struct lustre_msg *rs_msg; /* reply message */ + + /** Number of locks awaiting client ACK */ + int rs_nlocks; + /** Handles of locks awaiting client reply ACK */ + struct lustre_handle rs_locks[RS_MAX_LOCKS]; + /** Lock modes of locks in \a rs_locks */ + ldlm_mode_t rs_modes[RS_MAX_LOCKS]; +}; + +struct ptlrpc_thread; + +/** RPC stages */ +enum rq_phase { + RQ_PHASE_NEW = 0xebc0de00, + RQ_PHASE_RPC = 0xebc0de01, + RQ_PHASE_BULK = 0xebc0de02, + RQ_PHASE_INTERPRET = 0xebc0de03, + RQ_PHASE_COMPLETE = 0xebc0de04, + RQ_PHASE_UNREGISTERING = 0xebc0de05, + RQ_PHASE_UNDEFINED = 0xebc0de06 +}; + +/** Type of request interpreter call-back */ +typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc); + +/** + * Definition of request pool structure. + * The pool is used to store empty preallocated requests for the case + * when we would actually need to send something without performing + * any allocations (to avoid e.g. OOM). + */ +struct ptlrpc_request_pool { + /** Locks the list */ + spinlock_t prp_lock; + /** list of ptlrpc_request structs */ + struct list_head prp_req_list; + /** Maximum message size that would fit into a request from this pool */ + int prp_rq_size; + /** Function to allocate more requests for this pool */ + void (*prp_populate)(struct ptlrpc_request_pool *, int); +}; + +struct lu_context; +struct lu_env; + +struct ldlm_lock; + +/** + * \defgroup nrs Network Request Scheduler + * @{ + */ +struct ptlrpc_nrs_policy; +struct ptlrpc_nrs_resource; +struct ptlrpc_nrs_request; + +/** + * NRS control operations. + * + * These are common for all policies. + */ +enum ptlrpc_nrs_ctl { + /** + * Not a valid opcode. + */ + PTLRPC_NRS_CTL_INVALID, + /** + * Activate the policy. + */ + PTLRPC_NRS_CTL_START, + /** + * Reserved for multiple primary policies, which may be a possibility + * in the future. + */ + PTLRPC_NRS_CTL_STOP, + /** + * Policies can start using opcodes from this value and onwards for + * their own purposes; the assigned value itself is arbitrary. + */ + PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20, +}; + +/** + * ORR policy operations + */ +enum nrs_ctl_orr { + NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC, + NRS_CTL_ORR_WR_QUANTUM, + NRS_CTL_ORR_RD_OFF_TYPE, + NRS_CTL_ORR_WR_OFF_TYPE, + NRS_CTL_ORR_RD_SUPP_REQ, + NRS_CTL_ORR_WR_SUPP_REQ, +}; + +/** + * NRS policy operations. + * + * These determine the behaviour of a policy, and are called in response to + * NRS core events. + */ +struct ptlrpc_nrs_pol_ops { + /** + * Called during policy registration; this operation is optional. + * + * \param[in,out] policy The policy being initialized + */ + int (*op_policy_init) (struct ptlrpc_nrs_policy *policy); + /** + * Called during policy unregistration; this operation is optional. + * + * \param[in,out] policy The policy being unregistered/finalized + */ + void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy); + /** + * Called when activating a policy via lprocfs; policies allocate and + * initialize their resources here; this operation is optional. + * + * \param[in,out] policy The policy being started + * + * \see nrs_policy_start_locked() + */ + int (*op_policy_start) (struct ptlrpc_nrs_policy *policy); + /** + * Called when deactivating a policy via lprocfs; policies deallocate + * their resources here; this operation is optional + * + * \param[in,out] policy The policy being stopped + * + * \see nrs_policy_stop0() + */ + void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy); + /** + * Used for policy-specific operations; i.e. not generic ones like + * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous + * to an ioctl; this operation is optional. + * + * \param[in,out] policy The policy carrying out operation \a opc + * \param[in] opc The command operation being carried out + * \param[in,out] arg An generic buffer for communication between the + * user and the control operation + * + * \retval -ve error + * \retval 0 success + * + * \see ptlrpc_nrs_policy_control() + */ + int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg); + + /** + * Called when obtaining references to the resources of the resource + * hierarchy for a request that has arrived for handling at the PTLRPC + * service. Policies should return -ve for requests they do not wish + * to handle. This operation is mandatory. + * + * \param[in,out] policy The policy we're getting resources for. + * \param[in,out] nrq The request we are getting resources for. + * \param[in] parent The parent resource of the resource being + * requested; set to NULL if none. + * \param[out] resp The resource is to be returned here; the + * fallback policy in an NRS head should + * \e always return a non-NULL pointer value. + * \param[in] moving_req When set, signifies that this is an attempt + * to obtain resources for a request being moved + * to the high-priority NRS head by + * ldlm_lock_reorder_req(). + * This implies two things: + * 1. We are under obd_export::exp_rpc_lock and + * so should not sleep. + * 2. We should not perform non-idempotent or can + * skip performing idempotent operations that + * were carried out when resources were first + * taken for the request when it was initialized + * in ptlrpc_nrs_req_initialize(). + * + * \retval 0, +ve The level of the returned resource in the resource + * hierarchy; currently only 0 (for a non-leaf resource) + * and 1 (for a leaf resource) are supported by the + * framework. + * \retval -ve error + * + * \see ptlrpc_nrs_req_initialize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + int (*op_res_get) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, + bool moving_req); + /** + * Called when releasing references taken for resources in the resource + * hierarchy for the request; this operation is optional. + * + * \param[in,out] policy The policy the resource belongs to + * \param[in] res The resource to be freed + * + * \see ptlrpc_nrs_req_finalize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + void (*op_res_put) (struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res); + + /** + * Obtains a request for handling from the policy, and optionally + * removes the request from the policy; this operation is mandatory. + * + * \param[in,out] policy The policy to poll + * \param[in] peek When set, signifies that we just want to + * examine the request, and not handle it, so the + * request is not removed from the policy. + * \param[in] force When set, it will force a policy to return a + * request if it has one queued. + * + * \retval NULL No request available for handling + * \retval valid-pointer The request polled for handling + * + * \see ptlrpc_nrs_req_get_nolock() + */ + struct ptlrpc_nrs_request * + (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek, + bool force); + /** + * Called when attempting to add a request to a policy for later + * handling; this operation is mandatory. + * + * \param[in,out] policy The policy on which to enqueue \a nrq + * \param[in,out] nrq The request to enqueue + * + * \retval 0 success + * \retval != 0 error + * + * \see ptlrpc_nrs_req_add_nolock() + */ + int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Removes a request from the policy's set of pending requests. Normally + * called after a request has been polled successfully from the policy + * for handling; this operation is mandatory. + * + * \param[in,out] policy The policy the request \a nrq belongs to + * \param[in,out] nrq The request to dequeue + * + * \see ptlrpc_nrs_req_del_nolock() + */ + void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Called after the request being carried out. Could be used for + * job/resource control; this operation is optional. + * + * \param[in,out] policy The policy which is stopping to handle request + * \a nrq + * \param[in,out] nrq The request + * + * \pre assert_spin_locked(&svcpt->scp_req_lock) + * + * \see ptlrpc_nrs_req_stop_nolock() + */ + void (*op_req_stop) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Registers the policy's lprocfs interface with a PTLRPC service. + * + * \param[in] svc The service + * + * \retval 0 success + * \retval != 0 error + */ + int (*op_lprocfs_init) (struct ptlrpc_service *svc); + /** + * Unegisters the policy's lprocfs interface with a PTLRPC service. + * + * In cases of failed policy registration in + * \e ptlrpc_nrs_policy_register(), this function may be called for a + * service which has not registered the policy successfully, so + * implementations of this method should make sure their operations are + * safe in such cases. + * + * \param[in] svc The service + */ + void (*op_lprocfs_fini) (struct ptlrpc_service *svc); +}; + +/** + * Policy flags + */ +enum nrs_policy_flags { + /** + * Fallback policy, use this flag only on a single supported policy per + * service. The flag cannot be used on policies that use + * \e PTLRPC_NRS_FL_REG_EXTERN + */ + PTLRPC_NRS_FL_FALLBACK = (1 << 0), + /** + * Start policy immediately after registering. + */ + PTLRPC_NRS_FL_REG_START = (1 << 1), + /** + * This is a policy registering from a module different to the one NRS + * core ships in (currently ptlrpc). + */ + PTLRPC_NRS_FL_REG_EXTERN = (1 << 2), +}; + +/** + * NRS queue type. + * + * Denotes whether an NRS instance is for handling normal or high-priority + * RPCs, or whether an operation pertains to one or both of the NRS instances + * in a service. + */ +enum ptlrpc_nrs_queue_type { + PTLRPC_NRS_QUEUE_REG = (1 << 0), + PTLRPC_NRS_QUEUE_HP = (1 << 1), + PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP) +}; + +/** + * NRS head + * + * A PTLRPC service has at least one NRS head instance for handling normal + * priority RPCs, and may optionally have a second NRS head instance for + * handling high-priority RPCs. Each NRS head maintains a list of available + * policies, of which one and only one policy is acting as the fallback policy, + * and optionally a different policy may be acting as the primary policy. For + * all RPCs handled by this NRS head instance, NRS core will first attempt to + * enqueue the RPC using the primary policy (if any). The fallback policy is + * used in the following cases: + * - when there was no primary policy in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request + * was initialized. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, denoted it did not wish, or for some other reason was + * not able to handle the request, by returning a non-valid NRS resource + * reference. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, fails later during the request enqueueing stage. + * + * \see nrs_resource_get_safe() + * \see nrs_request_enqueue() + */ +struct ptlrpc_nrs { + spinlock_t nrs_lock; + /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */ + /** + * List of registered policies + */ + struct list_head nrs_policy_list; + /** + * List of policies with queued requests. Policies that have any + * outstanding requests are queued here, and this list is queried + * in a round-robin manner from NRS core when obtaining a request + * for handling. This ensures that requests from policies that at some + * point transition away from the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained. + */ + struct list_head nrs_policy_queued; + /** + * Service partition for this NRS head + */ + struct ptlrpc_service_part *nrs_svcpt; + /** + * Primary policy, which is the preferred policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_primary; + /** + * Fallback policy, which is the backup policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_fallback; + /** + * This NRS head handles either HP or regular requests + */ + enum ptlrpc_nrs_queue_type nrs_queue_type; + /** + * # queued requests from all policies in this NRS head + */ + unsigned long nrs_req_queued; + /** + * # scheduled requests from all policies in this NRS head + */ + unsigned long nrs_req_started; + /** + * # policies on this NRS + */ + unsigned nrs_num_pols; + /** + * This NRS head is in progress of starting a policy + */ + unsigned nrs_policy_starting:1; + /** + * In progress of shutting down the whole NRS head; used during + * unregistration + */ + unsigned nrs_stopping:1; +}; + +#define NRS_POL_NAME_MAX 16 + +struct ptlrpc_nrs_pol_desc; + +/** + * Service compatibility predicate; this determines whether a policy is adequate + * for handling RPCs of a particular PTLRPC service. + * + * XXX:This should give the same result during policy registration and + * unregistration, and for all partitions of a service; so the result should not + * depend on temporal service or other properties, that may influence the + * result. + */ +typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc); + +struct ptlrpc_nrs_pol_conf { + /** + * Human-readable policy name + */ + char nc_name[NRS_POL_NAME_MAX]; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *nc_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t nc_compat; + /** + * Set for policies that support a single ptlrpc service, i.e. ones that + * have \a pd_compat set to nrs_policy_compat_one(). The variable value + * depicts the name of the single service that such policies are + * compatible with. + */ + const char *nc_compat_svc_name; + /** + * Owner module for this policy descriptor; policies registering from a + * different module to the one the NRS framework is held within + * (currently ptlrpc), should set this field to THIS_MODULE. + */ + struct module *nc_owner; + /** + * Policy registration flags; a bitmask of \e nrs_policy_flags + */ + unsigned nc_flags; +}; + +/** + * NRS policy registering descriptor + * + * Is used to hold a description of a policy that can be passed to NRS core in + * order to register the policy with NRS heads in different PTLRPC services. + */ +struct ptlrpc_nrs_pol_desc { + /** + * Human-readable policy name + */ + char pd_name[NRS_POL_NAME_MAX]; + /** + * Link into nrs_core::nrs_policies + */ + struct list_head pd_list; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *pd_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t pd_compat; + /** + * Set for policies that are compatible with only one PTLRPC service. + * + * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name + */ + const char *pd_compat_svc_name; + /** + * Owner module for this policy descriptor. + * + * We need to hold a reference to the module whenever we might make use + * of any of the module's contents, i.e. + * - If one or more instances of the policy are at a state where they + * might be handling a request, i.e. + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to + * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference + * is taken on the module when + * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it + * becomes 0, so that we hold only one reference to the module maximum + * at any time. + * + * We do not need to hold a reference to the module, even though we + * might use code and data from the module, in the following cases: + * - During external policy registration, because this should happen in + * the module's init() function, in which case the module is safe from + * removal because a reference is being held on the module by the + * kernel, and iirc kmod (and I guess module-init-tools also) will + * serialize any racing processes properly anyway. + * - During external policy unregistration, because this should happen + * in a module's exit() function, and any attempts to start a policy + * instance would need to take a reference on the module, and this is + * not possible once we have reached the point where the exit() + * handler is called. + * - During service registration and unregistration, as service setup + * and cleanup, and policy registration, unregistration and policy + * instance starting, are serialized by \e nrs_core::nrs_mutex, so + * as long as users adhere to the convention of registering policies + * in init() and unregistering them in module exit() functions, there + * should not be a race between these operations. + * - During any policy-specific lprocfs operations, because a reference + * is held by the kernel on a proc entry that has been entered by a + * syscall, so as long as proc entries are removed during unregistration time, + * then unregistration and lprocfs operations will be properly + * serialized. + */ + struct module *pd_owner; + /** + * Bitmask of \e nrs_policy_flags + */ + unsigned pd_flags; + /** + * # of references on this descriptor + */ + atomic_t pd_refs; +}; + +/** + * NRS policy state + * + * Policies transition from one state to the other during their lifetime + */ +enum ptlrpc_nrs_pol_state { + /** + * Not a valid policy state. + */ + NRS_POL_STATE_INVALID, + /** + * Policies are at this state either at the start of their life, or + * transition here when the user selects a different policy to act + * as the primary one. + */ + NRS_POL_STATE_STOPPED, + /** + * Policy is progress of stopping + */ + NRS_POL_STATE_STOPPING, + /** + * Policy is in progress of starting + */ + NRS_POL_STATE_STARTING, + /** + * A policy is in this state in two cases: + * - it is the fallback policy, which is always in this state. + * - it has been activated by the user; i.e. it is the primary policy, + */ + NRS_POL_STATE_STARTED, +}; + +/** + * NRS policy information + * + * Used for obtaining information for the status of a policy via lprocfs + */ +struct ptlrpc_nrs_pol_info { + /** + * Policy name + */ + char pi_name[NRS_POL_NAME_MAX]; + /** + * Current policy state + */ + enum ptlrpc_nrs_pol_state pi_state; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pi_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pi_req_started; + /** + * Is this a fallback policy? + */ + unsigned pi_fallback:1; +}; + +/** + * NRS policy + * + * There is one instance of this for each policy in each NRS head of each + * PTLRPC service partition. + */ +struct ptlrpc_nrs_policy { + /** + * Linkage into the NRS head's list of policies, + * ptlrpc_nrs:nrs_policy_list + */ + struct list_head pol_list; + /** + * Linkage into the NRS head's list of policies with enqueued + * requests ptlrpc_nrs:nrs_policy_queued + */ + struct list_head pol_list_queued; + /** + * Current state of this policy + */ + enum ptlrpc_nrs_pol_state pol_state; + /** + * Bitmask of nrs_policy_flags + */ + unsigned pol_flags; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pol_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pol_req_started; + /** + * Usage Reference count taken on the policy instance + */ + long pol_ref; + /** + * The NRS head this policy has been created at + */ + struct ptlrpc_nrs *pol_nrs; + /** + * Private policy data; varies by policy type + */ + void *pol_private; + /** + * Policy descriptor for this policy instance. + */ + struct ptlrpc_nrs_pol_desc *pol_desc; +}; + +/** + * NRS resource + * + * Resources are embedded into two types of NRS entities: + * - Inside NRS policies, in the policy's private data in + * ptlrpc_nrs_policy::pol_private + * - In objects that act as prime-level scheduling entities in different NRS + * policies; e.g. on a policy that performs round robin or similar order + * scheduling across client NIDs, there would be one NRS resource per unique + * client NID. On a policy which performs round robin scheduling across + * backend filesystem objects, there would be one resource associated with + * each of the backend filesystem objects partaking in the scheduling + * performed by the policy. + * + * NRS resources share a parent-child relationship, in which resources embedded + * in policy instances are the parent entities, with all scheduling entities + * a policy schedules across being the children, thus forming a simple resource + * hierarchy. This hierarchy may be extended with one or more levels in the + * future if the ability to have more than one primary policy is added. + * + * Upon request initialization, references to the then active NRS policies are + * taken and used to later handle the dispatching of the request with one of + * these policies. + * + * \see nrs_resource_get_safe() + * \see ptlrpc_nrs_req_add() + */ +struct ptlrpc_nrs_resource { + /** + * This NRS resource's parent; is NULL for resources embedded in NRS + * policy instances; i.e. those are top-level ones. + */ + struct ptlrpc_nrs_resource *res_parent; + /** + * The policy associated with this resource. + */ + struct ptlrpc_nrs_policy *res_policy; +}; + +enum { + NRS_RES_FALLBACK, + NRS_RES_PRIMARY, + NRS_RES_MAX +}; + +/* \name fifo + * + * FIFO policy + * + * This policy is a logical wrapper around previous, non-NRS functionality. + * It dispatches RPCs in the same order as they arrive from the network. This + * policy is currently used as the fallback policy, and the only enabled policy + * on all NRS heads of all PTLRPC service partitions. + * @{ + */ + +/** + * Private data structure for the FIFO policy + */ +struct nrs_fifo_head { + /** + * Resource object for policy instance. + */ + struct ptlrpc_nrs_resource fh_res; + /** + * List of queued requests. + */ + struct list_head fh_list; + /** + * For debugging purposes. + */ + __u64 fh_sequence; +}; + +struct nrs_fifo_req { + struct list_head fr_list; + __u64 fr_sequence; +}; + +/** @} fifo */ + +/** + * NRS request + * + * Instances of this object exist embedded within ptlrpc_request; the main + * purpose of this object is to hold references to the request's resources + * for the lifetime of the request, and to hold properties that policies use + * use for determining the request's scheduling priority. + * */ +struct ptlrpc_nrs_request { + /** + * The request's resource hierarchy. + */ + struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX]; + /** + * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the + * policy that was used to enqueue the request. + * + * \see nrs_request_enqueue() + */ + unsigned nr_res_idx; + unsigned nr_initialized:1; + unsigned nr_enqueued:1; + unsigned nr_started:1; + unsigned nr_finalized:1; + + /** + * Policy-specific fields, used for determining a request's scheduling + * priority, and other supporting functionality. + */ + union { + /** + * Fields for the FIFO policy + */ + struct nrs_fifo_req fifo; + } nr_u; + /** + * Externally-registering policies may want to use this to allocate + * their own request properties. + */ + void *ext; +}; + +/** @} nrs */ + +/** + * Basic request prioritization operations structure. + * The whole idea is centered around locks and RPCs that might affect locks. + * When a lock is contended we try to give priority to RPCs that might lead + * to fastest release of that lock. + * Currently only implemented for OSTs only in a way that makes all + * IO and truncate RPCs that are coming from a locked region where a lock is + * contended a priority over other requests. + */ +struct ptlrpc_hpreq_ops { + /** + * Check if the lock handle of the given lock is the same as + * taken from the request. + */ + int (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *); + /** + * Check if the request is a high priority one. + */ + int (*hpreq_check)(struct ptlrpc_request *); + /** + * Called after the request has been handled. + */ + void (*hpreq_fini)(struct ptlrpc_request *); +}; + +/** + * Represents remote procedure call. + * + * This is a staple structure used by everybody wanting to send a request + * in Lustre. + */ +struct ptlrpc_request { + /* Request type: one of PTL_RPC_MSG_* */ + int rq_type; + /** Result of request processing */ + int rq_status; + /** + * Linkage item through which this request is included into + * sending/delayed lists on client and into rqbd list on server + */ + struct list_head rq_list; + /** + * Server side list of incoming unserved requests sorted by arrival + * time. Traversed from time to time to notice about to expire + * requests and sent back "early replies" to clients to let them + * know server is alive and well, just very busy to service their + * requests in time + */ + struct list_head rq_timed_list; + /** server-side history, used for debugging purposes. */ + struct list_head rq_history_list; + /** server-side per-export list */ + struct list_head rq_exp_list; + /** server-side hp handlers */ + struct ptlrpc_hpreq_ops *rq_ops; + + /** initial thread servicing this request */ + struct ptlrpc_thread *rq_svc_thread; + + /** history sequence # */ + __u64 rq_history_seq; + /** \addtogroup nrs + * @{ + */ + /** stub for NRS request */ + struct ptlrpc_nrs_request rq_nrq; + /** @} nrs */ + /** the index of service's srv_at_array into which request is linked */ + time_t rq_at_index; + /** Lock to protect request flags and some other important bits, like + * rq_list + */ + spinlock_t rq_lock; + /** client-side flags are serialized by rq_lock */ + unsigned int rq_intr:1, rq_replied:1, rq_err:1, + rq_timedout:1, rq_resend:1, rq_restart:1, + /** + * when ->rq_replay is set, request is kept by the client even + * after server commits corresponding transaction. This is + * used for operations that require sequence of multiple + * requests to be replayed. The only example currently is file + * open/close. When last request in such a sequence is + * committed, ->rq_replay is cleared on all requests in the + * sequence. + */ + rq_replay:1, + rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, + rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1, + rq_early:1, + rq_req_unlink:1, rq_reply_unlink:1, + rq_memalloc:1, /* req originated from "kswapd" */ + /* server-side flags */ + rq_packed_final:1, /* packed final reply */ + rq_hp:1, /* high priority RPC */ + rq_at_linked:1, /* link into service's srv_at_array */ + rq_reply_truncate:1, + rq_committed:1, + /* whether the "rq_set" is a valid one */ + rq_invalid_rqset:1, + rq_generation_set:1, + /* do not resend request on -EINPROGRESS */ + rq_no_retry_einprogress:1, + /* allow the req to be sent if the import is in recovery + * status */ + rq_allow_replay:1; + + unsigned int rq_nr_resend; + + enum rq_phase rq_phase; /* one of RQ_PHASE_* */ + enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */ + atomic_t rq_refcount;/* client-side refcount for SENT race, + server-side refcount for multiple replies */ + + /** Portal to which this request would be sent */ + short rq_request_portal; /* XXX FIXME bug 249 */ + /** Portal where to wait for reply and where reply would be sent */ + short rq_reply_portal; /* XXX FIXME bug 249 */ + + /** + * client-side: + * !rq_truncate : # reply bytes actually received, + * rq_truncate : required repbuf_len for resend + */ + int rq_nob_received; + /** Request length */ + int rq_reqlen; + /** Reply length */ + int rq_replen; + /** Request message - what client sent */ + struct lustre_msg *rq_reqmsg; + /** Reply message - server response */ + struct lustre_msg *rq_repmsg; + /** Transaction number */ + __u64 rq_transno; + /** xid */ + __u64 rq_xid; + /** + * List item to for replay list. Not yet committed requests get linked + * there. + * Also see \a rq_replay comment above. + */ + struct list_head rq_replay_list; + + /** + * security and encryption data + * @{ */ + struct ptlrpc_cli_ctx *rq_cli_ctx; /**< client's half ctx */ + struct ptlrpc_svc_ctx *rq_svc_ctx; /**< server's half ctx */ + struct list_head rq_ctx_chain; /**< link to waited ctx */ + + struct sptlrpc_flavor rq_flvr; /**< for client & server */ + enum lustre_sec_part rq_sp_from; + + /* client/server security flags */ + unsigned int + rq_ctx_init:1, /* context initiation */ + rq_ctx_fini:1, /* context destroy */ + rq_bulk_read:1, /* request bulk read */ + rq_bulk_write:1, /* request bulk write */ + /* server authentication flags */ + rq_auth_gss:1, /* authenticated by gss */ + rq_auth_remote:1, /* authed as remote user */ + rq_auth_usr_root:1, /* authed as root */ + rq_auth_usr_mdt:1, /* authed as mdt */ + rq_auth_usr_ost:1, /* authed as ost */ + /* security tfm flags */ + rq_pack_udesc:1, + rq_pack_bulk:1, + /* doesn't expect reply FIXME */ + rq_no_reply:1, + rq_pill_init:1; /* pill initialized */ + + uid_t rq_auth_uid; /* authed uid */ + uid_t rq_auth_mapped_uid; /* authed uid mapped to */ + + /* (server side), pointed directly into req buffer */ + struct ptlrpc_user_desc *rq_user_desc; + + /* various buffer pointers */ + struct lustre_msg *rq_reqbuf; /* req wrapper */ + char *rq_repbuf; /* rep buffer */ + struct lustre_msg *rq_repdata; /* rep wrapper msg */ + struct lustre_msg *rq_clrbuf; /* only in priv mode */ + int rq_reqbuf_len; /* req wrapper buf len */ + int rq_reqdata_len; /* req wrapper msg len */ + int rq_repbuf_len; /* rep buffer len */ + int rq_repdata_len; /* rep wrapper msg len */ + int rq_clrbuf_len; /* only in priv mode */ + int rq_clrdata_len; /* only in priv mode */ + + /** early replies go to offset 0, regular replies go after that */ + unsigned int rq_reply_off; + + /** @} */ + + /** Fields that help to see if request and reply were swabbed or not */ + __u32 rq_req_swab_mask; + __u32 rq_rep_swab_mask; + + /** What was import generation when this request was sent */ + int rq_import_generation; + enum lustre_imp_state rq_send_state; + + /** how many early replies (for stats) */ + int rq_early_count; + + /** client+server request */ + lnet_handle_md_t rq_req_md_h; + struct ptlrpc_cb_id rq_req_cbid; + /** optional time limit for send attempts */ + long rq_delay_limit; + /** time request was first queued */ + unsigned long rq_queued_time; + + /* server-side... */ + /** request arrival time */ + struct timeval rq_arrival_time; + /** separated reply state */ + struct ptlrpc_reply_state *rq_reply_state; + /** incoming request buffer */ + struct ptlrpc_request_buffer_desc *rq_rqbd; + + /** client-only incoming reply */ + lnet_handle_md_t rq_reply_md_h; + wait_queue_head_t rq_reply_waitq; + struct ptlrpc_cb_id rq_reply_cbid; + + /** our LNet NID */ + lnet_nid_t rq_self; + /** Peer description (the other side) */ + lnet_process_id_t rq_peer; + /** Server-side, export on which request was received */ + struct obd_export *rq_export; + /** Client side, import where request is being sent */ + struct obd_import *rq_import; + + /** Replay callback, called after request is replayed at recovery */ + void (*rq_replay_cb)(struct ptlrpc_request *); + /** + * Commit callback, called when request is committed and about to be + * freed. + */ + void (*rq_commit_cb)(struct ptlrpc_request *); + /** Opaq data for replay and commit callbacks. */ + void *rq_cb_data; + + /** For bulk requests on client only: bulk descriptor */ + struct ptlrpc_bulk_desc *rq_bulk; + + /** client outgoing req */ + /** + * when request/reply sent (secs), or time when request should be sent + */ + time_t rq_sent; + /** time for request really sent out */ + time_t rq_real_sent; + + /** when request must finish. volatile + * so that servers' early reply updates to the deadline aren't + * kept in per-cpu cache */ + volatile time_t rq_deadline; + /** when req reply unlink must finish. */ + time_t rq_reply_deadline; + /** when req bulk unlink must finish. */ + time_t rq_bulk_deadline; + /** + * service time estimate (secs) + * If the requestsis not served by this time, it is marked as timed out. + */ + int rq_timeout; + + /** Multi-rpc bits */ + /** Per-request waitq introduced by bug 21938 for recovery waiting */ + wait_queue_head_t rq_set_waitq; + /** Link item for request set lists */ + struct list_head rq_set_chain; + /** Link back to the request set */ + struct ptlrpc_request_set *rq_set; + /** Async completion handler, called when reply is received */ + ptlrpc_interpterer_t rq_interpret_reply; + /** Async completion context */ + union ptlrpc_async_args rq_async_args; + + /** Pool if request is from preallocated list */ + struct ptlrpc_request_pool *rq_pool; + + struct lu_context rq_session; + struct lu_context rq_recov_session; + + /** request format description */ + struct req_capsule rq_pill; +}; + +/** + * Call completion handler for rpc if any, return it's status or original + * rc if there was no handler defined for this request. + */ +static inline int ptlrpc_req_interpret(const struct lu_env *env, + struct ptlrpc_request *req, int rc) +{ + if (req->rq_interpret_reply != NULL) { + req->rq_status = req->rq_interpret_reply(env, req, + &req->rq_async_args, + rc); + return req->rq_status; + } + return rc; +} + +/** \addtogroup nrs + * @{ + */ +int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf); +int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf); +void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req); +void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_pol_info *info); + +/* + * Can the request be moved from the regular NRS head to the high-priority NRS + * head (of the same PTLRPC service partition), if any? + * + * For a reliable result, this should be checked under svcpt->scp_req lock. + */ +static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_request *nrq = &req->rq_nrq; + + /** + * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the + * request has been enqueued first, and ptlrpc_nrs_request::nr_started + * to make sure it has not been scheduled yet (analogous to previous + * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list). + */ + return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp; +} +/** @} nrs */ + +/** + * Returns 1 if request buffer at offset \a index was already swabbed + */ +static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); + return req->rq_req_swab_mask & (1 << index); +} + +/** + * Returns 1 if request reply buffer at offset \a index was already swabbed + */ +static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); + return req->rq_rep_swab_mask & (1 << index); +} + +/** + * Returns 1 if request needs to be swabbed into local cpu byteorder + */ +static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req) +{ + return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Returns 1 if request reply needs to be swabbed into local cpu byteorder + */ +static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req) +{ + return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Mark request buffer at offset \a index that it was already swabbed + */ +static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); + LASSERT((req->rq_req_swab_mask & (1 << index)) == 0); + req->rq_req_swab_mask |= 1 << index; +} + +/** + * Mark request reply buffer at offset \a index that it was already swabbed + */ +static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); + LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0); + req->rq_rep_swab_mask |= 1 << index; +} + +/** + * Convert numerical request phase value \a phase into text string description + */ +static inline const char * +ptlrpc_phase2str(enum rq_phase phase) +{ + switch (phase) { + case RQ_PHASE_NEW: + return "New"; + case RQ_PHASE_RPC: + return "Rpc"; + case RQ_PHASE_BULK: + return "Bulk"; + case RQ_PHASE_INTERPRET: + return "Interpret"; + case RQ_PHASE_COMPLETE: + return "Complete"; + case RQ_PHASE_UNREGISTERING: + return "Unregistering"; + default: + return "?Phase?"; + } +} + +/** + * Convert numerical request phase of the request \a req into text stringi + * description + */ +static inline const char * +ptlrpc_rqphase2str(struct ptlrpc_request *req) +{ + return ptlrpc_phase2str(req->rq_phase); +} + +/** + * Debugging functions and helpers to print request structure into debug log + * @{ + */ +/* Spare the preprocessor, spoil the bugs. */ +#define FLAG(field, str) (field ? str : "") + +/** Convert bit flags into a string */ +#define DEBUG_REQ_FLAGS(req) \ + ptlrpc_rqphase2str(req), \ + FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ + FLAG(req->rq_err, "E"), \ + FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ + FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ + FLAG(req->rq_no_resend, "N"), \ + FLAG(req->rq_waiting, "W"), \ + FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \ + FLAG(req->rq_committed, "M") + +#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s" + +void _debug_req(struct ptlrpc_request *req, + struct libcfs_debug_msg_data *data, const char *fmt, ...) + __printf(3, 4); + +/** + * Helper that decides if we need to print request according to current debug + * level settings + */ +#define debug_req(msgdata, mask, cdls, req, fmt, a...) \ +do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _debug_req((req), msgdata, fmt, ##a); \ +} while (0) + +/** + * This is the debug print function you need to use to print request structure + * content into lustre debug log. + * for most callers (level is a constant) this is resolved at compile time */ +#define DEBUG_REQ(level, req, fmt, args...) \ +do { \ + if ((level) & (D_ERROR | D_WARNING)) { \ + static struct cfs_debug_limit_state cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ + debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\ + } else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \ + } \ +} while (0) +/** @} */ + +/** + * Structure that defines a single page of a bulk transfer + */ +struct ptlrpc_bulk_page { + /** Linkage to list of pages in a bulk */ + struct list_head bp_link; + /** + * Number of bytes in a page to transfer starting from \a bp_pageoffset + */ + int bp_buflen; + /** offset within a page */ + int bp_pageoffset; + /** The page itself */ + struct page *bp_page; +}; + +#define BULK_GET_SOURCE 0 +#define BULK_PUT_SINK 1 +#define BULK_GET_SINK 2 +#define BULK_PUT_SOURCE 3 + +/** + * Definition of bulk descriptor. + * Bulks are special "Two phase" RPCs where initial request message + * is sent first and it is followed bt a transfer (o receiving) of a large + * amount of data to be settled into pages referenced from the bulk descriptors. + * Bulks transfers (the actual data following the small requests) are done + * on separate LNet portals. + * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs. + * Another user is readpage for MDT. + */ +struct ptlrpc_bulk_desc { + /** completed with failure */ + unsigned long bd_failure:1; + /** {put,get}{source,sink} */ + unsigned long bd_type:2; + /** client side */ + unsigned long bd_registered:1; + /** For serialization with callback */ + spinlock_t bd_lock; + /** Import generation when request for this bulk was sent */ + int bd_import_generation; + /** LNet portal for this bulk */ + __u32 bd_portal; + /** Server side - export this bulk created for */ + struct obd_export *bd_export; + /** Client side - import this bulk was sent on */ + struct obd_import *bd_import; + /** Back pointer to the request */ + struct ptlrpc_request *bd_req; + wait_queue_head_t bd_waitq; /* server side only WQ */ + int bd_iov_count; /* # entries in bd_iov */ + int bd_max_iov; /* allocated size of bd_iov */ + int bd_nob; /* # bytes covered */ + int bd_nob_transferred; /* # bytes GOT/PUT */ + + __u64 bd_last_xid; + + struct ptlrpc_cb_id bd_cbid; /* network callback info */ + lnet_nid_t bd_sender; /* stash event::sender */ + int bd_md_count; /* # valid entries in bd_mds */ + int bd_md_max_brw; /* max entries in bd_mds */ + /** array of associated MDs */ + lnet_handle_md_t bd_mds[PTLRPC_BULK_OPS_COUNT]; + + /* + * encrypt iov, size is either 0 or bd_iov_count. + */ + lnet_kiov_t *bd_enc_iov; + + lnet_kiov_t bd_iov[0]; +}; + +enum { + SVC_STOPPED = 1 << 0, + SVC_STOPPING = 1 << 1, + SVC_STARTING = 1 << 2, + SVC_RUNNING = 1 << 3, + SVC_EVENT = 1 << 4, + SVC_SIGNAL = 1 << 5, +}; + +#define PTLRPC_THR_NAME_LEN 32 +/** + * Definition of server service thread structure + */ +struct ptlrpc_thread { + /** + * List of active threads in svc->srv_threads + */ + struct list_head t_link; + /** + * thread-private data (preallocated memory) + */ + void *t_data; + __u32 t_flags; + /** + * service thread index, from ptlrpc_start_threads + */ + unsigned int t_id; + /** + * service thread pid + */ + pid_t t_pid; + /** + * put watchdog in the structure per thread b=14840 + * + * Lustre watchdog is removed for client in the hope + * of a generic watchdog can be merged in kernel. + * When that happens, we should add below back. + * + * struct lc_watchdog *t_watchdog; + */ + /** + * the svc this thread belonged to b=18582 + */ + struct ptlrpc_service_part *t_svcpt; + wait_queue_head_t t_ctl_waitq; + struct lu_env *t_env; + char t_name[PTLRPC_THR_NAME_LEN]; +}; + +static inline int thread_is_init(struct ptlrpc_thread *thread) +{ + return thread->t_flags == 0; +} + +static inline int thread_is_stopped(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPED); +} + +static inline int thread_is_stopping(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPING); +} + +static inline int thread_is_starting(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STARTING); +} + +static inline int thread_is_running(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_RUNNING); +} + +static inline int thread_is_event(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_EVENT); +} + +static inline int thread_is_signal(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_SIGNAL); +} + +static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags &= ~flags; +} + +static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags = flags; +} + +static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags |= flags; +} + +static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread, + __u32 flags) +{ + if (thread->t_flags & flags) { + thread->t_flags &= ~flags; + return 1; + } + return 0; +} + +/** + * Request buffer descriptor structure. + * This is a structure that contains one posted request buffer for service. + * Once data land into a buffer, event callback creates actual request and + * notifies wakes one of the service threads to process new incoming request. + * More than one request can fit into the buffer. + */ +struct ptlrpc_request_buffer_desc { + /** Link item for rqbds on a service */ + struct list_head rqbd_list; + /** History of requests for this buffer */ + struct list_head rqbd_reqs; + /** Back pointer to service for which this buffer is registered */ + struct ptlrpc_service_part *rqbd_svcpt; + /** LNet descriptor */ + lnet_handle_md_t rqbd_md_h; + int rqbd_refcount; + /** The buffer itself */ + char *rqbd_buffer; + struct ptlrpc_cb_id rqbd_cbid; + /** + * This "embedded" request structure is only used for the + * last request to fit into the buffer + */ + struct ptlrpc_request rqbd_req; +}; + +typedef int (*svc_handler_t)(struct ptlrpc_request *req); + +struct ptlrpc_service_ops { + /** + * if non-NULL called during thread creation (ptlrpc_start_thread()) + * to initialize service specific per-thread state. + */ + int (*so_thr_init)(struct ptlrpc_thread *thr); + /** + * if non-NULL called during thread shutdown (ptlrpc_main()) to + * destruct state created by ->srv_init(). + */ + void (*so_thr_done)(struct ptlrpc_thread *thr); + /** + * Handler function for incoming requests for this service + */ + int (*so_req_handler)(struct ptlrpc_request *req); + /** + * function to determine priority of the request, it's called + * on every new request + */ + int (*so_hpreq_handler)(struct ptlrpc_request *); + /** + * service-specific print fn + */ + void (*so_req_printer)(void *, struct ptlrpc_request *); +}; + +#ifndef __cfs_cacheline_aligned +/* NB: put it here for reducing patche dependence */ +# define __cfs_cacheline_aligned +#endif + +/** + * How many high priority requests to serve before serving one normal + * priority request + */ +#define PTLRPC_SVC_HP_RATIO 10 + +/** + * Definition of PortalRPC service. + * The service is listening on a particular portal (like tcp port) + * and perform actions for a specific server like IO service for OST + * or general metadata service for MDS. + */ +struct ptlrpc_service { + /** serialize /proc operations */ + spinlock_t srv_lock; + /** most often accessed fields */ + /** chain thru all services */ + struct list_head srv_list; + /** service operations table */ + struct ptlrpc_service_ops srv_ops; + /** only statically allocated strings here; we don't clean them */ + char *srv_name; + /** only statically allocated strings here; we don't clean them */ + char *srv_thread_name; + /** service thread list */ + struct list_head srv_threads; + /** threads # should be created for each partition on initializing */ + int srv_nthrs_cpt_init; + /** limit of threads number for each partition */ + int srv_nthrs_cpt_limit; + /** Root of /proc dir tree for this service */ + struct proc_dir_entry *srv_procroot; + /** Pointer to statistic data for this service */ + struct lprocfs_stats *srv_stats; + /** # hp per lp reqs to handle */ + int srv_hpreq_ratio; + /** biggest request to receive */ + int srv_max_req_size; + /** biggest reply to send */ + int srv_max_reply_size; + /** size of individual buffers */ + int srv_buf_size; + /** # buffers to allocate in 1 group */ + int srv_nbuf_per_group; + /** Local portal on which to receive requests */ + __u32 srv_req_portal; + /** Portal on the client to send replies to */ + __u32 srv_rep_portal; + /** + * Tags for lu_context associated with this thread, see struct + * lu_context. + */ + __u32 srv_ctx_tags; + /** soft watchdog timeout multiplier */ + int srv_watchdog_factor; + /** under unregister_service */ + unsigned srv_is_stopping:1; + + /** max # request buffers in history per partition */ + int srv_hist_nrqbds_cpt_max; + /** number of CPTs this service bound on */ + int srv_ncpts; + /** CPTs array this service bound on */ + __u32 *srv_cpts; + /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */ + int srv_cpt_bits; + /** CPT table this service is running over */ + struct cfs_cpt_table *srv_cptable; + /** + * partition data for ptlrpc service + */ + struct ptlrpc_service_part *srv_parts[0]; +}; + +/** + * Definition of PortalRPC service partition data. + * Although a service only has one instance of it right now, but we + * will have multiple instances very soon (instance per CPT). + * + * it has four locks: + * \a scp_lock + * serialize operations on rqbd and requests waiting for preprocess + * \a scp_req_lock + * serialize operations active requests sent to this portal + * \a scp_at_lock + * serialize adaptive timeout stuff + * \a scp_rep_lock + * serialize operations on RS list (reply states) + * + * We don't have any use-case to take two or more locks at the same time + * for now, so there is no lock order issue. + */ +struct ptlrpc_service_part { + /** back reference to owner */ + struct ptlrpc_service *scp_service __cfs_cacheline_aligned; + /* CPT id, reserved */ + int scp_cpt; + /** always increasing number */ + int scp_thr_nextid; + /** # of starting threads */ + int scp_nthrs_starting; + /** # of stopping threads, reserved for shrinking threads */ + int scp_nthrs_stopping; + /** # running threads */ + int scp_nthrs_running; + /** service threads list */ + struct list_head scp_threads; + + /** + * serialize the following fields, used for protecting + * rqbd list and incoming requests waiting for preprocess, + * threads starting & stopping are also protected by this lock. + */ + spinlock_t scp_lock __cfs_cacheline_aligned; + /** total # req buffer descs allocated */ + int scp_nrqbds_total; + /** # posted request buffers for receiving */ + int scp_nrqbds_posted; + /** in progress of allocating rqbd */ + int scp_rqbd_allocating; + /** # incoming reqs */ + int scp_nreqs_incoming; + /** request buffers to be reposted */ + struct list_head scp_rqbd_idle; + /** req buffers receiving */ + struct list_head scp_rqbd_posted; + /** incoming reqs */ + struct list_head scp_req_incoming; + /** timeout before re-posting reqs, in tick */ + long scp_rqbd_timeout; + /** + * all threads sleep on this. This wait-queue is signalled when new + * incoming request arrives and when difficult reply has to be handled. + */ + wait_queue_head_t scp_waitq; + + /** request history */ + struct list_head scp_hist_reqs; + /** request buffer history */ + struct list_head scp_hist_rqbds; + /** # request buffers in history */ + int scp_hist_nrqbds; + /** sequence number for request */ + __u64 scp_hist_seq; + /** highest seq culled from history */ + __u64 scp_hist_seq_culled; + + /** + * serialize the following fields, used for processing requests + * sent to this portal + */ + spinlock_t scp_req_lock __cfs_cacheline_aligned; + /** # reqs in either of the NRS heads below */ + /** # reqs being served */ + int scp_nreqs_active; + /** # HPreqs being served */ + int scp_nhreqs_active; + /** # hp requests handled */ + int scp_hreq_count; + + /** NRS head for regular requests */ + struct ptlrpc_nrs scp_nrs_reg; + /** NRS head for HP requests; this is only valid for services that can + * handle HP requests */ + struct ptlrpc_nrs *scp_nrs_hp; + + /** AT stuff */ + /** @{ */ + /** + * serialize the following fields, used for changes on + * adaptive timeout + */ + spinlock_t scp_at_lock __cfs_cacheline_aligned; + /** estimated rpc service time */ + struct adaptive_timeout scp_at_estimate; + /** reqs waiting for replies */ + struct ptlrpc_at_array scp_at_array; + /** early reply timer */ + struct timer_list scp_at_timer; + /** debug */ + unsigned long scp_at_checktime; + /** check early replies */ + unsigned scp_at_check; + /** @} */ + + /** + * serialize the following fields, used for processing + * replies for this portal + */ + spinlock_t scp_rep_lock __cfs_cacheline_aligned; + /** all the active replies */ + struct list_head scp_rep_active; + /** List of free reply_states */ + struct list_head scp_rep_idle; + /** waitq to run, when adding stuff to srv_free_rs_list */ + wait_queue_head_t scp_rep_waitq; + /** # 'difficult' replies */ + atomic_t scp_nreps_difficult; +}; + +#define ptlrpc_service_for_each_part(part, i, svc) \ + for (i = 0; \ + i < (svc)->srv_ncpts && \ + (svc)->srv_parts != NULL && \ + ((part) = (svc)->srv_parts[i]) != NULL; i++) + +/** + * Declaration of ptlrpcd control structure + */ +struct ptlrpcd_ctl { + /** + * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE) + */ + unsigned long pc_flags; + /** + * Thread lock protecting structure fields. + */ + spinlock_t pc_lock; + /** + * Start completion. + */ + struct completion pc_starting; + /** + * Stop completion. + */ + struct completion pc_finishing; + /** + * Thread requests set. + */ + struct ptlrpc_request_set *pc_set; + /** + * Thread name used in cfs_daemonize() + */ + char pc_name[16]; + /** + * Environment for request interpreters to run in. + */ + struct lu_env pc_env; + /** + * Index of ptlrpcd thread in the array. + */ + int pc_index; + /** + * Number of the ptlrpcd's partners. + */ + int pc_npartners; + /** + * Pointer to the array of partners' ptlrpcd_ctl structure. + */ + struct ptlrpcd_ctl **pc_partners; + /** + * Record the partner index to be processed next. + */ + int pc_cursor; +}; + +/* Bits for pc_flags */ +enum ptlrpcd_ctl_flags { + /** + * Ptlrpc thread start flag. + */ + LIOD_START = 1 << 0, + /** + * Ptlrpc thread stop flag. + */ + LIOD_STOP = 1 << 1, + /** + * Ptlrpc thread force flag (only stop force so far). + * This will cause aborting any inflight rpcs handled + * by thread if LIOD_STOP is specified. + */ + LIOD_FORCE = 1 << 2, + /** + * This is a recovery ptlrpc thread. + */ + LIOD_RECOVERY = 1 << 3, + /** + * The ptlrpcd is bound to some CPU core. + */ + LIOD_BIND = 1 << 4, +}; + +/** + * \addtogroup nrs + * @{ + * + * Service compatibility function; the policy is compatible with all services. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + return true; +} + +/** + * Service compatibility function; the policy is compatible with only a specific + * service which is identified by its human-readable name at + * ptlrpc_service::srv_name. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval false The policy is not compatible with the service + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + LASSERT(desc->pd_compat_svc_name != NULL); + return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0; +} + +/** @} nrs */ + +/* ptlrpc/events.c */ +extern lnet_handle_eq_t ptlrpc_eq_h; +extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, + lnet_process_id_t *peer, lnet_nid_t *self); +/** + * These callbacks are invoked by LNet when something happened to + * underlying buffer + * @{ + */ +extern void request_out_callback(lnet_event_t *ev); +extern void reply_in_callback(lnet_event_t *ev); +extern void client_bulk_callback(lnet_event_t *ev); +extern void request_in_callback(lnet_event_t *ev); +extern void reply_out_callback(lnet_event_t *ev); +/** @} */ + +/* ptlrpc/connection.c */ +struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer, + lnet_nid_t self, + struct obd_uuid *uuid); +int ptlrpc_connection_put(struct ptlrpc_connection *c); +struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *); +int ptlrpc_connection_init(void); +void ptlrpc_connection_fini(void); +extern lnet_pid_t ptl_get_pid(void); + +/* ptlrpc/niobuf.c */ +/** + * Actual interfacing with LNet to put/get/register/unregister stuff + * @{ + */ + +int ptlrpc_register_bulk(struct ptlrpc_request *req); +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async); + +static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc; + int rc; + + LASSERT(req != NULL); + desc = req->rq_bulk; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && + req->rq_bulk_deadline > get_seconds()) + return 1; + + if (!desc) + return 0; + + spin_lock(&desc->bd_lock); + rc = desc->bd_md_count; + spin_unlock(&desc->bd_lock); + return rc; +} + +#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 +#define PTLRPC_REPLY_EARLY 0x02 +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); +int ptlrpc_reply(struct ptlrpc_request *req); +int ptlrpc_send_error(struct ptlrpc_request *req, int difficult); +int ptlrpc_error(struct ptlrpc_request *req); +void ptlrpc_resend_req(struct ptlrpc_request *request); +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); +int ptl_send_rpc(struct ptlrpc_request *request, int noreply); +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd); +/** @} */ + +/* ptlrpc/client.c */ +/** + * Client-side portals API. Everything to send requests, receive replies, + * request queues, request management, etc. + * @{ + */ +void ptlrpc_request_committed(struct ptlrpc_request *req, int force); + +void ptlrpc_init_client(int req_portal, int rep_portal, char *name, + struct ptlrpc_client *); +void ptlrpc_cleanup_client(struct obd_import *imp); +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); + +int ptlrpc_queue_wait(struct ptlrpc_request *req); +int ptlrpc_replay_req(struct ptlrpc_request *req); +int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async); +void ptlrpc_restart_req(struct ptlrpc_request *req); +void ptlrpc_abort_inflight(struct obd_import *imp); +void ptlrpc_cleanup_imp(struct obd_import *imp); +void ptlrpc_abort_set(struct ptlrpc_request_set *set); + +struct ptlrpc_request_set *ptlrpc_prep_set(void); +struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, + void *arg); +int ptlrpc_set_add_cb(struct ptlrpc_request_set *set, + set_interpreter_func fn, void *data); +int ptlrpc_set_next_timeout(struct ptlrpc_request_set *); +int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set); +int ptlrpc_set_wait(struct ptlrpc_request_set *); +int ptlrpc_expired_set(void *data); +void ptlrpc_interrupted_set(void *data); +void ptlrpc_mark_interrupted(struct ptlrpc_request *req); +void ptlrpc_set_destroy(struct ptlrpc_request_set *); +void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *); +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req); + +void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); +void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); + +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int, int, + void (*populate_pool)(struct ptlrpc_request_pool *, int)); + +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); +struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, + const struct req_format *format); +struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, + struct ptlrpc_request_pool *, + const struct req_format *format); +void ptlrpc_request_free(struct ptlrpc_request *request); +int ptlrpc_request_pack(struct ptlrpc_request *request, + __u32 version, int opcode); +struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, + const struct req_format *format, + __u32 version, int opcode); +int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, char **bufs, + struct ptlrpc_cli_ctx *ctx); +struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, + int opcode, int count, __u32 *lengths, + char **bufs); +struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, + __u32 version, int opcode, + int count, __u32 *lengths, char **bufs, + struct ptlrpc_request_pool *pool); +void ptlrpc_req_finished(struct ptlrpc_request *request); +void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request); +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + unsigned npages, unsigned max_brw, + unsigned type, unsigned portal); +void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin); +static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk) +{ + __ptlrpc_free_bulk(bulk, 1); +} +static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk) +{ + __ptlrpc_free_bulk(bulk, 0); +} +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len, int); +static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1); +} + +static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0); +} + +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp); +__u64 ptlrpc_next_xid(void); +__u64 ptlrpc_sample_next_xid(void); +__u64 ptlrpc_req_xid(struct ptlrpc_request *request); + +/* Set of routines to run a function in ptlrpcd context */ +void *ptlrpcd_alloc_work(struct obd_import *imp, + int (*cb)(const struct lu_env *, void *), void *data); +void ptlrpcd_destroy_work(void *handler); +int ptlrpcd_queue_work(void *handler); + +/** @} */ +struct ptlrpc_service_buf_conf { + /* nbufs is buffers # to allocate when growing the pool */ + unsigned int bc_nbufs; + /* buffer size to post */ + unsigned int bc_buf_size; + /* portal to listed for requests on */ + unsigned int bc_req_portal; + /* portal of where to send replies to */ + unsigned int bc_rep_portal; + /* maximum request size to be accepted for this service */ + unsigned int bc_req_max_size; + /* maximum reply size this service can ever send */ + unsigned int bc_rep_max_size; +}; + +struct ptlrpc_service_thr_conf { + /* threadname should be 8 characters or less - 6 will be added on */ + char *tc_thr_name; + /* threads increasing factor for each CPU */ + unsigned int tc_thr_factor; + /* service threads # to start on each partition while initializing */ + unsigned int tc_nthrs_init; + /* + * low water of threads # upper-limit on each partition while running, + * service availability may be impacted if threads number is lower + * than this value. It can be ZERO if the service doesn't require + * CPU affinity or there is only one partition. + */ + unsigned int tc_nthrs_base; + /* "soft" limit for total threads number */ + unsigned int tc_nthrs_max; + /* user specified threads number, it will be validated due to + * other members of this structure. */ + unsigned int tc_nthrs_user; + /* set NUMA node affinity for service threads */ + unsigned int tc_cpu_affinity; + /* Tags for lu_context associated with service thread */ + __u32 tc_ctx_tags; +}; + +struct ptlrpc_service_cpt_conf { + struct cfs_cpt_table *cc_cptable; + /* string pattern to describe CPTs for a service */ + char *cc_pattern; +}; + +struct ptlrpc_service_conf { + /* service name */ + char *psc_name; + /* soft watchdog timeout multiplifier to print stuck service traces */ + unsigned int psc_watchdog_factor; + /* buffer information */ + struct ptlrpc_service_buf_conf psc_buf; + /* thread information */ + struct ptlrpc_service_thr_conf psc_thr; + /* CPU partition information */ + struct ptlrpc_service_cpt_conf psc_cpt; + /* function table */ + struct ptlrpc_service_ops psc_ops; +}; + +/* ptlrpc/service.c */ +/** + * Server-side services API. Register/unregister service, request state + * management, service thread management + * + * @{ + */ +void ptlrpc_save_lock(struct ptlrpc_request *req, + struct lustre_handle *lock, int mode, int no_ack); +void ptlrpc_commit_replies(struct obd_export *exp); +void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs); +void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs); +int ptlrpc_hpreq_handler(struct ptlrpc_request *req); +struct ptlrpc_service *ptlrpc_register_service( + struct ptlrpc_service_conf *conf, + struct proc_dir_entry *proc_entry); +void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); + +int ptlrpc_start_threads(struct ptlrpc_service *svc); +int ptlrpc_unregister_service(struct ptlrpc_service *service); +int liblustre_check_services(void *arg); +void ptlrpc_daemonize(char *name); +int ptlrpc_service_health_check(struct ptlrpc_service *); +void ptlrpc_server_drop_request(struct ptlrpc_request *req); +void ptlrpc_request_change_export(struct ptlrpc_request *req, + struct obd_export *export); + +int ptlrpc_hr_init(void); +void ptlrpc_hr_fini(void); + +/** @} */ + +/* ptlrpc/import.c */ +/** + * Import API + * @{ + */ +int ptlrpc_connect_import(struct obd_import *imp); +int ptlrpc_init_import(struct obd_import *imp); +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose); +int ptlrpc_import_recovery_state_machine(struct obd_import *imp); +void deuuidify(char *uuid, const char *prefix, char **uuid_start, + int *uuid_len); + +/* ptlrpc/pack_generic.c */ +int ptlrpc_reconnect_import(struct obd_import *imp); +/** @} */ + +/** + * ptlrpc msg buffer and swab interface + * + * @{ + */ +int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, + int index); +void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, + int index); +int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len); +int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len); + +int lustre_msg_check_version(struct lustre_msg *msg, __u32 version); +void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, + char **bufs); +int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count, + __u32 *lens, char **bufs); +int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs); +int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, + __u32 *lens, char **bufs, int flags); +#define LPRFL_EARLY_REPLY 1 +int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs, int flags); +int lustre_shrink_msg(struct lustre_msg *msg, int segment, + unsigned int newlen, int move_data); +void lustre_free_reply_state(struct ptlrpc_reply_state *rs); +int __lustre_unpack_msg(struct lustre_msg *m, int len); +int lustre_msg_hdr_size(__u32 magic, int count); +int lustre_msg_size(__u32 magic, int count, __u32 *lengths); +int lustre_msg_size_v2(int count, __u32 *lengths); +int lustre_packed_msg_size(struct lustre_msg *msg); +int lustre_msg_early_size(void); +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size); +void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen); +int lustre_msg_buflen(struct lustre_msg *m, int n); +void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len); +int lustre_msg_bufcount(struct lustre_msg *m); +char *lustre_msg_string(struct lustre_msg *m, int n, int max_len); +__u32 lustre_msghdr_get_flags(struct lustre_msg *msg); +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags); +__u32 lustre_msg_get_flags(struct lustre_msg *msg); +void lustre_msg_add_flags(struct lustre_msg *msg, int flags); +void lustre_msg_set_flags(struct lustre_msg *msg, int flags); +void lustre_msg_clear_flags(struct lustre_msg *msg, int flags); +__u32 lustre_msg_get_op_flags(struct lustre_msg *msg); +void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags); +void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags); +struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg); +__u32 lustre_msg_get_type(struct lustre_msg *msg); +__u32 lustre_msg_get_version(struct lustre_msg *msg); +void lustre_msg_add_version(struct lustre_msg *msg, int version); +__u32 lustre_msg_get_opc(struct lustre_msg *msg); +__u64 lustre_msg_get_last_xid(struct lustre_msg *msg); +__u64 lustre_msg_get_last_committed(struct lustre_msg *msg); +__u64 *lustre_msg_get_versions(struct lustre_msg *msg); +__u64 lustre_msg_get_transno(struct lustre_msg *msg); +__u64 lustre_msg_get_slv(struct lustre_msg *msg); +__u32 lustre_msg_get_limit(struct lustre_msg *msg); +void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv); +void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit); +int lustre_msg_get_status(struct lustre_msg *msg); +__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg); +int lustre_msg_is_v1(struct lustre_msg *msg); +__u32 lustre_msg_get_magic(struct lustre_msg *msg); +__u32 lustre_msg_get_timeout(struct lustre_msg *msg); +__u32 lustre_msg_get_service_time(struct lustre_msg *msg); +char *lustre_msg_get_jobid(struct lustre_msg *msg); +__u32 lustre_msg_get_cksum(struct lustre_msg *msg); +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg); +void lustre_msg_set_handle(struct lustre_msg *msg, + struct lustre_handle *handle); +void lustre_msg_set_type(struct lustre_msg *msg, __u32 type); +void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc); +void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid); +void lustre_msg_set_last_committed(struct lustre_msg *msg, + __u64 last_committed); +void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions); +void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno); +void lustre_msg_set_status(struct lustre_msg *msg, __u32 status); +void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt); +void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes); +void ptlrpc_request_set_replen(struct ptlrpc_request *req); +void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout); +void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time); +void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid); +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum); + +static inline void +lustre_shrink_reply(struct ptlrpc_request *req, int segment, + unsigned int newlen, int move_data) +{ + LASSERT(req->rq_reply_state); + LASSERT(req->rq_repmsg); + req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment, + newlen, move_data); +} + +#ifdef CONFIG_LUSTRE_TRANSLATE_ERRNOS + +static inline int ptlrpc_status_hton(int h) +{ + /* + * Positive errnos must be network errnos, such as LUSTRE_EDEADLK, + * ELDLM_LOCK_ABORTED, etc. + */ + if (h < 0) + return -lustre_errno_hton(-h); + else + return h; +} + +static inline int ptlrpc_status_ntoh(int n) +{ + /* + * See the comment in ptlrpc_status_hton(). + */ + if (n < 0) + return -lustre_errno_ntoh(-n); + else + return n; +} + +#else + +#define ptlrpc_status_hton(h) (h) +#define ptlrpc_status_ntoh(n) (n) + +#endif +/** @} */ + +/** Change request phase of \a req to \a new_phase */ +static inline void +ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase) +{ + if (req->rq_phase == new_phase) + return; + + if (new_phase == RQ_PHASE_UNREGISTERING) { + req->rq_next_phase = req->rq_phase; + if (req->rq_import) + atomic_inc(&req->rq_import->imp_unregistering); + } + + if (req->rq_phase == RQ_PHASE_UNREGISTERING) { + if (req->rq_import) + atomic_dec(&req->rq_import->imp_unregistering); + } + + DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"", + ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase)); + + req->rq_phase = new_phase; +} + +/** + * Returns true if request \a req got early reply and hard deadline is not met + */ +static inline int +ptlrpc_client_early(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > get_seconds()) + return 0; + return req->rq_early; +} + +/** + * Returns true if we got real reply from server for this request + */ +static inline int +ptlrpc_client_replied(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > get_seconds()) + return 0; + return req->rq_replied; +} + +/** Returns true if request \a req is in process of receiving server reply */ +static inline int +ptlrpc_client_recv(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > get_seconds()) + return 1; + return req->rq_receiving_reply; +} + +static inline int +ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) +{ + int rc; + + spin_lock(&req->rq_lock); + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > get_seconds()) { + spin_unlock(&req->rq_lock); + return 1; + } + rc = req->rq_receiving_reply; + rc = rc || req->rq_req_unlink || req->rq_reply_unlink; + spin_unlock(&req->rq_lock); + return rc; +} + +static inline void +ptlrpc_client_wake_req(struct ptlrpc_request *req) +{ + if (req->rq_set == NULL) + wake_up(&req->rq_reply_waitq); + else + wake_up(&req->rq_set->set_waitq); +} + +static inline void +ptlrpc_rs_addref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + atomic_inc(&rs->rs_refcount); +} + +static inline void +ptlrpc_rs_decref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + if (atomic_dec_and_test(&rs->rs_refcount)) + lustre_free_reply_state(rs); +} + +/* Should only be called once per req */ +static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req) +{ + if (req->rq_reply_state == NULL) + return; /* shouldn't occur */ + ptlrpc_rs_decref(req->rq_reply_state); + req->rq_reply_state = NULL; + req->rq_repmsg = NULL; +} + +static inline __u32 lustre_request_magic(struct ptlrpc_request *req) +{ + return lustre_msg_get_magic(req->rq_reqmsg); +} + +static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req) +{ + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return req->rq_reqmsg->lm_repsize; + default: + LASSERTF(0, "incorrect message magic: %08x\n", + req->rq_reqmsg->lm_magic); + return -EFAULT; + } +} + +static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req) +{ + if (req->rq_delay_limit != 0 && + time_before(cfs_time_add(req->rq_queued_time, + cfs_time_seconds(req->rq_delay_limit)), + cfs_time_current())) { + return 1; + } + return 0; +} + +static inline int ptlrpc_no_resend(struct ptlrpc_request *req) +{ + if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) { + spin_lock(&req->rq_lock); + req->rq_no_resend = 1; + spin_unlock(&req->rq_lock); + } + return req->rq_no_resend; +} + +static inline int +ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt) +{ + int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate); + + return svcpt->scp_service->srv_watchdog_factor * + max_t(int, at, obd_timeout); +} + +static inline struct ptlrpc_service * +ptlrpc_req2svc(struct ptlrpc_request *req) +{ + LASSERT(req->rq_rqbd != NULL); + return req->rq_rqbd->rqbd_svcpt->scp_service; +} + +/* ldlm/ldlm_lib.c */ +/** + * Target client logic + * @{ + */ +int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg); +int client_obd_cleanup(struct obd_device *obddev); +int client_connect_import(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *, + void *localdata); +int client_disconnect_export(struct obd_export *exp); +int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority); +int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid); +int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, + struct obd_uuid *uuid); +int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid); +void client_destroy_import(struct obd_import *imp); +/** @} */ + + +/* ptlrpc/pinger.c */ +/** + * Pinger API (client side only) + * @{ + */ +enum timeout_event { + TIMEOUT_GRANT = 1 +}; +struct timeout_item; +typedef int (*timeout_cb_t)(struct timeout_item *, void *); +int ptlrpc_pinger_add_import(struct obd_import *imp); +int ptlrpc_pinger_del_import(struct obd_import *imp); +int ptlrpc_add_timeout_client(int time, enum timeout_event event, + timeout_cb_t cb, void *data, + struct list_head *obd_list); +int ptlrpc_del_timeout_client(struct list_head *obd_list, + enum timeout_event event); +struct ptlrpc_request *ptlrpc_prep_ping(struct obd_import *imp); +int ptlrpc_obd_ping(struct obd_device *obd); +void ping_evictor_start(void); +void ping_evictor_stop(void); +void ptlrpc_pinger_ir_up(void); +void ptlrpc_pinger_ir_down(void); +/** @} */ +int ptlrpc_pinger_suppress_pings(void); + +/* ptlrpc daemon bind policy */ +typedef enum { + /* all ptlrpcd threads are free mode */ + PDB_POLICY_NONE = 1, + /* all ptlrpcd threads are bound mode */ + PDB_POLICY_FULL = 2, + /* ... */ + PDB_POLICY_PAIR = 3, + /* ... , + * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1]. + * If kernel supports NUMA, pthrpcd threads are binded and + * grouped by NUMA node */ + PDB_POLICY_NEIGHBOR = 4, +} pdb_policy_t; + +/* ptlrpc daemon load policy + * It is caller's duty to specify how to push the async RPC into some ptlrpcd + * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is + * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd, + * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner, + * depends on which is scheduled firstly, to accelerate the RPC processing. */ +typedef enum { + /* on the same CPU core as the caller */ + PDL_POLICY_SAME = 1, + /* within the same CPU partition, but not the same core as the caller */ + PDL_POLICY_LOCAL = 2, + /* round-robin on all CPU cores, but not the same core as the caller */ + PDL_POLICY_ROUND = 3, + /* the specified CPU core is preferred, but not enforced */ + PDL_POLICY_PREFERRED = 4, +} pdl_policy_t; + +/* ptlrpc/ptlrpcd.c */ +void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force); +void ptlrpcd_free(struct ptlrpcd_ctl *pc); +void ptlrpcd_wake(struct ptlrpc_request *req); +void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx); +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set); +int ptlrpcd_addref(void); +void ptlrpcd_decref(void); + +/* ptlrpc/lproc_ptlrpc.c */ +/** + * procfs output related functions + * @{ + */ +const char *ll_opcode2str(__u32 opcode); +#if defined (CONFIG_PROC_FS) +void ptlrpc_lprocfs_register_obd(struct obd_device *obd); +void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd); +void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes); +#else +static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {} +#endif +/** @} */ + +/* ptlrpc/llog_client.c */ +extern struct llog_operations llog_client_ops; + +/** @} net */ + +#endif +/** @} PtlRPC */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_param.h b/kernel/drivers/staging/lustre/lustre/include/lustre_param.h new file mode 100644 index 000000000..ed654684c --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_param.h @@ -0,0 +1,121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_param.h + * + * User-settable parameter keys + * + * Author: Nathan Rutman + */ + +#ifndef _LUSTRE_PARAM_H +#define _LUSTRE_PARAM_H + +/** \defgroup param param + * + * @{ + */ + +/* For interoperability */ +struct cfg_interop_param { + char *old_param; + char *new_param; +}; + +/* obd_config.c */ +int class_find_param(char *buf, char *key, char **valp); +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr); +int class_get_next_param(char **params, char *copy); +int class_match_param(char *buf, char *key, char **valp); +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_net(char *buf, __u32 *net, char **endh); +int class_match_nid(char *buf, char *key, lnet_nid_t nid); +int class_match_net(char *buf, char *key, __u32 net); +/* obd_mount.c */ +int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4); + + + +/****************** User-settable parameter keys *********************/ +/* e.g. + tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda + lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0 + ... testfs-MDT0000.lov.stripesize=4M + ... testfs-OST0000.ost.client_cache_seconds=15 + ... testfs.sys.timeout= + ... testfs.llite.max_read_ahead_mb=16 +*/ + +/* System global or special params not handled in obd's proc + * See mgs_write_log_sys() + */ +#define PARAM_TIMEOUT "timeout=" /* global */ +#define PARAM_LDLM_TIMEOUT "ldlm_timeout=" /* global */ +#define PARAM_AT_MIN "at_min=" /* global */ +#define PARAM_AT_MAX "at_max=" /* global */ +#define PARAM_AT_EXTRA "at_extra=" /* global */ +#define PARAM_AT_EARLY_MARGIN "at_early_margin=" /* global */ +#define PARAM_AT_HISTORY "at_history=" /* global */ +#define PARAM_JOBID_VAR "jobid_var=" /* global */ +#define PARAM_MGSNODE "mgsnode=" /* only at mounttime */ +#define PARAM_FAILNODE "failover.node=" /* add failover nid */ +#define PARAM_FAILMODE "failover.mode=" /* initial mount only */ +#define PARAM_ACTIVE "active=" /* activate/deactivate */ +#define PARAM_NETWORK "network=" /* bind on nid */ +#define PARAM_ID_UPCALL "identity_upcall=" /* identity upcall */ + +/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */ +#define PARAM_OST "ost." +#define PARAM_OSC "osc." +#define PARAM_MDT "mdt." +#define PARAM_MDD "mdd." +#define PARAM_MDC "mdc." +#define PARAM_LLITE "llite." +#define PARAM_LOV "lov." +#define PARAM_LOD "lod." +#define PARAM_OSP "osp." +#define PARAM_SYS "sys." /* global */ +#define PARAM_SRPC "srpc." +#define PARAM_SRPC_FLVR "srpc.flavor." +#define PARAM_SRPC_UDESC "srpc.udesc.cli2mdt" +#define PARAM_SEC "security." +#define PARAM_QUOTA "quota." /* global */ + +/** @} param */ + +#endif /* _LUSTRE_PARAM_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_quota.h b/kernel/drivers/staging/lustre/lustre/include/lustre_quota.h new file mode 100644 index 000000000..2643f2807 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_quota.h @@ -0,0 +1,241 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + * Use is subject to license terms. + */ + +#ifndef _LUSTRE_QUOTA_H +#define _LUSTRE_QUOTA_H + +/** \defgroup quota quota + * + */ + +#include +#include +#include + +#include "dt_object.h" +#include "lustre_fid.h" +#include "lustre_dlm.h" + +#ifndef MAX_IQ_TIME +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +#ifndef MAX_DQ_TIME +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +struct lquota_id_info; +struct lquota_trans; + +/* Gather all quota record type in an union that can be used to read any records + * from disk. All fields of these records must be 64-bit aligned, otherwise the + * OSD layer may swab them incorrectly. */ +union lquota_rec { + struct lquota_glb_rec lqr_glb_rec; + struct lquota_slv_rec lqr_slv_rec; + struct lquota_acct_rec lqr_acct_rec; +}; + +/* Index features supported by the global index objects + * Only used for migration purpose and should be removed once on-disk migration + * is no longer needed */ +extern struct dt_index_features dt_quota_iusr_features; +extern struct dt_index_features dt_quota_busr_features; +extern struct dt_index_features dt_quota_igrp_features; +extern struct dt_index_features dt_quota_bgrp_features; + +/* Name used in the configuration logs to identify the default metadata pool + * (composed of all the MDTs, with pool ID 0) and the default data pool (all + * the OSTs, with pool ID 0 too). */ +#define QUOTA_METAPOOL_NAME "mdt=" +#define QUOTA_DATAPOOL_NAME "ost=" + +/* + * Quota Master Target support + */ + +/* Request handlers for quota master operations. + * This is used by the MDT to pass quota/lock requests to the quota master + * target. This won't be needed any more once the QMT is a real target and + * does not rely any more on the MDT service threads and namespace. */ +struct qmt_handlers { + /* Handle quotactl request from client. */ + int (*qmth_quotactl)(const struct lu_env *, struct lu_device *, + struct obd_quotactl *); + + /* Handle dqacq/dqrel request from slave. */ + int (*qmth_dqacq)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *); + + /* LDLM intent policy associated with quota locks */ + int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *, struct ldlm_lock **, + int); + + /* Initialize LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *); + + /* Update LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *, + struct ptlrpc_request *, int); + + /* Return size of LVB to be packed in ldlm message */ + int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *); + + /* Fill request buffer with lvb */ + int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *, + int); + + /* Free lvb associated with ldlm resource */ + int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *); +}; + +/* actual handlers are defined in lustre/quota/qmt_handler.c */ +extern struct qmt_handlers qmt_hdls; + +/* + * Quota enforcement support on slaves + */ + +struct qsd_instance; + +/* The quota slave feature is implemented under the form of a library. + * The API is the following: + * + * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd + * instance via qsd_init(). This creates all required structures + * to manage quota enforcement for this target and performs all + * low-level initialization which does not involve any lustre + * object. qsd_init() should typically be called when the OSD + * is being set up. + * + * - qsd_prepare(): This sets up on-disk objects associated with the quota slave + * feature and initiates the quota reintegration procedure if + * needed. qsd_prepare() should typically be called when + * ->ldo_prepare is invoked. + * + * - qsd_start(): a qsd instance should be started once recovery is completed + * (i.e. when ->ldo_recovery_complete is called). This is used + * to notify the qsd layer that quota should now be enforced + * again via the qsd_op_begin/end functions. The last step of the + * reintegration procedure (namely usage reconciliation) will be + * completed during start. + * + * - qsd_fini(): is used to release a qsd_instance structure allocated with + * qsd_init(). This releases all quota slave objects and frees the + * structures associated with the qsd_instance. + * + * - qsd_op_begin(): is used to enforce quota, it must be called in the + * declaration of each operation. qsd_op_end() should then be + * invoked later once all operations have been completed in + * order to release/adjust the quota space. + * Running qsd_op_begin() before qsd_start() isn't fatal and + * will return success. + * Once qsd_start() has been run, qsd_op_begin() will block + * until the reintegration procedure is completed. + * + * - qsd_op_end(): performs the post operation quota processing. This must be + * called after the operation transaction stopped. + * While qsd_op_begin() must be invoked each time a new + * operation is declared, qsd_op_end() should be called only + * once for the whole transaction. + * + * - qsd_op_adjust(): triggers pre-acquire/release if necessary. + * + * Below are the function prototypes to be used by OSD layer to manage quota + * enforcement. Arguments are documented where each function is defined. */ + +struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *, + struct proc_dir_entry *); +int qsd_prepare(const struct lu_env *, struct qsd_instance *); +int qsd_start(const struct lu_env *, struct qsd_instance *); +void qsd_fini(const struct lu_env *, struct qsd_instance *); +int qsd_op_begin(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *, struct lquota_id_info *, int *); +void qsd_op_end(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *); +void qsd_op_adjust(const struct lu_env *, struct qsd_instance *, + union lquota_id *, int); +/* This is exported for the ldiskfs quota migration only, + * see convert_quota_file() */ +int lquota_disk_write_glb(const struct lu_env *, struct dt_object *, + __u64, struct lquota_glb_rec *); + +/* + * Quota information attached to a transaction + */ + +struct lquota_entry; + +struct lquota_id_info { + /* quota identifier */ + union lquota_id lqi_id; + + /* USRQUOTA or GRPQUOTA for now, could be expanded for + * directory quota or other types later. */ + int lqi_type; + + /* inodes or kbytes to be consumed or released, it could + * be negative when releasing space. */ + long long lqi_space; + + /* quota slave entry structure associated with this ID */ + struct lquota_entry *lqi_qentry; + + /* whether we are reporting blocks or inodes */ + bool lqi_is_blk; +}; + +/* Since we enforce only inode quota in meta pool (MDTs), and block quota in + * data pool (OSTs), there are at most 4 quota ids being enforced in a single + * transaction, which is chown transaction: + * original uid and gid, new uid and gid. + * + * This value might need to be revised when directory quota is added. */ +#define QUOTA_MAX_TRANSIDS 4 + +/* all qids involved in a single transaction */ +struct lquota_trans { + unsigned short lqt_id_cnt; + struct lquota_id_info lqt_ids[QUOTA_MAX_TRANSIDS]; +}; + +/* flags for quota local enforcement */ +#define QUOTA_FL_OVER_USRQUOTA 0x01 +#define QUOTA_FL_OVER_GRPQUOTA 0x02 +#define QUOTA_FL_SYNC 0x04 + +#define IS_LQUOTA_RES(res) \ + (res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA || \ + res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB) + +/* helper function used by MDT & OFD to retrieve quota accounting information + * on slave */ +int lquotactl_slv(const struct lu_env *, struct dt_device *, + struct obd_quotactl *); +/** @} quota */ +#endif /* _LUSTRE_QUOTA_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/kernel/drivers/staging/lustre/lustre/include/lustre_req_layout.h new file mode 100644 index 000000000..c6457b27c --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_req_layout.h @@ -0,0 +1,341 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_req_layout.h + * + * Lustre Metadata Target (mdt) request handler + * + * Author: Nikita Danilov + */ + +#ifndef _LUSTRE_REQ_LAYOUT_H__ +#define _LUSTRE_REQ_LAYOUT_H__ + +/** \defgroup req_layout req_layout + * + * @{ + */ + +struct req_msg_field; +struct req_format; +struct req_capsule; + +struct ptlrpc_request; + +enum req_location { + RCL_CLIENT, + RCL_SERVER, + RCL_NR +}; + +/* Maximal number of fields (buffers) in a request message. */ +#define REQ_MAX_FIELD_NR 9 + +struct req_capsule { + struct ptlrpc_request *rc_req; + const struct req_format *rc_fmt; + enum req_location rc_loc; + __u32 rc_area[RCL_NR][REQ_MAX_FIELD_NR]; +}; + +#if !defined(__REQ_LAYOUT_USER__) + +/* struct ptlrpc_request, lustre_msg* */ +#include "lustre_net.h" + +void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req, + enum req_location location); +void req_capsule_fini(struct req_capsule *pill); + +void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt); +void req_capsule_client_dump(struct req_capsule *pill); +void req_capsule_server_dump(struct req_capsule *pill); +void req_capsule_init_area(struct req_capsule *pill); +int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc); +int req_capsule_server_pack(struct req_capsule *pill); + +void *req_capsule_client_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_client_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_client_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len); +void *req_capsule_server_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_server_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len); +void *req_capsule_server_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_server_sized_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len, void *swabber); +const void *req_capsule_other_get(struct req_capsule *pill, + const struct req_msg_field *field); + +void req_capsule_set_size(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, int size); +int req_capsule_get_size(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc); +int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, + enum req_location loc); +void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt); + +int req_capsule_has_field(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +int req_capsule_field_present(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +void req_capsule_shrink(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen, + enum req_location loc); +int req_capsule_server_grow(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen); +int req_layout_init(void); +void req_layout_fini(void); + +/* __REQ_LAYOUT_USER__ */ +#endif + +extern struct req_format RQF_OBD_PING; +extern struct req_format RQF_OBD_SET_INFO; +extern struct req_format RQF_SEC_CTX; +extern struct req_format RQF_OBD_IDX_READ; +/* MGS req_format */ +extern struct req_format RQF_MGS_TARGET_REG; +extern struct req_format RQF_MGS_SET_INFO; +extern struct req_format RQF_MGS_CONFIG_READ; +/* fid/fld req_format */ +extern struct req_format RQF_SEQ_QUERY; +extern struct req_format RQF_FLD_QUERY; +/* MDS req_format */ +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_STATFS; +extern struct req_format RQF_MDS_GETSTATUS; +extern struct req_format RQF_MDS_SYNC; +extern struct req_format RQF_MDS_GETXATTR; +extern struct req_format RQF_MDS_GETATTR; +extern struct req_format RQF_UPDATE_OBJ; + +/* + * This is format of direct (non-intent) MDS_GETATTR_NAME request. + */ +extern struct req_format RQF_MDS_GETATTR_NAME; +extern struct req_format RQF_MDS_CLOSE; +extern struct req_format RQF_MDS_RELEASE_CLOSE; +extern struct req_format RQF_MDS_PIN; +extern struct req_format RQF_MDS_UNPIN; +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_GET_INFO; +extern struct req_format RQF_MDS_READPAGE; +extern struct req_format RQF_MDS_WRITEPAGE; +extern struct req_format RQF_MDS_IS_SUBDIR; +extern struct req_format RQF_MDS_DONE_WRITING; +extern struct req_format RQF_MDS_REINT; +extern struct req_format RQF_MDS_REINT_CREATE; +extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL; +extern struct req_format RQF_MDS_REINT_CREATE_SLAVE; +extern struct req_format RQF_MDS_REINT_CREATE_SYM; +extern struct req_format RQF_MDS_REINT_OPEN; +extern struct req_format RQF_MDS_REINT_UNLINK; +extern struct req_format RQF_MDS_REINT_LINK; +extern struct req_format RQF_MDS_REINT_RENAME; +extern struct req_format RQF_MDS_REINT_SETATTR; +extern struct req_format RQF_MDS_REINT_SETXATTR; +extern struct req_format RQF_MDS_QUOTACHECK; +extern struct req_format RQF_MDS_QUOTACTL; +extern struct req_format RQF_QC_CALLBACK; +extern struct req_format RQF_QUOTA_DQACQ; +extern struct req_format RQF_MDS_SWAP_LAYOUTS; +/* MDS hsm formats */ +extern struct req_format RQF_MDS_HSM_STATE_GET; +extern struct req_format RQF_MDS_HSM_STATE_SET; +extern struct req_format RQF_MDS_HSM_ACTION; +extern struct req_format RQF_MDS_HSM_PROGRESS; +extern struct req_format RQF_MDS_HSM_CT_REGISTER; +extern struct req_format RQF_MDS_HSM_CT_UNREGISTER; +extern struct req_format RQF_MDS_HSM_REQUEST; +/* OST req_format */ +extern struct req_format RQF_OST_CONNECT; +extern struct req_format RQF_OST_DISCONNECT; +extern struct req_format RQF_OST_QUOTACHECK; +extern struct req_format RQF_OST_QUOTACTL; +extern struct req_format RQF_OST_GETATTR; +extern struct req_format RQF_OST_SETATTR; +extern struct req_format RQF_OST_CREATE; +extern struct req_format RQF_OST_PUNCH; +extern struct req_format RQF_OST_SYNC; +extern struct req_format RQF_OST_DESTROY; +extern struct req_format RQF_OST_BRW_READ; +extern struct req_format RQF_OST_BRW_WRITE; +extern struct req_format RQF_OST_STATFS; +extern struct req_format RQF_OST_SET_GRANT_INFO; +extern struct req_format RQF_OST_GET_INFO_GENERIC; +extern struct req_format RQF_OST_GET_INFO_LAST_ID; +extern struct req_format RQF_OST_GET_INFO_LAST_FID; +extern struct req_format RQF_OST_SET_INFO_LAST_FID; +extern struct req_format RQF_OST_GET_INFO_FIEMAP; + +/* LDLM req_format */ +extern struct req_format RQF_LDLM_ENQUEUE; +extern struct req_format RQF_LDLM_ENQUEUE_LVB; +extern struct req_format RQF_LDLM_CONVERT; +extern struct req_format RQF_LDLM_INTENT; +extern struct req_format RQF_LDLM_INTENT_BASIC; +extern struct req_format RQF_LDLM_INTENT_LAYOUT; +extern struct req_format RQF_LDLM_INTENT_GETATTR; +extern struct req_format RQF_LDLM_INTENT_OPEN; +extern struct req_format RQF_LDLM_INTENT_CREATE; +extern struct req_format RQF_LDLM_INTENT_UNLINK; +extern struct req_format RQF_LDLM_INTENT_GETXATTR; +extern struct req_format RQF_LDLM_INTENT_QUOTA; +extern struct req_format RQF_LDLM_CANCEL; +extern struct req_format RQF_LDLM_CALLBACK; +extern struct req_format RQF_LDLM_CP_CALLBACK; +extern struct req_format RQF_LDLM_BL_CALLBACK; +extern struct req_format RQF_LDLM_GL_CALLBACK; +extern struct req_format RQF_LDLM_GL_DESC_CALLBACK; +/* LOG req_format */ +extern struct req_format RQF_LOG_CANCEL; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER; +extern struct req_format RQF_LLOG_ORIGIN_CONNECT; + +extern struct req_format RQF_CONNECT; + +extern struct req_msg_field RMF_GENERIC_DATA; +extern struct req_msg_field RMF_PTLRPC_BODY; +extern struct req_msg_field RMF_MDT_BODY; +extern struct req_msg_field RMF_MDT_EPOCH; +extern struct req_msg_field RMF_OBD_STATFS; +extern struct req_msg_field RMF_NAME; +extern struct req_msg_field RMF_SYMTGT; +extern struct req_msg_field RMF_TGTUUID; +extern struct req_msg_field RMF_CLUUID; +extern struct req_msg_field RMF_SETINFO_VAL; +extern struct req_msg_field RMF_SETINFO_KEY; +extern struct req_msg_field RMF_GETINFO_VAL; +extern struct req_msg_field RMF_GETINFO_VALLEN; +extern struct req_msg_field RMF_GETINFO_KEY; +extern struct req_msg_field RMF_IDX_INFO; +extern struct req_msg_field RMF_CLOSE_DATA; + +/* + * connection handle received in MDS_CONNECT request. + */ +extern struct req_msg_field RMF_CONN; +extern struct req_msg_field RMF_CONNECT_DATA; +extern struct req_msg_field RMF_DLM_REQ; +extern struct req_msg_field RMF_DLM_REP; +extern struct req_msg_field RMF_DLM_LVB; +extern struct req_msg_field RMF_DLM_GL_DESC; +extern struct req_msg_field RMF_LDLM_INTENT; +extern struct req_msg_field RMF_LAYOUT_INTENT; +extern struct req_msg_field RMF_MDT_MD; +extern struct req_msg_field RMF_REC_REINT; +extern struct req_msg_field RMF_EADATA; +extern struct req_msg_field RMF_EAVALS; +extern struct req_msg_field RMF_EAVALS_LENS; +extern struct req_msg_field RMF_ACL; +extern struct req_msg_field RMF_LOGCOOKIES; +extern struct req_msg_field RMF_CAPA1; +extern struct req_msg_field RMF_CAPA2; +extern struct req_msg_field RMF_OBD_QUOTACHECK; +extern struct req_msg_field RMF_OBD_QUOTACTL; +extern struct req_msg_field RMF_QUOTA_BODY; +extern struct req_msg_field RMF_STRING; +extern struct req_msg_field RMF_SWAP_LAYOUTS; +extern struct req_msg_field RMF_MDS_HSM_PROGRESS; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; +extern struct req_msg_field RMF_MDS_HSM_USER_ITEM; +extern struct req_msg_field RMF_MDS_HSM_ARCHIVE; +extern struct req_msg_field RMF_HSM_USER_STATE; +extern struct req_msg_field RMF_HSM_STATE_SET; +extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; + +/* seq-mgr fields */ +extern struct req_msg_field RMF_SEQ_OPC; +extern struct req_msg_field RMF_SEQ_RANGE; +extern struct req_msg_field RMF_FID_SPACE; + +/* FLD fields */ +extern struct req_msg_field RMF_FLD_OPC; +extern struct req_msg_field RMF_FLD_MDFLD; + +extern struct req_msg_field RMF_LLOGD_BODY; +extern struct req_msg_field RMF_LLOG_LOG_HDR; +extern struct req_msg_field RMF_LLOGD_CONN_BODY; + +extern struct req_msg_field RMF_MGS_TARGET_INFO; +extern struct req_msg_field RMF_MGS_SEND_PARAM; + +extern struct req_msg_field RMF_OST_BODY; +extern struct req_msg_field RMF_OBD_IOOBJ; +extern struct req_msg_field RMF_OBD_ID; +extern struct req_msg_field RMF_FID; +extern struct req_msg_field RMF_NIOBUF_REMOTE; +extern struct req_msg_field RMF_RCS; +extern struct req_msg_field RMF_FIEMAP_KEY; +extern struct req_msg_field RMF_FIEMAP_VAL; +extern struct req_msg_field RMF_OST_ID; + +/* MGS config read message format */ +extern struct req_msg_field RMF_MGS_CONFIG_BODY; +extern struct req_msg_field RMF_MGS_CONFIG_RES; + +/* generic uint32 */ +extern struct req_msg_field RMF_U32; + +/* OBJ update format */ +extern struct req_msg_field RMF_UPDATE; +extern struct req_msg_field RMF_UPDATE_REPLY; +/** @} req_layout */ + +#endif /* _LUSTRE_REQ_LAYOUT_H__ */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_sec.h b/kernel/drivers/staging/lustre/lustre/include/lustre_sec.h new file mode 100644 index 000000000..dff70a5b9 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_sec.h @@ -0,0 +1,1147 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_SEC_H_ +#define _LUSTRE_SEC_H_ + +/** \defgroup sptlrpc sptlrpc + * + * @{ + */ + +/* + * to avoid include + */ +struct obd_import; +struct obd_export; +struct ptlrpc_request; +struct ptlrpc_reply_state; +struct ptlrpc_bulk_desc; +struct brw_page; +/* Linux specific */ +struct key; +struct seq_file; + +/* + * forward declaration + */ +struct ptlrpc_sec_policy; +struct ptlrpc_sec_cops; +struct ptlrpc_sec_sops; +struct ptlrpc_sec; +struct ptlrpc_svc_ctx; +struct ptlrpc_cli_ctx; +struct ptlrpc_ctx_ops; + +/** + * \addtogroup flavor flavor + * + * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits + * are unused, must be set to 0 for future expansion. + *
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * 
+ * + * @{ + */ + +/* + * flavor constants + */ +enum sptlrpc_policy { + SPTLRPC_POLICY_NULL = 0, + SPTLRPC_POLICY_PLAIN = 1, + SPTLRPC_POLICY_GSS = 2, + SPTLRPC_POLICY_MAX, +}; + +enum sptlrpc_mech_null { + SPTLRPC_MECH_NULL = 0, + SPTLRPC_MECH_NULL_MAX, +}; + +enum sptlrpc_mech_plain { + SPTLRPC_MECH_PLAIN = 0, + SPTLRPC_MECH_PLAIN_MAX, +}; + +enum sptlrpc_mech_gss { + SPTLRPC_MECH_GSS_NULL = 0, + SPTLRPC_MECH_GSS_KRB5 = 1, + SPTLRPC_MECH_GSS_MAX, +}; + +enum sptlrpc_service_type { + SPTLRPC_SVC_NULL = 0, /**< no security */ + SPTLRPC_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_SVC_INTG = 2, /**< integrity */ + SPTLRPC_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_SVC_MAX, +}; + +enum sptlrpc_bulk_type { + SPTLRPC_BULK_DEFAULT = 0, /**< follow rpc flavor */ + SPTLRPC_BULK_HASH = 1, /**< hash integrity */ + SPTLRPC_BULK_MAX, +}; + +enum sptlrpc_bulk_service { + SPTLRPC_BULK_SVC_NULL = 0, /**< no security */ + SPTLRPC_BULK_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_BULK_SVC_INTG = 2, /**< integrity */ + SPTLRPC_BULK_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_BULK_SVC_MAX, +}; + +/* + * compose/extract macros + */ +#define FLVR_POLICY_OFFSET (0) +#define FLVR_MECH_OFFSET (4) +#define FLVR_SVC_OFFSET (8) +#define FLVR_BULK_TYPE_OFFSET (12) +#define FLVR_BULK_SVC_OFFSET (16) + +#define MAKE_FLVR(policy, mech, svc, btype, bsvc) \ + (((__u32)(policy) << FLVR_POLICY_OFFSET) | \ + ((__u32)(mech) << FLVR_MECH_OFFSET) | \ + ((__u32)(svc) << FLVR_SVC_OFFSET) | \ + ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) | \ + ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET)) + +/* + * extraction + */ +#define SPTLRPC_FLVR_POLICY(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF) +#define SPTLRPC_FLVR_MECH(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF) +#define SPTLRPC_FLVR_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_TYPE(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF) + +#define SPTLRPC_FLVR_BASE(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF) +#define SPTLRPC_FLVR_BASE_SUB(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF) + +/* + * gss subflavors + */ +#define MAKE_BASE_SUBFLVR(mech, svc) \ + ((__u32)(mech) | \ + ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET))) + +#define SPTLRPC_SUBFLVR_KRB5N \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL) +#define SPTLRPC_SUBFLVR_KRB5A \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH) +#define SPTLRPC_SUBFLVR_KRB5I \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG) +#define SPTLRPC_SUBFLVR_KRB5P \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV) + +/* + * "end user" flavors + */ +#define SPTLRPC_FLVR_NULL \ + MAKE_FLVR(SPTLRPC_POLICY_NULL, \ + SPTLRPC_MECH_NULL, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_PLAIN \ + MAKE_FLVR(SPTLRPC_POLICY_PLAIN, \ + SPTLRPC_MECH_PLAIN, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_HASH, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_KRB5N \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5A \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_AUTH, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5I \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_INTG, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_KRB5P \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_PRIV, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_PRIV) + +#define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL + +#define SPTLRPC_FLVR_INVALID ((__u32) 0xFFFFFFFF) +#define SPTLRPC_FLVR_ANY ((__u32) 0xFFF00000) + +/** + * extract the useful part from wire flavor + */ +#define WIRE_FLVR(wflvr) (((__u32) (wflvr)) & 0x000FFFFF) + +/** @} flavor */ + +static inline void flvr_set_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + svc, + SPTLRPC_FLVR_BULK_TYPE(*flvr), + SPTLRPC_FLVR_BULK_SVC(*flvr)); +} + +static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_BULK_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + SPTLRPC_FLVR_SVC(*flvr), + SPTLRPC_FLVR_BULK_TYPE(*flvr), + svc); +} + +struct bulk_spec_hash { + __u8 hash_alg; +}; + +/** + * Full description of flavors being used on a ptlrpc connection, include + * both regular RPC and bulk transfer parts. + */ +struct sptlrpc_flavor { + /** + * wire flavor, should be renamed to sf_wire. + */ + __u32 sf_rpc; + /** + * general flags of PTLRPC_SEC_FL_* + */ + __u32 sf_flags; + /** + * rpc flavor specification + */ + union { + /* nothing for now */ + } u_rpc; + /** + * bulk flavor specification + */ + union { + struct bulk_spec_hash hash; + } u_bulk; +}; + +/** + * identify the RPC is generated from what part of Lustre. It's encoded into + * RPC requests and to be checked by ptlrpc service. + */ +enum lustre_sec_part { + LUSTRE_SP_CLI = 0, + LUSTRE_SP_MDT, + LUSTRE_SP_OST, + LUSTRE_SP_MGC, + LUSTRE_SP_MGS, + LUSTRE_SP_ANY = 0xFF +}; + +const char *sptlrpc_part2name(enum lustre_sec_part sp); +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd); + +/** + * A rule specifies a flavor to be used by a ptlrpc connection between + * two Lustre parts. + */ +struct sptlrpc_rule { + __u32 sr_netid; /* LNET network ID */ + __u8 sr_from; /* sec_part */ + __u8 sr_to; /* sec_part */ + __u16 sr_padding; + struct sptlrpc_flavor sr_flvr; +}; + +/** + * A set of rules in memory. + * + * Rules are generated and stored on MGS, and propagated to MDT, OST, + * and client when needed. + */ +struct sptlrpc_rule_set { + int srs_nslot; + int srs_nrule; + struct sptlrpc_rule *srs_rules; +}; + +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr); +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr); + +static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set) +{ + memset(set, 0, sizeof(*set)); +} + +void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set, + struct sptlrpc_rule *rule); +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); +void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set); + +int sptlrpc_process_config(struct lustre_cfg *lcfg); +void sptlrpc_conf_log_start(const char *logname); +void sptlrpc_conf_log_stop(const char *logname); +void sptlrpc_conf_log_update_begin(const char *logname); +void sptlrpc_conf_log_update_end(const char *logname); +void sptlrpc_conf_client_adapt(struct obd_device *obd); +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *flavor); + +/* The maximum length of security payload. 1024 is enough for Kerberos 5, + * and should be enough for other future mechanisms but not sure. + * Only used by pre-allocated request/reply pool. + */ +#define SPTLRPC_MAX_PAYLOAD (1024) + + +struct vfs_cred { + uint32_t vc_uid; + uint32_t vc_gid; +}; + +struct ptlrpc_ctx_ops { + /** + * To determine whether it's suitable to use the \a ctx for \a vcred. + */ + int (*match) (struct ptlrpc_cli_ctx *ctx, + struct vfs_cred *vcred); + + /** + * To bring the \a ctx uptodate. + */ + int (*refresh) (struct ptlrpc_cli_ctx *ctx); + + /** + * Validate the \a ctx. + */ + int (*validate) (struct ptlrpc_cli_ctx *ctx); + + /** + * Force the \a ctx to die. + */ + void (*force_die) (struct ptlrpc_cli_ctx *ctx, + int grace); + int (*display) (struct ptlrpc_cli_ctx *ctx, + char *buf, int bufsize); + + /** + * Sign the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message with signature. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign(). + */ + int (*sign) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Verify the reply message using \a ctx. + * + * \pre req->rq_repdata point to reply message with signature. + * \pre req->rq_repdata_len is the total reply message length. + * \post req->rq_repmsg point to reply message without signature. + * \post req->rq_replen is the reply message length. + * + * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify(). + */ + int (*verify) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Encrypt the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message in clear text. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see gss_cli_ctx_seal(). + */ + int (*seal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Decrypt the reply message using \a ctx. + * + * \pre req->rq_repdata point to encrypted reply message. + * \pre req->rq_repdata_len is the total cipher text length. + * \post req->rq_repmsg point to reply message in clear text. + * \post req->rq_replen is the reply message length in clear text. + * + * \see gss_cli_ctx_unseal(). + */ + int (*unseal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Wrap bulk request data. This is called before wrapping RPC + * request message. + * + * \pre bulk buffer is descripted by desc->bd_iov and + * desc->bd_iov_count. note for read it's just buffer, no data + * need to be sent; for write it contains data in clear text. + * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared + * (usually inside of RPC request message). + * - encryption: cipher text bulk buffer is descripted by + * desc->bd_enc_iov and desc->bd_iov_count (currently assume iov + * count remains the same). + * - otherwise: bulk buffer is still desc->bd_iov and + * desc->bd_iov_count. + * + * \return 0: success. + * \return -ev: error code. + * + * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap bulk reply data. This is called after wrapping RPC + * reply message. + * + * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and + * desc->bd_iov_count, according to wrap_bulk(). + * \post final bulk data in clear text is placed in buffer described + * by desc->bd_iov and desc->bd_iov_count. + * \return +ve nob of actual bulk data in clear text. + * \return -ve error code. + * + * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +#define PTLRPC_CTX_NEW_BIT (0) /* newly created */ +#define PTLRPC_CTX_UPTODATE_BIT (1) /* uptodate */ +#define PTLRPC_CTX_DEAD_BIT (2) /* mark expired gracefully */ +#define PTLRPC_CTX_ERROR_BIT (3) /* fatal error (refresh, etc.) */ +#define PTLRPC_CTX_CACHED_BIT (8) /* in ctx cache (hash etc.) */ +#define PTLRPC_CTX_ETERNAL_BIT (9) /* always valid */ + +#define PTLRPC_CTX_NEW (1 << PTLRPC_CTX_NEW_BIT) +#define PTLRPC_CTX_UPTODATE (1 << PTLRPC_CTX_UPTODATE_BIT) +#define PTLRPC_CTX_DEAD (1 << PTLRPC_CTX_DEAD_BIT) +#define PTLRPC_CTX_ERROR (1 << PTLRPC_CTX_ERROR_BIT) +#define PTLRPC_CTX_CACHED (1 << PTLRPC_CTX_CACHED_BIT) +#define PTLRPC_CTX_ETERNAL (1 << PTLRPC_CTX_ETERNAL_BIT) + +#define PTLRPC_CTX_STATUS_MASK (PTLRPC_CTX_NEW_BIT | \ + PTLRPC_CTX_UPTODATE | \ + PTLRPC_CTX_DEAD | \ + PTLRPC_CTX_ERROR) + +struct ptlrpc_cli_ctx { + struct hlist_node cc_cache; /* linked into ctx cache */ + atomic_t cc_refcount; + struct ptlrpc_sec *cc_sec; + struct ptlrpc_ctx_ops *cc_ops; + unsigned long cc_expire; /* in seconds */ + unsigned int cc_early_expire:1; + unsigned long cc_flags; + struct vfs_cred cc_vcred; + spinlock_t cc_lock; + struct list_head cc_req_list; /* waiting reqs linked here */ + struct list_head cc_gc_chain; /* linked to gc chain */ +}; + +/** + * client side policy operation vector. + */ +struct ptlrpc_sec_cops { + /** + * Given an \a imp, create and initialize a ptlrpc_sec structure. + * \param ctx service context: + * - regular import: \a ctx should be NULL; + * - reverse import: \a ctx is obtained from incoming request. + * \param flavor specify what flavor to use. + * + * When necessary, policy module is responsible for taking reference + * on the import. + * + * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr(). + */ + struct ptlrpc_sec * (*create_sec) (struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flavor); + + /** + * Destructor of ptlrpc_sec. When called, refcount has been dropped + * to 0 and all contexts has been destroyed. + * + * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr(). + */ + void (*destroy_sec) (struct ptlrpc_sec *sec); + + /** + * Notify that this ptlrpc_sec is going to die. Optionally, policy + * module is supposed to set sec->ps_dying and whatever necessary + * actions. + * + * \see plain_kill_sec(), gss_sec_kill(). + */ + void (*kill_sec) (struct ptlrpc_sec *sec); + + /** + * Given \a vcred, lookup and/or create its context. The policy module + * is supposed to maintain its own context cache. + * XXX currently \a create and \a remove_dead is always 1, perhaps + * should be removed completely. + * + * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr(). + */ + struct ptlrpc_cli_ctx * (*lookup_ctx) (struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, + int remove_dead); + + /** + * Called then the reference of \a ctx dropped to 0. The policy module + * is supposed to destroy this context or whatever else according to + * its cache maintenance mechanism. + * + * \param sync if zero, we shouldn't wait for the context being + * destroyed completely. + * + * \see plain_release_ctx(), gss_sec_release_ctx_kr(). + */ + void (*release_ctx) (struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync); + + /** + * Flush the context cache. + * + * \param uid context of which user, -1 means all contexts. + * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected + * contexts should be cleared immediately. + * \param force if zero, only idle contexts will be flushed. + * + * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr(). + */ + int (*flush_ctx_cache) + (struct ptlrpc_sec *sec, + uid_t uid, + int grace, + int force); + + /** + * Called periodically by garbage collector to remove dead contexts + * from cache. + * + * \see gss_sec_gc_ctx_kr(). + */ + void (*gc_ctx) (struct ptlrpc_sec *sec); + + /** + * Given an context \a ctx, install a corresponding reverse service + * context on client side. + * XXX currently it's only used by GSS module, maybe we should remove + * this from general API. + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); + + /** + * To allocate request buffer for \a req. + * + * \pre req->rq_reqmsg == NULL. + * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated, + * we are not supposed to free it. + * \post if success, req->rq_reqmsg point to a buffer with size + * at least \a lustre_msg_size. + * + * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf(). + */ + int (*alloc_reqbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free request buffer for \a req. + * + * \pre req->rq_reqbuf != NULL. + * + * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf(). + */ + void (*free_reqbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To allocate reply buffer for \a req. + * + * \pre req->rq_repbuf == NULL. + * \post if success, req->rq_repbuf point to a buffer with size + * req->rq_repbuf_len, the size should be large enough to receive + * reply which be transformed from \a lustre_msg_size of clear text. + * + * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf(). + */ + int (*alloc_repbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free reply buffer for \a req. + * + * \pre req->rq_repbuf != NULL. + * \post req->rq_repbuf == NULL. + * \post req->rq_repbuf_len == 0. + * + * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf(). + */ + void (*free_repbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To expand the request buffer of \a req, thus the \a segment in + * the request message pointed by req->rq_reqmsg can accommodate + * at least \a newsize of data. + * + * \pre req->rq_reqmsg->lm_buflens[segment] < newsize. + * + * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(), + * gss_enlarge_reqbuf(). + */ + int (*enlarge_reqbuf) + (struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize); + /* + * misc + */ + int (*display) (struct ptlrpc_sec *sec, + struct seq_file *seq); +}; + +/** + * server side policy operation vector. + */ +struct ptlrpc_sec_sops { + /** + * verify an incoming request. + * + * \pre request message is pointed by req->rq_reqbuf, size is + * req->rq_reqdata_len; and the message has been unpacked to + * host byte order. + * + * \retval SECSVC_OK success, req->rq_reqmsg point to request message + * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set; + * req->rq_sp_from is decoded from request. + * \retval SECSVC_COMPLETE success, the request has been fully + * processed, and reply message has been prepared; req->rq_sp_from is + * decoded from request. + * \retval SECSVC_DROP failed, this request should be dropped. + * + * \see null_accept(), plain_accept(), gss_svc_accept_kr(). + */ + int (*accept) (struct ptlrpc_request *req); + + /** + * Perform security transformation upon reply message. + * + * \pre reply message is pointed by req->rq_reply_state->rs_msg, size + * is req->rq_replen. + * \post req->rs_repdata_len is the final message size. + * \post req->rq_reply_off is set. + * + * \see null_authorize(), plain_authorize(), gss_svc_authorize(). + */ + int (*authorize) (struct ptlrpc_request *req); + + /** + * Invalidate server context \a ctx. + * + * \see gss_svc_invalidate_ctx(). + */ + void (*invalidate_ctx) + (struct ptlrpc_svc_ctx *ctx); + + /** + * Allocate a ptlrpc_reply_state. + * + * \param msgsize size of the reply message in clear text. + * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we + * should simply use it; otherwise we'll responsible for allocating + * a new one. + * \post req->rq_reply_state != NULL; + * \post req->rq_reply_state->rs_msg != NULL; + * + * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs(). + */ + int (*alloc_rs) (struct ptlrpc_request *req, + int msgsize); + + /** + * Free a ptlrpc_reply_state. + */ + void (*free_rs) (struct ptlrpc_reply_state *rs); + + /** + * Release the server context \a ctx. + * + * \see gss_svc_free_ctx(). + */ + void (*free_ctx) (struct ptlrpc_svc_ctx *ctx); + + /** + * Install a reverse context based on the server context \a ctx. + * + * \see gss_svc_install_rctx_kr(). + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); + + /** + * Prepare buffer for incoming bulk write. + * + * \pre desc->bd_iov and desc->bd_iov_count describes the buffer + * intended to receive the write. + * + * \see gss_svc_prep_bulk(). + */ + int (*prep_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap the bulk write data. + * + * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Wrap the bulk read data. + * + * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +struct ptlrpc_sec_policy { + struct module *sp_owner; + char *sp_name; + __u16 sp_policy; /* policy number */ + struct ptlrpc_sec_cops *sp_cops; /* client ops */ + struct ptlrpc_sec_sops *sp_sops; /* server ops */ +}; + +#define PTLRPC_SEC_FL_REVERSE 0x0001 /* reverse sec */ +#define PTLRPC_SEC_FL_ROOTONLY 0x0002 /* treat everyone as root */ +#define PTLRPC_SEC_FL_UDESC 0x0004 /* ship udesc */ +#define PTLRPC_SEC_FL_BULK 0x0008 /* intensive bulk i/o expected */ +#define PTLRPC_SEC_FL_PAG 0x0010 /* PAG mode */ + +/** + * The ptlrpc_sec represents the client side ptlrpc security facilities, + * each obd_import (both regular and reverse import) must associate with + * a ptlrpc_sec. + * + * \see sptlrpc_import_sec_adapt(). + */ +struct ptlrpc_sec { + struct ptlrpc_sec_policy *ps_policy; + atomic_t ps_refcount; + /** statistic only */ + atomic_t ps_nctx; + /** unique identifier */ + int ps_id; + struct sptlrpc_flavor ps_flvr; + enum lustre_sec_part ps_part; + /** after set, no more new context will be created */ + unsigned int ps_dying:1; + /** owning import */ + struct obd_import *ps_import; + spinlock_t ps_lock; + + /* + * garbage collection + */ + struct list_head ps_gc_list; + unsigned long ps_gc_interval; /* in seconds */ + unsigned long ps_gc_next; /* in seconds */ +}; + +static inline int sec_is_reverse(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE); +} + +static inline int sec_is_rootonly(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY); +} + + +struct ptlrpc_svc_ctx { + atomic_t sc_refcount; + struct ptlrpc_sec_policy *sc_policy; +}; + +/* + * user identity descriptor + */ +#define LUSTRE_MAX_GROUPS (128) + +struct ptlrpc_user_desc { + __u32 pud_uid; + __u32 pud_gid; + __u32 pud_fsuid; + __u32 pud_fsgid; + __u32 pud_cap; + __u32 pud_ngroups; + __u32 pud_groups[0]; +}; + +/* + * bulk flavors + */ +enum sptlrpc_bulk_hash_alg { + BULK_HASH_ALG_NULL = 0, + BULK_HASH_ALG_ADLER32, + BULK_HASH_ALG_CRC32, + BULK_HASH_ALG_MD5, + BULK_HASH_ALG_SHA1, + BULK_HASH_ALG_SHA256, + BULK_HASH_ALG_SHA384, + BULK_HASH_ALG_SHA512, + BULK_HASH_ALG_MAX +}; + +const char *sptlrpc_get_hash_name(__u8 hash_alg); +__u8 sptlrpc_get_hash_alg(const char *algname); + +enum { + BSD_FL_ERR = 1, +}; + +struct ptlrpc_bulk_sec_desc { + __u8 bsd_version; /* 0 */ + __u8 bsd_type; /* SPTLRPC_BULK_XXX */ + __u8 bsd_svc; /* SPTLRPC_BULK_SVC_XXXX */ + __u8 bsd_flags; /* flags */ + __u32 bsd_nob; /* nob of bulk data */ + __u8 bsd_data[0]; /* policy-specific token */ +}; + + +/* + * round size up to next power of 2, for slab allocation. + * @size must be sane (can't overflow after round up) + */ +static inline int size_roundup_power2(int size) +{ + size--; + size |= size >> 1; + size |= size >> 2; + size |= size >> 4; + size |= size >> 8; + size |= size >> 16; + size++; + return size; +} + +/* + * internal support libraries + */ +void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, + int segment, int newsize); + +/* + * security policies + */ +int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy); +int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy); + +__u32 sptlrpc_name2flavor_base(const char *name); +const char *sptlrpc_flavor2name_base(__u32 flvr); +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize); +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize); +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize); + +static inline +struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy) +{ + __module_get(policy->sp_owner); + return policy; +} + +static inline +void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy) +{ + module_put(policy->sp_owner); +} + +/* + * client credential + */ +static inline +unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx) +{ + return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK); +} + +static inline +int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE); +} + +static inline +int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) != 0); +} + +static inline +int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0); +} + +static inline +int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0); +} + +static inline +int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0); +} + +static inline +int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0); +} + +/* + * sec get/put + */ +struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec); +void sptlrpc_sec_put(struct ptlrpc_sec *sec); + +/* + * internal apis which only used by policy implementation + */ +int sptlrpc_get_next_secid(void); +void sptlrpc_sec_destroy(struct ptlrpc_sec *sec); + +/* + * exported client context api + */ +struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync); +void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx); +int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize); + +/* + * exported client context wrap/buffers + */ +int sptlrpc_cli_wrap_request(struct ptlrpc_request *req); +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req); +int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, + int segment, int newsize); +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, + struct ptlrpc_request **req_ret); +void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req); + +void sptlrpc_request_out_callback(struct ptlrpc_request *req); + +/* + * exported higher interface of import & request + */ +int sptlrpc_import_sec_adapt(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flvr); +struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp); +void sptlrpc_import_sec_put(struct obd_import *imp); + +int sptlrpc_import_check_ctx(struct obd_import *imp); +void sptlrpc_import_flush_root_ctx(struct obd_import *imp); +void sptlrpc_import_flush_my_ctx(struct obd_import *imp); +void sptlrpc_import_flush_all_ctx(struct obd_import *imp); +int sptlrpc_req_get_ctx(struct ptlrpc_request *req); +void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync); +int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout); +int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req); +void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode); + +int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule); + +/* gc */ +void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx); + +/* misc */ +const char *sec2target_str(struct ptlrpc_sec *sec); +/* + * lprocfs + */ +#if defined (CONFIG_PROC_FS) +struct proc_dir_entry; +extern struct proc_dir_entry *sptlrpc_proc_root; +int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev); +#else +#define sptlrpc_proc_root NULL +static inline int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev) +{ return 0; } +#endif + +/* + * server side + */ +enum secsvc_accept_res { + SECSVC_OK = 0, + SECSVC_COMPLETE, + SECSVC_DROP, +}; + +int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req); +int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen); +int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req); +void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs); +void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req); + +int sptlrpc_target_export_check(struct obd_export *exp, + struct ptlrpc_request *req); +void sptlrpc_target_update_exp_flavor(struct obd_device *obd, + struct sptlrpc_rule_set *rset); + +/* + * reverse context + */ +int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); +int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_cli_ctx *ctx); + +/* bulk security api */ +int sptlrpc_enc_pool_add_user(void); +int sptlrpc_enc_pool_del_user(void); +int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc); +void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc); + +int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + int nob); +int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + +/* bulk helpers (internal use only by policies) */ +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen); + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed); + +/* user descriptor helpers */ +static inline int sptlrpc_user_desc_size(int ngroups) +{ + return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32); +} + +int sptlrpc_current_user_desc_size(void); +int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset); +int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed); + + +#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN) +#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE) + +enum { + LUSTRE_SEC_NONE = 0, + LUSTRE_SEC_REMOTE = 1, + LUSTRE_SEC_SPECIFY = 2, + LUSTRE_SEC_ALL = 3 +}; + +/** @} sptlrpc */ + +#endif /* _LUSTRE_SEC_H_ */ diff --git a/kernel/drivers/staging/lustre/lustre/include/lustre_ver.h b/kernel/drivers/staging/lustre/lustre/include/lustre_ver.h new file mode 100644 index 000000000..caa4da12f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/lustre_ver.h @@ -0,0 +1,26 @@ +#ifndef _LUSTRE_VER_H_ +#define _LUSTRE_VER_H_ +/* This file automatically generated from lustre/include/lustre_ver.h.in, + * based on parameters in lustre/autoconf/lustre-version.ac. + * Changes made directly to this file will be lost. */ + +#define LUSTRE_MAJOR 2 +#define LUSTRE_MINOR 3 +#define LUSTRE_PATCH 64 +#define LUSTRE_FIX 0 +#define LUSTRE_VERSION_STRING "2.3.64" + +#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR, \ + LUSTRE_MINOR, LUSTRE_PATCH, \ + LUSTRE_FIX) + +/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches + * by this amount (set in lustre/autoconf/lustre-version.ac). */ +#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32) + +/* If lustre version of client and servers it connects to differs by more + * than this amount, client would issue a warning. + * (set in lustre/autoconf/lustre-version.ac) */ +#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0) + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/obd.h b/kernel/drivers/staging/lustre/lustre/include/obd.h new file mode 100644 index 000000000..2a88b806f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/obd.h @@ -0,0 +1,1496 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __OBD_H +#define __OBD_H + +#include "linux/obd.h" + +#define IOC_OSC_TYPE 'h' +#define IOC_OSC_MIN_NR 20 +#define IOC_OSC_SET_ACTIVE _IOWR(IOC_OSC_TYPE, 21, struct obd_device *) +#define IOC_OSC_MAX_NR 50 + +#define IOC_MDC_TYPE 'i' +#define IOC_MDC_MIN_NR 20 +#define IOC_MDC_MAX_NR 50 + +#include "lustre/lustre_idl.h" +#include "lustre_lib.h" +#include "lu_ref.h" +#include "lustre_export.h" +#include "lustre_fid.h" +#include "lustre_fld.h" +#include "lustre_capa.h" + +#define MAX_OBD_DEVICES 8192 + +struct osc_async_rc { + int ar_rc; + int ar_force_sync; + __u64 ar_min_xid; +}; + +struct lov_oinfo { /* per-stripe data structure */ + struct ost_id loi_oi; /* object ID/Sequence on the target OST */ + int loi_ost_idx; /* OST stripe index in lov_tgt_desc->tgts */ + int loi_ost_gen; /* generation of this loi_ost_idx */ + + unsigned long loi_kms_valid:1; + __u64 loi_kms; /* known minimum size */ + struct ost_lvb loi_lvb; + struct osc_async_rc loi_ar; +}; + +static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms) +{ + oinfo->loi_kms = kms; + oinfo->loi_kms_valid = 1; +} + +static inline void loi_init(struct lov_oinfo *loi) +{ +} + +struct lov_stripe_md { + atomic_t lsm_refc; + spinlock_t lsm_lock; + pid_t lsm_lock_owner; /* debugging */ + + /* maximum possible file size, might change as OSTs status changes, + * e.g. disconnected, deactivated */ + __u64 lsm_maxbytes; + struct { + /* Public members. */ + struct ost_id lw_object_oi; /* lov object id/seq */ + + /* LOV-private members start here -- only for use in lov/. */ + __u32 lw_magic; + __u32 lw_stripe_size; /* size of the stripe */ + __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */ + __u16 lw_stripe_count; /* number of objects being striped over */ + __u16 lw_layout_gen; /* generation of the layout */ + char lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */ + } lsm_wire; + + struct lov_oinfo *lsm_oinfo[0]; +}; + +#define lsm_oi lsm_wire.lw_object_oi +#define lsm_magic lsm_wire.lw_magic +#define lsm_layout_gen lsm_wire.lw_layout_gen +#define lsm_stripe_size lsm_wire.lw_stripe_size +#define lsm_pattern lsm_wire.lw_pattern +#define lsm_stripe_count lsm_wire.lw_stripe_count +#define lsm_pool_name lsm_wire.lw_pool_name + +static inline bool lsm_is_released(struct lov_stripe_md *lsm) +{ + return !!(lsm->lsm_pattern & LOV_PATTERN_F_RELEASED); +} + +static inline bool lsm_has_objects(struct lov_stripe_md *lsm) +{ + if (lsm == NULL) + return false; + if (lsm_is_released(lsm)) + return false; + return true; +} + +static inline int lov_stripe_md_size(unsigned int stripe_count) +{ + struct lov_stripe_md lsm; + + return sizeof(lsm) + stripe_count * sizeof(lsm.lsm_oinfo[0]); +} + +struct obd_info; + +typedef int (*obd_enqueue_update_f)(void *cookie, int rc); + +/* obd info for a particular level (lov, osc). */ +struct obd_info { + /* Lock policy. It keeps an extent which is specific for a particular + * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy, + * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */ + ldlm_policy_data_t oi_policy; + /* Flags used for set request specific flags: + - while lock handling, the flags obtained on the enqueue + request are set here. + - while stats, the flags used for control delay/resend. + - while setattr, the flags used for distinguish punch operation + */ + __u64 oi_flags; + /* Lock handle specific for every OSC lock. */ + struct lustre_handle *oi_lockh; + /* lsm data specific for every OSC. */ + struct lov_stripe_md *oi_md; + /* obdo data specific for every OSC, if needed at all. */ + struct obdo *oi_oa; + /* statfs data specific for every OSC, if needed at all. */ + struct obd_statfs *oi_osfs; + /* An update callback which is called to update some data on upper + * level. E.g. it is used for update lsm->lsm_oinfo at every received + * request in osc level for enqueue requests. It is also possible to + * update some caller data from LOV layer if needed. */ + obd_enqueue_update_f oi_cb_up; + /* oss capability, its type is obd_capa in client to avoid copy. + * in contrary its type is lustre_capa in OSS. */ + void *oi_capa; + /* transfer jobid from ost_sync() to filter_sync()... */ + char *oi_jobid; +}; + +/* compare all relevant fields. */ +static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1, + struct lov_stripe_md *m2) +{ + /* + * ->lsm_wire contains padding, but it should be zeroed out during + * allocation. + */ + return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof(m1->lsm_wire)); +} + +static inline int lov_lum_lsm_cmp(struct lov_user_md *lum, + struct lov_stripe_md *lsm) +{ + if (lsm->lsm_magic != lum->lmm_magic) + return 1; + if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) && + (lsm->lsm_stripe_count != lum->lmm_stripe_count)) + return 2; + if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) && + (lsm->lsm_stripe_size != lum->lmm_stripe_size)) + return 3; + if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) && + (lsm->lsm_pattern != lum->lmm_pattern)) + return 4; + if ((lsm->lsm_magic == LOV_MAGIC_V3) && + (strncmp(lsm->lsm_pool_name, + ((struct lov_user_md_v3 *)lum)->lmm_pool_name, + LOV_MAXPOOLNAME) != 0)) + return 5; + return 0; +} + +static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3, + int *lmm_magic, + struct lov_user_md *lum) +{ + if (lum && copy_from_user(lumv3, lum, sizeof(struct lov_user_md_v1))) + return -EFAULT; + + *lmm_magic = lumv3->lmm_magic; + + if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) { + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3); + *lmm_magic = LOV_USER_MAGIC_V1; + } else if (*lmm_magic == LOV_USER_MAGIC_V3) { + if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3))) + return -EFAULT; + } else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) { + if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3))) + return -EFAULT; + lustre_swab_lov_user_md_v3(lumv3); + *lmm_magic = LOV_USER_MAGIC_V3; + } else if (*lmm_magic != LOV_USER_MAGIC_V1) { + CDEBUG(D_IOCTL, + "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n", + *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3); + return -EINVAL; + } + return 0; +} + +void lov_stripe_lock(struct lov_stripe_md *md); +void lov_stripe_unlock(struct lov_stripe_md *md); + +struct obd_type { + struct list_head typ_chain; + struct obd_ops *typ_dt_ops; + struct md_ops *typ_md_ops; + struct proc_dir_entry *typ_procroot; + char *typ_name; + int typ_refcnt; + struct lu_device_type *typ_lu; + spinlock_t obd_type_lock; +}; + +struct brw_page { + u64 off; + struct page *pg; + int count; + u32 flag; +}; + +/* llog contexts */ +enum llog_ctxt_id { + LLOG_CONFIG_ORIG_CTXT = 0, + LLOG_CONFIG_REPL_CTXT, + LLOG_MDS_OST_ORIG_CTXT, + LLOG_MDS_OST_REPL_CTXT, + LLOG_SIZE_ORIG_CTXT, + LLOG_SIZE_REPL_CTXT, + LLOG_RD1_ORIG_CTXT, + LLOG_RD1_REPL_CTXT, + LLOG_TEST_ORIG_CTXT, + LLOG_TEST_REPL_CTXT, + LLOG_LOVEA_ORIG_CTXT, + LLOG_LOVEA_REPL_CTXT, + LLOG_CHANGELOG_ORIG_CTXT, /**< changelog generation on mdd */ + LLOG_CHANGELOG_REPL_CTXT, /**< changelog access on clients */ + LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */ + LLOG_AGENT_ORIG_CTXT, /**< agent requests generation on cdt */ + LLOG_MAX_CTXTS +}; + +struct timeout_item { + enum timeout_event ti_event; + unsigned long ti_timeout; + timeout_cb_t ti_cb; + void *ti_cb_data; + struct list_head ti_obd_list; + struct list_head ti_chain; +}; + +#define OSC_MAX_RIF_DEFAULT 8 +#define MDS_OSC_MAX_RIF_DEFAULT 50 +#define OSC_MAX_RIF_MAX 256 +#define OSC_MAX_DIRTY_DEFAULT (OSC_MAX_RIF_DEFAULT * 4) +#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ +#define OSC_DEFAULT_RESENDS 10 + +/* possible values for fo_sync_lock_cancel */ +enum { + NEVER_SYNC_ON_CANCEL = 0, + BLOCKING_SYNC_ON_CANCEL = 1, + ALWAYS_SYNC_ON_CANCEL = 2, + NUM_SYNC_ON_CANCEL_STATES +}; + +#define MDC_MAX_RIF_DEFAULT 8 +#define MDC_MAX_RIF_MAX 512 + +struct mdc_rpc_lock; +struct obd_import; +struct client_obd { + struct rw_semaphore cl_sem; + struct obd_uuid cl_target_uuid; + struct obd_import *cl_import; /* ptlrpc connection state */ + int cl_conn_count; + /* max_mds_easize is purely a performance thing so we don't have to + * call obd_size_diskmd() all the time. */ + int cl_default_mds_easize; + int cl_max_mds_easize; + int cl_default_mds_cookiesize; + int cl_max_mds_cookiesize; + + enum lustre_sec_part cl_sp_me; + enum lustre_sec_part cl_sp_to; + struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ + + /* the grant values are protected by loi_list_lock below */ + long cl_dirty; /* all _dirty_ in bytes */ + long cl_dirty_max; /* allowed w/o rpc */ + long cl_dirty_transit; /* dirty synchronous */ + long cl_avail_grant; /* bytes of credit for ost */ + long cl_lost_grant; /* lost credits (trunc) */ + + /* since we allocate grant by blocks, we don't know how many grant will + * be used to add a page into cache. As a solution, we reserve maximum + * grant before trying to dirty a page and unreserve the rest. + * See osc_{reserve|unreserve}_grant for details. */ + long cl_reserved_grant; + struct list_head cl_cache_waiters; /* waiting for cache/grant */ + unsigned long cl_next_shrink_grant; /* jiffies */ + struct list_head cl_grant_shrink_list; /* Timeout event list */ + int cl_grant_shrink_interval; /* seconds */ + + /* A chunk is an optimal size used by osc_extent to determine + * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */ + int cl_chunkbits; + int cl_chunk; + int cl_extent_tax; /* extent overhead, by bytes */ + + /* keep track of objects that have lois that contain pages which + * have been queued for async brw. this lock also protects the + * lists of osc_client_pages that hang off of the loi */ + /* + * ->cl_loi_list_lock protects consistency of + * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and + * ->ap_completion() call-backs are executed under this lock. As we + * cannot guarantee that these call-backs never block on all platforms + * (as a matter of fact they do block on Mac OS X), type of + * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux + * and blocking mutex on Mac OS X. (Alternative is to make this lock + * blocking everywhere, but we don't want to slow down fast-path of + * our main platform.) + * + * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together + * with client_obd_list_{un,}lock() and + * client_obd_list_lock_{init,done}() functions. + * + * NB by Jinshan: though field names are still _loi_, but actually + * osc_object{}s are in the list. + */ + client_obd_lock_t cl_loi_list_lock; + struct list_head cl_loi_ready_list; + struct list_head cl_loi_hp_ready_list; + struct list_head cl_loi_write_list; + struct list_head cl_loi_read_list; + int cl_r_in_flight; + int cl_w_in_flight; + /* just a sum of the loi/lop pending numbers to be exported by /proc */ + atomic_t cl_pending_w_pages; + atomic_t cl_pending_r_pages; + __u32 cl_max_pages_per_rpc; + int cl_max_rpcs_in_flight; + struct obd_histogram cl_read_rpc_hist; + struct obd_histogram cl_write_rpc_hist; + struct obd_histogram cl_read_page_hist; + struct obd_histogram cl_write_page_hist; + struct obd_histogram cl_read_offset_hist; + struct obd_histogram cl_write_offset_hist; + + /* lru for osc caching pages */ + struct cl_client_cache *cl_cache; + struct list_head cl_lru_osc; /* member of cl_cache->ccc_lru */ + atomic_t *cl_lru_left; + atomic_t cl_lru_busy; + atomic_t cl_lru_shrinkers; + atomic_t cl_lru_in_list; + struct list_head cl_lru_list; /* lru page list */ + client_obd_lock_t cl_lru_list_lock; /* page list protector */ + + /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */ + atomic_t cl_destroy_in_flight; + wait_queue_head_t cl_destroy_waitq; + + struct mdc_rpc_lock *cl_rpc_lock; + struct mdc_rpc_lock *cl_close_lock; + + /* mgc datastruct */ + struct mutex cl_mgc_mutex; + struct local_oid_storage *cl_mgc_los; + struct dt_object *cl_mgc_configs_dir; + atomic_t cl_mgc_refcount; + struct obd_export *cl_mgc_mgsexp; + + /* checksumming for data sent over the network */ + unsigned int cl_checksum:1; /* 0 = disabled, 1 = enabled */ + /* supported checksum types that are worked out at connect time */ + __u32 cl_supp_cksum_types; + /* checksum algorithm to be used */ + cksum_type_t cl_cksum_type; + + /* also protected by the poorly named _loi_list_lock lock above */ + struct osc_async_rc cl_ar; + + /* used by quotacheck when the servers are older than 2.4 */ + int cl_qchk_stat; /* quotacheck stat of the peer */ +#define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */ +#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0) +#warning "please consider removing quotacheck compatibility code" +#endif + + /* sequence manager */ + struct lu_client_seq *cl_seq; + + atomic_t cl_resends; /* resend count */ + + /* ptlrpc work for writeback in ptlrpcd context */ + void *cl_writeback_work; + /* hash tables for osc_quota_info */ + struct cfs_hash *cl_quota_hash[MAXQUOTAS]; +}; +#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) + +struct obd_id_info { + __u32 idx; + u64 *data; +}; + +struct echo_client_obd { + struct obd_export *ec_exp; /* the local connection to osc/lov */ + spinlock_t ec_lock; + struct list_head ec_objects; + struct list_head ec_locks; + int ec_nstripes; + __u64 ec_unique; +}; + +struct lov_qos_oss { + struct obd_uuid lqo_uuid; /* ptlrpc's c_remote_uuid */ + struct list_head lqo_oss_list; /* link to lov_qos */ + __u64 lqo_bavail; /* total bytes avail on OSS */ + __u64 lqo_penalty; /* current penalty */ + __u64 lqo_penalty_per_obj;/* penalty decrease every obj*/ + time_t lqo_used; /* last used time, seconds */ + __u32 lqo_ost_count; /* number of osts on this oss */ +}; + +struct ltd_qos { + struct lov_qos_oss *ltq_oss; /* oss info */ + __u64 ltq_penalty; /* current penalty */ + __u64 ltq_penalty_per_obj; /* penalty decrease every obj*/ + __u64 ltq_weight; /* net weighting */ + time_t ltq_used; /* last used time, seconds */ + unsigned int ltq_usable:1; /* usable for striping */ +}; + +/* Generic subset of OSTs */ +struct ost_pool { + __u32 *op_array; /* array of index of + lov_obd->lov_tgts */ + unsigned int op_count; /* number of OSTs in the array */ + unsigned int op_size; /* allocated size of lp_array */ + struct rw_semaphore op_rw_sem; /* to protect ost_pool use */ +}; + +/* Round-robin allocator data */ +struct lov_qos_rr { + __u32 lqr_start_idx; /* start index of new inode */ + __u32 lqr_offset_idx; /* aliasing for start_idx */ + int lqr_start_count; /* reseed counter */ + struct ost_pool lqr_pool; /* round-robin optimized list */ + unsigned long lqr_dirty:1; /* recalc round-robin list */ +}; + +/* allow statfs data caching for 1 second */ +#define OBD_STATFS_CACHE_SECONDS 1 + +struct lov_statfs_data { + struct obd_info lsd_oi; + struct obd_statfs lsd_statfs; +}; +/* Stripe placement optimization */ +struct lov_qos { + struct list_head lq_oss_list; /* list of OSSs that targets use */ + struct rw_semaphore lq_rw_sem; + __u32 lq_active_oss_count; + unsigned int lq_prio_free; /* priority for free space */ + unsigned int lq_threshold_rr;/* priority for rr */ + struct lov_qos_rr lq_rr; /* round robin qos data */ + unsigned long lq_dirty:1, /* recalc qos data */ + lq_same_space:1,/* the ost's all have approx. + the same space avail */ + lq_reset:1, /* zero current penalties */ + lq_statfs_in_progress:1; /* statfs op in + progress */ + /* qos statfs data */ + struct lov_statfs_data *lq_statfs_data; + wait_queue_head_t lq_statfs_waitq; /* waitqueue to notify statfs + * requests completion */ +}; + +struct lov_tgt_desc { + struct list_head ltd_kill; + struct obd_uuid ltd_uuid; + struct obd_device *ltd_obd; + struct obd_export *ltd_exp; + struct ltd_qos ltd_qos; /* qos info per target */ + __u32 ltd_gen; + __u32 ltd_index; /* index in lov_obd->tgts */ + unsigned long ltd_active:1,/* is this target up for requests */ + ltd_activate:1,/* should target be activated */ + ltd_reap:1; /* should this target be deleted */ +}; + +/* Pool metadata */ +#define pool_tgt_size(_p) _p->pool_obds.op_size +#define pool_tgt_count(_p) _p->pool_obds.op_count +#define pool_tgt_array(_p) _p->pool_obds.op_array +#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem + +struct pool_desc { + char pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */ + struct ost_pool pool_obds; /* pool members */ + atomic_t pool_refcount; /* pool ref. counter */ + struct lov_qos_rr pool_rr; /* round robin qos */ + struct hlist_node pool_hash; /* access by poolname */ + struct list_head pool_list; /* serial access */ + struct proc_dir_entry *pool_proc_entry; /* file in /proc */ + struct obd_device *pool_lobd; /* obd of the lov/lod to which + * this pool belongs */ +}; + +struct lov_obd { + struct lov_desc desc; + struct lov_tgt_desc **lov_tgts; /* sparse array */ + struct ost_pool lov_packed; /* all OSTs in a packed + array */ + struct mutex lov_lock; + struct obd_connect_data lov_ocd; + atomic_t lov_refcount; + __u32 lov_tgt_count; /* how many OBD's */ + __u32 lov_active_tgt_count; /* how many active */ + __u32 lov_death_row;/* tgts scheduled to be deleted */ + __u32 lov_tgt_size; /* size of tgts array */ + int lov_connects; + int lov_pool_count; + struct cfs_hash *lov_pools_hash_body; /* used for key access */ + struct list_head lov_pool_list; /* used for sequential access */ + struct proc_dir_entry *lov_pool_proc_entry; + enum lustre_sec_part lov_sp_me; + + /* Cached LRU pages from upper layer */ + void *lov_cache; + + struct rw_semaphore lov_notify_lock; +}; + +struct lmv_tgt_desc { + struct obd_uuid ltd_uuid; + struct obd_export *ltd_exp; + int ltd_idx; + struct mutex ltd_fid_mutex; + unsigned long ltd_active:1; /* target up for requests */ +}; + +enum placement_policy { + PLACEMENT_CHAR_POLICY = 0, + PLACEMENT_NID_POLICY = 1, + PLACEMENT_INVAL_POLICY = 2, + PLACEMENT_MAX_POLICY +}; + +struct lmv_obd { + int refcount; + struct lu_client_fld lmv_fld; + spinlock_t lmv_lock; + enum placement_policy lmv_placement; + struct lmv_desc desc; + struct obd_uuid cluuid; + struct obd_export *exp; + + struct mutex init_mutex; + int connected; + int max_easize; + int max_def_easize; + int max_cookiesize; + int max_def_cookiesize; + int server_timeout; + + int tgts_size; /* size of tgts array */ + struct lmv_tgt_desc **tgts; + + struct obd_connect_data conn_data; +}; + +struct niobuf_local { + __u64 lnb_file_offset; + __u32 lnb_page_offset; + __u32 len; + __u32 flags; + struct page *page; + struct dentry *dentry; + int lnb_grant_used; + int rc; +}; + +#define LUSTRE_FLD_NAME "fld" +#define LUSTRE_SEQ_NAME "seq" + +#define LUSTRE_MDD_NAME "mdd" +#define LUSTRE_OSD_LDISKFS_NAME "osd-ldiskfs" +#define LUSTRE_OSD_ZFS_NAME "osd-zfs" +#define LUSTRE_VVP_NAME "vvp" +#define LUSTRE_LMV_NAME "lmv" +#define LUSTRE_SLP_NAME "slp" +#define LUSTRE_LOD_NAME "lod" +#define LUSTRE_OSP_NAME "osp" +#define LUSTRE_LWP_NAME "lwp" + +/* obd device type names */ + /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */ +#define LUSTRE_MDS_NAME "mds" +#define LUSTRE_MDT_NAME "mdt" +#define LUSTRE_MDC_NAME "mdc" +#define LUSTRE_OSS_NAME "ost" /* FIXME change name to oss */ +#define LUSTRE_OST_NAME "obdfilter" /* FIXME change name to ost */ +#define LUSTRE_OSC_NAME "osc" +#define LUSTRE_LOV_NAME "lov" +#define LUSTRE_MGS_NAME "mgs" +#define LUSTRE_MGC_NAME "mgc" + +#define LUSTRE_ECHO_NAME "obdecho" +#define LUSTRE_ECHO_CLIENT_NAME "echo_client" +#define LUSTRE_QMT_NAME "qmt" + +/* Constant obd names (post-rename) */ +#define LUSTRE_MDS_OBDNAME "MDS" +#define LUSTRE_OSS_OBDNAME "OSS" +#define LUSTRE_MGS_OBDNAME "MGS" +#define LUSTRE_MGC_OBDNAME "MGC" + +/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */ +#define N_LOCAL_TEMP_PAGE 0x10000000 + +struct obd_trans_info { + __u64 oti_transno; + __u64 oti_xid; + /* Only used on the server side for tracking acks. */ + struct oti_req_ack_lock { + struct lustre_handle lock; + __u32 mode; + } oti_ack_locks[4]; + void *oti_handle; + struct llog_cookie oti_onecookie; + struct llog_cookie *oti_logcookies; + int oti_numcookies; + /** synchronous write is needed */ + unsigned long oti_sync_write:1; + + /* initial thread handling transaction */ + struct ptlrpc_thread *oti_thread; + __u32 oti_conn_cnt; + /** VBR: versions */ + __u64 oti_pre_version; + /** JobID */ + char *oti_jobid; + + struct obd_uuid *oti_ost_uuid; +}; + +static inline void oti_init(struct obd_trans_info *oti, + struct ptlrpc_request *req) +{ + if (oti == NULL) + return; + memset(oti, 0, sizeof(*oti)); + + if (req == NULL) + return; + + oti->oti_xid = req->rq_xid; + /** VBR: take versions from request */ + if (req->rq_reqmsg != NULL && + lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { + __u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg); + + oti->oti_pre_version = pre_version ? pre_version[0] : 0; + oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg); + } + + /** called from mds_create_objects */ + if (req->rq_repmsg != NULL) + oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); + oti->oti_thread = req->rq_svc_thread; + if (req->rq_reqmsg != NULL) + oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); +} + +static inline void oti_alloc_cookies(struct obd_trans_info *oti, + int num_cookies) +{ + if (!oti) + return; + + if (num_cookies == 1) + oti->oti_logcookies = &oti->oti_onecookie; + else + OBD_ALLOC_LARGE(oti->oti_logcookies, + num_cookies * sizeof(oti->oti_onecookie)); + + oti->oti_numcookies = num_cookies; +} + +static inline void oti_free_cookies(struct obd_trans_info *oti) +{ + if (!oti || !oti->oti_logcookies) + return; + + if (oti->oti_logcookies == &oti->oti_onecookie) + LASSERT(oti->oti_numcookies == 1); + else + OBD_FREE_LARGE(oti->oti_logcookies, + oti->oti_numcookies*sizeof(oti->oti_onecookie)); + oti->oti_logcookies = NULL; + oti->oti_numcookies = 0; +} + +/* + * Events signalled through obd_notify() upcall-chain. + */ +enum obd_notify_event { + /* target added */ + OBD_NOTIFY_CREATE, + /* Device connect start */ + OBD_NOTIFY_CONNECT, + /* Device activated */ + OBD_NOTIFY_ACTIVE, + /* Device deactivated */ + OBD_NOTIFY_INACTIVE, + /* Device disconnected */ + OBD_NOTIFY_DISCON, + /* Connect data for import were changed */ + OBD_NOTIFY_OCD, + /* Sync request */ + OBD_NOTIFY_SYNC_NONBLOCK, + OBD_NOTIFY_SYNC, + /* Configuration event */ + OBD_NOTIFY_CONFIG, + /* Administratively deactivate/activate event */ + OBD_NOTIFY_DEACTIVATE, + OBD_NOTIFY_ACTIVATE +}; + +/* + * Data structure used to pass obd_notify()-event to non-obd listeners (llite + * and liblustre being main examples). + */ +struct obd_notify_upcall { + int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner, void *data); + /* Opaque datum supplied by upper layer listener */ + void *onu_owner; +}; + +struct target_recovery_data { + svc_handler_t trd_recovery_handler; + pid_t trd_processing_task; + struct completion trd_starting; + struct completion trd_finishing; +}; + +struct obd_llog_group { + int olg_seq; + struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS]; + wait_queue_head_t olg_waitq; + spinlock_t olg_lock; + struct mutex olg_cat_processing; +}; + +/* corresponds to one of the obd's */ +#define OBD_DEVICE_MAGIC 0XAB5CD6EF +#define OBD_DEV_BY_DEVNAME 0xffffd0de + +struct lvfs_run_ctxt { + struct dt_device *dt; +}; + +struct obd_device { + struct obd_type *obd_type; + __u32 obd_magic; + + /* common and UUID name of this device */ + char obd_name[MAX_OBD_NAME]; + struct obd_uuid obd_uuid; + + struct lu_device *obd_lu_dev; + + int obd_minor; + /* bitfield modification is protected by obd_dev_lock */ + unsigned long obd_attached:1, /* finished attach */ + obd_set_up:1, /* finished setup */ + obd_recovering:1, /* there are recoverable clients */ + obd_abort_recovery:1,/* recovery expired */ + obd_version_recov:1, /* obd uses version checking */ + obd_replayable:1, /* recovery is enabled; inform clients */ + obd_no_transno:1, /* no committed-transno notification */ + obd_no_recov:1, /* fail instead of retry messages */ + obd_stopping:1, /* started cleanup */ + obd_starting:1, /* started setup */ + obd_force:1, /* cleanup with > 0 obd refcount */ + obd_fail:1, /* cleanup with failover */ + obd_async_recov:1, /* allow asynchronous orphan cleanup */ + obd_no_conn:1, /* deny new connections */ + obd_inactive:1, /* device active/inactive + * (for /proc/status only!!) */ + obd_no_ir:1, /* no imperative recovery. */ + obd_process_conf:1; /* device is processing mgs config */ + /* use separate field as it is set in interrupt to don't mess with + * protection of other bits using _bh lock */ + unsigned long obd_recovery_expired:1; + /* uuid-export hash body */ + struct cfs_hash *obd_uuid_hash; + /* nid-export hash body */ + struct cfs_hash *obd_nid_hash; + /* nid stats body */ + struct cfs_hash *obd_nid_stats_hash; + struct list_head obd_nid_stats; + atomic_t obd_refcount; + wait_queue_head_t obd_refcount_waitq; + struct list_head obd_exports; + struct list_head obd_unlinked_exports; + struct list_head obd_delayed_exports; + int obd_num_exports; + spinlock_t obd_nid_lock; + struct ldlm_namespace *obd_namespace; + struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ + /* a spinlock is OK for what we do now, may need a semaphore later */ + spinlock_t obd_dev_lock; /* protect OBD bitfield above */ + struct mutex obd_dev_mutex; + __u64 obd_last_committed; + spinlock_t obd_osfs_lock; + struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */ + __u64 obd_osfs_age; + struct lvfs_run_ctxt obd_lvfs_ctxt; + struct obd_llog_group obd_olg; /* default llog group */ + struct obd_device *obd_observer; + struct rw_semaphore obd_observer_link_sem; + struct obd_notify_upcall obd_upcall; + struct obd_export *obd_self_export; + /* list of exports in LRU order, for ping evictor, with obd_dev_lock */ + struct list_head obd_exports_timed; + time_t obd_eviction_timer; /* for ping evictor */ + + int obd_max_recoverable_clients; + atomic_t obd_connected_clients; + int obd_stale_clients; + int obd_delayed_clients; + /* this lock protects all recovery list_heads, timer and + * obd_next_recovery_transno value */ + spinlock_t obd_recovery_task_lock; + __u64 obd_next_recovery_transno; + int obd_replayed_requests; + int obd_requests_queued_for_recovery; + wait_queue_head_t obd_next_transno_waitq; + /* protected by obd_recovery_task_lock */ + struct timer_list obd_recovery_timer; + time_t obd_recovery_start; /* seconds */ + time_t obd_recovery_end; /* seconds, for lprocfs_status */ + int obd_recovery_time_hard; + int obd_recovery_timeout; + int obd_recovery_ir_factor; + + /* new recovery stuff from CMD2 */ + struct target_recovery_data obd_recovery_data; + int obd_replayed_locks; + atomic_t obd_req_replay_clients; + atomic_t obd_lock_replay_clients; + /* all lists are protected by obd_recovery_task_lock */ + struct list_head obd_req_replay_queue; + struct list_head obd_lock_replay_queue; + struct list_head obd_final_req_queue; + int obd_recovery_stage; + + union { + struct client_obd cli; + struct echo_client_obd echo_client; + struct lov_obd lov; + struct lmv_obd lmv; + } u; + /* Fields used by LProcFS */ + unsigned int obd_cntr_base; + struct lprocfs_stats *obd_stats; + + unsigned int md_cntr_base; + struct lprocfs_stats *md_stats; + + struct proc_dir_entry *obd_proc_entry; + void *obd_proc_private; /* type private PDEs */ + struct proc_dir_entry *obd_proc_exports_entry; + struct proc_dir_entry *obd_svc_procroot; + struct lprocfs_stats *obd_svc_stats; + atomic_t obd_evict_inprogress; + wait_queue_head_t obd_evict_inprogress_waitq; + struct list_head obd_evict_list; /* protected with pet_lock */ + + /** + * Ldlm pool part. Save last calculated SLV and Limit. + */ + rwlock_t obd_pool_lock; + int obd_pool_limit; + __u64 obd_pool_slv; + + /** + * A list of outstanding class_incref()'s against this obd. For + * debugging. + */ + struct lu_ref obd_reference; + + int obd_conn_inprogress; +}; + +#define OBD_LLOG_FL_SENDNOW 0x0001 +#define OBD_LLOG_FL_EXIT 0x0002 + +enum obd_cleanup_stage { +/* Special case hack for MDS LOVs */ + OBD_CLEANUP_EARLY, +/* can be directly mapped to .ldto_device_fini() */ + OBD_CLEANUP_EXPORTS, +}; + +/* get/set_info keys */ +#define KEY_ASYNC "async" +#define KEY_BLOCKSIZE_BITS "blocksize_bits" +#define KEY_BLOCKSIZE "blocksize" +#define KEY_CAPA_KEY "capa_key" +#define KEY_CHANGELOG_CLEAR "changelog_clear" +#define KEY_FID2PATH "fid2path" +#define KEY_CHECKSUM "checksum" +#define KEY_CLEAR_FS "clear_fs" +#define KEY_CONN_DATA "conn_data" +#define KEY_EVICT_BY_NID "evict_by_nid" +#define KEY_FIEMAP "fiemap" +#define KEY_FLUSH_CTX "flush_ctx" +#define KEY_GRANT_SHRINK "grant_shrink" +#define KEY_HSM_COPYTOOL_SEND "hsm_send" +#define KEY_INIT_RECOV_BACKUP "init_recov_bk" +#define KEY_INIT_RECOV "initial_recov" +#define KEY_INTERMDS "inter_mds" +#define KEY_LAST_ID "last_id" +#define KEY_LAST_FID "last_fid" +#define KEY_LOCK_TO_STRIPE "lock_to_stripe" +#define KEY_LOVDESC "lovdesc" +#define KEY_LOV_IDX "lov_idx" +#define KEY_MAX_EASIZE "max_easize" +#define KEY_DEFAULT_EASIZE "default_easize" +#define KEY_MAX_COOKIESIZE "max_cookiesize" +#define KEY_DEFAULT_COOKIESIZE "default_cookiesize" +#define KEY_MDS_CONN "mds_conn" +#define KEY_MGSSEC "mgssec" +#define KEY_NEXT_ID "next_id" +#define KEY_READ_ONLY "read-only" +#define KEY_REGISTER_TARGET "register_target" +#define KEY_SET_FS "set_fs" +#define KEY_TGT_COUNT "tgt_count" +/* KEY_SET_INFO in lustre_idl.h */ +#define KEY_SPTLRPC_CONF "sptlrpc_conf" +#define KEY_CONNECT_FLAG "connect_flags" +#define KEY_SYNC_LOCK_CANCEL "sync_lock_cancel" + +#define KEY_CACHE_SET "cache_set" +#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink" +#define KEY_CHANGELOG_INDEX "changelog_index" + +struct lu_context; + +/* /!\ must be coherent with include/linux/namei.h on patched kernel */ +#define IT_OPEN (1 << 0) +#define IT_CREAT (1 << 1) +#define IT_READDIR (1 << 2) +#define IT_GETATTR (1 << 3) +#define IT_LOOKUP (1 << 4) +#define IT_UNLINK (1 << 5) +#define IT_TRUNC (1 << 6) +#define IT_GETXATTR (1 << 7) +#define IT_EXEC (1 << 8) +#define IT_PIN (1 << 9) +#define IT_LAYOUT (1 << 10) +#define IT_QUOTA_DQACQ (1 << 11) +#define IT_QUOTA_CONN (1 << 12) +#define IT_SETXATTR (1 << 13) + +static inline int it_to_lock_mode(struct lookup_intent *it) +{ + /* CREAT needs to be tested before open (both could be set) */ + if (it->it_op & IT_CREAT) + return LCK_CW; + else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP | + IT_LAYOUT)) + return LCK_CR; + else if (it->it_op & IT_GETXATTR) + return LCK_PR; + else if (it->it_op & IT_SETXATTR) + return LCK_PW; + + LASSERTF(0, "Invalid it_op: %d\n", it->it_op); + return -EINVAL; +} + +struct md_op_data { + struct lu_fid op_fid1; /* operation fid1 (usually parent) */ + struct lu_fid op_fid2; /* operation fid2 (usually child) */ + struct lu_fid op_fid3; /* 2 extra fids to find conflicting */ + struct lu_fid op_fid4; /* to the operation locks. */ + u32 op_mds; /* what mds server open will go to */ + struct lustre_handle op_handle; + s64 op_mod_time; + const char *op_name; + int op_namelen; + __u32 op_mode; + struct lmv_stripe_md *op_mea1; + struct lmv_stripe_md *op_mea2; + __u32 op_suppgids[2]; + __u32 op_fsuid; + __u32 op_fsgid; + cfs_cap_t op_cap; + void *op_data; + + /* iattr fields and blocks. */ + struct iattr op_attr; + unsigned int op_attr_flags; + __u64 op_valid; + loff_t op_attr_blocks; + + /* Size-on-MDS epoch and flags. */ + __u64 op_ioepoch; + __u32 op_flags; + + /* Capa fields */ + struct obd_capa *op_capa1; + struct obd_capa *op_capa2; + + /* Various operation flags. */ + enum mds_op_bias op_bias; + + /* Operation type */ + __u32 op_opc; + + /* Used by readdir */ + __u64 op_offset; + + /* Used by readdir */ + __u32 op_npages; + + /* used to transfer info between the stacks of MD client + * see enum op_cli_flags */ + __u32 op_cli_flags; + + /* File object data version for HSM release, on client */ + __u64 op_data_version; + struct lustre_handle op_lease_handle; +}; + +enum op_cli_flags { + CLI_SET_MEA = 1 << 0, + CLI_RM_ENTRY = 1 << 1, +}; + +struct md_enqueue_info; +/* metadata stat-ahead */ + +struct md_enqueue_info { + struct md_op_data mi_data; + struct lookup_intent mi_it; + struct lustre_handle mi_lockh; + struct inode *mi_dir; + int (*mi_cb)(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, int rc); + __u64 mi_cbdata; + unsigned int mi_generation; +}; + +struct obd_ops { + struct module *o_owner; + int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg); + int (*o_get_info)(const struct lu_env *env, struct obd_export *, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm); + int (*o_set_info_async)(const struct lu_env *, struct obd_export *, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set); + int (*o_attach)(struct obd_device *dev, u32 len, void *data); + int (*o_detach)(struct obd_device *dev); + int (*o_setup)(struct obd_device *dev, struct lustre_cfg *cfg); + int (*o_precleanup)(struct obd_device *dev, + enum obd_cleanup_stage cleanup_stage); + int (*o_cleanup)(struct obd_device *dev); + int (*o_process_config)(struct obd_device *dev, u32 len, void *data); + int (*o_postrecov)(struct obd_device *dev); + int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid, + int priority); + int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid); + /* connect to the target device with given connection + * data. @ocd->ocd_connect_flags is modified to reflect flags actually + * granted by the target, which are guaranteed to be a subset of flags + * asked for. If @ocd == NULL, use default parameters. */ + int (*o_connect)(const struct lu_env *env, + struct obd_export **exp, struct obd_device *src, + struct obd_uuid *cluuid, struct obd_connect_data *ocd, + void *localdata); + int (*o_reconnect)(const struct lu_env *env, + struct obd_export *exp, struct obd_device *src, + struct obd_uuid *cluuid, + struct obd_connect_data *ocd, + void *localdata); + int (*o_disconnect)(struct obd_export *exp); + + /* Initialize/finalize fids infrastructure. */ + int (*o_fid_init)(struct obd_device *obd, + struct obd_export *exp, enum lu_cli_type type); + int (*o_fid_fini)(struct obd_device *obd); + + /* Allocate new fid according to passed @hint. */ + int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data); + + /* + * Object with @fid is getting deleted, we may want to do something + * about this. + */ + int (*o_statfs)(const struct lu_env *, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags); + int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo, + __u64 max_age, struct ptlrpc_request_set *set); + int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt, + struct lov_stripe_md *mem_src); + int (*o_unpackmd)(struct obd_export *exp, + struct lov_stripe_md **mem_tgt, + struct lov_mds_md *disk_src, int disk_len); + int (*o_preallocate)(struct lustre_handle *, u32 *req, u64 *ids); + /* FIXME: add fid capability support for create & destroy! */ + int (*o_create)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti); + int (*o_destroy)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *ea, + struct obd_trans_info *oti, struct obd_export *md_exp, + void *capa); + int (*o_setattr)(const struct lu_env *, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti); + int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset); + int (*o_getattr)(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo); + int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo, + struct ptlrpc_request_set *set); + int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm, + u64 size, int shrink); + int (*o_preprw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, int objcount, + struct obd_ioobj *obj, struct niobuf_remote *remote, + int *nr_pages, struct niobuf_local *local, + struct obd_trans_info *oti, struct lustre_capa *capa); + int (*o_commitrw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int pages, + struct niobuf_local *local, + struct obd_trans_info *oti, int rc); + int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *, + ldlm_iterator_t it, void *data); + int (*o_init_export)(struct obd_export *exp); + int (*o_destroy_export)(struct obd_export *exp); + + /* metadata-only methods */ + int (*o_import_event)(struct obd_device *, struct obd_import *, + enum obd_import_event); + + int (*o_notify)(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data); + + int (*o_health_check)(const struct lu_env *env, struct obd_device *); + struct obd_uuid *(*o_get_uuid)(struct obd_export *exp); + + /* quota methods */ + int (*o_quotacheck)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + int (*o_quotactl)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + + /* pools methods */ + int (*o_pool_new)(struct obd_device *obd, char *poolname); + int (*o_pool_del)(struct obd_device *obd, char *poolname); + int (*o_pool_add)(struct obd_device *obd, char *poolname, + char *ostname); + int (*o_pool_rem)(struct obd_device *obd, char *poolname, + char *ostname); + void (*o_getref)(struct obd_device *obd); + void (*o_putref)(struct obd_device *obd); + /* + * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line + * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. + * Also, add a wrapper function in include/linux/obd_class.h. */ +}; + +enum { + LUSTRE_OPC_MKDIR = (1 << 0), + LUSTRE_OPC_SYMLINK = (1 << 1), + LUSTRE_OPC_MKNOD = (1 << 2), + LUSTRE_OPC_CREATE = (1 << 3), + LUSTRE_OPC_ANY = (1 << 4) +}; + +/* lmv structures */ +#define MEA_MAGIC_LAST_CHAR 0xb2221ca1 +#define MEA_MAGIC_ALL_CHARS 0xb222a11c +#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b + +#define MAX_HASH_SIZE_32 0x7fffffffUL +#define MAX_HASH_SIZE 0x7fffffffffffffffULL +#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL + +struct lustre_md { + struct mdt_body *body; + struct lov_stripe_md *lsm; + struct lmv_stripe_md *mea; +#ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *posix_acl; +#endif + struct mdt_remote_perm *remote_perm; + struct obd_capa *mds_capa; + struct obd_capa *oss_capa; +}; + +struct md_open_data { + struct obd_client_handle *mod_och; + struct ptlrpc_request *mod_open_req; + struct ptlrpc_request *mod_close_req; + atomic_t mod_refcount; + bool mod_is_create; +}; + +struct lookup_intent; + +struct md_ops { + int (*m_getstatus)(struct obd_export *, struct lu_fid *, + struct obd_capa **); + int (*m_null_inode)(struct obd_export *, const struct lu_fid *); + int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *, + ldlm_iterator_t, void *); + int (*m_close)(struct obd_export *, struct md_op_data *, + struct md_open_data *, struct ptlrpc_request **); + int (*m_create)(struct obd_export *, struct md_op_data *, + const void *, int, int, __u32, __u32, cfs_cap_t, + __u64, struct ptlrpc_request **); + int (*m_done_writing)(struct obd_export *, struct md_op_data *, + struct md_open_data *); + int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *, + struct lookup_intent *, struct md_op_data *, + struct lustre_handle *, void *, int, + struct ptlrpc_request **, __u64); + int (*m_getattr)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + int (*m_getattr_name)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + int (*m_intent_lock)(struct obd_export *, struct md_op_data *, + void *, int, struct lookup_intent *, int, + struct ptlrpc_request **, + ldlm_blocking_callback, __u64); + int (*m_link)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + int (*m_rename)(struct obd_export *, struct md_op_data *, + const char *, int, const char *, int, + struct ptlrpc_request **); + int (*m_is_subdir)(struct obd_export *, const struct lu_fid *, + const struct lu_fid *, + struct ptlrpc_request **); + int (*m_setattr)(struct obd_export *, struct md_op_data *, void *, + int , void *, int, struct ptlrpc_request **, + struct md_open_data **mod); + int (*m_sync)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, struct ptlrpc_request **); + int (*m_readpage)(struct obd_export *, struct md_op_data *, + struct page **, struct ptlrpc_request **); + + int (*m_unlink)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_setxattr)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, u64, const char *, + const char *, int, int, int, __u32, + struct ptlrpc_request **); + + int (*m_getxattr)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, u64, const char *, + const char *, int, int, int, + struct ptlrpc_request **); + + int (*m_init_ea_size)(struct obd_export *, int, int, int, int); + + int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *, + struct obd_export *, struct obd_export *, + struct lustre_md *); + + int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *); + + int (*m_set_open_replay_data)(struct obd_export *, + struct obd_client_handle *, + struct lookup_intent *); + int (*m_clear_open_replay_data)(struct obd_export *, + struct obd_client_handle *); + int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *); + + ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64, + const struct lu_fid *, ldlm_type_t, + ldlm_policy_data_t *, ldlm_mode_t, + struct lustre_handle *); + + int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *, + ldlm_policy_data_t *, ldlm_mode_t, + ldlm_cancel_flags_t flags, void *opaque); + int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc, + renew_capa_cb_t cb); + int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *, + const struct req_msg_field *, struct obd_capa **); + + int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, __u32, + struct ptlrpc_request **); + + int (*m_intent_getattr_async)(struct obd_export *, + struct md_enqueue_info *, + struct ldlm_enqueue_info *); + + int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *, + struct lu_fid *, __u64 *bits); + + /* + * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to + * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a + * wrapper function in include/linux/obd_class.h. + */ +}; + +struct lsm_operations { + void (*lsm_free)(struct lov_stripe_md *); + int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa, + struct obd_export *md_exp); + void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, u64 *, + u64 *); + void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, u64 *, + u64 *); + int (*lsm_lmm_verify)(struct lov_mds_md *lmm, int lmm_bytes, + __u16 *stripe_count); + int (*lsm_unpackmd)(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md *lmm); +}; + +extern const struct lsm_operations lsm_v1_ops; +extern const struct lsm_operations lsm_v3_ops; +static inline const struct lsm_operations *lsm_op_find(int magic) +{ + switch (magic) { + case LOV_MAGIC_V1: + return &lsm_v1_ops; + case LOV_MAGIC_V3: + return &lsm_v3_ops; + default: + CERROR("Cannot recognize lsm_magic %08x\n", magic); + return NULL; + } +} + +/* Requests for obd_extent_calc() */ +#define OBD_CALC_STRIPE_START 1 +#define OBD_CALC_STRIPE_END 2 + +static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo) +{ + return oinfo->oi_capa; +} + +static inline struct md_open_data *obd_mod_alloc(void) +{ + struct md_open_data *mod; + + OBD_ALLOC_PTR(mod); + if (mod == NULL) + return NULL; + atomic_set(&mod->mod_refcount, 1); + return mod; +} + +#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount) +#define obd_mod_put(mod) \ +({ \ + if (atomic_dec_and_test(&(mod)->mod_refcount)) { \ + if ((mod)->mod_open_req) \ + ptlrpc_req_finished((mod)->mod_open_req); \ + OBD_FREE_PTR(mod); \ + } \ +}) + +void obdo_from_inode(struct obdo *dst, struct inode *src, u32 valid); +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent); + +/* return 1 if client should be resend request */ +static inline int client_should_resend(int resend, struct client_obd *cli) +{ + return atomic_read(&cli->cl_resends) ? + atomic_read(&cli->cl_resends) > resend : 1; +} + +/** + * Return device name for this device + * + * XXX: lu_device is declared before obd_device, while a pointer pointing + * back to obd_device in lu_device, so this helper function defines here + * instead of in lu_object.h + */ +static inline const char *lu_dev_name(const struct lu_device *lu_dev) +{ + return lu_dev->ld_obd->obd_name; +} + +static inline bool filename_is_volatile(const char *name, int namelen, int *idx) +{ + const char *start; + char *end; + + if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0) + return false; + + /* caller does not care of idx */ + if (idx == NULL) + return true; + + /* volatile file, the MDT can be set from name */ + /* name format is LUSTRE_VOLATILE_HDR:[idx]: */ + /* if no MDT is specified, use std way */ + if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2) + goto bad_format; + /* test for no MDT idx case */ + if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') && + (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) { + *idx = -1; + return true; + } + /* we have an idx, read it */ + start = name + LUSTRE_VOLATILE_HDR_LEN + 1; + *idx = strtoul(start, &end, 0); + /* error cases: + * no digit, no trailing :, negative value + */ + if (((*idx == 0) && (end == start)) || + (*end != ':') || (*idx < 0)) + goto bad_format; + + return true; +bad_format: + /* bad format of mdt idx, we cannot return an error + * to caller so we use hash algo */ + CERROR("Bad volatile file name format: %s\n", + name + LUSTRE_VOLATILE_HDR_LEN); + return false; +} + +static inline int cli_brw_size(struct obd_device *obd) +{ + LASSERT(obd != NULL); + return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; +} + +#endif /* __OBD_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/obd_cache.h b/kernel/drivers/staging/lustre/lustre/include/obd_cache.h new file mode 100644 index 000000000..c8249fbb0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/obd_cache.h @@ -0,0 +1,39 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_CACHE_H__ +#define _OBD_CACHE_H__ + + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/include/obd_cksum.h b/kernel/drivers/staging/lustre/lustre/include/obd_cksum.h new file mode 100644 index 000000000..3a63462aa --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/obd_cksum.h @@ -0,0 +1,176 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __OBD_CKSUM +#define __OBD_CKSUM +#include "../../include/linux/libcfs/libcfs.h" +#include "lustre/lustre_idl.h" + +static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type) +{ + switch (cksum_type) { + case OBD_CKSUM_CRC32: + return CFS_HASH_ALG_CRC32; + case OBD_CKSUM_ADLER: + return CFS_HASH_ALG_ADLER32; + case OBD_CKSUM_CRC32C: + return CFS_HASH_ALG_CRC32C; + default: + CERROR("Unknown checksum type (%x)!!!\n", cksum_type); + LBUG(); + } + return 0; +} + +/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can + * only be a single checksum type per RPC. + * + * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask + * since they need to represent the full range of checksum algorithms that + * both the client and server can understand. + * + * In case of an unsupported types/flags we fall back to ADLER + * because that is supported by all clients since 1.8 + * + * In case multiple algorithms are supported the best one is used. */ +static inline u32 cksum_type_pack(cksum_type_t cksum_type) +{ + unsigned int performance = 0, tmp; + u32 flag = OBD_FL_CKSUM_ADLER; + + if (cksum_type & OBD_CKSUM_CRC32) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32; + } + } + if (cksum_type & OBD_CKSUM_CRC32C) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32C; + } + } + if (cksum_type & OBD_CKSUM_ADLER) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_ADLER; + } + } + if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C | + OBD_CKSUM_CRC32 | + OBD_CKSUM_ADLER)))) + CWARN("unknown cksum type %x\n", cksum_type); + + return flag; +} + +static inline cksum_type_t cksum_type_unpack(u32 o_flags) +{ + switch (o_flags & OBD_FL_CKSUM_ALL) { + case OBD_FL_CKSUM_CRC32C: + return OBD_CKSUM_CRC32C; + case OBD_FL_CKSUM_CRC32: + return OBD_CKSUM_CRC32; + default: + break; + } + + return OBD_CKSUM_ADLER; +} + +/* Return a bitmask of the checksum types supported on this system. + * 1.8 supported ADLER it is base and not depend on hw + * Client uses all available local algos + */ +static inline cksum_type_t cksum_types_supported_client(void) +{ + cksum_type_t ret = OBD_CKSUM_ADLER; + + CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n", + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER))); + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0) + ret |= OBD_CKSUM_CRC32C; + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0) + ret |= OBD_CKSUM_CRC32; + + return ret; +} + +/* Server uses algos that perform at 50% or better of the Adler */ +static inline cksum_type_t cksum_types_supported_server(void) +{ + int base_speed; + cksum_type_t ret = OBD_CKSUM_ADLER; + + CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n", + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER))); + + base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2; + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >= + base_speed) + ret |= OBD_CKSUM_CRC32C; + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >= + base_speed) + ret |= OBD_CKSUM_CRC32; + + return ret; +} + + +/* Select the best checksum algorithm among those supplied in the cksum_types + * input. + * + * Currently, calling cksum_type_pack() with a mask will return the fastest + * checksum type due to its benchmarking at libcfs module load. + * Caution is advised, however, since what is fastest on a single client may + * not be the fastest or most efficient algorithm on the server. */ +static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types) +{ + return cksum_type_unpack(cksum_type_pack(cksum_types)); +} + +/* Checksum algorithm names. Must be defined in the same order as the + * OBD_CKSUM_* flags. */ +#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"} + +#endif /* __OBD_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/obd_class.h b/kernel/drivers/staging/lustre/lustre/include/obd_class.h new file mode 100644 index 000000000..34b5fa3f0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/obd_class.h @@ -0,0 +1,1929 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef __CLASS_OBD_H +#define __CLASS_OBD_H + + +#include "obd_support.h" +#include "lustre_import.h" +#include "lustre_net.h" +#include "obd.h" +#include "lustre_lib.h" +#include "lustre/lustre_idl.h" +#include "lprocfs_status.h" + +#define OBD_STATFS_NODELAY 0x0001 /* requests should be send without delay + * and resends for avoid deadlocks */ +#define OBD_STATFS_FROM_CACHE 0x0002 /* the statfs callback should not update + * obd_osfs_age */ +#define OBD_STATFS_PTLRPCD 0x0004 /* requests will be sent via ptlrpcd + * instead of a specific set. This + * means that we cannot rely on the set + * interpret routine to be called. + * lov_statfs_fini() must thus be called + * by the request interpret routine */ +#define OBD_STATFS_FOR_MDT0 0x0008 /* The statfs is only for retrieving + * information from MDT0. */ +#define OBD_FL_PUNCH 0x00000001 /* To indicate it is punch operation */ + +/* OBD Device Declarations */ +extern struct obd_device *obd_devs[MAX_OBD_DEVICES]; +extern rwlock_t obd_dev_lock; + +/* OBD Operations Declarations */ +extern struct obd_device *class_conn2obd(struct lustre_handle *); +extern struct obd_device *class_exp2obd(struct obd_export *); +extern int class_handle_ioctl(unsigned int cmd, unsigned long arg); +extern int lustre_get_jobid(char *jobid); + +struct lu_device_type; + +/* genops.c */ +extern struct list_head obd_types; +struct obd_export *class_conn2export(struct lustre_handle *); +int class_register_type(struct obd_ops *, struct md_ops *, + struct lprocfs_vars *, const char *nm, + struct lu_device_type *ldt); +int class_unregister_type(const char *nm); + +struct obd_device *class_newdev(const char *type_name, const char *name); +void class_release_dev(struct obd_device *obd); + +int class_name2dev(const char *name); +struct obd_device *class_name2obd(const char *name); +int class_uuid2dev(struct obd_uuid *uuid); +struct obd_device *class_uuid2obd(struct obd_uuid *uuid); +void class_obd_list(void); +struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid, + const char *typ_name, + struct obd_uuid *grp_uuid); +struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, + int *next); +struct obd_device *class_num2obd(int num); +int get_devices_count(void); + +int class_notify_sptlrpc_conf(const char *fsname, int namelen); + +char *obd_export_nid2str(struct obd_export *exp); + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid); +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid); +int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep); + +int obd_zombie_impexp_init(void); +void obd_zombie_impexp_stop(void); +void obd_zombie_impexp_cull(void); +void obd_zombie_barrier(void); +void obd_exports_barrier(struct obd_device *obd); +int kuc_len(int payload_len); +struct kuc_hdr *kuc_ptr(void *p); +int kuc_ispayload(void *p); +void *kuc_alloc(int payload_len, int transport, int type); +void kuc_free(void *p, int payload_len); + +struct llog_handle; +struct llog_rec_hdr; +typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *, + struct llog_rec_hdr *, void *); +/* obd_config.c */ +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name); +int class_process_config(struct lustre_cfg *lcfg); +int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, + struct lustre_cfg *lcfg, void *data); +int class_attach(struct lustre_cfg *lcfg); +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg); +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, const void *source); +void class_decref(struct obd_device *obd, + const char *scope, const void *source); +void dump_exports(struct obd_device *obd, int locks); +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_add_uuid(const char *uuid, __u64 nid); + +/*obdecho*/ +#if defined (CONFIG_PROC_FS) +extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + +#define CFG_F_START 0x01 /* Set when we start updating from a log */ +#define CFG_F_MARKER 0x02 /* We are within a maker */ +#define CFG_F_SKIP 0x04 /* We should ignore this cfg command */ +#define CFG_F_COMPAT146 0x08 /* Allow old-style logs */ +#define CFG_F_EXCLUDE 0x10 /* OST exclusion list */ + +/* Passed as data param to class_config_parse_llog */ +struct config_llog_instance { + char *cfg_obdname; + void *cfg_instance; + struct super_block *cfg_sb; + struct obd_uuid cfg_uuid; + llog_cb_t cfg_callback; + int cfg_last_idx; /* for partial llog processing */ + int cfg_flags; +}; +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg); +int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg); + +enum { + CONFIG_T_CONFIG = 0, + CONFIG_T_SPTLRPC = 1, + CONFIG_T_RECOVER = 2, + CONFIG_T_PARAMS = 3, + CONFIG_T_MAX = 4 +}; + +#define PARAMS_FILENAME "params" +#define LCTL_UPCALL "lctl" + +/* list of active configuration logs */ +struct config_llog_data { + struct ldlm_res_id cld_resid; + struct config_llog_instance cld_cfg; + struct list_head cld_list_chain; + atomic_t cld_refcount; + struct config_llog_data *cld_sptlrpc;/* depended sptlrpc log */ + struct config_llog_data *cld_params; /* common parameters log */ + struct config_llog_data *cld_recover;/* imperative recover log */ + struct obd_export *cld_mgcexp; + struct mutex cld_lock; + int cld_type; + unsigned int cld_stopping:1, /* we were told to stop + * watching */ + cld_lostlock:1; /* lock not requeued */ + char cld_logname[0]; +}; + +struct lustre_profile { + struct list_head lp_list; + char *lp_profile; + char *lp_dt; + char *lp_md; +}; + +struct lustre_profile *class_get_profile(const char *prof); +void class_del_profile(const char *prof); +void class_del_profiles(void); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *); +void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *); +extern void (*class_export_dump_hook)(struct obd_export *); + +#else + +#define __class_export_add_lock_ref(exp, lock) do {} while (0) +#define __class_export_del_lock_ref(exp, lock) do {} while (0) + +#endif + +static inline void class_export_rpc_inc(struct obd_export *exp) +{ + atomic_inc(&(exp)->exp_rpc_count); + CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n", + (exp), atomic_read(&(exp)->exp_rpc_count)); +} + +static inline void class_export_rpc_dec(struct obd_export *exp) +{ + LASSERT_ATOMIC_POS(&exp->exp_rpc_count); + atomic_dec(&(exp)->exp_rpc_count); + CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n", + (exp), atomic_read(&(exp)->exp_rpc_count)); +} + +#define class_export_lock_get(exp, lock) \ +({ \ + atomic_inc(&(exp)->exp_locks_count); \ + __class_export_add_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_get(exp); \ +}) + +#define class_export_lock_put(exp, lock) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_locks_count); \ + atomic_dec(&(exp)->exp_locks_count); \ + __class_export_del_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_put(exp); \ +}) + +#define class_export_cb_get(exp) \ +({ \ + atomic_inc(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_get(exp); \ +}) + +#define class_export_cb_put(exp) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_cb_count); \ + atomic_dec(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_put(exp); \ +}) + +/* genops.c */ +struct obd_export *class_export_get(struct obd_export *exp); +void class_export_put(struct obd_export *exp); +struct obd_export *class_new_export(struct obd_device *obddev, + struct obd_uuid *cluuid); +void class_unlink_export(struct obd_export *exp); + +struct obd_import *class_import_get(struct obd_import *); +void class_import_put(struct obd_import *); +struct obd_import *class_new_import(struct obd_device *obd); +void class_destroy_import(struct obd_import *exp); + +struct obd_type *class_search_type(const char *name); +struct obd_type *class_get_type(const char *name); +void class_put_type(struct obd_type *type); +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid); +int class_disconnect(struct obd_export *exp); +void class_fail_export(struct obd_export *exp); +int class_connected_export(struct obd_export *exp); +void class_disconnect_exports(struct obd_device *obddev); +int class_manual_cleanup(struct obd_device *obd); +void class_disconnect_stale_exports(struct obd_device *, + int (*test_export)(struct obd_export *)); +static inline enum obd_option exp_flags_from_obd(struct obd_device *obd) +{ + return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | + (obd->obd_force ? OBD_OPT_FORCE : 0) | + (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) | + 0); +} + +struct inode; +struct lu_attr; +struct obdo; +void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid); +void la_from_obdo(struct lu_attr *la, struct obdo *dst, u32 valid); +void obdo_refresh_inode(struct inode *dst, struct obdo *src, u32 valid); +void obdo_to_inode(struct inode *dst, struct obdo *src, u32 valid); + +void obdo_cpy_md(struct obdo *dst, struct obdo *src, u32 valid); +void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj); +void obdo_from_iattr(struct obdo *oa, struct iattr *attr, + unsigned int ia_valid); +void iattr_from_obdo(struct iattr *attr, struct obdo *oa, u32 valid); +void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, u32 valid); +void obdo_from_md(struct obdo *oa, struct md_op_data *op_data, + unsigned int valid); + +void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo); +void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo); + +#define OBT(dev) (dev)->obd_type +#define OBP(dev, op) (dev)->obd_type->typ_dt_ops->o_ ## op +#define MDP(dev, op) (dev)->obd_type->typ_md_ops->m_ ## op +#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op + +/* Ensure obd_setup: used for cleanup which must be called + while obd is stopping */ +static inline int obd_check_dev(struct obd_device *obd) +{ + if (!obd) { + CERROR("NULL device\n"); + return -ENODEV; + } + return 0; +} + +/* ensure obd_setup and !obd_stopping */ +static inline int obd_check_dev_active(struct obd_device *obd) +{ + int rc; + + rc = obd_check_dev(obd); + if (rc) + return rc; + if (!obd->obd_set_up || obd->obd_stopping) { + CERROR("Device %d not setup\n", obd->obd_minor); + return -ENODEV; + } + return rc; +} + +#if defined (CONFIG_PROC_FS) +#define OBD_COUNTER_OFFSET(op) \ + ((offsetof(struct obd_ops, o_ ## op) - \ + offsetof(struct obd_ops, o_iocontrol)) \ + / sizeof(((struct obd_ops *)(0))->o_iocontrol)) + +#define OBD_COUNTER_INCREMENT(obdx, op) \ + if ((obdx)->obd_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((obdx)->obd_cntr_base) + \ + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (obdx)->obd_stats->ls_num); \ + lprocfs_counter_incr((obdx)->obd_stats, coffset); \ + } + +#define EXP_COUNTER_INCREMENT(export, op) \ + if ((export)->exp_obd->obd_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \ + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num); \ + lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \ + if ((export)->exp_nid_stats != NULL && \ + (export)->exp_nid_stats->nid_stats != NULL) \ + lprocfs_counter_incr( \ + (export)->exp_nid_stats->nid_stats, coffset);\ + } + +#define MD_COUNTER_OFFSET(op) \ + ((offsetof(struct md_ops, m_ ## op) - \ + offsetof(struct md_ops, m_getstatus)) \ + / sizeof(((struct md_ops *)(0))->m_getstatus)) + +#define MD_COUNTER_INCREMENT(obdx, op) \ + if ((obd)->md_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((obdx)->md_cntr_base) + \ + MD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (obdx)->md_stats->ls_num); \ + lprocfs_counter_incr((obdx)->md_stats, coffset); \ + } + +#define EXP_MD_COUNTER_INCREMENT(export, op) \ + if ((export)->exp_obd->obd_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((export)->exp_obd->md_cntr_base) + \ + MD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (export)->exp_obd->md_stats->ls_num); \ + lprocfs_counter_incr((export)->exp_obd->md_stats, coffset); \ + if ((export)->exp_md_stats != NULL) \ + lprocfs_counter_incr( \ + (export)->exp_md_stats, coffset); \ + } + +#else +#define OBD_COUNTER_OFFSET(op) +#define OBD_COUNTER_INCREMENT(obd, op) +#define EXP_COUNTER_INCREMENT(exp, op) +#define MD_COUNTER_INCREMENT(obd, op) +#define EXP_MD_COUNTER_INCREMENT(exp, op) +#endif + +static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat *tmp) +{ + /* Always add in ldlm_stats */ + tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC + ,LPROCFS_STATS_FLAG_NOPERCPU); + if (tmp->nid_ldlm_stats == NULL) + return -ENOMEM; + + lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats); + + return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats", + tmp->nid_ldlm_stats); +} + +#define OBD_CHECK_MD_OP(obd, op, err) \ +do { \ + if (!OBT(obd) || !MDP((obd), op)) { \ + if (err) \ + CERROR("md_" #op ": dev %s/%d no operation\n", \ + obd->obd_name, obd->obd_minor); \ + return err; \ + } \ +} while (0) + +#define EXP_CHECK_MD_OP(exp, op) \ +do { \ + if ((exp) == NULL) { \ + CERROR("obd_" #op ": NULL export\n"); \ + return -ENODEV; \ + } \ + if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \ + CERROR("obd_" #op ": cleaned up obd\n"); \ + return -EOPNOTSUPP; \ + } \ + if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \ + CERROR("obd_" #op ": dev %s/%d no operation\n", \ + (exp)->exp_obd->obd_name, \ + (exp)->exp_obd->obd_minor); \ + return -EOPNOTSUPP; \ + } \ +} while (0) + + +#define OBD_CHECK_DT_OP(obd, op, err) \ +do { \ + if (!OBT(obd) || !OBP((obd), op)) { \ + if (err) \ + CERROR("obd_" #op ": dev %d no operation\n", \ + obd->obd_minor); \ + return err; \ + } \ +} while (0) + +#define EXP_CHECK_DT_OP(exp, op) \ +do { \ + if ((exp) == NULL) { \ + CERROR("obd_" #op ": NULL export\n"); \ + return -ENODEV; \ + } \ + if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \ + CERROR("obd_" #op ": cleaned up obd\n"); \ + return -EOPNOTSUPP; \ + } \ + if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \ + CERROR("obd_" #op ": dev %d no operation\n", \ + (exp)->exp_obd->obd_minor); \ + return -EOPNOTSUPP; \ + } \ +} while (0) + +#define CTXT_CHECK_OP(ctxt, op, err) \ +do { \ + if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) { \ + if (err) \ + CERROR("lop_" #op ": dev %d no operation\n", \ + ctxt->loc_obd->obd_minor); \ + return err; \ + } \ +} while (0) + +static inline int class_devno_max(void) +{ + return MAX_OBD_DEVICES; +} + +static inline int obd_get_info(const struct lu_env *env, + struct obd_export *exp, __u32 keylen, + void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + int rc; + + EXP_CHECK_DT_OP(exp, get_info); + EXP_COUNTER_INCREMENT(exp, get_info); + + rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val, + lsm); + return rc; +} + +static inline int obd_set_info_async(const struct lu_env *env, + struct obd_export *exp, u32 keylen, + void *key, u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + int rc; + + EXP_CHECK_DT_OP(exp, set_info_async); + EXP_COUNTER_INCREMENT(exp, set_info_async); + + rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen, + val, set); + return rc; +} + +/* + * obd-lu integration. + * + * Functionality is being moved into new lu_device-based layering, but some + * pieces of configuration process are still based on obd devices. + * + * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully + * subsume ->o_setup() methods of obd devices they replace. The same for + * lu_device_operations::ldo_process_config() and ->o_process_config(). As a + * result, obd_setup() and obd_process_config() branch and call one XOR + * another. + * + * Yet neither lu_device_type_operations::ldto_device_fini() nor + * lu_device_type_operations::ldto_device_free() fully implement the + * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence, + * obd_precleanup() and obd_cleanup() call both lu_device and obd operations. + */ + +#define DECLARE_LU_VARS(ldt, d) \ + struct lu_device_type *ldt; \ + struct lu_device *d + +static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + + ldt = obd->obd_type->typ_lu; + if (ldt != NULL) { + struct lu_context session_ctx; + struct lu_env env; + lu_context_init(&session_ctx, LCT_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + env.le_ses = &session_ctx; + d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg); + lu_env_fini(&env); + if (!IS_ERR(d)) { + obd->obd_lu_dev = d; + d->ld_obd = obd; + rc = 0; + } else + rc = PTR_ERR(d); + } + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + + } else { + OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, setup); + rc = OBP(obd, setup)(obd, cfg); + } + return rc; +} + +static inline int obd_precleanup(struct obd_device *obd, + enum obd_cleanup_stage cleanup_stage) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + + rc = obd_check_dev(obd); + if (rc) + return rc; + ldt = obd->obd_type->typ_lu; + d = obd->obd_lu_dev; + if (ldt != NULL && d != NULL) { + if (cleanup_stage == OBD_CLEANUP_EXPORTS) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + ldt->ldt_ops->ldto_device_fini(&env, d); + lu_env_fini(&env); + } + } + } + OBD_CHECK_DT_OP(obd, precleanup, 0); + OBD_COUNTER_INCREMENT(obd, precleanup); + + rc = OBP(obd, precleanup)(obd, cleanup_stage); + return rc; +} + +static inline int obd_cleanup(struct obd_device *obd) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + + rc = obd_check_dev(obd); + if (rc) + return rc; + + ldt = obd->obd_type->typ_lu; + d = obd->obd_lu_dev; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + ldt->ldt_ops->ldto_device_free(&env, d); + lu_env_fini(&env); + obd->obd_lu_dev = NULL; + } + } + OBD_CHECK_DT_OP(obd, cleanup, 0); + OBD_COUNTER_INCREMENT(obd, cleanup); + + rc = OBP(obd, cleanup)(obd); + return rc; +} + +static inline void obd_cleanup_client_import(struct obd_device *obd) +{ + /* If we set up but never connected, the + client import will not have been cleaned. */ + down_write(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) { + struct obd_import *imp; + imp = obd->u.cli.cl_import; + CDEBUG(D_CONFIG, "%s: client import never connected\n", + obd->obd_name); + ptlrpc_invalidate_import(imp); + if (imp->imp_rq_pool) { + ptlrpc_free_rq_pool(imp->imp_rq_pool); + imp->imp_rq_pool = NULL; + } + client_destroy_import(imp); + obd->u.cli.cl_import = NULL; + } + up_write(&obd->u.cli.cl_sem); +} + +static inline int +obd_process_config(struct obd_device *obd, int datalen, void *data) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + + rc = obd_check_dev(obd); + if (rc) + return rc; + + obd->obd_process_conf = 1; + ldt = obd->obd_type->typ_lu; + d = obd->obd_lu_dev; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + rc = d->ld_ops->ldo_process_config(&env, d, data); + lu_env_fini(&env); + } + } else { + OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP); + rc = OBP(obd, process_config)(obd, datalen, data); + } + OBD_COUNTER_INCREMENT(obd, process_config); + obd->obd_process_conf = 0; + + return rc; +} + +/* Pack an in-memory MD struct for storage on disk. + * Returns +ve size of packed MD (0 for free), or -ve error. + * + * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL). + * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed. + * If @*disk_tgt == NULL, it will be allocated + */ +static inline int obd_packmd(struct obd_export *exp, + struct lov_mds_md **disk_tgt, + struct lov_stripe_md *mem_src) +{ + int rc; + + EXP_CHECK_DT_OP(exp, packmd); + EXP_COUNTER_INCREMENT(exp, packmd); + + rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src); + return rc; +} + +static inline int obd_size_diskmd(struct obd_export *exp, + struct lov_stripe_md *mem_src) +{ + return obd_packmd(exp, NULL, mem_src); +} + +static inline int obd_free_diskmd(struct obd_export *exp, + struct lov_mds_md **disk_tgt) +{ + LASSERT(disk_tgt); + LASSERT(*disk_tgt); + /* + * LU-2590, for caller's convenience, *disk_tgt could be host + * endianness, it needs swab to LE if necessary, while just + * lov_mds_md header needs it for figuring out how much memory + * needs to be freed. + */ + if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) && + (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) || + ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3))) + lustre_swab_lov_mds_md(*disk_tgt); + return obd_packmd(exp, disk_tgt, NULL); +} + +/* Unpack an MD struct from disk to in-memory format. + * Returns +ve size of unpacked MD (0 for free), or -ve error. + * + * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL). + * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed. + * If @*mem_tgt == NULL, it will be allocated + */ +static inline int obd_unpackmd(struct obd_export *exp, + struct lov_stripe_md **mem_tgt, + struct lov_mds_md *disk_src, + int disk_len) +{ + int rc; + + EXP_CHECK_DT_OP(exp, unpackmd); + EXP_COUNTER_INCREMENT(exp, unpackmd); + + rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len); + return rc; +} + +/* helper functions */ +static inline int obd_alloc_memmd(struct obd_export *exp, + struct lov_stripe_md **mem_tgt) +{ + LASSERT(mem_tgt); + LASSERT(*mem_tgt == NULL); + return obd_unpackmd(exp, mem_tgt, NULL, 0); +} + +static inline int obd_free_memmd(struct obd_export *exp, + struct lov_stripe_md **mem_tgt) +{ + int rc; + + LASSERT(mem_tgt); + LASSERT(*mem_tgt); + rc = obd_unpackmd(exp, mem_tgt, NULL, 0); + *mem_tgt = NULL; + return rc; +} + +static inline int obd_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + int rc; + + EXP_CHECK_DT_OP(exp, create); + EXP_COUNTER_INCREMENT(exp, create); + + rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti); + return rc; +} + +static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo, struct lov_stripe_md *ea, + struct obd_trans_info *oti, + struct obd_export *md_exp, void *capa) +{ + int rc; + + EXP_CHECK_DT_OP(exp, destroy); + EXP_COUNTER_INCREMENT(exp, destroy); + + rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa); + return rc; +} + +static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo) +{ + int rc; + + EXP_CHECK_DT_OP(exp, getattr); + EXP_COUNTER_INCREMENT(exp, getattr); + + rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo); + return rc; +} + +static inline int obd_getattr_async(struct obd_export *exp, + struct obd_info *oinfo, + struct ptlrpc_request_set *set) +{ + int rc; + + EXP_CHECK_DT_OP(exp, getattr_async); + EXP_COUNTER_INCREMENT(exp, getattr_async); + + rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set); + return rc; +} + +static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti) +{ + int rc; + + EXP_CHECK_DT_OP(exp, setattr); + EXP_COUNTER_INCREMENT(exp, setattr); + + rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti); + return rc; +} + +/* This performs all the requests set init/wait/destroy actions. */ +static inline int obd_setattr_rqset(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti) +{ + struct ptlrpc_request_set *set = NULL; + int rc; + + EXP_CHECK_DT_OP(exp, setattr_async); + EXP_COUNTER_INCREMENT(exp, setattr_async); + + set = ptlrpc_prep_set(); + if (set == NULL) + return -ENOMEM; + + rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + return rc; +} + +/* This adds all the requests into @set if @set != NULL, otherwise + all requests are sent asynchronously without waiting for response. */ +static inline int obd_setattr_async(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *set) +{ + int rc; + + EXP_CHECK_DT_OP(exp, setattr_async); + EXP_COUNTER_INCREMENT(exp, setattr_async); + + rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set); + return rc; +} + +static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + + rc = obd_check_dev_active(obd); + if (rc) + return rc; + OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, add_conn); + + rc = OBP(obd, add_conn)(imp, uuid, priority); + return rc; +} + +static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + + rc = obd_check_dev_active(obd); + if (rc) + return rc; + OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, del_conn); + + rc = OBP(obd, del_conn)(imp, uuid); + return rc; +} + +static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp) +{ + struct obd_uuid *uuid; + + OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL); + EXP_COUNTER_INCREMENT(exp, get_uuid); + + uuid = OBP(exp->exp_obd, get_uuid)(exp); + return uuid; +} + +/** Create a new /a exp on device /a obd for the uuid /a cluuid + * @param exp New export handle + * @param d Connect data, supported flags are set, flags also understood + * by obd are returned. + */ +static inline int obd_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *data, + void *localdata) +{ + int rc; + __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition + * check */ + + rc = obd_check_dev_active(obd); + if (rc) + return rc; + OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, connect); + + rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata); + /* check that only subset is granted */ + LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) == + data->ocd_connect_flags)); + return rc; +} + +static inline int obd_reconnect(const struct lu_env *env, + struct obd_export *exp, + struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *d, + void *localdata) +{ + int rc; + __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition + * check */ + + rc = obd_check_dev_active(obd); + if (rc) + return rc; + OBD_CHECK_DT_OP(obd, reconnect, 0); + OBD_COUNTER_INCREMENT(obd, reconnect); + + rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata); + /* check that only subset is granted */ + LASSERT(ergo(d != NULL, + (d->ocd_connect_flags & ocf) == d->ocd_connect_flags)); + return rc; +} + +static inline int obd_disconnect(struct obd_export *exp) +{ + int rc; + + EXP_CHECK_DT_OP(exp, disconnect); + EXP_COUNTER_INCREMENT(exp, disconnect); + + rc = OBP(exp->exp_obd, disconnect)(exp); + return rc; +} + +static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type) +{ + int rc; + + OBD_CHECK_DT_OP(obd, fid_init, 0); + OBD_COUNTER_INCREMENT(obd, fid_init); + + rc = OBP(obd, fid_init)(obd, exp, type); + return rc; +} + +static inline int obd_fid_fini(struct obd_device *obd) +{ + int rc; + + OBD_CHECK_DT_OP(obd, fid_fini, 0); + OBD_COUNTER_INCREMENT(obd, fid_fini); + + rc = OBP(obd, fid_fini)(obd); + return rc; +} + +static inline int obd_fid_alloc(struct obd_export *exp, + struct lu_fid *fid, + struct md_op_data *op_data) +{ + int rc; + + EXP_CHECK_DT_OP(exp, fid_alloc); + EXP_COUNTER_INCREMENT(exp, fid_alloc); + + rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data); + return rc; +} + +static inline int obd_pool_new(struct obd_device *obd, char *poolname) +{ + int rc; + + OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_new); + + rc = OBP(obd, pool_new)(obd, poolname); + return rc; +} + +static inline int obd_pool_del(struct obd_device *obd, char *poolname) +{ + int rc; + + OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_del); + + rc = OBP(obd, pool_del)(obd, poolname); + return rc; +} + +static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname) +{ + int rc; + + OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_add); + + rc = OBP(obd, pool_add)(obd, poolname, ostname); + return rc; +} + +static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname) +{ + int rc; + + OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_rem); + + rc = OBP(obd, pool_rem)(obd, poolname, ostname); + return rc; +} + +static inline void obd_getref(struct obd_device *obd) +{ + if (OBT(obd) && OBP(obd, getref)) { + OBD_COUNTER_INCREMENT(obd, getref); + OBP(obd, getref)(obd); + } +} + +static inline void obd_putref(struct obd_device *obd) +{ + if (OBT(obd) && OBP(obd, putref)) { + OBD_COUNTER_INCREMENT(obd, putref); + OBP(obd, putref)(obd); + } +} + +static inline int obd_init_export(struct obd_export *exp) +{ + int rc = 0; + + if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) && + OBP((exp)->exp_obd, init_export)) + rc = OBP(exp->exp_obd, init_export)(exp); + return rc; +} + +static inline int obd_destroy_export(struct obd_export *exp) +{ + if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) && + OBP((exp)->exp_obd, destroy_export)) + OBP(exp->exp_obd, destroy_export)(exp); + return 0; +} + +/* @max_age is the oldest time in jiffies that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */ +static inline int obd_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, + __u64 max_age, + struct ptlrpc_request_set *rqset) +{ + int rc = 0; + struct obd_device *obd; + + if (exp == NULL || exp->exp_obd == NULL) + return -EINVAL; + + obd = exp->exp_obd; + OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, statfs); + + CDEBUG(D_SUPER, "%s: osfs %p age %llu, max_age %llu\n", + obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age); + if (cfs_time_before_64(obd->obd_osfs_age, max_age)) { + rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset); + } else { + CDEBUG(D_SUPER, + "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs)); + spin_unlock(&obd->obd_osfs_lock); + oinfo->oi_flags |= OBD_STATFS_FROM_CACHE; + if (oinfo->oi_cb_up) + oinfo->oi_cb_up(oinfo, 0); + } + return rc; +} + +static inline int obd_statfs_rqset(struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, + __u32 flags) +{ + struct ptlrpc_request_set *set = NULL; + struct obd_info oinfo = { { { 0 } } }; + int rc = 0; + + set = ptlrpc_prep_set(); + if (set == NULL) + return -ENOMEM; + + oinfo.oi_osfs = osfs; + oinfo.oi_flags = flags; + rc = obd_statfs_async(exp, &oinfo, max_age, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + return rc; +} + +/* @max_age is the oldest time in jiffies that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */ +static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, + __u32 flags) +{ + int rc = 0; + struct obd_device *obd = exp->exp_obd; + + if (obd == NULL) + return -EINVAL; + + OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, statfs); + + CDEBUG(D_SUPER, "osfs %llu, max_age %llu\n", + obd->obd_osfs_age, max_age); + if (cfs_time_before_64(obd->obd_osfs_age, max_age)) { + rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags); + if (rc == 0) { + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs)); + obd->obd_osfs_age = cfs_time_current_64(); + spin_unlock(&obd->obd_osfs_lock); + } + } else { + CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(osfs, &obd->obd_osfs, sizeof(*osfs)); + spin_unlock(&obd->obd_osfs_lock); + } + return rc; +} + +static inline int obd_preprw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int *pages, + struct niobuf_local *local, + struct obd_trans_info *oti, + struct lustre_capa *capa) +{ + int rc; + + EXP_CHECK_DT_OP(exp, preprw); + EXP_COUNTER_INCREMENT(exp, preprw); + + rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote, + pages, local, oti, capa); + return rc; +} + +static inline int obd_commitrw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *rnb, int pages, + struct niobuf_local *local, + struct obd_trans_info *oti, int rc) +{ + EXP_CHECK_DT_OP(exp, commitrw); + EXP_COUNTER_INCREMENT(exp, commitrw); + + rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj, + rnb, pages, local, oti, rc); + return rc; +} + +static inline int obd_adjust_kms(struct obd_export *exp, + struct lov_stripe_md *lsm, u64 size, + int shrink) +{ + int rc; + + EXP_CHECK_DT_OP(exp, adjust_kms); + EXP_COUNTER_INCREMENT(exp, adjust_kms); + + rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink); + return rc; +} + +static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp, + int len, void *karg, void *uarg) +{ + int rc; + + EXP_CHECK_DT_OP(exp, iocontrol); + EXP_COUNTER_INCREMENT(exp, iocontrol); + + rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg); + return rc; +} + +static inline int obd_find_cbdata(struct obd_export *exp, + struct lov_stripe_md *lsm, + ldlm_iterator_t it, void *data) +{ + int rc; + + EXP_CHECK_DT_OP(exp, find_cbdata); + EXP_COUNTER_INCREMENT(exp, find_cbdata); + + rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data); + return rc; +} + +static inline void obd_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + if (!obd) { + CERROR("NULL device\n"); + return; + } + if (obd->obd_set_up && OBP(obd, import_event)) { + OBD_COUNTER_INCREMENT(obd, import_event); + OBP(obd, import_event)(obd, imp, event); + } +} + +static inline int obd_notify(struct obd_device *obd, + struct obd_device *watched, + enum obd_notify_event ev, + void *data) +{ + int rc; + + rc = obd_check_dev(obd); + if (rc) + return rc; + + /* the check for async_recov is a complete hack - I'm hereby + overloading the meaning to also mean "this was called from + mds_postsetup". I know that my mds is able to handle notifies + by this point, and it needs to get them to execute mds_postrecov. */ + if (!obd->obd_set_up && !obd->obd_async_recov) { + CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name); + return -EINVAL; + } + + if (!OBP(obd, notify)) { + CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name); + return -ENOSYS; + } + + OBD_COUNTER_INCREMENT(obd, notify); + rc = OBP(obd, notify)(obd, watched, ev, data); + return rc; +} + +static inline int obd_notify_observer(struct obd_device *observer, + struct obd_device *observed, + enum obd_notify_event ev, + void *data) +{ + int rc1; + int rc2; + + struct obd_notify_upcall *onu; + + if (observer->obd_observer) + rc1 = obd_notify(observer->obd_observer, observed, ev, data); + else + rc1 = 0; + /* + * Also, call non-obd listener, if any + */ + onu = &observer->obd_upcall; + if (onu->onu_upcall != NULL) + rc2 = onu->onu_upcall(observer, observed, ev, + onu->onu_owner, NULL); + else + rc2 = 0; + + return rc1 ? rc1 : rc2; +} + +static inline int obd_quotacheck(struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + int rc; + + EXP_CHECK_DT_OP(exp, quotacheck); + EXP_COUNTER_INCREMENT(exp, quotacheck); + + rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl); + return rc; +} + +static inline int obd_quotactl(struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + int rc; + + EXP_CHECK_DT_OP(exp, quotactl); + EXP_COUNTER_INCREMENT(exp, quotactl); + + rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl); + return rc; +} + +static inline int obd_health_check(const struct lu_env *env, + struct obd_device *obd) +{ + /* returns: 0 on healthy + * >0 on unhealthy + reason code/flag + * however the only supported reason == 1 right now + * We'll need to define some better reasons + * or flags in the future. + * <0 on error + */ + int rc; + + /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */ + if (obd == NULL || !OBT(obd)) { + CERROR("cleaned up obd\n"); + return -EOPNOTSUPP; + } + if (!obd->obd_set_up || obd->obd_stopping) + return 0; + if (!OBP(obd, health_check)) + return 0; + + rc = OBP(obd, health_check)(env, obd); + return rc; +} + +static inline int obd_register_observer(struct obd_device *obd, + struct obd_device *observer) +{ + int rc; + + rc = obd_check_dev(obd); + if (rc) + return rc; + down_write(&obd->obd_observer_link_sem); + if (obd->obd_observer && observer) { + up_write(&obd->obd_observer_link_sem); + return -EALREADY; + } + obd->obd_observer = observer; + up_write(&obd->obd_observer_link_sem); + return 0; +} + +#if 0 +static inline int obd_register_page_removal_cb(struct obd_export *exp, + obd_page_removal_cb_t cb, + obd_pin_extent_cb pin_cb) +{ + int rc; + + OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb); + + rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb); + return rc; +} + +static inline int obd_unregister_page_removal_cb(struct obd_export *exp, + obd_page_removal_cb_t cb) +{ + int rc; + + OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb); + + rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb); + return rc; +} + +static inline int obd_register_lock_cancel_cb(struct obd_export *exp, + obd_lock_cancel_cb cb) +{ + int rc; + + OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb); + + rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb); + return rc; +} + +static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp, + obd_lock_cancel_cb cb) +{ + int rc; + + OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb); + + rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb); + return rc; +} +#endif + +/* metadata helpers */ +static inline int md_getstatus(struct obd_export *exp, + struct lu_fid *fid, struct obd_capa **pc) +{ + int rc; + + EXP_CHECK_MD_OP(exp, getstatus); + EXP_MD_COUNTER_INCREMENT(exp, getstatus); + rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc); + return rc; +} + +static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, getattr); + EXP_MD_COUNTER_INCREMENT(exp, getattr); + rc = MDP(exp->exp_obd, getattr)(exp, op_data, request); + return rc; +} + +static inline int md_null_inode(struct obd_export *exp, + const struct lu_fid *fid) +{ + int rc; + + EXP_CHECK_MD_OP(exp, null_inode); + EXP_MD_COUNTER_INCREMENT(exp, null_inode); + rc = MDP(exp->exp_obd, null_inode)(exp, fid); + return rc; +} + +static inline int md_find_cbdata(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_iterator_t it, void *data) +{ + int rc; + + EXP_CHECK_MD_OP(exp, find_cbdata); + EXP_MD_COUNTER_INCREMENT(exp, find_cbdata); + rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data); + return rc; +} + +static inline int md_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, + struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, close); + EXP_MD_COUNTER_INCREMENT(exp, close); + rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request); + return rc; +} + +static inline int md_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, + __u32 gid, cfs_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, create); + EXP_MD_COUNTER_INCREMENT(exp, create); + rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode, + uid, gid, cap_effective, rdev, request); + return rc; +} + +static inline int md_done_writing(struct obd_export *exp, + struct md_op_data *op_data, + struct md_open_data *mod) +{ + int rc; + + EXP_CHECK_MD_OP(exp, done_writing); + EXP_MD_COUNTER_INCREMENT(exp, done_writing); + rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod); + return rc; +} + +static inline int md_enqueue(struct obd_export *exp, + struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, + struct md_op_data *op_data, + struct lustre_handle *lockh, + void *lmm, int lmmsize, + struct ptlrpc_request **req, + __u64 extra_lock_flags) +{ + int rc; + + EXP_CHECK_MD_OP(exp, enqueue); + EXP_MD_COUNTER_INCREMENT(exp, enqueue); + rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh, + lmm, lmmsize, req, extra_lock_flags); + return rc; +} + +static inline int md_getattr_name(struct obd_export *exp, + struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, getattr_name); + EXP_MD_COUNTER_INCREMENT(exp, getattr_name); + rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request); + return rc; +} + +static inline int md_intent_lock(struct obd_export *exp, + struct md_op_data *op_data, void *lmm, + int lmmsize, struct lookup_intent *it, + int lookup_flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + int rc; + + EXP_CHECK_MD_OP(exp, intent_lock); + EXP_MD_COUNTER_INCREMENT(exp, intent_lock); + rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize, + it, lookup_flags, reqp, cb_blocking, + extra_lock_flags); + return rc; +} + +static inline int md_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, link); + EXP_MD_COUNTER_INCREMENT(exp, link); + rc = MDP(exp->exp_obd, link)(exp, op_data, request); + return rc; +} + +static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, + int newlen, struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, rename); + EXP_MD_COUNTER_INCREMENT(exp, rename); + rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new, + newlen, request); + return rc; +} + +static inline int md_is_subdir(struct obd_export *exp, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, is_subdir); + EXP_MD_COUNTER_INCREMENT(exp, is_subdir); + rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request); + return rc; +} + +static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len, + struct ptlrpc_request **request, + struct md_open_data **mod) +{ + int rc; + + EXP_CHECK_MD_OP(exp, setattr); + EXP_MD_COUNTER_INCREMENT(exp, setattr); + rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, + ea2, ea2len, request, mod); + return rc; +} + +static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, sync); + EXP_MD_COUNTER_INCREMENT(exp, sync); + rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request); + return rc; +} + +static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata, + struct page **pages, + struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, readpage); + EXP_MD_COUNTER_INCREMENT(exp, readpage); + rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request); + return rc; +} + +static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + EXP_CHECK_MD_OP(exp, unlink); + EXP_MD_COUNTER_INCREMENT(exp, unlink); + rc = MDP(exp->exp_obd, unlink)(exp, op_data, request); + return rc; +} + +static inline int md_get_lustre_md(struct obd_export *exp, + struct ptlrpc_request *req, + struct obd_export *dt_exp, + struct obd_export *md_exp, + struct lustre_md *md) +{ + EXP_CHECK_MD_OP(exp, get_lustre_md); + EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md); + return MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md); +} + +static inline int md_free_lustre_md(struct obd_export *exp, + struct lustre_md *md) +{ + EXP_CHECK_MD_OP(exp, free_lustre_md); + EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md); + return MDP(exp->exp_obd, free_lustre_md)(exp, md); +} + +static inline int md_setxattr(struct obd_export *exp, + const struct lu_fid *fid, struct obd_capa *oc, + u64 valid, const char *name, + const char *input, int input_size, + int output_size, int flags, __u32 suppgid, + struct ptlrpc_request **request) +{ + EXP_CHECK_MD_OP(exp, setxattr); + EXP_MD_COUNTER_INCREMENT(exp, setxattr); + return MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input, + input_size, output_size, flags, + suppgid, request); +} + +static inline int md_getxattr(struct obd_export *exp, + const struct lu_fid *fid, struct obd_capa *oc, + u64 valid, const char *name, + const char *input, int input_size, + int output_size, int flags, + struct ptlrpc_request **request) +{ + EXP_CHECK_MD_OP(exp, getxattr); + EXP_MD_COUNTER_INCREMENT(exp, getxattr); + return MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input, + input_size, output_size, flags, + request); +} + +static inline int md_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it) +{ + EXP_CHECK_MD_OP(exp, set_open_replay_data); + EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data); + return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it); +} + +static inline int md_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + EXP_CHECK_MD_OP(exp, clear_open_replay_data); + EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data); + return MDP(exp->exp_obd, clear_open_replay_data)(exp, och); +} + +static inline int md_set_lock_data(struct obd_export *exp, + __u64 *lockh, void *data, __u64 *bits) +{ + EXP_CHECK_MD_OP(exp, set_lock_data); + EXP_MD_COUNTER_INCREMENT(exp, set_lock_data); + return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits); +} + +static inline int md_cancel_unused(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque) +{ + int rc; + + EXP_CHECK_MD_OP(exp, cancel_unused); + EXP_MD_COUNTER_INCREMENT(exp, cancel_unused); + + rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode, + flags, opaque); + return rc; +} + +static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, + ldlm_type_t type, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + struct lustre_handle *lockh) +{ + EXP_CHECK_MD_OP(exp, lock_match); + EXP_MD_COUNTER_INCREMENT(exp, lock_match); + return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type, + policy, mode, lockh); +} + +static inline int md_init_ea_size(struct obd_export *exp, int easize, + int def_asize, int cookiesize, + int def_cookiesize) +{ + EXP_CHECK_MD_OP(exp, init_ea_size); + EXP_MD_COUNTER_INCREMENT(exp, init_ea_size); + return MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize, + cookiesize, def_cookiesize); +} + +static inline int md_get_remote_perm(struct obd_export *exp, + const struct lu_fid *fid, + struct obd_capa *oc, __u32 suppgid, + struct ptlrpc_request **request) +{ + EXP_CHECK_MD_OP(exp, get_remote_perm); + EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm); + return MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid, + request); +} + +static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa, + renew_capa_cb_t cb) +{ + int rc; + + EXP_CHECK_MD_OP(exp, renew_capa); + EXP_MD_COUNTER_INCREMENT(exp, renew_capa); + rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb); + return rc; +} + +static inline int md_unpack_capa(struct obd_export *exp, + struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa **oc) +{ + int rc; + + EXP_CHECK_MD_OP(exp, unpack_capa); + EXP_MD_COUNTER_INCREMENT(exp, unpack_capa); + rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc); + return rc; +} + +static inline int md_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo) +{ + int rc; + + EXP_CHECK_MD_OP(exp, intent_getattr_async); + EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async); + rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo); + return rc; +} + +static inline int md_revalidate_lock(struct obd_export *exp, + struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + int rc; + + EXP_CHECK_MD_OP(exp, revalidate_lock); + EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock); + rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits); + return rc; +} + + +/* OBD Metadata Support */ + +extern int obd_init_caches(void); +extern void obd_cleanup_caches(void); + +/* support routines */ +extern struct kmem_cache *obdo_cachep; + +#define OBDO_ALLOC(ptr) \ +do { \ + OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, GFP_NOFS); \ +} while (0) + +#define OBDO_FREE(ptr) \ +do { \ + OBD_SLAB_FREE_PTR((ptr), obdo_cachep); \ +} while (0) + + +static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid) +{ + /* something here */ +} + +static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa) +{ + /* something here */ +} + +typedef int (*register_lwp_cb)(void *data); + +struct lwp_register_item { + struct obd_export **lri_exp; + register_lwp_cb lri_cb_func; + void *lri_cb_data; + struct list_head lri_list; + char lri_name[MTI_NAME_MAXLEN]; +}; + +/* I'm as embarrassed about this as you are. + * + * // XXX do not look into _superhack with remaining eye + * // XXX if this were any uglier, I'd get my own show on MTV */ +extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); + +/* obd_mount.c */ + +/* sysctl.c */ +extern void obd_sysctl_init (void); +extern void obd_sysctl_clean (void); + +/* uuid.c */ +typedef __u8 class_uuid_t[16]; +void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out); + +/* lustre_peer.c */ +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index); +int class_add_uuid(const char *uuid, __u64 nid); +int class_del_uuid (const char *uuid); +int class_check_uuid(struct obd_uuid *uuid, __u64 nid); +void class_init_uuidlist(void); +void class_exit_uuidlist(void); + +/* class_obd.c */ +extern char obd_jobid_node[]; +extern struct miscdevice obd_psdev; +extern spinlock_t obd_types_lock; + +/* prng.c */ +#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t)) + +#endif /* __LINUX_OBD_CLASS_H */ diff --git a/kernel/drivers/staging/lustre/lustre/include/obd_support.h b/kernel/drivers/staging/lustre/lustre/include/obd_support.h new file mode 100644 index 000000000..2991d2ee7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/include/obd_support.h @@ -0,0 +1,862 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_SUPPORT +#define _OBD_SUPPORT + +#include +#include "../../include/linux/libcfs/libcfs.h" +#include "linux/lustre_compat25.h" +#include "lprocfs_status.h" + +/* global variables */ +extern struct lprocfs_stats *obd_memory; +enum { + OBD_MEMORY_STAT = 0, + OBD_MEMORY_PAGES_STAT = 1, + OBD_STATS_NUM, +}; + +extern unsigned int obd_debug_peer_on_timeout; +extern unsigned int obd_dump_on_timeout; +extern unsigned int obd_dump_on_eviction; +/* obd_timeout should only be used for recovery, not for + networking / disk / timings affected by load (use Adaptive Timeouts) */ +extern unsigned int obd_timeout; /* seconds */ +extern unsigned int ldlm_timeout; /* seconds */ +extern unsigned int obd_timeout_set; +extern unsigned int ldlm_timeout_set; +extern unsigned int at_min; +extern unsigned int at_max; +extern unsigned int at_history; +extern int at_early_margin; +extern int at_extra; +extern unsigned int obd_sync_filter; +extern unsigned int obd_max_dirty_pages; +extern atomic_t obd_dirty_pages; +extern atomic_t obd_dirty_transit_pages; +extern unsigned int obd_alloc_fail_rate; +extern char obd_jobid_var[]; + +/* lvfs.c */ +int obd_alloc_fail(const void *ptr, const char *name, const char *type, + size_t size, const char *file, int line); + +/* Some hash init argument constants */ +#define HASH_POOLS_BKT_BITS 3 +#define HASH_POOLS_CUR_BITS 3 +#define HASH_POOLS_MAX_BITS 7 +#define HASH_UUID_BKT_BITS 5 +#define HASH_UUID_CUR_BITS 7 +#define HASH_UUID_MAX_BITS 12 +#define HASH_NID_BKT_BITS 5 +#define HASH_NID_CUR_BITS 7 +#define HASH_NID_MAX_BITS 12 +#define HASH_NID_STATS_BKT_BITS 5 +#define HASH_NID_STATS_CUR_BITS 7 +#define HASH_NID_STATS_MAX_BITS 12 +#define HASH_LQE_BKT_BITS 5 +#define HASH_LQE_CUR_BITS 7 +#define HASH_LQE_MAX_BITS 12 +#define HASH_CONN_BKT_BITS 5 +#define HASH_CONN_CUR_BITS 5 +#define HASH_CONN_MAX_BITS 15 +#define HASH_EXP_LOCK_BKT_BITS 5 +#define HASH_EXP_LOCK_CUR_BITS 7 +#define HASH_EXP_LOCK_MAX_BITS 16 +#define HASH_CL_ENV_BKT_BITS 5 +#define HASH_CL_ENV_BITS 10 +#define HASH_JOB_STATS_BKT_BITS 5 +#define HASH_JOB_STATS_CUR_BITS 7 +#define HASH_JOB_STATS_MAX_BITS 12 + +/* Timeout definitions */ +#define OBD_TIMEOUT_DEFAULT 100 +#define LDLM_TIMEOUT_DEFAULT 20 +#define MDS_LDLM_TIMEOUT_DEFAULT 6 +/* Time to wait for all clients to reconnect during recovery (hard limit) */ +#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9) +/* Time to wait for all clients to reconnect during recovery (soft limit) */ +/* Should be very conservative; must catch the first reconnect after reboot */ +#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3) +/* Change recovery-small 26b time if you change this */ +#define PING_INTERVAL max(obd_timeout / 4, 1U) +/* a bit more than maximal journal commit time in seconds */ +#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U) +/* Client may skip 1 ping; we must wait at least 2.5. But for multiple + * failover targets the client only pings one server at a time, and pings + * can be lost on a loaded network. Since eviction has serious consequences, + * and there's no urgent need to evict a client just because it's idle, we + * should be very conservative here. */ +#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6) +#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */ +#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */ + /* Max connect interval for nonresponsive servers; ~50s to avoid building up + connect requests in the LND queues, but within obd_timeout so we don't + miss the recovery window */ +#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN, obd_timeout)) +#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */ +/* In general this should be low to have quick detection of a system + running on a backup server. (If it's too low, import_select_connection + will increase the timeout anyhow.) */ +#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN, obd_timeout/20) +/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */ +#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \ + INITIAL_CONNECT_TIMEOUT) +/* The min time a target should wait for clients to reconnect in recovery */ +#define OBD_RECOVERY_TIME_MIN (2*RECONNECT_DELAY_MAX) +#define OBD_IR_FACTOR_MIN 1 +#define OBD_IR_FACTOR_MAX 10 +#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX/2) +/* default timeout for the MGS to become IR_FULL */ +#define OBD_IR_MGS_TIMEOUT (4*obd_timeout) +#define LONG_UNLINK 300 /* Unlink should happen before now */ + +/** + * Time interval of shrink, if the client is "idle" more than this interval, + * then the ll_grant thread will return the requested grant space to filter + */ +#define GRANT_SHRINK_INTERVAL 1200/*20 minutes*/ + +#define OBD_FAIL_MDS 0x100 +#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 +#define OBD_FAIL_MDS_GETATTR_NET 0x102 +#define OBD_FAIL_MDS_GETATTR_PACK 0x103 +#define OBD_FAIL_MDS_READPAGE_NET 0x104 +#define OBD_FAIL_MDS_READPAGE_PACK 0x105 +#define OBD_FAIL_MDS_SENDPAGE 0x106 +#define OBD_FAIL_MDS_REINT_NET 0x107 +#define OBD_FAIL_MDS_REINT_UNPACK 0x108 +#define OBD_FAIL_MDS_REINT_SETATTR 0x109 +#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a +#define OBD_FAIL_MDS_REINT_CREATE 0x10b +#define OBD_FAIL_MDS_REINT_CREATE_WRITE 0x10c +#define OBD_FAIL_MDS_REINT_UNLINK 0x10d +#define OBD_FAIL_MDS_REINT_UNLINK_WRITE 0x10e +#define OBD_FAIL_MDS_REINT_LINK 0x10f +#define OBD_FAIL_MDS_REINT_LINK_WRITE 0x110 +#define OBD_FAIL_MDS_REINT_RENAME 0x111 +#define OBD_FAIL_MDS_REINT_RENAME_WRITE 0x112 +#define OBD_FAIL_MDS_OPEN_NET 0x113 +#define OBD_FAIL_MDS_OPEN_PACK 0x114 +#define OBD_FAIL_MDS_CLOSE_NET 0x115 +#define OBD_FAIL_MDS_CLOSE_PACK 0x116 +#define OBD_FAIL_MDS_CONNECT_NET 0x117 +#define OBD_FAIL_MDS_CONNECT_PACK 0x118 +#define OBD_FAIL_MDS_REINT_NET_REP 0x119 +#define OBD_FAIL_MDS_DISCONNECT_NET 0x11a +#define OBD_FAIL_MDS_GETSTATUS_NET 0x11b +#define OBD_FAIL_MDS_GETSTATUS_PACK 0x11c +#define OBD_FAIL_MDS_STATFS_PACK 0x11d +#define OBD_FAIL_MDS_STATFS_NET 0x11e +#define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f +#define OBD_FAIL_MDS_PIN_NET 0x120 +#define OBD_FAIL_MDS_UNPIN_NET 0x121 +#define OBD_FAIL_MDS_ALL_REPLY_NET 0x122 +#define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123 +#define OBD_FAIL_MDS_SYNC_NET 0x124 +#define OBD_FAIL_MDS_SYNC_PACK 0x125 +#define OBD_FAIL_MDS_DONE_WRITING_NET 0x126 +#define OBD_FAIL_MDS_DONE_WRITING_PACK 0x127 +#define OBD_FAIL_MDS_ALLOC_OBDO 0x128 +#define OBD_FAIL_MDS_PAUSE_OPEN 0x129 +#define OBD_FAIL_MDS_STATFS_LCW_SLEEP 0x12a +#define OBD_FAIL_MDS_OPEN_CREATE 0x12b +#define OBD_FAIL_MDS_OST_SETATTR 0x12c +#define OBD_FAIL_MDS_QUOTACHECK_NET 0x12d +#define OBD_FAIL_MDS_QUOTACTL_NET 0x12e +#define OBD_FAIL_MDS_CLIENT_ADD 0x12f +#define OBD_FAIL_MDS_GETXATTR_NET 0x130 +#define OBD_FAIL_MDS_GETXATTR_PACK 0x131 +#define OBD_FAIL_MDS_SETXATTR_NET 0x132 +#define OBD_FAIL_MDS_SETXATTR 0x133 +#define OBD_FAIL_MDS_SETXATTR_WRITE 0x134 +#define OBD_FAIL_MDS_FS_SETUP 0x135 +#define OBD_FAIL_MDS_RESEND 0x136 +#define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137 +#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138 +#define OBD_FAIL_MDS_OSC_PRECREATE 0x139 +#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a +#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b +#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x13c +#define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x13d +#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e +#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING 0x13f +#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 +#define OBD_FAIL_MDS_LOV_PREP_CREATE 0x141 +#define OBD_FAIL_MDS_REINT_DELAY 0x142 +#define OBD_FAIL_MDS_READLINK_EPROTO 0x143 +#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144 +#define OBD_FAIL_MDS_PDO_LOCK 0x145 +#define OBD_FAIL_MDS_PDO_LOCK2 0x146 +#define OBD_FAIL_MDS_OSC_CREATE_FAIL 0x147 +#define OBD_FAIL_MDS_NEGATIVE_POSITIVE 0x148 +#define OBD_FAIL_MDS_HSM_STATE_GET_NET 0x149 +#define OBD_FAIL_MDS_HSM_STATE_SET_NET 0x14a +#define OBD_FAIL_MDS_HSM_PROGRESS_NET 0x14b +#define OBD_FAIL_MDS_HSM_REQUEST_NET 0x14c +#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d +#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET 0x14e +#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET 0x14f +#define OBD_FAIL_MDS_HSM_ACTION_NET 0x150 +#define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 + +/* layout lock */ +#define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 +#define OBD_FAIL_MDS_NO_LL_OPEN 0x171 +#define OBD_FAIL_MDS_LL_BLOCK 0x172 + +/* CMD */ +#define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 +#define OBD_FAIL_MDS_IS_SUBDIR_PACK 0x181 +#define OBD_FAIL_MDS_SET_INFO_NET 0x182 +#define OBD_FAIL_MDS_WRITEPAGE_NET 0x183 +#define OBD_FAIL_MDS_WRITEPAGE_PACK 0x184 +#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185 +#define OBD_FAIL_MDS_GET_INFO_NET 0x186 +#define OBD_FAIL_MDS_DQACQ_NET 0x187 + +/* OI scrub */ +#define OBD_FAIL_OSD_SCRUB_DELAY 0x190 +#define OBD_FAIL_OSD_SCRUB_CRASH 0x191 +#define OBD_FAIL_OSD_SCRUB_FATAL 0x192 +#define OBD_FAIL_OSD_FID_MAPPING 0x193 +#define OBD_FAIL_OSD_LMA_INCOMPAT 0x194 +#define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY 0x195 + +#define OBD_FAIL_OST 0x200 +#define OBD_FAIL_OST_CONNECT_NET 0x201 +#define OBD_FAIL_OST_DISCONNECT_NET 0x202 +#define OBD_FAIL_OST_GET_INFO_NET 0x203 +#define OBD_FAIL_OST_CREATE_NET 0x204 +#define OBD_FAIL_OST_DESTROY_NET 0x205 +#define OBD_FAIL_OST_GETATTR_NET 0x206 +#define OBD_FAIL_OST_SETATTR_NET 0x207 +#define OBD_FAIL_OST_OPEN_NET 0x208 +#define OBD_FAIL_OST_CLOSE_NET 0x209 +#define OBD_FAIL_OST_BRW_NET 0x20a +#define OBD_FAIL_OST_PUNCH_NET 0x20b +#define OBD_FAIL_OST_STATFS_NET 0x20c +#define OBD_FAIL_OST_HANDLE_UNPACK 0x20d +#define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e +#define OBD_FAIL_OST_BRW_READ_BULK 0x20f +#define OBD_FAIL_OST_SYNC_NET 0x210 +#define OBD_FAIL_OST_ALL_REPLY_NET 0x211 +#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212 +#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 +#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 +#define OBD_FAIL_OST_ENOSPC 0x215 +#define OBD_FAIL_OST_EROFS 0x216 +#define OBD_FAIL_OST_ENOENT 0x217 +#define OBD_FAIL_OST_QUOTACHECK_NET 0x218 +#define OBD_FAIL_OST_QUOTACTL_NET 0x219 +#define OBD_FAIL_OST_CHECKSUM_RECEIVE 0x21a +#define OBD_FAIL_OST_CHECKSUM_SEND 0x21b +#define OBD_FAIL_OST_BRW_SIZE 0x21c +#define OBD_FAIL_OST_DROP_REQ 0x21d +#define OBD_FAIL_OST_SETATTR_CREDITS 0x21e +#define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f +#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 +#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 +#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 +#define OBD_FAIL_OST_PAUSE_CREATE 0x223 +#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 +#define OBD_FAIL_OST_CONNECT_NET2 0x225 +#define OBD_FAIL_OST_NOMEM 0x226 +#define OBD_FAIL_OST_BRW_PAUSE_BULK2 0x227 +#define OBD_FAIL_OST_MAPBLK_ENOSPC 0x228 +#define OBD_FAIL_OST_ENOINO 0x229 +#define OBD_FAIL_OST_DQACQ_NET 0x230 +#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 + +#define OBD_FAIL_LDLM 0x300 +#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 +#define OBD_FAIL_LDLM_ENQUEUE_NET 0x302 +#define OBD_FAIL_LDLM_CONVERT_NET 0x303 +#define OBD_FAIL_LDLM_CANCEL_NET 0x304 +#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305 +#define OBD_FAIL_LDLM_CP_CALLBACK_NET 0x306 +#define OBD_FAIL_LDLM_GL_CALLBACK_NET 0x307 +#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 +#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309 +#define OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a +#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b +#define OBD_FAIL_LDLM_REPLY 0x30c +#define OBD_FAIL_LDLM_RECOV_CLIENTS 0x30d +#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e +#define OBD_FAIL_LDLM_GLIMPSE 0x30f +#define OBD_FAIL_LDLM_CANCEL_RACE 0x310 +#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311 +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 +#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313 +#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 +#define OBD_FAIL_LDLM_CP_CB_WAIT 0x315 +#define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316 +#define OBD_FAIL_LDLM_INTR_CP_AST 0x317 +#define OBD_FAIL_LDLM_CP_BL_RACE 0x318 +#define OBD_FAIL_LDLM_NEW_LOCK 0x319 +#define OBD_FAIL_LDLM_AGL_DELAY 0x31a +#define OBD_FAIL_LDLM_AGL_NOLOCK 0x31b +#define OBD_FAIL_LDLM_OST_LVB 0x31c + +/* LOCKLESS IO */ +#define OBD_FAIL_LDLM_SET_CONTENTION 0x385 + +#define OBD_FAIL_OSC 0x400 +#define OBD_FAIL_OSC_BRW_READ_BULK 0x401 +#define OBD_FAIL_OSC_BRW_WRITE_BULK 0x402 +#define OBD_FAIL_OSC_LOCK_BL_AST 0x403 +#define OBD_FAIL_OSC_LOCK_CP_AST 0x404 +#define OBD_FAIL_OSC_MATCH 0x405 +#define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 +#define OBD_FAIL_OSC_SHUTDOWN 0x407 +#define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 +#define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 +#define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a +#define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b +#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c +#define OBD_FAIL_OSC_DIO_PAUSE 0x40d +#define OBD_FAIL_OSC_OBJECT_CONTENTION 0x40e +#define OBD_FAIL_OSC_CP_CANCEL_RACE 0x40f +#define OBD_FAIL_OSC_CP_ENQ_RACE 0x410 +#define OBD_FAIL_OSC_NO_GRANT 0x411 +#define OBD_FAIL_OSC_DELAY_SETTIME 0x412 + +#define OBD_FAIL_PTLRPC 0x500 +#define OBD_FAIL_PTLRPC_ACK 0x501 +#define OBD_FAIL_PTLRPC_RQBD 0x502 +#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503 +#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 +#define OBD_FAIL_PTLRPC_DROP_RPC 0x505 +#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 +#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a +#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c +#define OBD_FAIL_PTLRPC_IMP_DEACTIVE 0x50d +#define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e +#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f +#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510 +#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511 +#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 +#define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513 +#define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 +#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL 0x516 +#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x517 + +#define OBD_FAIL_OBD_PING_NET 0x600 +#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 +#define OBD_FAIL_OBD_LOGD_NET 0x602 +#define OBD_FAIL_OBD_QC_CALLBACK_NET 0x603 +#define OBD_FAIL_OBD_DQACQ 0x604 +#define OBD_FAIL_OBD_LLOG_SETUP 0x605 +#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 +#define OBD_FAIL_OBD_IDX_READ_NET 0x607 +#define OBD_FAIL_OBD_IDX_READ_BREAK 0x608 +#define OBD_FAIL_OBD_NO_LRU 0x609 + +#define OBD_FAIL_TGT_REPLY_NET 0x700 +#define OBD_FAIL_TGT_CONN_RACE 0x701 +#define OBD_FAIL_TGT_FORCE_RECONNECT 0x702 +#define OBD_FAIL_TGT_DELAY_CONNECT 0x703 +#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 +#define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 +#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 +#define OBD_FAIL_TGT_REPLAY_DROP 0x707 +#define OBD_FAIL_TGT_FAKE_EXP 0x708 +#define OBD_FAIL_TGT_REPLAY_DELAY 0x709 +#define OBD_FAIL_TGT_LAST_REPLAY 0x710 +#define OBD_FAIL_TGT_CLIENT_ADD 0x711 +#define OBD_FAIL_TGT_RCVG_FLAG 0x712 +#define OBD_FAIL_TGT_DELAY_CONDITIONAL 0x713 + +#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 +#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 +#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 +#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 +#define OBD_FAIL_MDC_RPCS_SEM 0x804 +#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 + +#define OBD_FAIL_MGS 0x900 +#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 +#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902 +#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 +#define OBD_FAIL_MGS_PAUSE_REQ 0x904 +#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 +#define OBD_FAIL_MGS_CONNECT_NET 0x906 +#define OBD_FAIL_MGS_DISCONNECT_NET 0x907 +#define OBD_FAIL_MGS_SET_INFO_NET 0x908 +#define OBD_FAIL_MGS_EXCEPTION_NET 0x909 +#define OBD_FAIL_MGS_TARGET_REG_NET 0x90a +#define OBD_FAIL_MGS_TARGET_DEL_NET 0x90b +#define OBD_FAIL_MGS_CONFIG_READ_NET 0x90c + +#define OBD_FAIL_QUOTA_DQACQ_NET 0xA01 +#define OBD_FAIL_QUOTA_EDQUOT 0xA02 +#define OBD_FAIL_QUOTA_DELAY_REINT 0xA03 +#define OBD_FAIL_QUOTA_RECOVERABLE_ERR 0xA04 + +#define OBD_FAIL_LPROC_REMOVE 0xB00 + +#define OBD_FAIL_GENERAL_ALLOC 0xC00 + +#define OBD_FAIL_SEQ 0x1000 +#define OBD_FAIL_SEQ_QUERY_NET 0x1001 +#define OBD_FAIL_SEQ_EXHAUST 0x1002 + +#define OBD_FAIL_FLD 0x1100 +#define OBD_FAIL_FLD_QUERY_NET 0x1101 + +#define OBD_FAIL_SEC_CTX 0x1200 +#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201 +#define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202 +#define OBD_FAIL_SEC_CTX_FINI_NET 0x1203 +#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 + +#define OBD_FAIL_LLOG 0x1300 +#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET 0x1301 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET 0x1302 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET 0x1303 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET 0x1305 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET 0x1306 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET 0x1307 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET 0x1308 +#define OBD_FAIL_LLOG_CATINFO_NET 0x1309 +#define OBD_FAIL_MDS_SYNC_CAPA_SL 0x1310 +#define OBD_FAIL_SEQ_ALLOC 0x1311 + +#define OBD_FAIL_LLITE 0x1400 +#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401 +#define OBD_FAIL_LOCK_STATE_WAIT_INTR 0x1402 +#define OBD_FAIL_LOV_INIT 0x1403 +#define OBD_FAIL_GLIMPSE_DELAY 0x1404 +#define OBD_FAIL_LLITE_XATTR_ENOMEM 0x1405 + +#define OBD_FAIL_FID_INDIR 0x1501 +#define OBD_FAIL_FID_INLMA 0x1502 +#define OBD_FAIL_FID_IGIF 0x1504 +#define OBD_FAIL_FID_LOOKUP 0x1505 +#define OBD_FAIL_FID_NOLMA 0x1506 + +/* LFSCK */ +#define OBD_FAIL_LFSCK_DELAY1 0x1600 +#define OBD_FAIL_LFSCK_DELAY2 0x1601 +#define OBD_FAIL_LFSCK_DELAY3 0x1602 +#define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603 +#define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604 +#define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605 +#define OBD_FAIL_LFSCK_FATAL1 0x1608 +#define OBD_FAIL_LFSCK_FATAL2 0x1609 +#define OBD_FAIL_LFSCK_CRASH 0x160a +#define OBD_FAIL_LFSCK_NO_AUTO 0x160b +#define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c + +/* UPDATE */ +#define OBD_FAIL_UPDATE_OBJ_NET 0x1700 +#define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 + + +/* Assign references to moved code to reduce code changes */ +#define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id) +#define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id) +#define OBD_FAIL_CHECK_VALUE(id, value) CFS_FAIL_CHECK_VALUE(id, value) +#define OBD_FAIL_CHECK_ORSET(id, value) CFS_FAIL_CHECK_ORSET(id, value) +#define OBD_FAIL_CHECK_RESET(id, value) CFS_FAIL_CHECK_RESET(id, value) +#define OBD_FAIL_RETURN(id, ret) CFS_FAIL_RETURN(id, ret) +#define OBD_FAIL_TIMEOUT(id, secs) CFS_FAIL_TIMEOUT(id, secs) +#define OBD_FAIL_TIMEOUT_MS(id, ms) CFS_FAIL_TIMEOUT_MS(id, ms) +#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs) +#define OBD_RACE(id) CFS_RACE(id) +#define OBD_FAIL_ONCE CFS_FAIL_ONCE +#define OBD_FAILED CFS_FAILED + +extern atomic_t libcfs_kmemory; + +extern void obd_update_maxusage(void); + +#if defined (CONFIG_PROC_FS) +#define obd_memory_add(size) \ + lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sub(size) \ + lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sum() \ + lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT, \ + LPROCFS_FIELDS_FLAGS_SUM) +#define obd_pages_add(order) \ + lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT, \ + (long)(1 << (order))) +#define obd_pages_sub(order) \ + lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT, \ + (long)(1 << (order))) +#define obd_pages_sum() \ + lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT, \ + LPROCFS_FIELDS_FLAGS_SUM) + +extern __u64 obd_memory_max(void); +extern __u64 obd_pages_max(void); + +#else + +extern __u64 obd_alloc; +extern __u64 obd_pages; + +extern __u64 obd_max_alloc; +extern __u64 obd_max_pages; + +static inline void obd_memory_add(long size) +{ + obd_alloc += size; + if (obd_alloc > obd_max_alloc) + obd_max_alloc = obd_alloc; +} + +static inline void obd_memory_sub(long size) +{ + obd_alloc -= size; +} + +static inline void obd_pages_add(int order) +{ + obd_pages += 1<< order; + if (obd_pages > obd_max_pages) + obd_max_pages = obd_pages; +} + +static inline void obd_pages_sub(int order) +{ + obd_pages -= 1<< order; +} + +#define obd_memory_sum() (obd_alloc) +#define obd_pages_sum() (obd_pages) + +#define obd_memory_max() (obd_max_alloc) +#define obd_pages_max() (obd_max_pages) + +#endif + +#define OBD_DEBUG_MEMUSAGE (1) + +#if OBD_DEBUG_MEMUSAGE +#define OBD_ALLOC_POST(ptr, size, name) \ + obd_memory_add(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr) + +#define OBD_FREE_PRE(ptr, size, name) \ + LASSERT(ptr); \ + obd_memory_sub(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr); \ + POISON(ptr, 0x5a, size) + +#else /* !OBD_DEBUG_MEMUSAGE */ + +#define OBD_ALLOC_POST(ptr, size, name) ((void)0) +#define OBD_FREE_PRE(ptr, size, name) ((void)0) + +#endif /* !OBD_DEBUG_MEMUSAGE */ + +#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC) + +#define OBD_ALLOC_FAIL_BITS 24 +#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1) +#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100) + +#if defined(LUSTRE_UTILS) /* this version is for utils only */ +#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \ +do { \ + (ptr) = (cptab) == NULL ? \ + kmalloc(size, flags) : \ + kmalloc_node(size, flags, cfs_cpt_spread_node(cptab, cpt)); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n", \ + (int)(size), __FILE__, __LINE__); \ + } else { \ + memset(ptr, 0, size); \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n", \ + (int)(size), ptr); \ + } \ +} while (0) + +#else /* this version is for the kernel and liblustre */ +#define OBD_FREE_RTN0(ptr) \ +({ \ + kfree(ptr); \ + (ptr) = NULL; \ + 0; \ +}) + +#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \ +do { \ + (ptr) = (cptab) == NULL ? \ + kmalloc(size, flags | __GFP_ZERO) : \ + kmalloc_node(size, flags | __GFP_ZERO, \ + cfs_cpt_spread_node(cptab, cpt)); \ + if (likely((ptr) != NULL && \ + (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \ + !obd_alloc_fail(ptr, #ptr, "km", size, \ + __FILE__, __LINE__) || \ + OBD_FREE_RTN0(ptr)))){ \ + OBD_ALLOC_POST(ptr, size, "kmalloced"); \ + } \ +} while (0) +#endif + +#define OBD_ALLOC_GFP(ptr, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask) + +#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_NOFS) +#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_KERNEL) +#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof(*(ptr))) +#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof(*(ptr))) + +#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask) + +#define OBD_CPT_ALLOC(ptr, cptab, cpt, size) \ + OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS) + +#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt) \ + OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof(*(ptr))) + +# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size) \ +do { \ + (ptr) = cptab == NULL ? \ + vzalloc(size) : \ + vzalloc_node(size, cfs_cpt_spread_node(cptab, cpt)); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n", \ + (int)(size)); \ + CERROR("%llu total bytes allocated by Lustre, %d by LNET\n", \ + obd_memory_sum(), atomic_read(&libcfs_kmemory)); \ + } else { \ + OBD_ALLOC_POST(ptr, size, "vmalloced"); \ + } \ +} while (0) + +# define OBD_VMALLOC(ptr, size) \ + __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size) +# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size) \ + __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size) + + +/* Allocations above this size are considered too big and could not be done + * atomically. + * + * Be very careful when changing this value, especially when decreasing it, + * since vmalloc in Linux doesn't perform well on multi-cores system, calling + * vmalloc in critical path would hurt performance badly. See LU-66. + */ +#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE) + +#define OBD_ALLOC_LARGE(ptr, size) \ +do { \ + if (size > OBD_ALLOC_BIG) \ + OBD_VMALLOC(ptr, size); \ + else \ + OBD_ALLOC(ptr, size); \ +} while (0) + +#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size) \ +do { \ + if (size > OBD_ALLOC_BIG) \ + OBD_CPT_VMALLOC(ptr, cptab, cpt, size); \ + else \ + OBD_CPT_ALLOC(ptr, cptab, cpt, size); \ +} while (0) + +#define OBD_FREE_LARGE(ptr, size) \ +do { \ + if (size > OBD_ALLOC_BIG) \ + OBD_VFREE(ptr, size); \ + else \ + OBD_FREE(ptr, size); \ +} while (0) + + +#ifdef CONFIG_DEBUG_SLAB +#define POISON(ptr, c, s) do {} while (0) +#define POISON_PTR(ptr) ((void)0) +#else +#define POISON(ptr, c, s) memset(ptr, c, s) +#define POISON_PTR(ptr) (ptr) = (void *)0xdeadbeef +#endif + +#ifdef POISON_BULK +#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE); \ + kunmap(page); } while (0) +#else +#define POISON_PAGE(page, val) do { } while (0) +#endif + +#define OBD_FREE(ptr, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "kfreed"); \ + kfree(ptr); \ + POISON_PTR(ptr); \ +} while (0) + + +#define OBD_FREE_RCU(ptr, size, handle) \ +do { \ + struct portals_handle *__h = (handle); \ + \ + LASSERT(handle != NULL); \ + __h->h_cookie = (unsigned long)(ptr); \ + __h->h_size = (size); \ + call_rcu(&__h->h_rcu, class_handle_free_cb); \ + POISON_PTR(ptr); \ +} while (0) + + +#define OBD_VFREE(ptr, size) \ + do { \ + OBD_FREE_PRE(ptr, size, "vfreed"); \ + vfree(ptr); \ + POISON_PTR(ptr); \ + } while (0) + +/* we memset() the slab object to 0 when allocation succeeds, so DO NOT + * HAVE A CTOR THAT DOES ANYTHING. its work will be cleared here. we'd + * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */ +#define OBD_SLAB_FREE_RTN0(ptr, slab) \ +({ \ + kmem_cache_free((slab), (ptr)); \ + (ptr) = NULL; \ + 0; \ +}) + +#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type) \ +do { \ + LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt())); \ + (ptr) = (cptab) == NULL ? \ + kmem_cache_alloc(slab, type | __GFP_ZERO) : \ + kmem_cache_alloc_node(slab, type | __GFP_ZERO, \ + cfs_cpt_spread_node(cptab, cpt)); \ + if (likely((ptr) != NULL && \ + (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \ + !obd_alloc_fail(ptr, #ptr, "slab-", size, \ + __FILE__, __LINE__) || \ + OBD_SLAB_FREE_RTN0(ptr, slab)))) { \ + OBD_ALLOC_POST(ptr, size, "slab-alloced"); \ + } \ +} while (0) + +#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags) +#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags) + +#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof(*(ptr))) + +#define OBD_SLAB_FREE(ptr, slab, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "slab-freed"); \ + kmem_cache_free(slab, ptr); \ + POISON_PTR(ptr); \ +} while (0) + +#define OBD_SLAB_ALLOC(ptr, slab, size) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, size, GFP_NOFS) + +#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, GFP_NOFS) + +#define OBD_SLAB_ALLOC_PTR(ptr, slab) \ + OBD_SLAB_ALLOC(ptr, slab, sizeof(*(ptr))) + +#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt) \ + OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof(*(ptr))) + +#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof(*(ptr)), flags) + +#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof(*(ptr)), flags) + +#define OBD_SLAB_FREE_PTR(ptr, slab) \ + OBD_SLAB_FREE((ptr), (slab), sizeof(*(ptr))) + +#define KEY_IS(str) \ + (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0) + +/* Wrapper for contiguous page frame allocation */ +#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask) \ +do { \ + (ptr) = (cptab) == NULL ? \ + alloc_page(gfp_mask) : \ + alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), gfp_mask, 0);\ + if (unlikely((ptr) == NULL)) { \ + CERROR("alloc_pages of '" #ptr "' %d page(s) / %llu bytes "\ + "failed\n", (int)1, \ + (__u64)(1 << PAGE_CACHE_SHIFT)); \ + CERROR("%llu total bytes and %llu total pages " \ + "(%llu bytes) allocated by Lustre, " \ + "%d total bytes by LNET\n", \ + obd_memory_sum(), \ + obd_pages_sum() << PAGE_CACHE_SHIFT, \ + obd_pages_sum(), \ + atomic_read(&libcfs_kmemory)); \ + } else { \ + obd_pages_add(0); \ + CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / " \ + "%llu bytes at %p.\n", \ + (int)1, \ + (__u64)(1 << PAGE_CACHE_SHIFT), ptr); \ + } \ +} while (0) + +#define OBD_PAGE_ALLOC(ptr, gfp_mask) \ + __OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask) +#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask) \ + __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask) + +#define OBD_PAGE_FREE(ptr) \ +do { \ + LASSERT(ptr); \ + obd_pages_sub(0); \ + CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / %llu bytes " \ + "at %p.\n", \ + (int)1, (__u64)(1 << PAGE_CACHE_SHIFT), \ + ptr); \ + __free_page(ptr); \ + (ptr) = (void *)0xdeadbeef; \ +} while (0) + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/lclient/glimpse.c b/kernel/drivers/staging/lustre/lustre/lclient/glimpse.c new file mode 100644 index 000000000..b9f2bb66d --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lclient/glimpse.c @@ -0,0 +1,269 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * glimpse code shared between vvp and liblustre (and other Lustre clients in + * the future). + * + * Author: Nikita Danilov + * Author: Oleg Drokin + */ + +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/obd.h" + +#include "../include/lustre_dlm.h" +#include "../include/lustre_lite.h" +#include "../include/lustre_mdc.h" +#include +#include + +#include "../include/cl_object.h" +#include "../include/lclient.h" +#include "../llite/llite_internal.h" + +static const struct cl_lock_descr whole_file = { + .cld_start = 0, + .cld_end = CL_PAGE_EOF, + .cld_mode = CLM_READ +}; + +/* + * Check whether file has possible unwriten pages. + * + * \retval 1 file is mmap-ed or has dirty pages + * 0 otherwise + */ +blkcnt_t dirty_cnt(struct inode *inode) +{ + blkcnt_t cnt = 0; + struct ccc_object *vob = cl_inode2ccc(inode); + void *results[1]; + + if (inode->i_mapping != NULL) + cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree, + results, 0, 1, + PAGECACHE_TAG_DIRTY); + if (cnt == 0 && atomic_read(&vob->cob_mmap_cnt) > 0) + cnt = 1; + + return (cnt > 0) ? 1 : 0; +} + +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob, int agl) +{ + struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr; + struct cl_inode_info *lli = cl_i2info(inode); + const struct lu_fid *fid = lu_object_fid(&clob->co_lu); + struct ccc_io *cio = ccc_env_io(env); + struct cl_lock *lock; + int result; + + result = 0; + if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) { + CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid)); + if (lli->lli_has_smd) { + /* NOTE: this looks like DLM lock request, but it may + * not be one. Due to CEF_ASYNC flag (translated + * to LDLM_FL_HAS_INTENT by osc), this is + * glimpse request, that won't revoke any + * conflicting DLM locks held. Instead, + * ll_glimpse_callback() will be called on each + * client holding a DLM lock against this file, + * and resulting size will be returned for each + * stripe. DLM lock on [0, EOF] is acquired only + * if there were no conflicting locks. If there + * were conflicting locks, enqueuing or waiting + * fails with -ENAVAIL, but valid inode + * attributes are returned anyway. */ + *descr = whole_file; + descr->cld_obj = clob; + descr->cld_mode = CLM_PHANTOM; + descr->cld_enq_flags = CEF_ASYNC | CEF_MUST; + if (agl) + descr->cld_enq_flags |= CEF_AGL; + cio->cui_glimpse = 1; + /* + * CEF_ASYNC is used because glimpse sub-locks cannot + * deadlock (because they never conflict with other + * locks) and, hence, can be enqueued out-of-order. + * + * CEF_MUST protects glimpse lock from conversion into + * a lockless mode. + */ + lock = cl_lock_request(env, io, descr, "glimpse", + current); + cio->cui_glimpse = 0; + + if (lock == NULL) + return 0; + + if (IS_ERR(lock)) + return PTR_ERR(lock); + + LASSERT(agl == 0); + result = cl_wait(env, lock); + if (result == 0) { + cl_merge_lvb(env, inode); + if (cl_isize_read(inode) > 0 && + inode->i_blocks == 0) { + /* + * LU-417: Add dirty pages block count + * lest i_blocks reports 0, some "cp" or + * "tar" may think it's a completely + * sparse file and skip it. + */ + inode->i_blocks = dirty_cnt(inode); + } + cl_unuse(env, lock); + } + cl_lock_release(env, lock, "glimpse", current); + } else { + CDEBUG(D_DLMTRACE, "No objects for inode\n"); + cl_merge_lvb(env, inode); + } + } + + return result; +} + +static int cl_io_get(struct inode *inode, struct lu_env **envout, + struct cl_io **ioout, int *refcheck) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_inode_info *lli = cl_i2info(inode); + struct cl_object *clob = lli->lli_clob; + int result; + + if (S_ISREG(cl_inode_mode(inode))) { + env = cl_env_get(refcheck); + if (!IS_ERR(env)) { + io = ccc_env_thread_io(env); + io->ci_obj = clob; + *envout = env; + *ioout = io; + result = 1; + } else + result = PTR_ERR(env); + } else + result = 0; + return result; +} + +int cl_glimpse_size0(struct inode *inode, int agl) +{ + /* + * We don't need ast_flags argument to cl_glimpse_size(), because + * osc_lock_enqueue() takes care of the possible deadlock that said + * argument was introduced to avoid. + */ + /* + * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to + * cl_glimpse_size(), which doesn't make sense: glimpse locks are not + * blocking anyway. + */ + struct lu_env *env = NULL; + struct cl_io *io = NULL; + int result; + int refcheck; + + result = cl_io_get(inode, &env, &io, &refcheck); + if (result > 0) { +again: + io->ci_verify_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result > 0) + /* + * nothing to do for this io. This currently happens + * when stripe sub-object's are not yet created. + */ + result = io->ci_result; + else if (result == 0) + result = cl_glimpse_lock(env, io, inode, io->ci_obj, + agl); + + OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2); + cl_io_fini(env, io); + if (unlikely(io->ci_need_restart)) + goto again; + cl_env_put(env, &refcheck); + } + return result; +} + +int cl_local_size(struct inode *inode) +{ + struct lu_env *env = NULL; + struct cl_io *io = NULL; + struct ccc_thread_info *cti; + struct cl_object *clob; + struct cl_lock_descr *descr; + struct cl_lock *lock; + int result; + int refcheck; + + if (!cl_i2info(inode)->lli_has_smd) + return 0; + + result = cl_io_get(inode, &env, &io, &refcheck); + if (result <= 0) + return result; + + clob = io->ci_obj; + result = cl_io_init(env, io, CIT_MISC, clob); + if (result > 0) + result = io->ci_result; + else if (result == 0) { + cti = ccc_env_info(env); + descr = &cti->cti_descr; + + *descr = whole_file; + descr->cld_obj = clob; + lock = cl_lock_peek(env, io, descr, "localsize", current); + if (lock != NULL) { + cl_merge_lvb(env, inode); + cl_unuse(env, lock); + cl_lock_release(env, lock, "localsize", current); + result = 0; + } else + result = -ENODATA; + } + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + return result; +} diff --git a/kernel/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/kernel/drivers/staging/lustre/lustre/lclient/lcommon_cl.c new file mode 100644 index 000000000..ab6cb4193 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lclient/lcommon_cl.c @@ -0,0 +1,1287 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl code shared between vvp and liblustre (and other Lustre clients in the + * future). + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../../include/linux/libcfs/libcfs.h" +# include +# include +# include +# include +# include +# include +# include + +#include "../include/obd.h" +#include "../include/obd_support.h" +#include "../include/lustre_fid.h" +#include "../include/lustre_lite.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_ver.h" +#include "../include/lustre_mdc.h" +#include "../include/cl_object.h" + +#include "../include/lclient.h" + +#include "../llite/llite_internal.h" + +static const struct cl_req_operations ccc_req_ops; + +/* + * ccc_ prefix stands for "Common Client Code". + */ + +static struct kmem_cache *ccc_lock_kmem; +static struct kmem_cache *ccc_object_kmem; +static struct kmem_cache *ccc_thread_kmem; +static struct kmem_cache *ccc_session_kmem; +static struct kmem_cache *ccc_req_kmem; + +static struct lu_kmem_descr ccc_caches[] = { + { + .ckd_cache = &ccc_lock_kmem, + .ckd_name = "ccc_lock_kmem", + .ckd_size = sizeof(struct ccc_lock) + }, + { + .ckd_cache = &ccc_object_kmem, + .ckd_name = "ccc_object_kmem", + .ckd_size = sizeof(struct ccc_object) + }, + { + .ckd_cache = &ccc_thread_kmem, + .ckd_name = "ccc_thread_kmem", + .ckd_size = sizeof(struct ccc_thread_info), + }, + { + .ckd_cache = &ccc_session_kmem, + .ckd_name = "ccc_session_kmem", + .ckd_size = sizeof(struct ccc_session) + }, + { + .ckd_cache = &ccc_req_kmem, + .ckd_name = "ccc_req_kmem", + .ckd_size = sizeof(struct ccc_req) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Vvp device and device type functions. + * + */ + +void *ccc_key_init(const struct lu_context *ctx, struct lu_context_key *key) +{ + struct ccc_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +void ccc_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct ccc_thread_info *info = data; + + OBD_SLAB_FREE_PTR(info, ccc_thread_kmem); +} + +void *ccc_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct ccc_session *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, GFP_NOFS); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +void ccc_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct ccc_session *session = data; + + OBD_SLAB_FREE_PTR(session, ccc_session_kmem); +} + +struct lu_context_key ccc_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = ccc_key_init, + .lct_fini = ccc_key_fini +}; + +struct lu_context_key ccc_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = ccc_session_key_init, + .lct_fini = ccc_session_key_fini +}; + + +/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */ +/* LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key); */ + +int ccc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct ccc_device *vdv; + int rc; + + vdv = lu2ccc_dev(d); + vdv->cdv_next = lu2cl_dev(next); + + LASSERT(d->ld_site != NULL && next->ld_type != NULL); + next->ld_site = d->ld_site; + rc = next->ld_type->ldt_ops->ldto_device_init( + env, next, next->ld_type->ldt_name, NULL); + if (rc == 0) { + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + } + return rc; +} + +struct lu_device *ccc_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return cl2lu_dev(lu2ccc_dev(d)->cdv_next); +} + +struct lu_device *ccc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg, + const struct lu_device_operations *luops, + const struct cl_device_operations *clops) +{ + struct ccc_device *vdv; + struct lu_device *lud; + struct cl_site *site; + int rc; + + OBD_ALLOC_PTR(vdv); + if (vdv == NULL) + return ERR_PTR(-ENOMEM); + + lud = &vdv->cdv_cl.cd_lu_dev; + cl_device_init(&vdv->cdv_cl, t); + ccc2lu_dev(vdv)->ld_ops = luops; + vdv->cdv_cl.cd_ops = clops; + + OBD_ALLOC_PTR(site); + if (site != NULL) { + rc = cl_site_init(site, &vdv->cdv_cl); + if (rc == 0) + rc = lu_site_init_finish(&site->cs_lu); + else { + LASSERT(lud->ld_site == NULL); + CERROR("Cannot init lu_site, rc %d.\n", rc); + OBD_FREE_PTR(site); + } + } else + rc = -ENOMEM; + if (rc != 0) { + ccc_device_free(env, lud); + lud = ERR_PTR(rc); + } + return lud; +} + +struct lu_device *ccc_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct ccc_device *vdv = lu2ccc_dev(d); + struct cl_site *site = lu2cl_site(d->ld_site); + struct lu_device *next = cl2lu_dev(vdv->cdv_next); + + if (d->ld_site != NULL) { + cl_site_fini(site); + OBD_FREE_PTR(site); + } + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(vdv); + return next; +} + +int ccc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct ccc_req *vrq; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, GFP_NOFS); + if (vrq != NULL) { + cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +/** + * An `emergency' environment used by ccc_inode_fini() when cl_env_get() + * fails. Access to this environment is serialized by ccc_inode_fini_guard + * mutex. + */ +static struct lu_env *ccc_inode_fini_env; + +/** + * A mutex serializing calls to slp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +static DEFINE_MUTEX(ccc_inode_fini_guard); +static int dummy_refcheck; + +int ccc_global_init(struct lu_device_type *device_type) +{ + int result; + + result = lu_kmem_init(ccc_caches); + if (result) + return result; + + result = lu_device_type_init(device_type); + if (result) + goto out_kmem; + + ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck, + LCT_REMEMBER|LCT_NOREF); + if (IS_ERR(ccc_inode_fini_env)) { + result = PTR_ERR(ccc_inode_fini_env); + goto out_device; + } + + ccc_inode_fini_env->le_ctx.lc_cookie = 0x4; + return 0; +out_device: + lu_device_type_fini(device_type); +out_kmem: + lu_kmem_fini(ccc_caches); + return result; +} + +void ccc_global_fini(struct lu_device_type *device_type) +{ + if (ccc_inode_fini_env != NULL) { + cl_env_put(ccc_inode_fini_env, &dummy_refcheck); + ccc_inode_fini_env = NULL; + } + lu_device_type_fini(device_type); + lu_kmem_fini(ccc_caches); +} + +/***************************************************************************** + * + * Object operations. + * + */ + +struct lu_object *ccc_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev, + const struct cl_object_operations *clops, + const struct lu_object_operations *luops) +{ + struct ccc_object *vob; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, GFP_NOFS); + if (vob != NULL) { + struct cl_object_header *hdr; + + obj = ccc2lu(vob); + hdr = &vob->cob_header; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + vob->cob_cl.co_ops = clops; + obj->lo_ops = luops; + } else + obj = NULL; + return obj; +} + +int ccc_object_init0(const struct lu_env *env, + struct ccc_object *vob, + const struct cl_object_conf *conf) +{ + vob->cob_inode = conf->coc_inode; + vob->cob_transient_pages = 0; + cl_object_page_init(&vob->cob_cl, sizeof(struct ccc_page)); + return 0; +} + +int ccc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct ccc_device *dev = lu2ccc_dev(obj->lo_dev); + struct ccc_object *vob = lu2ccc(obj); + struct lu_object *below; + struct lu_device *under; + int result; + + under = &dev->cdv_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below != NULL) { + const struct cl_object_conf *cconf; + + cconf = lu2cl_conf(conf); + INIT_LIST_HEAD(&vob->cob_pending_list); + lu_object_add(obj, below); + result = ccc_object_init0(env, vob, cconf); + } else + result = -ENOMEM; + return result; +} + +void ccc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct ccc_object *vob = lu2ccc(obj); + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + OBD_SLAB_FREE_PTR(vob, ccc_object_kmem); +} + +int ccc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *unused, + const struct cl_lock_operations *lkops) +{ + struct ccc_lock *clk; + int result; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, GFP_NOFS); + if (clk != NULL) { + cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +int ccc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + return 0; +} + +int ccc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb) +{ + struct inode *inode = ccc_object_inode(obj); + + lvb->lvb_mtime = cl_inode_mtime(inode); + lvb->lvb_atime = cl_inode_atime(inode); + lvb->lvb_ctime = cl_inode_ctime(inode); + /* + * LU-417: Add dirty pages block count lest i_blocks reports 0, some + * "cp" or "tar" on remote node may think it's a completely sparse file + * and skip it. + */ + if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0) + lvb->lvb_blocks = dirty_cnt(inode); + return 0; +} + + + +int ccc_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + /* TODO: destroy all pages attached to this object. */ + return 0; +} + +static void ccc_object_size_lock(struct cl_object *obj) +{ + struct inode *inode = ccc_object_inode(obj); + + cl_isize_lock(inode); + cl_object_attr_lock(obj); +} + +static void ccc_object_size_unlock(struct cl_object *obj) +{ + struct inode *inode = ccc_object_inode(obj); + + cl_object_attr_unlock(obj); + cl_isize_unlock(inode); +} + +/***************************************************************************** + * + * Page operations. + * + */ + +struct page *ccc_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return cl2vm_page(slice); +} + +int ccc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct ccc_io *cio = ccc_env_io(env); + struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr; + struct cl_page *page = slice->cpl_page; + + int result; + + if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + io->ci_type == CIT_FAULT) { + if (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED) + result = -EBUSY; + else { + desc->cld_start = page->cp_index; + desc->cld_end = page->cp_index; + desc->cld_obj = page->cp_obj; + desc->cld_mode = CLM_READ; + result = cl_queue_match(&io->ci_lockset.cls_done, + desc) ? -EBUSY : 0; + } + } else + result = 0; + return result; +} + +int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice) +{ + /* + * Cached read? + */ + LBUG(); + return 0; +} + +void ccc_transient_page_verify(const struct cl_page *page) +{ +} + +int ccc_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused, + int nonblock) +{ + ccc_transient_page_verify(slice->cpl_page); + return 0; +} + +void ccc_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct cl_page *page = slice->cpl_page; + + ccc_transient_page_verify(slice->cpl_page); + + /* + * For transient pages, remove it from the radix tree. + */ + cl_page_delete(env, page); +} + +int ccc_transient_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + /* transient page should always be sent. */ + return 0; +} + +/***************************************************************************** + * + * Lock operations. + * + */ + +void ccc_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); +} + +void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice) +{ + struct ccc_lock *clk = cl2ccc_lock(slice); + + OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem); +} + +int ccc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *unused, __u32 enqflags) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +int ccc_lock_use(const struct lu_env *env, const struct cl_lock_slice *slice) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +/** + * Implementation of cl_lock_operations::clo_fits_into() methods for ccc + * layer. This function is executed every time io finds an existing lock in + * the lock cache while creating new lock. This function has to decide whether + * cached lock "fits" into io. + * + * \param slice lock to be checked + * \param io IO that wants a lock. + * + * \see lov_lock_fits_into(). + */ +int ccc_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + const struct cl_lock *lock = slice->cls_lock; + const struct cl_lock_descr *descr = &lock->cll_descr; + const struct ccc_io *cio = ccc_env_io(env); + int result; + + /* + * Work around DLM peculiarity: it assumes that glimpse + * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock + * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make + * sure that glimpse doesn't get CLM_WRITE top-lock, so that it + * doesn't enqueue CLM_WRITE sub-locks. + */ + if (cio->cui_glimpse) + result = descr->cld_mode != CLM_WRITE; + + /* + * Also, don't match incomplete write locks for read, otherwise read + * would enqueue missing sub-locks in the write mode. + */ + else if (need->cld_mode != descr->cld_mode) + result = lock->cll_state >= CLS_ENQUEUED; + else + result = 1; + return result; +} + +/** + * Implements cl_lock_operations::clo_state() method for ccc layer, invoked + * whenever lock state changes. Transfers object attributes, that might be + * updated as a result of lock acquiring into inode. + */ +void ccc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct cl_lock *lock = slice->cls_lock; + + /* + * Refresh inode attributes when the lock is moving into CLS_HELD + * state, and only when this is a result of real enqueue, rather than + * of finding lock in the cache. + */ + if (state == CLS_HELD && lock->cll_state < CLS_HELD) { + struct cl_object *obj; + struct inode *inode; + + obj = slice->cls_obj; + inode = ccc_object_inode(obj); + + /* vmtruncate() sets the i_size + * under both a DLM lock and the + * ll_inode_size_lock(). If we don't get the + * ll_inode_size_lock() here we can match the DLM lock and + * reset i_size. generic_file_write can then trust the + * stale i_size when doing appending writes and effectively + * cancel the result of the truncate. Getting the + * ll_inode_size_lock() after the enqueue maintains the DLM + * -> ll_inode_size_lock() acquiring order. */ + if (lock->cll_descr.cld_start == 0 && + lock->cll_descr.cld_end == CL_PAGE_EOF) + cl_merge_lvb(env, inode); + } +} + +/***************************************************************************** + * + * io operations. + * + */ + +void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + + CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj)); +} + +int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + pgoff_t start, pgoff_t end) +{ + struct ccc_io *cio = ccc_env_io(env); + struct cl_lock_descr *descr = &cio->cui_link.cill_descr; + struct cl_object *obj = io->ci_obj; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end); + + memset(&cio->cui_link, 0, sizeof(cio->cui_link)); + + if (cio->cui_fd && (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + descr->cld_mode = CLM_GROUP; + descr->cld_gid = cio->cui_fd->fd_grouplock.cg_gid; + } else { + descr->cld_mode = mode; + } + descr->cld_obj = obj; + descr->cld_start = start; + descr->cld_end = end; + descr->cld_enq_flags = enqflags; + + cl_io_lock_add(env, io, &cio->cui_link); + return 0; +} + +void ccc_io_update_iov(const struct lu_env *env, + struct ccc_io *cio, struct cl_io *io) +{ + size_t size = io->u.ci_rw.crw_count; + + if (!cl_is_normalio(env, io) || cio->cui_iter == NULL) + return; + + iov_iter_truncate(cio->cui_iter, size); +} + +int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + loff_t start, loff_t end) +{ + struct cl_object *obj = io->ci_obj; + + return ccc_io_one_lock_index(env, io, enqflags, mode, + cl_index(obj, start), cl_index(obj, end)); +} + +void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + CLOBINVRNT(env, ios->cis_io->ci_obj, + ccc_object_invariant(ios->cis_io->ci_obj)); +} + +void ccc_io_advance(const struct lu_env *env, + const struct cl_io_slice *ios, + size_t nob) +{ + struct ccc_io *cio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = ios->cis_io->ci_obj; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + if (!cl_is_normalio(env, io)) + return; + + iov_iter_reexpand(cio->cui_iter, cio->cui_tot_count -= nob); +} + +/** + * Helper function that if necessary adjusts file size (inode->i_size), when + * position at the offset \a pos is accessed. File size can be arbitrary stale + * on a Lustre client, but client at least knows KMS. If accessed area is + * inside [0, KMS], set file size to KMS, otherwise glimpse file size. + * + * Locking: cl_isize_lock is used to serialize changes to inode size and to + * protect consistency between inode size and cl_object + * attributes. cl_object_size_lock() protects consistency between cl_attr's of + * top-object and sub-objects. + */ +int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, loff_t start, size_t count, int *exceed) +{ + struct cl_attr *attr = ccc_env_thread_attr(env); + struct inode *inode = ccc_object_inode(obj); + loff_t pos = start + count - 1; + loff_t kms; + int result; + + /* + * Consistency guarantees: following possibilities exist for the + * relation between region being accessed and real file size at this + * moment: + * + * (A): the region is completely inside of the file; + * + * (B-x): x bytes of region are inside of the file, the rest is + * outside; + * + * (C): the region is completely outside of the file. + * + * This classification is stable under DLM lock already acquired by + * the caller, because to change the class, other client has to take + * DLM lock conflicting with our lock. Also, any updates to ->i_size + * by other threads on this client are serialized by + * ll_inode_size_lock(). This guarantees that short reads are handled + * correctly in the face of concurrent writes and truncates. + */ + ccc_object_size_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + kms = attr->cat_kms; + if (pos > kms) { + /* + * A glimpse is necessary to determine whether we + * return a short read (B) or some zeroes at the end + * of the buffer (C) + */ + ccc_object_size_unlock(obj); + result = cl_glimpse_lock(env, io, inode, obj, 0); + if (result == 0 && exceed != NULL) { + /* If objective page index exceed end-of-file + * page index, return directly. Do not expect + * kernel will check such case correctly. + * linux-2.6.18-128.1.1 miss to do that. + * --bug 17336 */ + loff_t size = cl_isize_read(inode); + loff_t cur_index = start >> PAGE_CACHE_SHIFT; + loff_t size_index = (size - 1) >> + PAGE_CACHE_SHIFT; + + if ((size == 0 && cur_index != 0) || + size_index < cur_index) + *exceed = 1; + } + return result; + } else { + /* + * region is within kms and, hence, within real file + * size (A). We need to increase i_size to cover the + * read region so that generic_file_read() will do its + * job, but that doesn't mean the kms size is + * _correct_, it is only the _minimum_ size. If + * someone does a stat they will get the correct size + * which will always be >= the kms value here. + * b=11081 + */ + if (cl_isize_read(inode) < kms) { + cl_isize_write_nolock(inode, kms); + CDEBUG(D_VFSTRACE, + DFID" updating i_size %llu\n", + PFID(lu_object_fid(&obj->co_lu)), + (__u64)cl_isize_read(inode)); + + } + } + } + ccc_object_size_unlock(obj); + return result; +} + +/***************************************************************************** + * + * Transfer operations. + * + */ + +void ccc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct ccc_req *vrq; + + if (ioret > 0) + cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret); + + vrq = cl2ccc_req(slice); + OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem); +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for ccc + * layer. ccc is responsible for + * + * - o_[mac]time + * + * - o_mode + * + * - o_parent_seq + * + * - o_[ug]id + * + * - o_parent_oid + * + * - o_parent_ver + * + * - o_ioepoch, + * + * and capability. + */ +void ccc_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, u64 flags) +{ + struct inode *inode; + struct obdo *oa; + u32 valid_flags; + + oa = attr->cra_oa; + inode = ccc_object_inode(obj); + valid_flags = OBD_MD_FLTYPE; + + if ((flags & OBD_MD_FLOSSCAPA) != 0) { + LASSERT(attr->cra_capa == NULL); + attr->cra_capa = cl_capa_lookup(inode, + slice->crs_req->crq_type); + } + + if (slice->crs_req->crq_type == CRT_WRITE) { + if (flags & OBD_MD_FLEPOCH) { + oa->o_valid |= OBD_MD_FLEPOCH; + oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch; + valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME | + OBD_MD_FLUID | OBD_MD_FLGID; + } + } + obdo_from_inode(oa, inode, valid_flags & flags); + obdo_set_parent_fid(oa, &cl_i2info(inode)->lli_fid); + memcpy(attr->cra_jobid, cl_i2info(inode)->lli_jobid, + JOBSTATS_JOBID_SIZE); +} + +static const struct cl_req_operations ccc_req_ops = { + .cro_attr_set = ccc_req_attr_set, + .cro_completion = ccc_req_completion +}; + +int cl_setattr_ost(struct inode *inode, const struct iattr *attr, + struct obd_capa *capa) +{ + struct lu_env *env; + struct cl_io *io; + int result; + int refcheck; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + io = ccc_env_thread_io(env); + io->ci_obj = cl_i2info(inode)->lli_clob; + + io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime); + io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime); + io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime); + io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size; + io->u.ci_setattr.sa_valid = attr->ia_valid; + io->u.ci_setattr.sa_capa = capa; + +again: + if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) { + struct ccc_io *cio = ccc_env_io(env); + + if (attr->ia_valid & ATTR_FILE) + /* populate the file descriptor for ftruncate to honor + * group lock - see LU-787 */ + cio->cui_fd = cl_iattr2fd(inode, attr); + + result = cl_io_loop(env, io); + } else { + result = io->ci_result; + } + cl_io_fini(env, io); + if (unlikely(io->ci_need_restart)) + goto again; + /* HSM import case: file is released, cannot be restored + * no need to fail except if restore registration failed + * with -ENODATA */ + if (result == -ENODATA && io->ci_restore_needed && + io->ci_result != -ENODATA) + result = 0; + cl_env_put(env, &refcheck); + return result; +} + +/***************************************************************************** + * + * Type conversions. + * + */ + +struct lu_device *ccc2lu_dev(struct ccc_device *vdv) +{ + return &vdv->cdv_cl.cd_lu_dev; +} + +struct ccc_device *lu2ccc_dev(const struct lu_device *d) +{ + return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev); +} + +struct ccc_device *cl2ccc_dev(const struct cl_device *d) +{ + return container_of0(d, struct ccc_device, cdv_cl); +} + +struct lu_object *ccc2lu(struct ccc_object *vob) +{ + return &vob->cob_cl.co_lu; +} + +struct ccc_object *lu2ccc(const struct lu_object *obj) +{ + return container_of0(obj, struct ccc_object, cob_cl.co_lu); +} + +struct ccc_object *cl2ccc(const struct cl_object *obj) +{ + return container_of0(obj, struct ccc_object, cob_cl); +} + +struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice) +{ + return container_of(slice, struct ccc_lock, clk_cl); +} + +struct ccc_io *cl2ccc_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct ccc_io *cio; + + cio = container_of(slice, struct ccc_io, cui_cl); + LASSERT(cio == ccc_env_io(env)); + return cio; +} + +struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice) +{ + return container_of0(slice, struct ccc_req, crq_cl); +} + +struct page *cl2vm_page(const struct cl_page_slice *slice) +{ + return cl2ccc_page(slice)->cpg_page; +} + +/***************************************************************************** + * + * Accessors. + * + */ +int ccc_object_invariant(const struct cl_object *obj) +{ + struct inode *inode = ccc_object_inode(obj); + struct cl_inode_info *lli = cl_i2info(inode); + + return (S_ISREG(cl_inode_mode(inode)) || + /* i_mode of unlinked inode is zeroed. */ + cl_inode_mode(inode) == 0) && lli->lli_clob == obj; +} + +struct inode *ccc_object_inode(const struct cl_object *obj) +{ + return cl2ccc(obj)->cob_inode; +} + +/** + * Returns a pointer to cl_page associated with \a vmpage, without acquiring + * additional reference to the resulting page. This is an unsafe version of + * cl_vmpage_page() that can only be used under vmpage lock. + */ +struct cl_page *ccc_vmpage_page_transient(struct page *vmpage) +{ + KLASSERT(PageLocked(vmpage)); + return (struct cl_page *)vmpage->private; +} + +/** + * Initialize or update CLIO structures for regular files when new + * meta-data arrives from the server. + * + * \param inode regular file inode + * \param md new file metadata from MDS + * - allocates cl_object if necessary, + * - updated layout, if object was already here. + */ +int cl_file_inode_init(struct inode *inode, struct lustre_md *md) +{ + struct lu_env *env; + struct cl_inode_info *lli; + struct cl_object *clob; + struct lu_site *site; + struct lu_fid *fid; + struct cl_object_conf conf = { + .coc_inode = inode, + .u = { + .coc_md = md + } + }; + int result = 0; + int refcheck; + + LASSERT(md->body->valid & OBD_MD_FLID); + LASSERT(S_ISREG(cl_inode_mode(inode))); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + site = cl_i2sbi(inode)->ll_site; + lli = cl_i2info(inode); + fid = &lli->lli_fid; + LASSERT(fid_is_sane(fid)); + + if (lli->lli_clob == NULL) { + /* clob is slave of inode, empty lli_clob means for new inode, + * there is no clob in cache with the given fid, so it is + * unnecessary to perform lookup-alloc-lookup-insert, just + * alloc and insert directly. */ + LASSERT(inode->i_state & I_NEW); + conf.coc_lu.loc_flags = LOC_F_NEW; + clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev), + fid, &conf); + if (!IS_ERR(clob)) { + /* + * No locking is necessary, as new inode is + * locked by I_NEW bit. + */ + lli->lli_clob = clob; + lli->lli_has_smd = lsm_has_objects(md->lsm); + lu_object_ref_add(&clob->co_lu, "inode", inode); + } else + result = PTR_ERR(clob); + } else { + result = cl_conf_set(env, lli->lli_clob, &conf); + } + + cl_env_put(env, &refcheck); + + if (result != 0) + CERROR("Failure to initialize cl object "DFID": %d\n", + PFID(fid), result); + return result; +} + +/** + * Wait for others drop their references of the object at first, then we drop + * the last one, which will lead to the object be destroyed immediately. + * Must be called after cl_object_kill() against this object. + * + * The reason we want to do this is: destroying top object will wait for sub + * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs) + * to initiate top object destroying which may deadlock. See bz22520. + */ +static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) +{ + struct lu_object_header *header = obj->co_lu.lo_header; + wait_queue_t waiter; + + if (unlikely(atomic_read(&header->loh_ref) != 1)) { + struct lu_site *site = obj->co_lu.lo_dev->ld_site; + struct lu_site_bkt_data *bkt; + + bkt = lu_site_bkt_from_fid(site, &header->loh_fid); + + init_waitqueue_entry(&waiter, current); + add_wait_queue(&bkt->lsb_marche_funebre, &waiter); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&header->loh_ref) == 1) + break; + schedule(); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&bkt->lsb_marche_funebre, &waiter); + } + + cl_object_put(env, obj); +} + +void cl_inode_fini(struct inode *inode) +{ + struct lu_env *env; + struct cl_inode_info *lli = cl_i2info(inode); + struct cl_object *clob = lli->lli_clob; + int refcheck; + int emergency; + + if (clob != NULL) { + void *cookie; + + cookie = cl_env_reenter(); + env = cl_env_get(&refcheck); + emergency = IS_ERR(env); + if (emergency) { + mutex_lock(&ccc_inode_fini_guard); + LASSERT(ccc_inode_fini_env != NULL); + cl_env_implant(ccc_inode_fini_env, &refcheck); + env = ccc_inode_fini_env; + } + /* + * cl_object cache is a slave to inode cache (which, in turn + * is a slave to dentry cache), don't keep cl_object in memory + * when its master is evicted. + */ + cl_object_kill(env, clob); + lu_object_ref_del(&clob->co_lu, "inode", inode); + cl_object_put_last(env, clob); + lli->lli_clob = NULL; + if (emergency) { + cl_env_unplant(ccc_inode_fini_env, &refcheck); + mutex_unlock(&ccc_inode_fini_guard); + } else + cl_env_put(env, &refcheck); + cl_env_reexit(cookie); + } +} + +/** + * return IF_* type for given lu_dirent entry. + * IF_* flag shld be converted to particular OS file type in + * platform llite module. + */ +__u16 ll_dirent_type_get(struct lu_dirent *ent) +{ + __u16 type = 0; + struct luda_type *lt; + int len = 0; + + if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) { + const unsigned align = sizeof(struct luda_type) - 1; + + len = le16_to_cpu(ent->lde_namelen); + len = (len + align) & ~align; + lt = (void *)ent->lde_name + len; + type = IFTODT(le16_to_cpu(lt->lt_type)); + } + return type; +} + +/** + * build inode number from passed @fid */ +__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32) +{ + if (BITS_PER_LONG == 32 || api32) + return fid_flatten32(fid); + else + return fid_flatten(fid); +} + +/** + * build inode generation from passed @fid. If our FID overflows the 32-bit + * inode number then return a non-zero generation to distinguish them. */ +__u32 cl_fid_build_gen(const struct lu_fid *fid) +{ + __u32 gen; + + if (fid_is_igif(fid)) { + gen = lu_igif_gen(fid); + return gen; + } + + gen = fid_flatten(fid) >> 32; + return gen; +} + +/* lsm is unreliable after hsm implementation as layout can be changed at + * any time. This is only to support old, non-clio-ized interfaces. It will + * cause deadlock if clio operations are called with this extra layout refcount + * because in case the layout changed during the IO, ll_layout_refresh() will + * have to wait for the refcount to become zero to destroy the older layout. + * + * Notice that the lsm returned by this function may not be valid unless called + * inside layout lock - MDS_INODELOCK_LAYOUT. */ +struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode) +{ + return lov_lsm_get(cl_i2info(inode)->lli_clob); +} + +inline void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm) +{ + lov_lsm_put(cl_i2info(inode)->lli_clob, lsm); +} diff --git a/kernel/drivers/staging/lustre/lustre/lclient/lcommon_misc.c b/kernel/drivers/staging/lustre/lustre/lclient/lcommon_misc.c new file mode 100644 index 000000000..01bf894d4 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lclient/lcommon_misc.c @@ -0,0 +1,199 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl code shared between vvp and liblustre (and other Lustre clients in the + * future). + * + */ +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/obd.h" +#include "../include/cl_object.h" +#include "../include/lclient.h" + +#include "../include/lustre_lite.h" + + +/* Initialize the default and maximum LOV EA and cookie sizes. This allows + * us to make MDS RPCs with large enough reply buffers to hold the + * maximum-sized (= maximum striped) EA and cookie without having to + * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */ +int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp) +{ + struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 }; + __u32 valsize = sizeof(struct lov_desc); + int rc, easize, def_easize, cookiesize; + struct lov_desc desc; + __u16 stripes, def_stripes; + + rc = obd_get_info(NULL, dt_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC, + &valsize, &desc, NULL); + if (rc) + return rc; + + stripes = min_t(__u32, desc.ld_tgt_count, LOV_MAX_STRIPE_COUNT); + lsm.lsm_stripe_count = stripes; + easize = obd_size_diskmd(dt_exp, &lsm); + + def_stripes = min_t(__u32, desc.ld_default_stripe_count, + LOV_MAX_STRIPE_COUNT); + lsm.lsm_stripe_count = def_stripes; + def_easize = obd_size_diskmd(dt_exp, &lsm); + + cookiesize = stripes * sizeof(struct llog_cookie); + + /* default cookiesize is 0 because from 2.4 server doesn't send + * llog cookies to client. */ + CDEBUG(D_HA, + "updating def/max_easize: %d/%d def/max_cookiesize: 0/%d\n", + def_easize, easize, cookiesize); + + rc = md_init_ea_size(md_exp, easize, def_easize, cookiesize, 0); + return rc; +} + +/** + * This function is used as an upcall-callback hooked by liblustre and llite + * clients into obd_notify() listeners chain to handle notifications about + * change of import connect_flags. See llu_fsswop_mount() and + * lustre_common_fill_super(). + */ +int cl_ocd_update(struct obd_device *host, + struct obd_device *watched, + enum obd_notify_event ev, void *owner, void *data) +{ + struct lustre_client_ocd *lco; + struct client_obd *cli; + __u64 flags; + int result; + + if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + cli = &watched->u.cli; + lco = owner; + flags = cli->cl_import->imp_connect_data.ocd_connect_flags; + CDEBUG(D_SUPER, "Changing connect_flags: %#llx -> %#llx\n", + lco->lco_flags, flags); + mutex_lock(&lco->lco_lock); + lco->lco_flags &= flags; + /* for each osc event update ea size */ + if (lco->lco_dt_exp) + cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp); + + mutex_unlock(&lco->lco_lock); + result = 0; + } else { + CERROR("unexpected notification from %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + result = -EINVAL; + } + return result; +} + +#define GROUPLOCK_SCOPE "grouplock" + +int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, + struct ccc_grouplock *cg) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_lock *lock; + struct cl_lock_descr *descr; + __u32 enqflags; + int refcheck; + int rc; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + io = ccc_env_thread_io(env); + io->ci_obj = obj; + io->ci_ignore_layout = 1; + + rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (rc) { + /* Does not make sense to take GL for released layout */ + if (rc > 0) + rc = -ENOTSUPP; + cl_env_put(env, &refcheck); + return rc; + } + + descr = &ccc_env_info(env)->cti_descr; + descr->cld_obj = obj; + descr->cld_start = 0; + descr->cld_end = CL_PAGE_EOF; + descr->cld_gid = gid; + descr->cld_mode = CLM_GROUP; + + enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0); + descr->cld_enq_flags = enqflags; + + lock = cl_lock_request(env, io, descr, GROUPLOCK_SCOPE, current); + if (IS_ERR(lock)) { + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + return PTR_ERR(lock); + } + + cg->cg_env = cl_env_get(&refcheck); + cg->cg_io = io; + cg->cg_lock = lock; + cg->cg_gid = gid; + LASSERT(cg->cg_env == env); + + cl_env_unplant(env, &refcheck); + return 0; +} + +void cl_put_grouplock(struct ccc_grouplock *cg) +{ + struct lu_env *env = cg->cg_env; + struct cl_io *io = cg->cg_io; + struct cl_lock *lock = cg->cg_lock; + int refcheck; + + LASSERT(cg->cg_env); + LASSERT(cg->cg_gid); + + cl_env_implant(env, &refcheck); + cl_env_put(env, &refcheck); + + cl_unuse(env, lock); + cl_lock_release(env, lock, GROUPLOCK_SCOPE, current); + cl_io_fini(env, io); + cl_env_put(env, NULL); +} diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/kernel/drivers/staging/lustre/lustre/ldlm/interval_tree.c new file mode 100644 index 000000000..eab2bd602 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/interval_tree.c @@ -0,0 +1,751 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/interval_tree.c + * + * Interval tree library used by ldlm extent lock code + * + * Author: Huang Wei + * Author: Jay Xiong + */ +#include "../include/lustre_dlm.h" +#include "../include/obd_support.h" +#include "../include/interval_tree.h" + +enum { + INTERVAL_RED = 0, + INTERVAL_BLACK = 1 +}; + +static inline int node_is_left_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_left; +} + +static inline int node_is_right_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_right; +} + +static inline int node_is_red(struct interval_node *node) +{ + return node->in_color == INTERVAL_RED; +} + +static inline int node_is_black(struct interval_node *node) +{ + return node->in_color == INTERVAL_BLACK; +} + +static inline int extent_compare(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + int rc; + + if (e1->start == e2->start) { + if (e1->end < e2->end) + rc = -1; + else if (e1->end > e2->end) + rc = 1; + else + rc = 0; + } else { + if (e1->start < e2->start) + rc = -1; + else + rc = 1; + } + return rc; +} + +static inline int extent_equal(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start == e2->start) && (e1->end == e2->end); +} + +static inline int extent_overlapped(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start <= e2->end) && (e2->start <= e1->end); +} + +static inline int node_compare(struct interval_node *n1, + struct interval_node *n2) +{ + return extent_compare(&n1->in_extent, &n2->in_extent); +} + +static inline int node_equal(struct interval_node *n1, + struct interval_node *n2) +{ + return extent_equal(&n1->in_extent, &n2->in_extent); +} + +static inline __u64 max_u64(__u64 x, __u64 y) +{ + return x > y ? x : y; +} + +static inline __u64 min_u64(__u64 x, __u64 y) +{ + return x < y ? x : y; +} + +#define interval_for_each(node, root) \ +for (node = interval_first(root); node != NULL; \ + node = interval_next(node)) + +#define interval_for_each_reverse(node, root) \ +for (node = interval_last(root); node != NULL; \ + node = interval_prev(node)) + +static struct interval_node *interval_first(struct interval_node *node) +{ + if (!node) + return NULL; + while (node->in_left) + node = node->in_left; + return node; +} + +static struct interval_node *interval_last(struct interval_node *node) +{ + if (!node) + return NULL; + while (node->in_right) + node = node->in_right; + return node; +} + +static struct interval_node *interval_next(struct interval_node *node) +{ + if (!node) + return NULL; + if (node->in_right) + return interval_first(node->in_right); + while (node->in_parent && node_is_right_child(node)) + node = node->in_parent; + return node->in_parent; +} + +static struct interval_node *interval_prev(struct interval_node *node) +{ + if (!node) + return NULL; + + if (node->in_left) + return interval_last(node->in_left); + + while (node->in_parent && node_is_left_child(node)) + node = node->in_parent; + + return node->in_parent; +} + +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + + interval_for_each(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + return rc; +} +EXPORT_SYMBOL(interval_iterate); + +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + + interval_for_each_reverse(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + return rc; +} +EXPORT_SYMBOL(interval_iterate_reverse); + +/* try to find a node with same interval in the tree, + * if found, return the pointer to the node, otherwise return NULL*/ +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex) +{ + struct interval_node *walk = root; + int rc; + + while (walk) { + rc = extent_compare(ex, &walk->in_extent); + if (rc == 0) + break; + else if (rc < 0) + walk = walk->in_left; + else + walk = walk->in_right; + } + + return walk; +} +EXPORT_SYMBOL(interval_find); + +static void __rotate_change_maxhigh(struct interval_node *node, + struct interval_node *rotate) +{ + __u64 left_max, right_max; + + rotate->in_max_high = node->in_max_high; + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max_u64(interval_high(node), + max_u64(left_max, right_max)); +} + +/* The left rotation "pivots" around the link from node to node->right, and + * - node will be linked to node->right's left child, and + * - node->right's left child will be linked to node's right child. */ +static void __rotate_left(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *right = node->in_right; + struct interval_node *parent = node->in_parent; + + node->in_right = right->in_left; + if (node->in_right) + right->in_left->in_parent = node; + + right->in_left = node; + right->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = right; + else + parent->in_right = right; + } else { + *root = right; + } + node->in_parent = right; + + /* update max_high for node and right */ + __rotate_change_maxhigh(node, right); +} + +/* The right rotation "pivots" around the link from node to node->left, and + * - node will be linked to node->left's right child, and + * - node->left's right child will be linked to node's left child. */ +static void __rotate_right(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *left = node->in_left; + struct interval_node *parent = node->in_parent; + + node->in_left = left->in_right; + if (node->in_left) + left->in_right->in_parent = node; + left->in_right = node; + + left->in_parent = parent; + if (parent) { + if (node_is_right_child(node)) + parent->in_right = left; + else + parent->in_left = left; + } else { + *root = left; + } + node->in_parent = left; + + /* update max_high for node and left */ + __rotate_change_maxhigh(node, left); +} + +#define interval_swap(a, b) do { \ + struct interval_node *c = a; a = b; b = c; \ +} while (0) + +/* + * Operations INSERT and DELETE, when run on a tree with n keys, + * take O(logN) time.Because they modify the tree, the result + * may violate the red-black properties.To restore these properties, + * we must change the colors of some of the nodes in the tree + * and also change the pointer structure. + */ +static void interval_insert_color(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *parent, *gparent; + + while ((parent = node->in_parent) && node_is_red(parent)) { + gparent = parent->in_parent; + /* Parent is RED, so gparent must not be NULL */ + if (node_is_left_child(parent)) { + struct interval_node *uncle; + + uncle = gparent->in_right; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (parent->in_right == node) { + __rotate_left(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_right(gparent, root); + } else { + struct interval_node *uncle; + + uncle = gparent->in_left; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (node_is_left_child(node)) { + __rotate_right(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_left(gparent, root); + } + } + + (*root)->in_color = INTERVAL_BLACK; +} + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root) + +{ + struct interval_node **p, *parent = NULL; + + LASSERT(!interval_is_intree(node)); + p = root; + while (*p) { + parent = *p; + if (node_equal(parent, node)) + return parent; + + /* max_high field must be updated after each iteration */ + if (parent->in_max_high < interval_high(node)) + parent->in_max_high = interval_high(node); + + if (node_compare(node, parent) < 0) + p = &parent->in_left; + else + p = &parent->in_right; + } + + /* link node into the tree */ + node->in_parent = parent; + node->in_color = INTERVAL_RED; + node->in_left = node->in_right = NULL; + *p = node; + + interval_insert_color(node, root); + node->in_intree = 1; + + return NULL; +} +EXPORT_SYMBOL(interval_insert); + +static inline int node_is_black_or_0(struct interval_node *node) +{ + return !node || node_is_black(node); +} + +static void interval_erase_color(struct interval_node *node, + struct interval_node *parent, + struct interval_node **root) +{ + struct interval_node *tmp; + + while (node_is_black_or_0(node) && node != *root) { + if (parent->in_left == node) { + tmp = parent->in_right; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_left(parent, root); + tmp = parent->in_right; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_right)) { + struct interval_node *o_left; + + o_left = tmp->in_left; + if (o_left) + o_left->in_color = INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_right(tmp, root); + tmp = parent->in_right; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_right) + tmp->in_right->in_color = INTERVAL_BLACK; + __rotate_left(parent, root); + node = *root; + break; + } + } else { + tmp = parent->in_left; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_right(parent, root); + tmp = parent->in_left; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_left)) { + struct interval_node *o_right; + + o_right = tmp->in_right; + if (o_right) + o_right->in_color = INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_left(tmp, root); + tmp = parent->in_left; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_left) + tmp->in_left->in_color = INTERVAL_BLACK; + __rotate_right(parent, root); + node = *root; + break; + } + } + } + if (node) + node->in_color = INTERVAL_BLACK; +} + +/* + * if the @max_high value of @node is changed, this function traverse a path + * from node up to the root to update max_high for the whole tree. + */ +static void update_maxhigh(struct interval_node *node, + __u64 old_maxhigh) +{ + __u64 left_max, right_max; + + while (node) { + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max_u64(interval_high(node), + max_u64(left_max, right_max)); + + if (node->in_max_high >= old_maxhigh) + break; + node = node->in_parent; + } +} + +void interval_erase(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *child, *parent; + int color; + + LASSERT(interval_is_intree(node)); + node->in_intree = 0; + if (!node->in_left) { + child = node->in_right; + } else if (!node->in_right) { + child = node->in_left; + } else { /* Both left and right child are not NULL */ + struct interval_node *old = node; + + node = interval_next(node); + child = node->in_right; + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent == old) + parent->in_right = child; + else + parent->in_left = child; + + node->in_color = old->in_color; + node->in_right = old->in_right; + node->in_left = old->in_left; + node->in_parent = old->in_parent; + + if (old->in_parent) { + if (node_is_left_child(old)) + old->in_parent->in_left = node; + else + old->in_parent->in_right = node; + } else { + *root = node; + } + + old->in_left->in_parent = node; + if (old->in_right) + old->in_right->in_parent = node; + update_maxhigh(child ? : parent, node->in_max_high); + update_maxhigh(node, old->in_max_high); + if (parent == old) + parent = node; + goto color; + } + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = child; + else + parent->in_right = child; + } else { + *root = child; + } + + update_maxhigh(child ? : parent, node->in_max_high); + +color: + if (color == INTERVAL_BLACK) + interval_erase_color(child, parent, root); +} +EXPORT_SYMBOL(interval_erase); + +static inline int interval_may_overlap(struct interval_node *node, + struct interval_node_extent *ext) +{ + return (ext->start <= node->in_max_high && + ext->end >= interval_low(node)); +} + +/* + * This function finds all intervals that overlap interval ext, + * and calls func to handle resulted intervals one by one. + * in lustre, this function will find all conflicting locks in + * the granted queue and add these locks to the ast work list. + * + * { + * if (node == NULL) + * return 0; + * if (ext->end < interval_low(node)) { + * interval_search(node->in_left, ext, func, data); + * } else if (interval_may_overlap(node, ext)) { + * if (extent_overlapped(ext, &node->in_extent)) + * func(node, data); + * interval_search(node->in_left, ext, func, data); + * interval_search(node->in_right, ext, func, data); + * } + * return 0; + * } + * + */ +enum interval_iter interval_search(struct interval_node *node, + struct interval_node_extent *ext, + interval_callback_t func, + void *data) +{ + struct interval_node *parent; + enum interval_iter rc = INTERVAL_ITER_CONT; + + LASSERT(ext != NULL); + LASSERT(func != NULL); + + while (node) { + if (ext->end < interval_low(node)) { + if (node->in_left) { + node = node->in_left; + continue; + } + } else if (interval_may_overlap(node, ext)) { + if (extent_overlapped(ext, &node->in_extent)) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + if (node->in_left) { + node = node->in_left; + continue; + } + if (node->in_right) { + node = node->in_right; + continue; + } + } + + parent = node->in_parent; + while (parent) { + if (node_is_left_child(node) && + parent->in_right) { + /* If we ever got the left, it means that the + * parent met ext->endin_right; + break; + } + node = parent; + parent = parent->in_parent; + } + if (parent == NULL || !interval_may_overlap(parent, ext)) + break; + } + + return rc; +} +EXPORT_SYMBOL(interval_search); + +static enum interval_iter interval_overlap_cb(struct interval_node *n, + void *args) +{ + *(int *)args = 1; + return INTERVAL_ITER_STOP; +} + +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ext) +{ + int has = 0; + (void)interval_search(root, ext, interval_overlap_cb, &has); + return has; +} +EXPORT_SYMBOL(interval_is_overlapped); + +/* Don't expand to low. Expanding downwards is expensive, and meaningless to + * some extents, because programs seldom do IO backward. + * + * The recursive algorithm of expanding low: + * expand_low { + * struct interval_node *tmp; + * static __u64 res = 0; + * + * if (root == NULL) + * return res; + * if (root->in_max_high < low) { + * res = max_u64(root->in_max_high + 1, res); + * return res; + * } else if (low < interval_low(root)) { + * interval_expand_low(root->in_left, low); + * return res; + * } + * + * if (interval_high(root) < low) + * res = max_u64(interval_high(root) + 1, res); + * interval_expand_low(root->in_left, low); + * interval_expand_low(root->in_right, low); + * + * return res; + * } + * + * It's much easy to eliminate the recursion, see interval_search for + * an example. -jay + */ +static inline __u64 interval_expand_low(struct interval_node *root, __u64 low) +{ + /* we only concern the empty tree right now. */ + if (root == NULL) + return 0; + return low; +} + +static inline __u64 interval_expand_high(struct interval_node *node, __u64 high) +{ + __u64 result = ~0; + + while (node != NULL) { + if (node->in_max_high < high) + break; + + if (interval_low(node) > high) { + result = interval_low(node) - 1; + node = node->in_left; + } else { + node = node->in_right; + } + } + + return result; +} + +/* expanding the extent based on @ext. */ +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter) +{ + /* The assertion of interval_is_overlapped is expensive because we may + * travel many nodes to find the overlapped node. */ + LASSERT(interval_is_overlapped(root, ext) == 0); + if (!limiter || limiter->start < ext->start) + ext->start = interval_expand_low(root, ext->start); + if (!limiter || limiter->end > ext->end) + ext->end = interval_expand_high(root, ext->end); + LASSERT(interval_is_overlapped(root, ext) == 0); +} +EXPORT_SYMBOL(interval_expand); diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c b/kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c new file mode 100644 index 000000000..cd8ab40e3 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/l_lock.c @@ -0,0 +1,76 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LDLM +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/lustre_dlm.h" +#include "../include/lustre_lib.h" + +/** + * Lock a lock and its resource. + * + * LDLM locking uses resource to serialize access to locks + * but there is a case when we change resource of lock upon + * enqueue reply. We rely on lock->l_resource = new_res + * being an atomic operation. + */ +struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock) +{ + /* on server-side resource of lock doesn't change */ + if ((lock->l_flags & LDLM_FL_NS_SRV) == 0) + spin_lock(&lock->l_lock); + + lock_res(lock->l_resource); + + lock->l_flags |= LDLM_FL_RES_LOCKED; + return lock->l_resource; +} +EXPORT_SYMBOL(lock_res_and_lock); + +/** + * Unlock a lock and its resource previously locked with lock_res_and_lock + */ +void unlock_res_and_lock(struct ldlm_lock *lock) +{ + /* on server-side resource of lock doesn't change */ + lock->l_flags &= ~LDLM_FL_RES_LOCKED; + + unlock_res(lock->l_resource); + if ((lock->l_flags & LDLM_FL_NS_SRV) == 0) + spin_unlock(&lock->l_lock); +} +EXPORT_SYMBOL(unlock_res_and_lock); diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c new file mode 100644 index 000000000..fd9b05936 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c @@ -0,0 +1,241 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_extent.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of EXTENT lock type + * + * EXTENT lock type is for locking a contiguous range of values, represented + * by 64-bit starting and ending offsets (inclusive). There are several extent + * lock modes, some of which may be mutually incompatible. Extent locks are + * considered incompatible if their modes are incompatible and their extents + * intersect. See the lock mode compatibility matrix in lustre_dlm.h. + */ + +#define DEBUG_SUBSYSTEM S_LDLM +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/lustre_dlm.h" +#include "../include/obd_support.h" +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/lustre_lib.h" +#include "ldlm_internal.h" + + +/* When a lock is cancelled by a client, the KMS may undergo change if this + * is the "highest lock". This function returns the new KMS value. + * Caller must hold lr_lock already. + * + * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */ +__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms) +{ + struct ldlm_resource *res = lock->l_resource; + struct list_head *tmp; + struct ldlm_lock *lck; + __u64 kms = 0; + + /* don't let another thread in ldlm_extent_shift_kms race in + * just after we finish and take our lock into account in its + * calculation of the kms */ + lock->l_flags |= LDLM_FL_KMS_IGNORE; + + list_for_each(tmp, &res->lr_granted) { + lck = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (lck->l_flags & LDLM_FL_KMS_IGNORE) + continue; + + if (lck->l_policy_data.l_extent.end >= old_kms) + return old_kms; + + /* This extent _has_ to be smaller than old_kms (checked above) + * so kms can only ever be smaller or the same as old_kms. */ + if (lck->l_policy_data.l_extent.end + 1 > kms) + kms = lck->l_policy_data.l_extent.end + 1; + } + LASSERTF(kms <= old_kms, "kms %llu old_kms %llu\n", kms, old_kms); + + return kms; +} +EXPORT_SYMBOL(ldlm_extent_shift_kms); + +struct kmem_cache *ldlm_interval_slab; +struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock) +{ + struct ldlm_interval *node; + + LASSERT(lock->l_resource->lr_type == LDLM_EXTENT); + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS); + if (node == NULL) + return NULL; + + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + return node; +} + +void ldlm_interval_free(struct ldlm_interval *node) +{ + if (node) { + LASSERT(list_empty(&node->li_group)); + LASSERT(!interval_is_intree(&node->li_node)); + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + } +} + +/* interval tree, for LDLM_EXTENT. */ +void ldlm_interval_attach(struct ldlm_interval *n, + struct ldlm_lock *l) +{ + LASSERT(l->l_tree_node == NULL); + LASSERT(l->l_resource->lr_type == LDLM_EXTENT); + + list_add_tail(&l->l_sl_policy, &n->li_group); + l->l_tree_node = n; +} + +struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l) +{ + struct ldlm_interval *n = l->l_tree_node; + + if (n == NULL) + return NULL; + + LASSERT(!list_empty(&n->li_group)); + l->l_tree_node = NULL; + list_del_init(&l->l_sl_policy); + + return list_empty(&n->li_group) ? n : NULL; +} + +static inline int lock_mode_to_index(ldlm_mode_t mode) +{ + int index; + + LASSERT(mode != 0); + LASSERT(IS_PO2(mode)); + for (index = -1; mode; index++) + mode >>= 1; + LASSERT(index < LCK_MODE_NUM); + return index; +} + +/** Add newly granted lock into interval tree for the resource. */ +void ldlm_extent_add_lock(struct ldlm_resource *res, + struct ldlm_lock *lock) +{ + struct interval_node *found, **root; + struct ldlm_interval *node; + struct ldlm_extent *extent; + int idx; + + LASSERT(lock->l_granted_mode == lock->l_req_mode); + + node = lock->l_tree_node; + LASSERT(node != NULL); + LASSERT(!interval_is_intree(&node->li_node)); + + idx = lock_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == 1 << idx); + LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode); + + /* node extent initialize */ + extent = &lock->l_policy_data.l_extent; + interval_set(&node->li_node, extent->start, extent->end); + + root = &res->lr_itree[idx].lit_root; + found = interval_insert(&node->li_node, root); + if (found) { /* The policy group found. */ + struct ldlm_interval *tmp; + + tmp = ldlm_interval_detach(lock); + LASSERT(tmp != NULL); + ldlm_interval_free(tmp); + ldlm_interval_attach(to_ldlm_interval(found), lock); + } + res->lr_itree[idx].lit_size++; + + /* even though we use interval tree to manage the extent lock, we also + * add the locks into grant list, for debug purpose, .. */ + ldlm_resource_add_lock(res, &res->lr_granted, lock); +} + +/** Remove cancelled lock from resource interval tree. */ +void ldlm_extent_unlink_lock(struct ldlm_lock *lock) +{ + struct ldlm_resource *res = lock->l_resource; + struct ldlm_interval *node = lock->l_tree_node; + struct ldlm_interval_tree *tree; + int idx; + + if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */ + return; + + idx = lock_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == 1 << idx); + tree = &res->lr_itree[idx]; + + LASSERT(tree->lit_root != NULL); /* assure the tree is not null */ + + tree->lit_size--; + node = ldlm_interval_detach(lock); + if (node) { + interval_erase(&node->li_node, &tree->lit_root); + ldlm_interval_free(node); + } +} + +void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + memset(lpolicy, 0, sizeof(*lpolicy)); + lpolicy->l_extent.start = wpolicy->l_extent.start; + lpolicy->l_extent.end = wpolicy->l_extent.end; + lpolicy->l_extent.gid = wpolicy->l_extent.gid; +} + +void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_extent.start = lpolicy->l_extent.start; + wpolicy->l_extent.end = lpolicy->l_extent.end; + wpolicy->l_extent.gid = lpolicy->l_extent.gid; +} diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c new file mode 100644 index 000000000..a4c252feb --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c @@ -0,0 +1,859 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003 Hewlett-Packard Development Company LP. + * Developed under the sponsorship of the US Government under + * Subcontract No. B514193 + * + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** + * This file implements POSIX lock type for Lustre. + * Its policy properties are start and end of extent and PID. + * + * These locks are only done through MDS due to POSIX semantics requiring + * e.g. that locks could be only partially released and as such split into + * two parts, and also that two adjacent locks from the same process may be + * merged into a single wider lock. + * + * Lock modes are mapped like this: + * PR and PW for READ and WRITE locks + * NL to request a releasing of a portion of the lock + * + * These flock locks never timeout. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include "../include/lustre_dlm.h" +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_lib.h" +#include +#include "ldlm_internal.h" + +int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag); + +/** + * list_for_remaining_safe - iterate over the remaining entries in a list + * and safeguard against removal of a list entry. + * \param pos the &struct list_head to use as a loop counter. pos MUST + * have been initialized prior to using it in this macro. + * \param n another &struct list_head to use as temporary storage + * \param head the head for your list. + */ +#define list_for_remaining_safe(pos, n, head) \ + for (n = pos->next; pos != (head); pos = n, n = pos->next) + +static inline int +ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new) +{ + return((new->l_policy_data.l_flock.owner == + lock->l_policy_data.l_flock.owner) && + (new->l_export == lock->l_export)); +} + +static inline int +ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new) +{ + return((new->l_policy_data.l_flock.start <= + lock->l_policy_data.l_flock.end) && + (new->l_policy_data.l_flock.end >= + lock->l_policy_data.l_flock.start)); +} + +static inline void ldlm_flock_blocking_link(struct ldlm_lock *req, + struct ldlm_lock *lock) +{ + /* For server only */ + if (req->l_export == NULL) + return; + + LASSERT(hlist_unhashed(&req->l_exp_flock_hash)); + + req->l_policy_data.l_flock.blocking_owner = + lock->l_policy_data.l_flock.owner; + req->l_policy_data.l_flock.blocking_export = + lock->l_export; + req->l_policy_data.l_flock.blocking_refs = 0; + + cfs_hash_add(req->l_export->exp_flock_hash, + &req->l_policy_data.l_flock.owner, + &req->l_exp_flock_hash); +} + +static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req) +{ + /* For server only */ + if (req->l_export == NULL) + return; + + check_res_locked(req->l_resource); + if (req->l_export->exp_flock_hash != NULL && + !hlist_unhashed(&req->l_exp_flock_hash)) + cfs_hash_del(req->l_export->exp_flock_hash, + &req->l_policy_data.l_flock.owner, + &req->l_exp_flock_hash); +} + +static inline void +ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, __u64 flags) +{ + LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%llx)", + mode, flags); + + /* Safe to not lock here, since it should be empty anyway */ + LASSERT(hlist_unhashed(&lock->l_exp_flock_hash)); + + list_del_init(&lock->l_res_link); + if (flags == LDLM_FL_WAIT_NOREPROC && + !(lock->l_flags & LDLM_FL_FAILED)) { + /* client side - set a flag to prevent sending a CANCEL */ + lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING; + + /* when reaching here, it is under lock_res_and_lock(). Thus, + need call the nolock version of ldlm_lock_decref_internal*/ + ldlm_lock_decref_internal_nolock(lock, mode); + } + + ldlm_lock_destroy_nolock(lock); +} + +/** + * POSIX locks deadlock detection code. + * + * Given a new lock \a req and an existing lock \a bl_lock it conflicts + * with, we need to iterate through all blocked POSIX locks for this + * export and see if there is a deadlock condition arising. (i.e. when + * one client holds a lock on something and want a lock on something + * else and at the same time another client has the opposite situation). + */ +static int +ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock) +{ + struct obd_export *req_exp = req->l_export; + struct obd_export *bl_exp = bl_lock->l_export; + __u64 req_owner = req->l_policy_data.l_flock.owner; + __u64 bl_owner = bl_lock->l_policy_data.l_flock.owner; + + /* For server only */ + if (req_exp == NULL) + return 0; + + class_export_get(bl_exp); + while (1) { + struct obd_export *bl_exp_new; + struct ldlm_lock *lock = NULL; + struct ldlm_flock *flock; + + if (bl_exp->exp_flock_hash != NULL) + lock = cfs_hash_lookup(bl_exp->exp_flock_hash, + &bl_owner); + if (lock == NULL) + break; + + LASSERT(req != lock); + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->owner == bl_owner); + bl_owner = flock->blocking_owner; + bl_exp_new = class_export_get(flock->blocking_export); + class_export_put(bl_exp); + + cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash); + bl_exp = bl_exp_new; + + if (bl_owner == req_owner && bl_exp == req_exp) { + class_export_put(bl_exp); + return 1; + } + } + class_export_put(bl_exp); + + return 0; +} + +static void ldlm_flock_cancel_on_deadlock(struct ldlm_lock *lock, + struct list_head *work_list) +{ + CDEBUG(D_INFO, "reprocess deadlock req=%p\n", lock); + + if ((exp_connect_flags(lock->l_export) & + OBD_CONNECT_FLOCK_DEAD) == 0) { + CERROR( + "deadlock found, but client doesn't support flock canceliation\n"); + } else { + LASSERT(lock->l_completion_ast); + LASSERT((lock->l_flags & LDLM_FL_AST_SENT) == 0); + lock->l_flags |= LDLM_FL_AST_SENT | LDLM_FL_CANCEL_ON_BLOCK | + LDLM_FL_FLOCK_DEADLOCK; + ldlm_flock_blocking_unlink(lock); + ldlm_resource_unlink_lock(lock); + ldlm_add_ast_work_item(lock, NULL, work_list); + } +} + +/** + * Process a granting attempt for flock lock. + * Must be called under ns lock held. + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + * + * It is also responsible for splitting a lock if a portion of the lock + * is released. + * + * If \a first_enq is 0 (ie, called from ldlm_reprocess_queue): + * - blocking ASTs have already been sent + * + * If \a first_enq is 1 (ie, called from ldlm_lock_enqueue): + * - blocking ASTs have not been sent yet, so list of conflicting locks + * would be collected and ASTs sent. + */ +int +ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, int first_enq, + ldlm_error_t *err, struct list_head *work_list) +{ + struct ldlm_resource *res = req->l_resource; + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + struct list_head *tmp; + struct list_head *ownlocks = NULL; + struct ldlm_lock *lock = NULL; + struct ldlm_lock *new = req; + struct ldlm_lock *new2 = NULL; + ldlm_mode_t mode = req->l_req_mode; + int local = ns_is_client(ns); + int added = (mode == LCK_NL); + int overlaps = 0; + int splitted = 0; + const struct ldlm_callback_suite null_cbs = { NULL }; + + CDEBUG(D_DLMTRACE, + "flags %#llx owner %llu pid %u mode %u start %llu end %llu\n", + *flags, new->l_policy_data.l_flock.owner, + new->l_policy_data.l_flock.pid, mode, + req->l_policy_data.l_flock.start, + req->l_policy_data.l_flock.end); + + *err = ELDLM_OK; + + if (local) { + /* No blocking ASTs are sent to the clients for + * Posix file & record locks */ + req->l_blocking_ast = NULL; + } else { + /* Called on the server for lock cancels. */ + req->l_blocking_ast = ldlm_flock_blocking_ast; + } + +reprocess: + if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) { + /* This loop determines where this processes locks start + * in the resource lr_granted list. */ + list_for_each(tmp, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + if (ldlm_same_flock_owner(lock, req)) { + ownlocks = tmp; + break; + } + } + } else { + int reprocess_failed = 0; + + lockmode_verify(mode); + + /* This loop determines if there are existing locks + * that conflict with the new lock request. */ + list_for_each(tmp, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + + if (ldlm_same_flock_owner(lock, req)) { + if (!ownlocks) + ownlocks = tmp; + continue; + } + + /* locks are compatible, overlap doesn't matter */ + if (lockmode_compat(lock->l_granted_mode, mode)) + continue; + + if (!ldlm_flocks_overlap(lock, req)) + continue; + + if (!first_enq) { + reprocess_failed = 1; + if (ldlm_flock_deadlock(req, lock)) { + ldlm_flock_cancel_on_deadlock(req, + work_list); + return LDLM_ITER_CONTINUE; + } + continue; + } + + if (*flags & LDLM_FL_BLOCK_NOWAIT) { + ldlm_flock_destroy(req, mode, *flags); + *err = -EAGAIN; + return LDLM_ITER_STOP; + } + + if (*flags & LDLM_FL_TEST_LOCK) { + ldlm_flock_destroy(req, mode, *flags); + req->l_req_mode = lock->l_granted_mode; + req->l_policy_data.l_flock.pid = + lock->l_policy_data.l_flock.pid; + req->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + req->l_policy_data.l_flock.end = + lock->l_policy_data.l_flock.end; + *flags |= LDLM_FL_LOCK_CHANGED; + return LDLM_ITER_STOP; + } + + /* add lock to blocking list before deadlock + * check to prevent race */ + ldlm_flock_blocking_link(req, lock); + + if (ldlm_flock_deadlock(req, lock)) { + ldlm_flock_blocking_unlink(req); + ldlm_flock_destroy(req, mode, *flags); + *err = -EDEADLK; + return LDLM_ITER_STOP; + } + + ldlm_resource_add_lock(res, &res->lr_waiting, req); + *flags |= LDLM_FL_BLOCK_GRANTED; + return LDLM_ITER_STOP; + } + if (reprocess_failed) + return LDLM_ITER_CONTINUE; + } + + if (*flags & LDLM_FL_TEST_LOCK) { + ldlm_flock_destroy(req, mode, *flags); + req->l_req_mode = LCK_NL; + *flags |= LDLM_FL_LOCK_CHANGED; + return LDLM_ITER_STOP; + } + + /* In case we had slept on this lock request take it off of the + * deadlock detection hash list. */ + ldlm_flock_blocking_unlink(req); + + /* Scan the locks owned by this process that overlap this request. + * We may have to merge or split existing locks. */ + + if (!ownlocks) + ownlocks = &res->lr_granted; + + list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) { + lock = list_entry(ownlocks, struct ldlm_lock, l_res_link); + + if (!ldlm_same_flock_owner(lock, new)) + break; + + if (lock->l_granted_mode == mode) { + /* If the modes are the same then we need to process + * locks that overlap OR adjoin the new lock. The extra + * logic condition is necessary to deal with arithmetic + * overflow and underflow. */ + if ((new->l_policy_data.l_flock.start > + (lock->l_policy_data.l_flock.end + 1)) + && (lock->l_policy_data.l_flock.end != + OBD_OBJECT_EOF)) + continue; + + if ((new->l_policy_data.l_flock.end < + (lock->l_policy_data.l_flock.start - 1)) + && (lock->l_policy_data.l_flock.start != 0)) + break; + + if (new->l_policy_data.l_flock.start < + lock->l_policy_data.l_flock.start) { + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.start; + } else { + new->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + } + + if (new->l_policy_data.l_flock.end > + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.end; + } else { + new->l_policy_data.l_flock.end = + lock->l_policy_data.l_flock.end; + } + + if (added) { + ldlm_flock_destroy(lock, mode, *flags); + } else { + new = lock; + added = 1; + } + continue; + } + + if (new->l_policy_data.l_flock.start > + lock->l_policy_data.l_flock.end) + continue; + + if (new->l_policy_data.l_flock.end < + lock->l_policy_data.l_flock.start) + break; + + ++overlaps; + + if (new->l_policy_data.l_flock.start <= + lock->l_policy_data.l_flock.start) { + if (new->l_policy_data.l_flock.end < + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.end + 1; + break; + } + ldlm_flock_destroy(lock, lock->l_req_mode, *flags); + continue; + } + if (new->l_policy_data.l_flock.end >= + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.start - 1; + continue; + } + + /* split the existing lock into two locks */ + + /* if this is an F_UNLCK operation then we could avoid + * allocating a new lock and use the req lock passed in + * with the request but this would complicate the reply + * processing since updates to req get reflected in the + * reply. The client side replays the lock request so + * it must see the original lock data in the reply. */ + + /* XXX - if ldlm_lock_new() can sleep we should + * release the lr_lock, allocate the new lock, + * and restart processing this lock. */ + if (!new2) { + unlock_res_and_lock(req); + new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK, + lock->l_granted_mode, &null_cbs, + NULL, 0, LVB_T_NONE); + lock_res_and_lock(req); + if (!new2) { + ldlm_flock_destroy(req, lock->l_granted_mode, + *flags); + *err = -ENOLCK; + return LDLM_ITER_STOP; + } + goto reprocess; + } + + splitted = 1; + + new2->l_granted_mode = lock->l_granted_mode; + new2->l_policy_data.l_flock.pid = + new->l_policy_data.l_flock.pid; + new2->l_policy_data.l_flock.owner = + new->l_policy_data.l_flock.owner; + new2->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + new2->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.start - 1; + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.end + 1; + new2->l_conn_export = lock->l_conn_export; + if (lock->l_export != NULL) { + new2->l_export = class_export_lock_get(lock->l_export, + new2); + if (new2->l_export->exp_lock_hash && + hlist_unhashed(&new2->l_exp_hash)) + cfs_hash_add(new2->l_export->exp_lock_hash, + &new2->l_remote_handle, + &new2->l_exp_hash); + } + if (*flags == LDLM_FL_WAIT_NOREPROC) + ldlm_lock_addref_internal_nolock(new2, + lock->l_granted_mode); + + /* insert new2 at lock */ + ldlm_resource_add_lock(res, ownlocks, new2); + LDLM_LOCK_RELEASE(new2); + break; + } + + /* if new2 is created but never used, destroy it*/ + if (splitted == 0 && new2 != NULL) + ldlm_lock_destroy_nolock(new2); + + /* At this point we're granting the lock request. */ + req->l_granted_mode = req->l_req_mode; + + /* Add req to the granted queue before calling ldlm_reprocess_all(). */ + if (!added) { + list_del_init(&req->l_res_link); + /* insert new lock before ownlocks in list. */ + ldlm_resource_add_lock(res, ownlocks, req); + } + + if (*flags != LDLM_FL_WAIT_NOREPROC) { + /* The only one possible case for client-side calls flock + * policy function is ldlm_flock_completion_ast inside which + * carries LDLM_FL_WAIT_NOREPROC flag. */ + CERROR("Illegal parameter for client-side-only module.\n"); + LBUG(); + } + + /* In case we're reprocessing the requested lock we can't destroy + * it until after calling ldlm_add_ast_work_item() above so that laawi() + * can bump the reference count on \a req. Otherwise \a req + * could be freed before the completion AST can be sent. */ + if (added) + ldlm_flock_destroy(req, mode, *flags); + + ldlm_resource_dump(D_INFO, res); + return LDLM_ITER_CONTINUE; +} + +struct ldlm_flock_wait_data { + struct ldlm_lock *fwd_lock; + int fwd_generation; +}; + +static void +ldlm_flock_interrupted_wait(void *data) +{ + struct ldlm_lock *lock; + + lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock; + + /* take lock off the deadlock detection hash list. */ + lock_res_and_lock(lock); + ldlm_flock_blocking_unlink(lock); + + /* client side - set flag to prevent lock from being put on LRU list */ + lock->l_flags |= LDLM_FL_CBPENDING; + unlock_res_and_lock(lock); +} + +/** + * Flock completion callback function. + * + * \param lock [in,out]: A lock to be handled + * \param flags [in]: flags + * \param *data [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg + * + * \retval 0 : success + * \retval <0 : failure + */ +int +ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + struct file_lock *getlk = lock->l_ast_data; + struct obd_device *obd; + struct obd_import *imp = NULL; + struct ldlm_flock_wait_data fwd; + struct l_wait_info lwi; + ldlm_error_t err; + int rc = 0; + + CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n", + flags, data, getlk); + + /* Import invalidation. We need to actually release the lock + * references being held, so that it can go away. No point in + * holding the lock even if app still believes it has it, since + * server already dropped it anyway. Only for granted locks too. */ + if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) == + (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) { + if (lock->l_req_mode == lock->l_granted_mode && + lock->l_granted_mode != LCK_NL && + NULL == data) + ldlm_lock_decref_internal(lock, lock->l_req_mode); + + /* Need to wake up the waiter if we were evicted */ + wake_up(&lock->l_waitq); + return 0; + } + + LASSERT(flags != LDLM_FL_WAIT_NOREPROC); + + if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | + LDLM_FL_BLOCK_CONV))) { + if (NULL == data) + /* mds granted the lock in the reply */ + goto granted; + /* CP AST RPC: lock get granted, wake it up */ + wake_up(&lock->l_waitq); + return 0; + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, sleeping"); + fwd.fwd_lock = lock; + obd = class_exp2obd(lock->l_conn_export); + + /* if this is a local lock, there is no import */ + if (NULL != obd) + imp = obd->u.cli.cl_import; + + if (NULL != imp) { + spin_lock(&imp->imp_lock); + fwd.fwd_generation = imp->imp_generation; + spin_unlock(&imp->imp_lock); + } + + lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd); + + /* Go to sleep until the lock is granted. */ + rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi); + + if (rc) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + return rc; + } + +granted: + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10); + + if (lock->l_flags & LDLM_FL_DESTROYED) { + LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed"); + return 0; + } + + if (lock->l_flags & LDLM_FL_FAILED) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed"); + return -EIO; + } + + if (rc) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + return rc; + } + + LDLM_DEBUG(lock, "client-side enqueue granted"); + + lock_res_and_lock(lock); + + /* take lock off the deadlock detection hash list. */ + ldlm_flock_blocking_unlink(lock); + + /* ldlm_lock_enqueue() has already placed lock on the granted list. */ + list_del_init(&lock->l_res_link); + + if (lock->l_flags & LDLM_FL_FLOCK_DEADLOCK) { + LDLM_DEBUG(lock, "client-side enqueue deadlock received"); + rc = -EDEADLK; + } else if (flags & LDLM_FL_TEST_LOCK) { + /* fcntl(F_GETLK) request */ + /* The old mode was saved in getlk->fl_type so that if the mode + * in the lock changes we can decref the appropriate refcount.*/ + ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC); + switch (lock->l_granted_mode) { + case LCK_PR: + getlk->fl_type = F_RDLCK; + break; + case LCK_PW: + getlk->fl_type = F_WRLCK; + break; + default: + getlk->fl_type = F_UNLCK; + } + getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid; + getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start; + getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end; + } else { + __u64 noreproc = LDLM_FL_WAIT_NOREPROC; + + /* We need to reprocess the lock to do merges or splits + * with existing locks owned by this process. */ + ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL); + } + unlock_res_and_lock(lock); + return rc; +} +EXPORT_SYMBOL(ldlm_flock_completion_ast); + +int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + LASSERT(lock); + LASSERT(flag == LDLM_CB_CANCELING); + + /* take lock off the deadlock detection hash list. */ + lock_res_and_lock(lock); + ldlm_flock_blocking_unlink(lock); + unlock_res_and_lock(lock); + return 0; +} + +void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + memset(lpolicy, 0, sizeof(*lpolicy)); + lpolicy->l_flock.start = wpolicy->l_flock.lfw_start; + lpolicy->l_flock.end = wpolicy->l_flock.lfw_end; + lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid; + /* Compat code, old clients had no idea about owner field and + * relied solely on pid for ownership. Introduced in LU-104, 2.1, + * April 2011 */ + lpolicy->l_flock.owner = wpolicy->l_flock.lfw_pid; +} + + +void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + memset(lpolicy, 0, sizeof(*lpolicy)); + lpolicy->l_flock.start = wpolicy->l_flock.lfw_start; + lpolicy->l_flock.end = wpolicy->l_flock.lfw_end; + lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid; + lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner; +} + +void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_flock.lfw_start = lpolicy->l_flock.start; + wpolicy->l_flock.lfw_end = lpolicy->l_flock.end; + wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid; + wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner; +} + +/* + * Export handle<->flock hash operations. + */ +static unsigned +ldlm_export_flock_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_u64_hash(*(__u64 *)key, mask); +} + +static void * +ldlm_export_flock_key(struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + return &lock->l_policy_data.l_flock.owner; +} + +static int +ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode) +{ + return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64)); +} + +static void * +ldlm_export_flock_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); +} + +static void +ldlm_export_flock_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + struct ldlm_flock *flock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + LDLM_LOCK_GET(lock); + + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->blocking_export != NULL); + class_export_get(flock->blocking_export); + flock->blocking_refs++; +} + +static void +ldlm_export_flock_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + struct ldlm_flock *flock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + LDLM_LOCK_RELEASE(lock); + + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->blocking_export != NULL); + class_export_put(flock->blocking_export); + if (--flock->blocking_refs == 0) { + flock->blocking_owner = 0; + flock->blocking_export = NULL; + } +} + +static cfs_hash_ops_t ldlm_export_flock_ops = { + .hs_hash = ldlm_export_flock_hash, + .hs_key = ldlm_export_flock_key, + .hs_keycmp = ldlm_export_flock_keycmp, + .hs_object = ldlm_export_flock_object, + .hs_get = ldlm_export_flock_get, + .hs_put = ldlm_export_flock_put, + .hs_put_locked = ldlm_export_flock_put, +}; + +int ldlm_init_flock_export(struct obd_export *exp) +{ + if (strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0) + return 0; + + exp->exp_flock_hash = + cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid), + HASH_EXP_LOCK_CUR_BITS, + HASH_EXP_LOCK_MAX_BITS, + HASH_EXP_LOCK_BKT_BITS, 0, + CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &ldlm_export_flock_ops, + CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE); + if (!exp->exp_flock_hash) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(ldlm_init_flock_export); + +void ldlm_destroy_flock_export(struct obd_export *exp) +{ + if (exp->exp_flock_hash) { + cfs_hash_putref(exp->exp_flock_hash); + exp->exp_flock_hash = NULL; + } +} +EXPORT_SYMBOL(ldlm_destroy_flock_export); diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c new file mode 100644 index 000000000..40d333850 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c @@ -0,0 +1,74 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_inodebits.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of IBITS lock type + * + * IBITS lock type contains a bit mask determining various properties of an + * object. The meanings of specific bits are specific to the caller and are + * opaque to LDLM code. + * + * Locks with intersecting bitmasks and conflicting lock modes (e.g. LCK_PW) + * are considered conflicting. See the lock mode compatibility matrix + * in lustre_dlm.h. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include "../include/lustre_dlm.h" +#include "../include/obd_support.h" +#include "../include/lustre_lib.h" +#include "ldlm_internal.h" + + +void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + memset(lpolicy, 0, sizeof(*lpolicy)); + lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits; +} + +void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits; +} diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h new file mode 100644 index 000000000..70b909f55 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h @@ -0,0 +1,316 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define MAX_STRING_SIZE 128 + +extern int ldlm_srv_namespace_nr; +extern int ldlm_cli_namespace_nr; +extern struct mutex ldlm_srv_namespace_lock; +extern struct list_head ldlm_srv_namespace_list; +extern struct mutex ldlm_cli_namespace_lock; +extern struct list_head ldlm_cli_active_namespace_list; +extern struct list_head ldlm_cli_inactive_namespace_list; + +static inline int ldlm_namespace_nr_read(ldlm_side_t client) +{ + return client == LDLM_NAMESPACE_SERVER ? + ldlm_srv_namespace_nr : ldlm_cli_namespace_nr; +} + +static inline void ldlm_namespace_nr_inc(ldlm_side_t client) +{ + if (client == LDLM_NAMESPACE_SERVER) + ldlm_srv_namespace_nr++; + else + ldlm_cli_namespace_nr++; +} + +static inline void ldlm_namespace_nr_dec(ldlm_side_t client) +{ + if (client == LDLM_NAMESPACE_SERVER) + ldlm_srv_namespace_nr--; + else + ldlm_cli_namespace_nr--; +} + +static inline struct list_head *ldlm_namespace_list(ldlm_side_t client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_list : &ldlm_cli_active_namespace_list; +} + +static inline struct list_head *ldlm_namespace_inactive_list(ldlm_side_t client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_list : &ldlm_cli_inactive_namespace_list; +} + +static inline struct mutex *ldlm_namespace_lock(ldlm_side_t client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock; +} + +/* ns_bref is the number of resources in this namespace */ +static inline int ldlm_ns_empty(struct ldlm_namespace *ns) +{ + return atomic_read(&ns->ns_bref) == 0; +} + +void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *, ldlm_side_t); +void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *, + ldlm_side_t); +struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t); + +/* ldlm_request.c */ +/* Cancel lru flag, it indicates we cancel aged locks. */ +enum { + LDLM_CANCEL_AGED = 1 << 0, /* Cancel aged locks (non lru resize). */ + LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */ + LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */ + LDLM_CANCEL_LRUR = 1 << 3, /* Cancel locks from lru resize. */ + LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither + * sending nor waiting for any rpcs) */ +}; + +int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, + ldlm_cancel_flags_t sync, int flags); +int ldlm_cancel_lru_local(struct ldlm_namespace *ns, + struct list_head *cancels, int count, int max, + ldlm_cancel_flags_t cancel_flags, int flags); +extern int ldlm_enqueue_min; +int ldlm_get_enq_timeout(struct ldlm_lock *lock); + +/* ldlm_resource.c */ +int ldlm_resource_putref_locked(struct ldlm_resource *res); +void ldlm_resource_insert_lock_after(struct ldlm_lock *original, + struct ldlm_lock *new); +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, int force); +void ldlm_namespace_free_post(struct ldlm_namespace *ns); +/* ldlm_lock.c */ + +struct ldlm_cb_set_arg { + struct ptlrpc_request_set *set; + int type; /* LDLM_{CP,BL,GL}_CALLBACK */ + atomic_t restart; + struct list_head *list; + union ldlm_gl_desc *gl_desc; /* glimpse AST descriptor */ +}; + +enum ldlm_desc_ast_t { + LDLM_WORK_BL_AST, + LDLM_WORK_CP_AST, + LDLM_WORK_REVOKE_AST, + LDLM_WORK_GL_AST +}; + +void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list); +int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, + enum req_location loc, void *data, int size); +struct ldlm_lock * +ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *, + ldlm_type_t type, ldlm_mode_t, + const struct ldlm_callback_suite *cbs, + void *data, __u32 lvb_len, enum lvb_type lvb_type); +ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **, + void *cookie, __u64 *flags); +void ldlm_lock_addref_internal(struct ldlm_lock *, __u32 mode); +void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, __u32 mode); +void ldlm_lock_decref_internal(struct ldlm_lock *, __u32 mode); +void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, __u32 mode); +void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list); +int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, + enum ldlm_desc_ast_t ast_type); +int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq); +int ldlm_lock_remove_from_lru(struct ldlm_lock *lock); +int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock); +void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock); +void ldlm_lock_add_to_lru(struct ldlm_lock *lock); +void ldlm_lock_touch_in_lru(struct ldlm_lock *lock); +void ldlm_lock_destroy_nolock(struct ldlm_lock *lock); + +void ldlm_cancel_locks_for_export(struct obd_export *export); + +/* ldlm_lockd.c */ +int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct ldlm_lock *lock); +int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + ldlm_cancel_flags_t cancel_flags); + +void ldlm_handle_bl_callback(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, struct ldlm_lock *lock); + +extern struct kmem_cache *ldlm_resource_slab; + +/* ldlm_lockd.c & ldlm_lock.c */ +extern struct kmem_cache *ldlm_lock_slab; + +/* ldlm_extent.c */ +void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock); +void ldlm_extent_unlink_lock(struct ldlm_lock *lock); + +/* ldlm_flock.c */ +int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, + int first_enq, ldlm_error_t *err, + struct list_head *work_list); +int ldlm_init_flock_export(struct obd_export *exp); +void ldlm_destroy_flock_export(struct obd_export *exp); + +/* l_lock.c */ +void l_check_ns_lock(struct ldlm_namespace *ns); +void l_check_no_ns_lock(struct ldlm_namespace *ns); + +extern struct proc_dir_entry *ldlm_svc_proc_dir; +extern struct proc_dir_entry *ldlm_type_proc_dir; + +struct ldlm_state { + struct ptlrpc_service *ldlm_cb_service; + struct ptlrpc_service *ldlm_cancel_service; + struct ptlrpc_client *ldlm_client; + struct ptlrpc_connection *ldlm_server_conn; + struct ldlm_bl_pool *ldlm_bl_pool; +}; + +/* interval tree, for LDLM_EXTENT. */ +extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */ +extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l); +struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l); +struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock); +void ldlm_interval_free(struct ldlm_interval *node); +/* this function must be called with res lock held */ +static inline struct ldlm_extent * +ldlm_interval_extent(struct ldlm_interval *node) +{ + struct ldlm_lock *lock; + + LASSERT(!list_empty(&node->li_group)); + + lock = list_entry(node->li_group.next, struct ldlm_lock, + l_sl_policy); + return &lock->l_policy_data.l_extent; +} + +int ldlm_init(void); +void ldlm_exit(void); + +enum ldlm_policy_res { + LDLM_POLICY_CANCEL_LOCK, + LDLM_POLICY_KEEP_LOCK, + LDLM_POLICY_SKIP_LOCK +}; + +typedef enum ldlm_policy_res ldlm_policy_res_t; + +#define LDLM_POOL_PROC_READER_SEQ_SHOW(var, type) \ + static int lprocfs_##var##_seq_show(struct seq_file *m, void *v) \ + { \ + struct ldlm_pool *pl = m->private; \ + type tmp; \ + \ + spin_lock(&pl->pl_lock); \ + tmp = pl->pl_##var; \ + spin_unlock(&pl->pl_lock); \ + \ + return lprocfs_rd_uint(m, &tmp); \ + } \ + struct __##var##__dummy_read {; } /* semicolon catcher */ + +#define LDLM_POOL_PROC_WRITER(var, type) \ + static int lprocfs_wr_##var(struct file *file, \ + const char __user *buffer, \ + unsigned long count, void *data) \ + { \ + struct ldlm_pool *pl = data; \ + type tmp; \ + int rc; \ + \ + rc = lprocfs_wr_uint(file, buffer, count, &tmp); \ + if (rc < 0) { \ + CERROR("Can't parse user input, rc = %d\n", rc); \ + return rc; \ + } \ + \ + spin_lock(&pl->pl_lock); \ + pl->pl_##var = tmp; \ + spin_unlock(&pl->pl_lock); \ + \ + return rc; \ + } \ + struct __##var##__dummy_write {; } /* semicolon catcher */ + +static inline int is_granted_or_cancelled(struct ldlm_lock *lock) +{ + int ret = 0; + + lock_res_and_lock(lock); + if (((lock->l_req_mode == lock->l_granted_mode) && + !(lock->l_flags & LDLM_FL_CP_REQD)) || + (lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_CANCEL))) + ret = 1; + unlock_res_and_lock(lock); + + return ret; +} + +typedef void (*ldlm_policy_wire_to_local_t)(const ldlm_wire_policy_data_t *, + ldlm_policy_data_t *); + +typedef void (*ldlm_policy_local_to_wire_t)(const ldlm_policy_data_t *, + ldlm_wire_policy_data_t *); + +void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); +void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); +void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); +void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); +void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); +void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); +void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); +void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); + +void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c new file mode 100644 index 000000000..c5c86e73c --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c @@ -0,0 +1,870 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** + * This file deals with various client/target related logic including recovery. + * + * TODO: This code more logically belongs in the ptlrpc module than in ldlm and + * should be moved. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_net.h" +#include "../include/lustre_sec.h" +#include "ldlm_internal.h" + +/* @priority: If non-zero, move the selected connection to the list head. + * @create: If zero, only search in existing connections. + */ +static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority, int create) +{ + struct ptlrpc_connection *ptlrpc_conn; + struct obd_import_conn *imp_conn = NULL, *item; + int rc = 0; + + if (!create && !priority) { + CDEBUG(D_HA, "Nothing to do\n"); + return -EINVAL; + } + + ptlrpc_conn = ptlrpc_uuid_to_connection(uuid); + if (!ptlrpc_conn) { + CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid); + return -ENOENT; + } + + if (create) { + OBD_ALLOC(imp_conn, sizeof(*imp_conn)); + if (!imp_conn) { + rc = -ENOMEM; + goto out_put; + } + } + + spin_lock(&imp->imp_lock); + list_for_each_entry(item, &imp->imp_conn_list, oic_item) { + if (obd_uuid_equals(uuid, &item->oic_uuid)) { + if (priority) { + list_del(&item->oic_item); + list_add(&item->oic_item, + &imp->imp_conn_list); + item->oic_last_attempt = 0; + } + CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? ", moved to head" : "")); + spin_unlock(&imp->imp_lock); + rc = 0; + goto out_free; + } + } + /* No existing import connection found for \a uuid. */ + if (create) { + imp_conn->oic_conn = ptlrpc_conn; + imp_conn->oic_uuid = *uuid; + imp_conn->oic_last_attempt = 0; + if (priority) + list_add(&imp_conn->oic_item, &imp->imp_conn_list); + else + list_add_tail(&imp_conn->oic_item, + &imp->imp_conn_list); + CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? "head" : "tail")); + } else { + spin_unlock(&imp->imp_lock); + rc = -ENOENT; + goto out_free; + } + + spin_unlock(&imp->imp_lock); + return 0; +out_free: + if (imp_conn) + OBD_FREE(imp_conn, sizeof(*imp_conn)); +out_put: + ptlrpc_connection_put(ptlrpc_conn); + return rc; +} + +int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid) +{ + return import_set_conn(imp, uuid, 1, 0); +} + +int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority) +{ + return import_set_conn(imp, uuid, priority, 1); +} +EXPORT_SYMBOL(client_import_add_conn); + +int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid) +{ + struct obd_import_conn *imp_conn; + struct obd_export *dlmexp; + int rc = -ENOENT; + + spin_lock(&imp->imp_lock); + if (list_empty(&imp->imp_conn_list)) { + LASSERT(!imp->imp_connection); + goto out; + } + + list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) { + if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid)) + continue; + LASSERT(imp_conn->oic_conn); + + if (imp_conn == imp->imp_conn_current) { + LASSERT(imp_conn->oic_conn == imp->imp_connection); + + if (imp->imp_state != LUSTRE_IMP_CLOSED && + imp->imp_state != LUSTRE_IMP_DISCON) { + CERROR("can't remove current connection\n"); + rc = -EBUSY; + goto out; + } + + ptlrpc_connection_put(imp->imp_connection); + imp->imp_connection = NULL; + + dlmexp = class_conn2export(&imp->imp_dlm_handle); + if (dlmexp && dlmexp->exp_connection) { + LASSERT(dlmexp->exp_connection == + imp_conn->oic_conn); + ptlrpc_connection_put(dlmexp->exp_connection); + dlmexp->exp_connection = NULL; + } + } + + list_del(&imp_conn->oic_item); + ptlrpc_connection_put(imp_conn->oic_conn); + OBD_FREE(imp_conn, sizeof(*imp_conn)); + CDEBUG(D_HA, "imp %p@%s: remove connection %s\n", + imp, imp->imp_obd->obd_name, uuid->uuid); + rc = 0; + break; + } +out: + spin_unlock(&imp->imp_lock); + if (rc == -ENOENT) + CERROR("connection %s not found\n", uuid->uuid); + return rc; +} +EXPORT_SYMBOL(client_import_del_conn); + +/** + * Find conn UUID by peer NID. \a peer is a server NID. This function is used + * to find a conn uuid of \a imp which can reach \a peer. + */ +int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, + struct obd_uuid *uuid) +{ + struct obd_import_conn *conn; + int rc = -ENOENT; + + spin_lock(&imp->imp_lock); + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + /* Check if conn UUID does have this peer NID. */ + if (class_check_uuid(&conn->oic_uuid, peer)) { + *uuid = conn->oic_uuid; + rc = 0; + break; + } + } + spin_unlock(&imp->imp_lock); + return rc; +} +EXPORT_SYMBOL(client_import_find_conn); + +void client_destroy_import(struct obd_import *imp) +{ + /* Drop security policy instance after all RPCs have finished/aborted + * to let all busy contexts be released. */ + class_import_get(imp); + class_destroy_import(imp); + sptlrpc_import_sec_put(imp); + class_import_put(imp); +} +EXPORT_SYMBOL(client_destroy_import); + +/** + * Check whether or not the OSC is on MDT. + * In the config log, + * osc on MDT + * setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID + * osc on client + * setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID + * + **/ +static int osc_on_mdt(char *obdname) +{ + char *ptr; + + ptr = strrchr(obdname, '-'); + if (ptr == NULL) + return 0; + + if (strncmp(ptr + 1, "MDT", 3) == 0) + return 1; + + return 0; +} + +/* Configure an RPC client OBD device. + * + * lcfg parameters: + * 1 - client UUID + * 2 - server UUID + * 3 - inactive-on-startup + */ +int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) +{ + struct client_obd *cli = &obddev->u.cli; + struct obd_import *imp; + struct obd_uuid server_uuid; + int rq_portal, rp_portal, connect_op; + char *name = obddev->obd_type->typ_name; + ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN; + int rc; + + /* In a more perfect world, we would hang a ptlrpc_client off of + * obd_type and just use the values from there. */ + if (!strcmp(name, LUSTRE_OSC_NAME)) { + rq_portal = OST_REQUEST_PORTAL; + rp_portal = OSC_REPLY_PORTAL; + connect_op = OST_CONNECT; + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_OST; + ns_type = LDLM_NS_TYPE_OSC; + } else if (!strcmp(name, LUSTRE_MDC_NAME) || + !strcmp(name, LUSTRE_LWP_NAME)) { + rq_portal = MDS_REQUEST_PORTAL; + rp_portal = MDC_REPLY_PORTAL; + connect_op = MDS_CONNECT; + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_MDT; + ns_type = LDLM_NS_TYPE_MDC; + } else if (!strcmp(name, LUSTRE_OSP_NAME)) { + if (strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL) { + /* OSP_on_MDT for other MDTs */ + connect_op = MDS_CONNECT; + cli->cl_sp_to = LUSTRE_SP_MDT; + ns_type = LDLM_NS_TYPE_MDC; + rq_portal = OUT_PORTAL; + } else { + /* OSP on MDT for OST */ + connect_op = OST_CONNECT; + cli->cl_sp_to = LUSTRE_SP_OST; + ns_type = LDLM_NS_TYPE_OSC; + rq_portal = OST_REQUEST_PORTAL; + } + rp_portal = OSC_REPLY_PORTAL; + cli->cl_sp_me = LUSTRE_SP_CLI; + } else if (!strcmp(name, LUSTRE_MGC_NAME)) { + rq_portal = MGS_REQUEST_PORTAL; + rp_portal = MGC_REPLY_PORTAL; + connect_op = MGS_CONNECT; + cli->cl_sp_me = LUSTRE_SP_MGC; + cli->cl_sp_to = LUSTRE_SP_MGS; + cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID; + ns_type = LDLM_NS_TYPE_MGC; + } else { + CERROR("unknown client OBD type \"%s\", can't setup\n", + name); + return -EINVAL; + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("requires a TARGET UUID\n"); + return -EINVAL; + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) { + CERROR("client UUID must be less than 38 characters\n"); + return -EINVAL; + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) { + CERROR("setup requires a SERVER UUID\n"); + return -EINVAL; + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) { + CERROR("target UUID must be less than 38 characters\n"); + return -EINVAL; + } + + init_rwsem(&cli->cl_sem); + mutex_init(&cli->cl_mgc_mutex); + cli->cl_conn_count = 0; + memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), + min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), + sizeof(server_uuid))); + + cli->cl_dirty = 0; + cli->cl_avail_grant = 0; + /* FIXME: Should limit this for the sum of all cl_dirty_max. */ + cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024; + if (cli->cl_dirty_max >> PAGE_CACHE_SHIFT > totalram_pages / 8) + cli->cl_dirty_max = totalram_pages << (PAGE_CACHE_SHIFT - 3); + INIT_LIST_HEAD(&cli->cl_cache_waiters); + INIT_LIST_HEAD(&cli->cl_loi_ready_list); + INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list); + INIT_LIST_HEAD(&cli->cl_loi_write_list); + INIT_LIST_HEAD(&cli->cl_loi_read_list); + client_obd_list_lock_init(&cli->cl_loi_list_lock); + atomic_set(&cli->cl_pending_w_pages, 0); + atomic_set(&cli->cl_pending_r_pages, 0); + cli->cl_r_in_flight = 0; + cli->cl_w_in_flight = 0; + + spin_lock_init(&cli->cl_read_rpc_hist.oh_lock); + spin_lock_init(&cli->cl_write_rpc_hist.oh_lock); + spin_lock_init(&cli->cl_read_page_hist.oh_lock); + spin_lock_init(&cli->cl_write_page_hist.oh_lock); + spin_lock_init(&cli->cl_read_offset_hist.oh_lock); + spin_lock_init(&cli->cl_write_offset_hist.oh_lock); + + /* lru for osc. */ + INIT_LIST_HEAD(&cli->cl_lru_osc); + atomic_set(&cli->cl_lru_shrinkers, 0); + atomic_set(&cli->cl_lru_busy, 0); + atomic_set(&cli->cl_lru_in_list, 0); + INIT_LIST_HEAD(&cli->cl_lru_list); + client_obd_list_lock_init(&cli->cl_lru_list_lock); + + init_waitqueue_head(&cli->cl_destroy_waitq); + atomic_set(&cli->cl_destroy_in_flight, 0); + /* Turn on checksumming by default. */ + cli->cl_checksum = 1; + /* + * The supported checksum types will be worked out at connect time + * Set cl_chksum* to CRC32 for now to avoid returning screwed info + * through procfs. + */ + cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; + atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); + + /* This value may be reduced at connect time in + * ptlrpc_connect_interpret() . We initialize it to only + * 1MB until we know what the performance looks like. + * In the future this should likely be increased. LU-1431 */ + cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES, + LNET_MTU >> PAGE_CACHE_SHIFT); + + if (!strcmp(name, LUSTRE_MDC_NAME)) { + cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT; + } else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) { + cli->cl_max_rpcs_in_flight = 2; + } else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) { + cli->cl_max_rpcs_in_flight = 3; + } else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) { + cli->cl_max_rpcs_in_flight = 4; + } else { + if (osc_on_mdt(obddev->obd_name)) + cli->cl_max_rpcs_in_flight = MDS_OSC_MAX_RIF_DEFAULT; + else + cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT; + } + rc = ldlm_get_ref(); + if (rc) { + CERROR("ldlm_get_ref failed: %d\n", rc); + goto err; + } + + ptlrpc_init_client(rq_portal, rp_portal, name, + &obddev->obd_ldlm_client); + + imp = class_new_import(obddev); + if (imp == NULL) { + rc = -ENOENT; + goto err_ldlm; + } + imp->imp_client = &obddev->obd_ldlm_client; + imp->imp_connect_op = connect_op; + memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 1)); + class_import_put(imp); + + rc = client_import_add_conn(imp, &server_uuid, 1); + if (rc) { + CERROR("can't add initial connection\n"); + goto err_import; + } + + cli->cl_import = imp; + /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */ + cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3); + cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie); + + if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { + if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) { + CDEBUG(D_HA, "marking %s %s->%s as inactive\n", + name, obddev->obd_name, + cli->cl_target_uuid.uuid); + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + } + } + + obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name, + LDLM_NAMESPACE_CLIENT, + LDLM_NAMESPACE_GREEDY, + ns_type); + if (obddev->obd_namespace == NULL) { + CERROR("Unable to create client namespace - %s\n", + obddev->obd_name); + rc = -ENOMEM; + goto err_import; + } + + cli->cl_qchk_stat = CL_NOT_QUOTACHECKED; + + return rc; + +err_import: + class_destroy_import(imp); +err_ldlm: + ldlm_put_ref(); +err: + return rc; + +} +EXPORT_SYMBOL(client_obd_setup); + +int client_obd_cleanup(struct obd_device *obddev) +{ + ldlm_namespace_free_post(obddev->obd_namespace); + obddev->obd_namespace = NULL; + + LASSERT(obddev->u.cli.cl_import == NULL); + + ldlm_put_ref(); + return 0; +} +EXPORT_SYMBOL(client_obd_cleanup); + +/* ->o_connect() method for client side (OSC and MDC and MGC) */ +int client_connect_import(const struct lu_env *env, + struct obd_export **exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + struct obd_connect_data *ocd; + struct lustre_handle conn = { 0 }; + int rc; + + *exp = NULL; + down_write(&cli->cl_sem); + if (cli->cl_conn_count > 0) { + rc = -EALREADY; + goto out_sem; + } + + rc = class_connect(&conn, obd, cluuid); + if (rc) + goto out_sem; + + cli->cl_conn_count++; + *exp = class_conn2export(&conn); + + LASSERT(obd->obd_namespace); + + imp->imp_dlm_handle = conn; + rc = ptlrpc_init_import(imp); + if (rc != 0) + goto out_ldlm; + + ocd = &imp->imp_connect_data; + if (data) { + *ocd = *data; + imp->imp_connect_flags_orig = data->ocd_connect_flags; + } + + rc = ptlrpc_connect_import(imp); + if (rc != 0) { + LASSERT(imp->imp_state == LUSTRE_IMP_DISCON); + goto out_ldlm; + } + LASSERT(*exp != NULL && (*exp)->exp_connection); + + if (data) { + LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) == + ocd->ocd_connect_flags, "old %#llx, new %#llx\n", + data->ocd_connect_flags, ocd->ocd_connect_flags); + data->ocd_connect_flags = ocd->ocd_connect_flags; + } + + ptlrpc_pinger_add_import(imp); + + if (rc) { +out_ldlm: + cli->cl_conn_count--; + class_disconnect(*exp); + *exp = NULL; + } +out_sem: + up_write(&cli->cl_sem); + + return rc; +} +EXPORT_SYMBOL(client_connect_import); + +int client_disconnect_export(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct client_obd *cli; + struct obd_import *imp; + int rc = 0, err; + + if (!obd) { + CERROR("invalid export for disconnect: exp %p cookie %#llx\n", + exp, exp ? exp->exp_handle.h_cookie : -1); + return -EINVAL; + } + + cli = &obd->u.cli; + imp = cli->cl_import; + + down_write(&cli->cl_sem); + CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name, + cli->cl_conn_count); + + if (!cli->cl_conn_count) { + CERROR("disconnecting disconnected device (%s)\n", + obd->obd_name); + rc = -EINVAL; + goto out_disconnect; + } + + cli->cl_conn_count--; + if (cli->cl_conn_count) { + rc = 0; + goto out_disconnect; + } + + /* Mark import deactivated now, so we don't try to reconnect if any + * of the cleanup RPCs fails (e.g. LDLM cancel, etc). We don't + * fully deactivate the import, or that would drop all requests. */ + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + + /* Some non-replayable imports (MDS's OSCs) are pinged, so just + * delete it regardless. (It's safe to delete an import that was + * never added.) */ + (void)ptlrpc_pinger_del_import(imp); + + if (obd->obd_namespace != NULL) { + /* obd_force == local only */ + ldlm_cli_cancel_unused(obd->obd_namespace, NULL, + obd->obd_force ? LCF_LOCAL : 0, NULL); + ldlm_namespace_free_prior(obd->obd_namespace, imp, + obd->obd_force); + } + + /* There's no need to hold sem while disconnecting an import, + * and it may actually cause deadlock in GSS. */ + up_write(&cli->cl_sem); + rc = ptlrpc_disconnect_import(imp, 0); + down_write(&cli->cl_sem); + + ptlrpc_invalidate_import(imp); + +out_disconnect: + /* Use server style - class_disconnect should be always called for + * o_disconnect. */ + err = class_disconnect(exp); + if (!rc && err) + rc = err; + + up_write(&cli->cl_sem); + + return rc; +} +EXPORT_SYMBOL(client_disconnect_export); + + +/** + * Packs current SLV and Limit into \a req. + */ +int target_pack_pool_reply(struct ptlrpc_request *req) +{ + struct obd_device *obd; + + /* Check that we still have all structures alive as this may + * be some late RPC at shutdown time. */ + if (unlikely(!req->rq_export || !req->rq_export->exp_obd || + !exp_connect_lru_resize(req->rq_export))) { + lustre_msg_set_slv(req->rq_repmsg, 0); + lustre_msg_set_limit(req->rq_repmsg, 0); + return 0; + } + + /* OBD is alive here as export is alive, which we checked above. */ + obd = req->rq_export->exp_obd; + + read_lock(&obd->obd_pool_lock); + lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv); + lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); + + return 0; +} +EXPORT_SYMBOL(target_pack_pool_reply); + +int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id) +{ + if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) { + DEBUG_REQ(D_ERROR, req, "dropping reply"); + return -ECOMM; + } + + if (unlikely(rc)) { + DEBUG_REQ(D_NET, req, "processing error (%d)", rc); + req->rq_status = rc; + return ptlrpc_send_error(req, 1); + } else { + DEBUG_REQ(D_NET, req, "sending reply"); + } + + return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT); +} + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) +{ + struct ptlrpc_service_part *svcpt; + int netrc; + struct ptlrpc_reply_state *rs; + struct obd_export *exp; + + if (req->rq_no_reply) + return; + + svcpt = req->rq_rqbd->rqbd_svcpt; + rs = req->rq_reply_state; + if (rs == NULL || !rs->rs_difficult) { + /* no notifiers */ + target_send_reply_msg(req, rc, fail_id); + return; + } + + /* must be an export if locks saved */ + LASSERT(req->rq_export != NULL); + /* req/reply consistent */ + LASSERT(rs->rs_svcpt == svcpt); + + /* "fresh" reply */ + LASSERT(!rs->rs_scheduled); + LASSERT(!rs->rs_scheduled_ever); + LASSERT(!rs->rs_handled); + LASSERT(!rs->rs_on_net); + LASSERT(rs->rs_export == NULL); + LASSERT(list_empty(&rs->rs_obd_list)); + LASSERT(list_empty(&rs->rs_exp_list)); + + exp = class_export_get(req->rq_export); + + /* disable reply scheduling while I'm setting up */ + rs->rs_scheduled = 1; + rs->rs_on_net = 1; + rs->rs_xid = req->rq_xid; + rs->rs_transno = req->rq_transno; + rs->rs_export = exp; + rs->rs_opc = lustre_msg_get_opc(req->rq_reqmsg); + + spin_lock(&exp->exp_uncommitted_replies_lock); + CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n", + rs->rs_transno, exp->exp_last_committed); + if (rs->rs_transno > exp->exp_last_committed) { + /* not committed already */ + list_add_tail(&rs->rs_obd_list, + &exp->exp_uncommitted_replies); + } + spin_unlock(&exp->exp_uncommitted_replies_lock); + + spin_lock(&exp->exp_lock); + list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies); + spin_unlock(&exp->exp_lock); + + netrc = target_send_reply_msg(req, rc, fail_id); + + spin_lock(&svcpt->scp_rep_lock); + + atomic_inc(&svcpt->scp_nreps_difficult); + + if (netrc != 0) { + /* error sending: reply is off the net. Also we need +1 + * reply ref until ptlrpc_handle_rs() is done + * with the reply state (if the send was successful, there + * would have been +1 ref for the net, which + * reply_out_callback leaves alone) */ + rs->rs_on_net = 0; + ptlrpc_rs_addref(rs); + } + + spin_lock(&rs->rs_lock); + if (rs->rs_transno <= exp->exp_last_committed || + (!rs->rs_on_net && !rs->rs_no_ack) || + list_empty(&rs->rs_exp_list) || /* completed already */ + list_empty(&rs->rs_obd_list)) { + CDEBUG(D_HA, "Schedule reply immediately\n"); + ptlrpc_dispatch_difficult_reply(rs); + } else { + list_add(&rs->rs_list, &svcpt->scp_rep_active); + rs->rs_scheduled = 0; /* allow notifier to schedule */ + } + spin_unlock(&rs->rs_lock); + spin_unlock(&svcpt->scp_rep_lock); +} +EXPORT_SYMBOL(target_send_reply); + +ldlm_mode_t lck_compat_array[] = { + [LCK_EX] = LCK_COMPAT_EX, + [LCK_PW] = LCK_COMPAT_PW, + [LCK_PR] = LCK_COMPAT_PR, + [LCK_CW] = LCK_COMPAT_CW, + [LCK_CR] = LCK_COMPAT_CR, + [LCK_NL] = LCK_COMPAT_NL, + [LCK_GROUP] = LCK_COMPAT_GROUP, + [LCK_COS] = LCK_COMPAT_COS, +}; + +/** + * Rather arbitrary mapping from LDLM error codes to errno values. This should + * not escape to the user level. + */ +int ldlm_error2errno(ldlm_error_t error) +{ + int result; + + switch (error) { + case ELDLM_OK: + result = 0; + break; + case ELDLM_LOCK_CHANGED: + result = -ESTALE; + break; + case ELDLM_LOCK_ABORTED: + result = -ENAVAIL; + break; + case ELDLM_LOCK_REPLACED: + result = -ESRCH; + break; + case ELDLM_NO_LOCK_DATA: + result = -ENOENT; + break; + case ELDLM_NAMESPACE_EXISTS: + result = -EEXIST; + break; + case ELDLM_BAD_NAMESPACE: + result = -EBADF; + break; + default: + if (((int)error) < 0) /* cast to signed type */ + result = error; /* as ldlm_error_t can be unsigned */ + else { + CERROR("Invalid DLM result code: %d\n", error); + result = -EPROTO; + } + } + return result; +} +EXPORT_SYMBOL(ldlm_error2errno); + +/** + * Dual to ldlm_error2errno(): maps errno values back to ldlm_error_t. + */ +ldlm_error_t ldlm_errno2error(int err_no) +{ + int error; + + switch (err_no) { + case 0: + error = ELDLM_OK; + break; + case -ESTALE: + error = ELDLM_LOCK_CHANGED; + break; + case -ENAVAIL: + error = ELDLM_LOCK_ABORTED; + break; + case -ESRCH: + error = ELDLM_LOCK_REPLACED; + break; + case -ENOENT: + error = ELDLM_NO_LOCK_DATA; + break; + case -EEXIST: + error = ELDLM_NAMESPACE_EXISTS; + break; + case -EBADF: + error = ELDLM_BAD_NAMESPACE; + break; + default: + error = err_no; + } + return error; +} +EXPORT_SYMBOL(ldlm_errno2error); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void ldlm_dump_export_locks(struct obd_export *exp) +{ + spin_lock(&exp->exp_locks_list_guard); + if (!list_empty(&exp->exp_locks_list)) { + struct ldlm_lock *lock; + + CERROR("dumping locks for export %p,ignore if the unmount doesn't hang\n", + exp); + list_for_each_entry(lock, &exp->exp_locks_list, + l_exp_refs_link) + LDLM_ERROR(lock, "lock:"); + } + spin_unlock(&exp->exp_locks_list_guard); +} +#endif diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c new file mode 100644 index 000000000..84b111eb4 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c @@ -0,0 +1,2322 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_lock.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/lustre_intent.h" +#include "../include/obd_class.h" +#include "ldlm_internal.h" + +/* lock types */ +char *ldlm_lockname[] = { + [0] = "--", + [LCK_EX] = "EX", + [LCK_PW] = "PW", + [LCK_PR] = "PR", + [LCK_CW] = "CW", + [LCK_CR] = "CR", + [LCK_NL] = "NL", + [LCK_GROUP] = "GROUP", + [LCK_COS] = "COS", +}; +EXPORT_SYMBOL(ldlm_lockname); + +char *ldlm_typename[] = { + [LDLM_PLAIN] = "PLN", + [LDLM_EXTENT] = "EXT", + [LDLM_FLOCK] = "FLK", + [LDLM_IBITS] = "IBT", +}; +EXPORT_SYMBOL(ldlm_typename); + +static ldlm_policy_wire_to_local_t ldlm_policy_wire18_to_local[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_wire_to_local, + [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_wire_to_local, + [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_wire18_to_local, + [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_wire_to_local, +}; + +static ldlm_policy_wire_to_local_t ldlm_policy_wire21_to_local[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_wire_to_local, + [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_wire_to_local, + [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_wire21_to_local, + [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_wire_to_local, +}; + +static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_local_to_wire, + [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_local_to_wire, + [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_local_to_wire, + [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_local_to_wire, +}; + +/** + * Converts lock policy from local format to on the wire lock_desc format + */ +void ldlm_convert_policy_to_wire(ldlm_type_t type, + const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + ldlm_policy_local_to_wire_t convert; + + convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE]; + + convert(lpolicy, wpolicy); +} + +/** + * Converts lock policy from on the wire lock_desc format to local format + */ +void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type, + const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + ldlm_policy_wire_to_local_t convert; + int new_client; + + /** some badness for 2.0.0 clients, but 2.0.0 isn't supported */ + new_client = (exp_connect_flags(exp) & OBD_CONNECT_FULL20) != 0; + if (new_client) + convert = ldlm_policy_wire21_to_local[type - LDLM_MIN_TYPE]; + else + convert = ldlm_policy_wire18_to_local[type - LDLM_MIN_TYPE]; + + convert(wpolicy, lpolicy); +} + +char *ldlm_it2str(int it) +{ + switch (it) { + case IT_OPEN: + return "open"; + case IT_CREAT: + return "creat"; + case (IT_OPEN | IT_CREAT): + return "open|creat"; + case IT_READDIR: + return "readdir"; + case IT_GETATTR: + return "getattr"; + case IT_LOOKUP: + return "lookup"; + case IT_UNLINK: + return "unlink"; + case IT_GETXATTR: + return "getxattr"; + case IT_LAYOUT: + return "layout"; + default: + CERROR("Unknown intent %d\n", it); + return "UNKNOWN"; + } +} +EXPORT_SYMBOL(ldlm_it2str); + + +void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg) +{ + ns->ns_policy = arg; +} +EXPORT_SYMBOL(ldlm_register_intent); + +/* + * REFCOUNTED LOCK OBJECTS + */ + + +/** + * Get a reference on a lock. + * + * Lock refcounts, during creation: + * - one special one for allocation, dec'd only once in destroy + * - one for being a lock that's in-use + * - one for the addref associated with a new lock + */ +struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock) +{ + atomic_inc(&lock->l_refc); + return lock; +} +EXPORT_SYMBOL(ldlm_lock_get); + +/** + * Release lock reference. + * + * Also frees the lock if it was last reference. + */ +void ldlm_lock_put(struct ldlm_lock *lock) +{ + LASSERT(lock->l_resource != LP_POISON); + LASSERT(atomic_read(&lock->l_refc) > 0); + if (atomic_dec_and_test(&lock->l_refc)) { + struct ldlm_resource *res; + + LDLM_DEBUG(lock, + "final lock_put on destroyed lock, freeing it."); + + res = lock->l_resource; + LASSERT(lock->l_flags & LDLM_FL_DESTROYED); + LASSERT(list_empty(&lock->l_res_link)); + LASSERT(list_empty(&lock->l_pending_chain)); + + lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats, + LDLM_NSS_LOCKS); + lu_ref_del(&res->lr_reference, "lock", lock); + ldlm_resource_putref(res); + lock->l_resource = NULL; + if (lock->l_export) { + class_export_lock_put(lock->l_export, lock); + lock->l_export = NULL; + } + + if (lock->l_lvb_data != NULL) + OBD_FREE(lock->l_lvb_data, lock->l_lvb_len); + + ldlm_interval_free(ldlm_interval_detach(lock)); + lu_ref_fini(&lock->l_reference); + OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle); + } +} +EXPORT_SYMBOL(ldlm_lock_put); + +/** + * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked. + */ +int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock) +{ + int rc = 0; + + if (!list_empty(&lock->l_lru)) { + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); + list_del_init(&lock->l_lru); + LASSERT(ns->ns_nr_unused > 0); + ns->ns_nr_unused--; + rc = 1; + } + return rc; +} + +/** + * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first. + */ +int ldlm_lock_remove_from_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + int rc; + + if (lock->l_flags & LDLM_FL_NS_SRV) { + LASSERT(list_empty(&lock->l_lru)); + return 0; + } + + spin_lock(&ns->ns_lock); + rc = ldlm_lock_remove_from_lru_nolock(lock); + spin_unlock(&ns->ns_lock); + return rc; +} + +/** + * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked. + */ +void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + lock->l_last_used = cfs_time_current(); + LASSERT(list_empty(&lock->l_lru)); + LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); + list_add_tail(&lock->l_lru, &ns->ns_unused_list); + if (lock->l_flags & LDLM_FL_SKIPPED) + lock->l_flags &= ~LDLM_FL_SKIPPED; + LASSERT(ns->ns_nr_unused >= 0); + ns->ns_nr_unused++; +} + +/** + * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks + * first. + */ +void ldlm_lock_add_to_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + spin_lock(&ns->ns_lock); + ldlm_lock_add_to_lru_nolock(lock); + spin_unlock(&ns->ns_lock); +} + +/** + * Moves LDLM lock \a lock that is already in namespace LRU to the tail of + * the LRU. Performs necessary LRU locking + */ +void ldlm_lock_touch_in_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + if (lock->l_flags & LDLM_FL_NS_SRV) { + LASSERT(list_empty(&lock->l_lru)); + return; + } + + spin_lock(&ns->ns_lock); + if (!list_empty(&lock->l_lru)) { + ldlm_lock_remove_from_lru_nolock(lock); + ldlm_lock_add_to_lru_nolock(lock); + } + spin_unlock(&ns->ns_lock); +} + +/** + * Helper to destroy a locked lock. + * + * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock + * Must be called with l_lock and lr_lock held. + * + * Does not actually free the lock data, but rather marks the lock as + * destroyed by setting l_destroyed field in the lock to 1. Destroys a + * handle->lock association too, so that the lock can no longer be found + * and removes the lock from LRU list. Actual lock freeing occurs when + * last lock reference goes away. + * + * Original comment (of some historical value): + * This used to have a 'strict' flag, which recovery would use to mark an + * in-use lock as needing-to-die. Lest I am ever tempted to put it back, I + * shall explain why it's gone: with the new hash table scheme, once you call + * ldlm_lock_destroy, you can never drop your final references on this lock. + * Because it's not in the hash table anymore. -phil + */ +int ldlm_lock_destroy_internal(struct ldlm_lock *lock) +{ + if (lock->l_readers || lock->l_writers) { + LDLM_ERROR(lock, "lock still has references"); + LBUG(); + } + + if (!list_empty(&lock->l_res_link)) { + LDLM_ERROR(lock, "lock still on resource"); + LBUG(); + } + + if (lock->l_flags & LDLM_FL_DESTROYED) { + LASSERT(list_empty(&lock->l_lru)); + return 0; + } + lock->l_flags |= LDLM_FL_DESTROYED; + + if (lock->l_export && lock->l_export->exp_lock_hash) { + /* NB: it's safe to call cfs_hash_del() even lock isn't + * in exp_lock_hash. */ + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_del(lock->l_export->exp_lock_hash, + &lock->l_remote_handle, &lock->l_exp_hash); + } + + ldlm_lock_remove_from_lru(lock); + class_handle_unhash(&lock->l_handle); + +#if 0 + /* Wake anyone waiting for this lock */ + /* FIXME: I should probably add yet another flag, instead of using + * l_export to only call this on clients */ + if (lock->l_export) + class_export_put(lock->l_export); + lock->l_export = NULL; + if (lock->l_export && lock->l_completion_ast) + lock->l_completion_ast(lock, 0); +#endif + return 1; +} + +/** + * Destroys a LDLM lock \a lock. Performs necessary locking first. + */ +void ldlm_lock_destroy(struct ldlm_lock *lock) +{ + int first; + + lock_res_and_lock(lock); + first = ldlm_lock_destroy_internal(lock); + unlock_res_and_lock(lock); + + /* drop reference from hashtable only for first destroy */ + if (first) { + lu_ref_del(&lock->l_reference, "hash", lock); + LDLM_LOCK_RELEASE(lock); + } +} + +/** + * Destroys a LDLM lock \a lock that is already locked. + */ +void ldlm_lock_destroy_nolock(struct ldlm_lock *lock) +{ + int first; + + first = ldlm_lock_destroy_internal(lock); + /* drop reference from hashtable only for first destroy */ + if (first) { + lu_ref_del(&lock->l_reference, "hash", lock); + LDLM_LOCK_RELEASE(lock); + } +} + +/* this is called by portals_handle2object with the handle lock taken */ +static void lock_handle_addref(void *lock) +{ + LDLM_LOCK_GET((struct ldlm_lock *)lock); +} + +static void lock_handle_free(void *lock, int size) +{ + LASSERT(size == sizeof(struct ldlm_lock)); + OBD_SLAB_FREE(lock, ldlm_lock_slab, size); +} + +struct portals_handle_ops lock_handle_ops = { + .hop_addref = lock_handle_addref, + .hop_free = lock_handle_free, +}; + +/** + * + * Allocate and initialize new lock structure. + * + * usage: pass in a resource on which you have done ldlm_resource_get + * new lock will take over the refcount. + * returns: lock with refcount 2 - one for current caller and one for remote + */ +static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource) +{ + struct ldlm_lock *lock; + + if (resource == NULL) + LBUG(); + + OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, GFP_NOFS); + if (lock == NULL) + return NULL; + + spin_lock_init(&lock->l_lock); + lock->l_resource = resource; + lu_ref_add(&resource->lr_reference, "lock", lock); + + atomic_set(&lock->l_refc, 2); + INIT_LIST_HEAD(&lock->l_res_link); + INIT_LIST_HEAD(&lock->l_lru); + INIT_LIST_HEAD(&lock->l_pending_chain); + INIT_LIST_HEAD(&lock->l_bl_ast); + INIT_LIST_HEAD(&lock->l_cp_ast); + INIT_LIST_HEAD(&lock->l_rk_ast); + init_waitqueue_head(&lock->l_waitq); + lock->l_blocking_lock = NULL; + INIT_LIST_HEAD(&lock->l_sl_mode); + INIT_LIST_HEAD(&lock->l_sl_policy); + INIT_HLIST_NODE(&lock->l_exp_hash); + INIT_HLIST_NODE(&lock->l_exp_flock_hash); + + lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats, + LDLM_NSS_LOCKS); + INIT_LIST_HEAD(&lock->l_handle.h_link); + class_handle_hash(&lock->l_handle, &lock_handle_ops); + + lu_ref_init(&lock->l_reference); + lu_ref_add(&lock->l_reference, "hash", lock); + lock->l_callback_timeout = 0; + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + INIT_LIST_HEAD(&lock->l_exp_refs_link); + lock->l_exp_refs_nr = 0; + lock->l_exp_refs_target = NULL; +#endif + INIT_LIST_HEAD(&lock->l_exp_list); + + return lock; +} + +/** + * Moves LDLM lock \a lock to another resource. + * This is used on client when server returns some other lock than requested + * (typically as a result of intent operation) + */ +int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, + const struct ldlm_res_id *new_resid) +{ + struct ldlm_resource *oldres = lock->l_resource; + struct ldlm_resource *newres; + int type; + + LASSERT(ns_is_client(ns)); + + lock_res_and_lock(lock); + if (memcmp(new_resid, &lock->l_resource->lr_name, + sizeof(lock->l_resource->lr_name)) == 0) { + /* Nothing to do */ + unlock_res_and_lock(lock); + return 0; + } + + LASSERT(new_resid->name[0] != 0); + + /* This function assumes that the lock isn't on any lists */ + LASSERT(list_empty(&lock->l_res_link)); + + type = oldres->lr_type; + unlock_res_and_lock(lock); + + newres = ldlm_resource_get(ns, NULL, new_resid, type, 1); + if (newres == NULL) + return -ENOMEM; + + lu_ref_add(&newres->lr_reference, "lock", lock); + /* + * To flip the lock from the old to the new resource, lock, oldres and + * newres have to be locked. Resource spin-locks are nested within + * lock->l_lock, and are taken in the memory address order to avoid + * dead-locks. + */ + spin_lock(&lock->l_lock); + oldres = lock->l_resource; + if (oldres < newres) { + lock_res(oldres); + lock_res_nested(newres, LRT_NEW); + } else { + lock_res(newres); + lock_res_nested(oldres, LRT_NEW); + } + LASSERT(memcmp(new_resid, &oldres->lr_name, + sizeof(oldres->lr_name)) != 0); + lock->l_resource = newres; + unlock_res(oldres); + unlock_res_and_lock(lock); + + /* ...and the flowers are still standing! */ + lu_ref_del(&oldres->lr_reference, "lock", lock); + ldlm_resource_putref(oldres); + + return 0; +} +EXPORT_SYMBOL(ldlm_lock_change_resource); + +/** \defgroup ldlm_handles LDLM HANDLES + * Ways to get hold of locks without any addresses. + * @{ + */ + +/** + * Fills in handle for LDLM lock \a lock into supplied \a lockh + * Does not take any references. + */ +void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh) +{ + lockh->cookie = lock->l_handle.h_cookie; +} +EXPORT_SYMBOL(ldlm_lock2handle); + +/** + * Obtain a lock reference by handle. + * + * if \a flags: atomically get the lock and set the flags. + * Return NULL if flag already set + */ +struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle, + __u64 flags) +{ + struct ldlm_lock *lock; + + LASSERT(handle); + + lock = class_handle2object(handle->cookie); + if (lock == NULL) + return NULL; + + /* It's unlikely but possible that someone marked the lock as + * destroyed after we did handle2object on it */ + if (flags == 0 && ((lock->l_flags & LDLM_FL_DESTROYED) == 0)) { + lu_ref_add(&lock->l_reference, "handle", current); + return lock; + } + + lock_res_and_lock(lock); + + LASSERT(lock->l_resource != NULL); + + lu_ref_add_atomic(&lock->l_reference, "handle", current); + if (unlikely(lock->l_flags & LDLM_FL_DESTROYED)) { + unlock_res_and_lock(lock); + CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock); + LDLM_LOCK_PUT(lock); + return NULL; + } + + if (flags && (lock->l_flags & flags)) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + return NULL; + } + + if (flags) + lock->l_flags |= flags; + + unlock_res_and_lock(lock); + return lock; +} +EXPORT_SYMBOL(__ldlm_handle2lock); +/** @} ldlm_handles */ + +/** + * Fill in "on the wire" representation for given LDLM lock into supplied + * lock descriptor \a desc structure. + */ +void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc) +{ + ldlm_res2desc(lock->l_resource, &desc->l_resource); + desc->l_req_mode = lock->l_req_mode; + desc->l_granted_mode = lock->l_granted_mode; + ldlm_convert_policy_to_wire(lock->l_resource->lr_type, + &lock->l_policy_data, + &desc->l_policy_data); +} +EXPORT_SYMBOL(ldlm_lock2desc); + +/** + * Add a lock to list of conflicting locks to send AST to. + * + * Only add if we have not sent a blocking AST to the lock yet. + */ +void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list) +{ + if ((lock->l_flags & LDLM_FL_AST_SENT) == 0) { + LDLM_DEBUG(lock, "lock incompatible; sending blocking AST."); + lock->l_flags |= LDLM_FL_AST_SENT; + /* If the enqueuing client said so, tell the AST recipient to + * discard dirty data, rather than writing back. */ + if (new->l_flags & LDLM_FL_AST_DISCARD_DATA) + lock->l_flags |= LDLM_FL_DISCARD_DATA; + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, work_list); + LDLM_LOCK_GET(lock); + LASSERT(lock->l_blocking_lock == NULL); + lock->l_blocking_lock = LDLM_LOCK_GET(new); + } +} + +/** + * Add a lock to list of just granted locks to send completion AST to. + */ +void ldlm_add_cp_work_item(struct ldlm_lock *lock, struct list_head *work_list) +{ + if ((lock->l_flags & LDLM_FL_CP_REQD) == 0) { + lock->l_flags |= LDLM_FL_CP_REQD; + LDLM_DEBUG(lock, "lock granted; sending completion AST."); + LASSERT(list_empty(&lock->l_cp_ast)); + list_add(&lock->l_cp_ast, work_list); + LDLM_LOCK_GET(lock); + } +} + +/** + * Aggregator function to add AST work items into a list. Determines + * what sort of an AST work needs to be done and calls the proper + * adding function. + * Must be called with lr_lock held. + */ +void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list) +{ + check_res_locked(lock->l_resource); + if (new) + ldlm_add_bl_work_item(lock, new, work_list); + else + ldlm_add_cp_work_item(lock, work_list); +} + +/** + * Add specified reader/writer reference to LDLM lock with handle \a lockh. + * r/w reference type is determined by \a mode + * Calls ldlm_lock_addref_internal. + */ +void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + ldlm_lock_addref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_addref); + +/** + * Helper function. + * Add specified reader/writer reference to LDLM lock \a lock. + * r/w reference type is determined by \a mode + * Removes lock from LRU if it is there. + * Assumes the LDLM lock is already locked. + */ +void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, __u32 mode) +{ + ldlm_lock_remove_from_lru(lock); + if (mode & (LCK_NL | LCK_CR | LCK_PR)) { + lock->l_readers++; + lu_ref_add_atomic(&lock->l_reference, "reader", lock); + } + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { + lock->l_writers++; + lu_ref_add_atomic(&lock->l_reference, "writer", lock); + } + LDLM_LOCK_GET(lock); + lu_ref_add_atomic(&lock->l_reference, "user", lock); + LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]); +} + +/** + * Attempts to add reader/writer reference to a lock with handle \a lockh, and + * fails if lock is already LDLM_FL_CBPENDING or destroyed. + * + * \retval 0 success, lock was addref-ed + * + * \retval -EAGAIN lock is being canceled. + */ +int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock; + int result; + + result = -EAGAIN; + lock = ldlm_handle2lock(lockh); + if (lock != NULL) { + lock_res_and_lock(lock); + if (lock->l_readers != 0 || lock->l_writers != 0 || + !(lock->l_flags & LDLM_FL_CBPENDING)) { + ldlm_lock_addref_internal_nolock(lock, mode); + result = 0; + } + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + return result; +} +EXPORT_SYMBOL(ldlm_lock_addref_try); + +/** + * Add specified reader/writer reference to LDLM lock \a lock. + * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work. + * Only called for local locks. + */ +void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode) +{ + lock_res_and_lock(lock); + ldlm_lock_addref_internal_nolock(lock, mode); + unlock_res_and_lock(lock); +} + +/** + * Removes reader/writer reference for LDLM lock \a lock. + * Assumes LDLM lock is already locked. + * only called in ldlm_flock_destroy and for local locks. + * Does NOT add lock to LRU if no r/w references left to accommodate flock locks + * that cannot be placed in LRU. + */ +void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode) +{ + LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); + if (mode & (LCK_NL | LCK_CR | LCK_PR)) { + LASSERT(lock->l_readers > 0); + lu_ref_del(&lock->l_reference, "reader", lock); + lock->l_readers--; + } + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { + LASSERT(lock->l_writers > 0); + lu_ref_del(&lock->l_reference, "writer", lock); + lock->l_writers--; + } + + lu_ref_del(&lock->l_reference, "user", lock); + LDLM_LOCK_RELEASE(lock); /* matches the LDLM_LOCK_GET() in addref */ +} + +/** + * Removes reader/writer reference for LDLM lock \a lock. + * Locks LDLM lock first. + * If the lock is determined to be client lock on a client and r/w refcount + * drops to zero and the lock is not blocked, the lock is added to LRU lock + * on the namespace. + * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called. + */ +void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode) +{ + struct ldlm_namespace *ns; + + lock_res_and_lock(lock); + + ns = ldlm_lock_to_ns(lock); + + ldlm_lock_decref_internal_nolock(lock, mode); + + if (lock->l_flags & LDLM_FL_LOCAL && + !lock->l_readers && !lock->l_writers) { + /* If this is a local lock on a server namespace and this was + * the last reference, cancel the lock. */ + CDEBUG(D_INFO, "forcing cancel of local lock\n"); + lock->l_flags |= LDLM_FL_CBPENDING; + } + + if (!lock->l_readers && !lock->l_writers && + (lock->l_flags & LDLM_FL_CBPENDING)) { + /* If we received a blocked AST and this was the last reference, + * run the callback. */ + if ((lock->l_flags & LDLM_FL_NS_SRV) && lock->l_export) + CERROR("FL_CBPENDING set on non-local lock--just a warning\n"); + + LDLM_DEBUG(lock, "final decref done on cbpending lock"); + + LDLM_LOCK_GET(lock); /* dropped by bl thread */ + ldlm_lock_remove_from_lru(lock); + unlock_res_and_lock(lock); + + if (lock->l_flags & LDLM_FL_FAIL_LOC) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + if ((lock->l_flags & LDLM_FL_ATOMIC_CB) || + ldlm_bl_to_thread_lock(ns, NULL, lock) != 0) + ldlm_handle_bl_callback(ns, NULL, lock); + } else if (ns_is_client(ns) && + !lock->l_readers && !lock->l_writers && + !(lock->l_flags & LDLM_FL_NO_LRU) && + !(lock->l_flags & LDLM_FL_BL_AST)) { + + LDLM_DEBUG(lock, "add lock into lru list"); + + /* If this is a client-side namespace and this was the last + * reference, put it on the LRU. */ + ldlm_lock_add_to_lru(lock); + unlock_res_and_lock(lock); + + if (lock->l_flags & LDLM_FL_FAIL_LOC) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE + * are not supported by the server, otherwise, it is done on + * enqueue. */ + if (!exp_connect_cancelset(lock->l_conn_export) && + !ns_connect_lru_resize(ns)) + ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0); + } else { + LDLM_DEBUG(lock, "do not add lock into lru list"); + unlock_res_and_lock(lock); + } +} + +/** + * Decrease reader/writer refcount for LDLM lock with handle \a lockh + */ +void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + + LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie); + ldlm_lock_decref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_decref); + +/** + * Decrease reader/writer refcount for LDLM lock with handle + * \a lockh and mark it for subsequent cancellation once r/w refcount + * drops to zero instead of putting into LRU. + * + * Typical usage is for GROUP locks which we cannot allow to be cached. + */ +void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + + LASSERT(lock != NULL); + + LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_CBPENDING; + unlock_res_and_lock(lock); + ldlm_lock_decref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_decref_and_cancel); + +struct sl_insert_point { + struct list_head *res_link; + struct list_head *mode_link; + struct list_head *policy_link; +}; + +/** + * Finds a position to insert the new lock into granted lock list. + * + * Used for locks eligible for skiplist optimization. + * + * Parameters: + * queue [input]: the granted list where search acts on; + * req [input]: the lock whose position to be located; + * prev [output]: positions within 3 lists to insert @req to + * Return Value: + * filled @prev + * NOTE: called by + * - ldlm_grant_lock_with_skiplist + */ +static void search_granted_lock(struct list_head *queue, + struct ldlm_lock *req, + struct sl_insert_point *prev) +{ + struct list_head *tmp; + struct ldlm_lock *lock, *mode_end, *policy_end; + + list_for_each(tmp, queue) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + mode_end = list_entry(lock->l_sl_mode.prev, + struct ldlm_lock, l_sl_mode); + + if (lock->l_req_mode != req->l_req_mode) { + /* jump to last lock of mode group */ + tmp = &mode_end->l_res_link; + continue; + } + + /* suitable mode group is found */ + if (lock->l_resource->lr_type == LDLM_PLAIN) { + /* insert point is last lock of the mode group */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + return; + } else if (lock->l_resource->lr_type == LDLM_IBITS) { + for (;;) { + policy_end = + list_entry(lock->l_sl_policy.prev, + struct ldlm_lock, + l_sl_policy); + + if (lock->l_policy_data.l_inodebits.bits == + req->l_policy_data.l_inodebits.bits) { + /* insert point is last lock of + * the policy group */ + prev->res_link = + &policy_end->l_res_link; + prev->mode_link = + &policy_end->l_sl_mode; + prev->policy_link = + &policy_end->l_sl_policy; + return; + } + + if (policy_end == mode_end) + /* done with mode group */ + break; + + /* go to next policy group within mode group */ + tmp = policy_end->l_res_link.next; + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + } /* loop over policy groups within the mode group */ + + /* insert point is last lock of the mode group, + * new policy group is started */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + return; + } else { + LDLM_ERROR(lock, + "is not LDLM_PLAIN or LDLM_IBITS lock"); + LBUG(); + } + } + + /* insert point is last lock on the queue, + * new mode group and new policy group are started */ + prev->res_link = queue->prev; + prev->mode_link = &req->l_sl_mode; + prev->policy_link = &req->l_sl_policy; +} + +/** + * Add a lock into resource granted list after a position described by + * \a prev. + */ +static void ldlm_granted_list_add_lock(struct ldlm_lock *lock, + struct sl_insert_point *prev) +{ + struct ldlm_resource *res = lock->l_resource; + + check_res_locked(res); + + ldlm_resource_dump(D_INFO, res); + LDLM_DEBUG(lock, "About to add lock:"); + + if (lock->l_flags & LDLM_FL_DESTROYED) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + return; + } + + LASSERT(list_empty(&lock->l_res_link)); + LASSERT(list_empty(&lock->l_sl_mode)); + LASSERT(list_empty(&lock->l_sl_policy)); + + /* + * lock->link == prev->link means lock is first starting the group. + * Don't re-add to itself to suppress kernel warnings. + */ + if (&lock->l_res_link != prev->res_link) + list_add(&lock->l_res_link, prev->res_link); + if (&lock->l_sl_mode != prev->mode_link) + list_add(&lock->l_sl_mode, prev->mode_link); + if (&lock->l_sl_policy != prev->policy_link) + list_add(&lock->l_sl_policy, prev->policy_link); +} + +/** + * Add a lock to granted list on a resource maintaining skiplist + * correctness. + */ +static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock) +{ + struct sl_insert_point prev; + + LASSERT(lock->l_req_mode == lock->l_granted_mode); + + search_granted_lock(&lock->l_resource->lr_granted, lock, &prev); + ldlm_granted_list_add_lock(lock, &prev); +} + +/** + * Perform lock granting bookkeeping. + * + * Includes putting the lock into granted list and updating lock mode. + * NOTE: called by + * - ldlm_lock_enqueue + * - ldlm_reprocess_queue + * - ldlm_lock_convert + * + * must be called with lr_lock held + */ +void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + + check_res_locked(res); + + lock->l_granted_mode = lock->l_req_mode; + if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) + ldlm_grant_lock_with_skiplist(lock); + else if (res->lr_type == LDLM_EXTENT) + ldlm_extent_add_lock(res, lock); + else + ldlm_resource_add_lock(res, &res->lr_granted, lock); + + if (lock->l_granted_mode < res->lr_most_restr) + res->lr_most_restr = lock->l_granted_mode; + + if (work_list && lock->l_completion_ast != NULL) + ldlm_add_ast_work_item(lock, NULL, work_list); + + ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock); +} + +/** + * Search for a lock with given properties in a queue. + * + * \retval a referenced lock or NULL. See the flag descriptions below, in the + * comment above ldlm_lock_match + */ +static struct ldlm_lock *search_queue(struct list_head *queue, + ldlm_mode_t *mode, + ldlm_policy_data_t *policy, + struct ldlm_lock *old_lock, + __u64 flags, int unref) +{ + struct ldlm_lock *lock; + struct list_head *tmp; + + list_for_each(tmp, queue) { + ldlm_mode_t match; + + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (lock == old_lock) + break; + + /* Check if this lock can be matched. + * Used by LU-2919(exclusive open) for open lease lock */ + if (ldlm_is_excl(lock)) + continue; + + /* llite sometimes wants to match locks that will be + * canceled when their users drop, but we allow it to match + * if it passes in CBPENDING and the lock still has users. + * this is generally only going to be used by children + * whose parents already hold a lock so forward progress + * can still happen. */ + if (lock->l_flags & LDLM_FL_CBPENDING && + !(flags & LDLM_FL_CBPENDING)) + continue; + if (!unref && lock->l_flags & LDLM_FL_CBPENDING && + lock->l_readers == 0 && lock->l_writers == 0) + continue; + + if (!(lock->l_req_mode & *mode)) + continue; + match = lock->l_req_mode; + + if (lock->l_resource->lr_type == LDLM_EXTENT && + (lock->l_policy_data.l_extent.start > + policy->l_extent.start || + lock->l_policy_data.l_extent.end < policy->l_extent.end)) + continue; + + if (unlikely(match == LCK_GROUP) && + lock->l_resource->lr_type == LDLM_EXTENT && + lock->l_policy_data.l_extent.gid != policy->l_extent.gid) + continue; + + /* We match if we have existing lock with same or wider set + of bits. */ + if (lock->l_resource->lr_type == LDLM_IBITS && + ((lock->l_policy_data.l_inodebits.bits & + policy->l_inodebits.bits) != + policy->l_inodebits.bits)) + continue; + + if (!unref && (lock->l_flags & LDLM_FL_GONE_MASK)) + continue; + + if ((flags & LDLM_FL_LOCAL_ONLY) && + !(lock->l_flags & LDLM_FL_LOCAL)) + continue; + + if (flags & LDLM_FL_TEST_LOCK) { + LDLM_LOCK_GET(lock); + ldlm_lock_touch_in_lru(lock); + } else { + ldlm_lock_addref_internal_nolock(lock, match); + } + *mode = match; + return lock; + } + + return NULL; +} + +void ldlm_lock_fail_match_locked(struct ldlm_lock *lock) +{ + if ((lock->l_flags & LDLM_FL_FAIL_NOTIFIED) == 0) { + lock->l_flags |= LDLM_FL_FAIL_NOTIFIED; + wake_up_all(&lock->l_waitq); + } +} +EXPORT_SYMBOL(ldlm_lock_fail_match_locked); + +void ldlm_lock_fail_match(struct ldlm_lock *lock) +{ + lock_res_and_lock(lock); + ldlm_lock_fail_match_locked(lock); + unlock_res_and_lock(lock); +} +EXPORT_SYMBOL(ldlm_lock_fail_match); + +/** + * Mark lock as "matchable" by OST. + * + * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB + * is not yet valid. + * Assumes LDLM lock is already locked. + */ +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock) +{ + lock->l_flags |= LDLM_FL_LVB_READY; + wake_up_all(&lock->l_waitq); +} +EXPORT_SYMBOL(ldlm_lock_allow_match_locked); + +/** + * Mark lock as "matchable" by OST. + * Locks the lock and then \see ldlm_lock_allow_match_locked + */ +void ldlm_lock_allow_match(struct ldlm_lock *lock) +{ + lock_res_and_lock(lock); + ldlm_lock_allow_match_locked(lock); + unlock_res_and_lock(lock); +} +EXPORT_SYMBOL(ldlm_lock_allow_match); + +/** + * Attempt to find a lock with specified properties. + * + * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is + * set in \a flags + * + * Can be called in two ways: + * + * If 'ns' is NULL, then lockh describes an existing lock that we want to look + * for a duplicate of. + * + * Otherwise, all of the fields must be filled in, to match against. + * + * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the + * server (ie, connh is NULL) + * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted + * list will be considered + * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked + * to be canceled can still be matched as long as they still have reader + * or writer referneces + * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock, + * just tell us if we would have matched. + * + * \retval 1 if it finds an already-existing lock that is compatible; in this + * case, lockh is filled in with a addref()ed lock + * + * We also check security context, and if that fails we simply return 0 (to + * keep caller code unchanged), the context failure will be discovered by + * caller sometime later. + */ +ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags, + const struct ldlm_res_id *res_id, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + struct lustre_handle *lockh, int unref) +{ + struct ldlm_resource *res; + struct ldlm_lock *lock, *old_lock = NULL; + int rc = 0; + + if (ns == NULL) { + old_lock = ldlm_handle2lock(lockh); + LASSERT(old_lock); + + ns = ldlm_lock_to_ns(old_lock); + res_id = &old_lock->l_resource->lr_name; + type = old_lock->l_resource->lr_type; + mode = old_lock->l_req_mode; + } + + res = ldlm_resource_get(ns, NULL, res_id, type, 0); + if (res == NULL) { + LASSERT(old_lock == NULL); + return 0; + } + + LDLM_RESOURCE_ADDREF(res); + lock_res(res); + + lock = search_queue(&res->lr_granted, &mode, policy, old_lock, + flags, unref); + if (lock != NULL) { + rc = 1; + goto out; + } + if (flags & LDLM_FL_BLOCK_GRANTED) { + rc = 0; + goto out; + } + lock = search_queue(&res->lr_converting, &mode, policy, old_lock, + flags, unref); + if (lock != NULL) { + rc = 1; + goto out; + } + lock = search_queue(&res->lr_waiting, &mode, policy, old_lock, + flags, unref); + if (lock != NULL) { + rc = 1; + goto out; + } + + out: + unlock_res(res); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + + if (lock) { + ldlm_lock2handle(lock, lockh); + if ((flags & LDLM_FL_LVB_READY) && + (!(lock->l_flags & LDLM_FL_LVB_READY))) { + __u64 wait_flags = LDLM_FL_LVB_READY | + LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED; + struct l_wait_info lwi; + + if (lock->l_completion_ast) { + int err = lock->l_completion_ast(lock, + LDLM_FL_WAIT_NOREPROC, + NULL); + if (err) { + if (flags & LDLM_FL_TEST_LOCK) + LDLM_LOCK_RELEASE(lock); + else + ldlm_lock_decref_internal(lock, + mode); + rc = 0; + goto out2; + } + } + + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout), + NULL, LWI_ON_SIGNAL_NOOP, NULL); + + /* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */ + l_wait_event(lock->l_waitq, + lock->l_flags & wait_flags, + &lwi); + if (!(lock->l_flags & LDLM_FL_LVB_READY)) { + if (flags & LDLM_FL_TEST_LOCK) + LDLM_LOCK_RELEASE(lock); + else + ldlm_lock_decref_internal(lock, mode); + rc = 0; + } + } + } + out2: + if (rc) { + LDLM_DEBUG(lock, "matched (%llu %llu)", + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] : policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); + + /* check user's security context */ + if (lock->l_conn_export && + sptlrpc_import_check_ctx( + class_exp2cliimp(lock->l_conn_export))) { + if (!(flags & LDLM_FL_TEST_LOCK)) + ldlm_lock_decref_internal(lock, mode); + rc = 0; + } + + if (flags & LDLM_FL_TEST_LOCK) + LDLM_LOCK_RELEASE(lock); + + } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/ + LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res %llu/%llu (%llu %llu)", + ns, type, mode, res_id->name[0], + res_id->name[1], + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] : policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); + } + if (old_lock) + LDLM_LOCK_PUT(old_lock); + + return rc ? mode : 0; +} +EXPORT_SYMBOL(ldlm_lock_match); + +ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh, + __u64 *bits) +{ + struct ldlm_lock *lock; + ldlm_mode_t mode = 0; + + lock = ldlm_handle2lock(lockh); + if (lock != NULL) { + lock_res_and_lock(lock); + if (lock->l_flags & LDLM_FL_GONE_MASK) + goto out; + + if (lock->l_flags & LDLM_FL_CBPENDING && + lock->l_readers == 0 && lock->l_writers == 0) + goto out; + + if (bits) + *bits = lock->l_policy_data.l_inodebits.bits; + mode = lock->l_granted_mode; + ldlm_lock_addref_internal_nolock(lock, mode); + } + +out: + if (lock != NULL) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + return mode; +} +EXPORT_SYMBOL(ldlm_revalidate_lock_handle); + +/** The caller must guarantee that the buffer is large enough. */ +int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, + enum req_location loc, void *data, int size) +{ + void *lvb; + + LASSERT(data != NULL); + LASSERT(size >= 0); + + switch (lock->l_lvb_type) { + case LVB_T_OST: + if (size == sizeof(struct ost_lvb)) { + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb); + else + lvb = req_capsule_server_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + return -EPROTO; + } + + memcpy(data, lvb, size); + } else if (size == sizeof(struct ost_lvb_v1)) { + struct ost_lvb *olvb = data; + + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb_v1); + else + lvb = req_capsule_server_sized_swab_get(pill, + &RMF_DLM_LVB, size, + lustre_swab_ost_lvb_v1); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + return -EPROTO; + } + + memcpy(data, lvb, size); + olvb->lvb_mtime_ns = 0; + olvb->lvb_atime_ns = 0; + olvb->lvb_ctime_ns = 0; + } else { + LDLM_ERROR(lock, "Replied unexpected ost LVB size %d", + size); + return -EINVAL; + } + break; + case LVB_T_LQUOTA: + if (size == sizeof(struct lquota_lvb)) { + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); + else + lvb = req_capsule_server_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + return -EPROTO; + } + + memcpy(data, lvb, size); + } else { + LDLM_ERROR(lock, + "Replied unexpected lquota LVB size %d", + size); + return -EINVAL; + } + break; + case LVB_T_LAYOUT: + if (size == 0) + break; + + if (loc == RCL_CLIENT) + lvb = req_capsule_client_get(pill, &RMF_DLM_LVB); + else + lvb = req_capsule_server_get(pill, &RMF_DLM_LVB); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + return -EPROTO; + } + + memcpy(data, lvb, size); + break; + default: + LDLM_ERROR(lock, "Unknown LVB type: %d\n", lock->l_lvb_type); + dump_stack(); + return -EINVAL; + } + + return 0; +} + +/** + * Create and fill in new LDLM lock with specified properties. + * Returns a referenced lock + */ +struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_type_t type, + ldlm_mode_t mode, + const struct ldlm_callback_suite *cbs, + void *data, __u32 lvb_len, + enum lvb_type lvb_type) +{ + struct ldlm_lock *lock; + struct ldlm_resource *res; + + res = ldlm_resource_get(ns, NULL, res_id, type, 1); + if (res == NULL) + return NULL; + + lock = ldlm_lock_new(res); + + if (lock == NULL) + return NULL; + + lock->l_req_mode = mode; + lock->l_ast_data = data; + lock->l_pid = current_pid(); + if (ns_is_server(ns)) + lock->l_flags |= LDLM_FL_NS_SRV; + if (cbs) { + lock->l_blocking_ast = cbs->lcs_blocking; + lock->l_completion_ast = cbs->lcs_completion; + lock->l_glimpse_ast = cbs->lcs_glimpse; + } + + lock->l_tree_node = NULL; + /* if this is the extent lock, allocate the interval tree node */ + if (type == LDLM_EXTENT) { + if (ldlm_interval_alloc(lock) == NULL) + goto out; + } + + if (lvb_len) { + lock->l_lvb_len = lvb_len; + OBD_ALLOC(lock->l_lvb_data, lvb_len); + if (lock->l_lvb_data == NULL) + goto out; + } + + lock->l_lvb_type = lvb_type; + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK)) + goto out; + + return lock; + +out: + ldlm_lock_destroy(lock); + LDLM_LOCK_RELEASE(lock); + return NULL; +} + +/** + * Enqueue (request) a lock. + * + * Does not block. As a result of enqueue the lock would be put + * into granted or waiting list. + * + * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag + * set, skip all the enqueueing and delegate lock processing to intent policy + * function. + */ +ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, + struct ldlm_lock **lockp, + void *cookie, __u64 *flags) +{ + struct ldlm_lock *lock = *lockp; + struct ldlm_resource *res = lock->l_resource; + int local = ns_is_client(ldlm_res_to_ns(res)); + ldlm_error_t rc = ELDLM_OK; + struct ldlm_interval *node = NULL; + + lock->l_last_activity = get_seconds(); + /* policies are not executed on the client or during replay */ + if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT + && !local && ns->ns_policy) { + rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags, + NULL); + if (rc == ELDLM_LOCK_REPLACED) { + /* The lock that was returned has already been granted, + * and placed into lockp. If it's not the same as the + * one we passed in, then destroy the old one and our + * work here is done. */ + if (lock != *lockp) { + ldlm_lock_destroy(lock); + LDLM_LOCK_RELEASE(lock); + } + *flags |= LDLM_FL_LOCK_CHANGED; + return 0; + } else if (rc != ELDLM_OK || + (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) { + ldlm_lock_destroy(lock); + return rc; + } + } + + /* For a replaying lock, it might be already in granted list. So + * unlinking the lock will cause the interval node to be freed, we + * have to allocate the interval node early otherwise we can't regrant + * this lock in the future. - jay */ + if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT) + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS); + + lock_res_and_lock(lock); + if (local && lock->l_req_mode == lock->l_granted_mode) { + /* The server returned a blocked lock, but it was granted + * before we got a chance to actually enqueue it. We don't + * need to do anything else. */ + *flags &= ~(LDLM_FL_BLOCK_GRANTED | + LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT); + goto out; + } + + ldlm_resource_unlink_lock(lock); + if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) { + if (node == NULL) { + ldlm_lock_destroy_nolock(lock); + rc = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + node = NULL; + } + + /* Some flags from the enqueue want to make it into the AST, via the + * lock's l_flags. */ + lock->l_flags |= *flags & LDLM_FL_AST_DISCARD_DATA; + + /* This distinction between local lock trees is very important; a client + * namespace only has information about locks taken by that client, and + * thus doesn't have enough information to decide for itself if it can + * be granted (below). In this case, we do exactly what the server + * tells us to do, as dictated by the 'flags'. + * + * We do exactly the same thing during recovery, when the server is + * more or less trusting the clients not to lie. + * + * FIXME (bug 268): Detect obvious lies by checking compatibility in + * granted/converting queues. */ + if (local) { + if (*flags & LDLM_FL_BLOCK_CONV) + ldlm_resource_add_lock(res, &res->lr_converting, lock); + else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED)) + ldlm_resource_add_lock(res, &res->lr_waiting, lock); + else + ldlm_grant_lock(lock, NULL); + goto out; + } else { + CERROR("This is client-side-only module, cannot handle LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } + +out: + unlock_res_and_lock(lock); + if (node) + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + return rc; +} + + +/** + * Process a call to blocking AST callback for a lock in ast_work list + */ +static int +ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock_desc d; + int rc; + struct ldlm_lock *lock; + + if (list_empty(arg->list)) + return -ENOENT; + + lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast); + + /* nobody should touch l_bl_ast */ + lock_res_and_lock(lock); + list_del_init(&lock->l_bl_ast); + + LASSERT(lock->l_flags & LDLM_FL_AST_SENT); + LASSERT(lock->l_bl_ast_run == 0); + LASSERT(lock->l_blocking_lock); + lock->l_bl_ast_run++; + unlock_res_and_lock(lock); + + ldlm_lock2desc(lock->l_blocking_lock, &d); + + rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING); + LDLM_LOCK_RELEASE(lock->l_blocking_lock); + lock->l_blocking_lock = NULL; + LDLM_LOCK_RELEASE(lock); + + return rc; +} + +/** + * Process a call to completion AST callback for a lock in ast_work list + */ +static int +ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + int rc = 0; + struct ldlm_lock *lock; + ldlm_completion_callback completion_callback; + + if (list_empty(arg->list)) + return -ENOENT; + + lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast); + + /* It's possible to receive a completion AST before we've set + * the l_completion_ast pointer: either because the AST arrived + * before the reply, or simply because there's a small race + * window between receiving the reply and finishing the local + * enqueue. (bug 842) + * + * This can't happen with the blocking_ast, however, because we + * will never call the local blocking_ast until we drop our + * reader/writer reference, which we won't do until we get the + * reply and finish enqueueing. */ + + /* nobody should touch l_cp_ast */ + lock_res_and_lock(lock); + list_del_init(&lock->l_cp_ast); + LASSERT(lock->l_flags & LDLM_FL_CP_REQD); + /* save l_completion_ast since it can be changed by + * mds_intent_policy(), see bug 14225 */ + completion_callback = lock->l_completion_ast; + lock->l_flags &= ~LDLM_FL_CP_REQD; + unlock_res_and_lock(lock); + + if (completion_callback != NULL) + rc = completion_callback(lock, 0, (void *)arg); + LDLM_LOCK_RELEASE(lock); + + return rc; +} + +/** + * Process a call to revocation AST callback for a lock in ast_work list + */ +static int +ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock_desc desc; + int rc; + struct ldlm_lock *lock; + + if (list_empty(arg->list)) + return -ENOENT; + + lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast); + list_del_init(&lock->l_rk_ast); + + /* the desc just pretend to exclusive */ + ldlm_lock2desc(lock, &desc); + desc.l_req_mode = LCK_EX; + desc.l_granted_mode = 0; + + rc = lock->l_blocking_ast(lock, &desc, (void *)arg, LDLM_CB_BLOCKING); + LDLM_LOCK_RELEASE(lock); + + return rc; +} + +/** + * Process a call to glimpse AST callback for a lock in ast_work list + */ +int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_glimpse_work *gl_work; + struct ldlm_lock *lock; + int rc = 0; + + if (list_empty(arg->list)) + return -ENOENT; + + gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work, + gl_list); + list_del_init(&gl_work->gl_list); + + lock = gl_work->gl_lock; + + /* transfer the glimpse descriptor to ldlm_cb_set_arg */ + arg->gl_desc = gl_work->gl_desc; + + /* invoke the actual glimpse callback */ + if (lock->l_glimpse_ast(lock, (void *)arg) == 0) + rc = 1; + + LDLM_LOCK_RELEASE(lock); + + if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0) + OBD_FREE_PTR(gl_work); + + return rc; +} + +/** + * Process list of locks in need of ASTs being sent. + * + * Used on server to send multiple ASTs together instead of sending one by + * one. + */ +int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, + enum ldlm_desc_ast_t ast_type) +{ + struct ldlm_cb_set_arg *arg; + set_producer_func work_ast_lock; + int rc; + + if (list_empty(rpc_list)) + return 0; + + OBD_ALLOC_PTR(arg); + if (arg == NULL) + return -ENOMEM; + + atomic_set(&arg->restart, 0); + arg->list = rpc_list; + + switch (ast_type) { + case LDLM_WORK_BL_AST: + arg->type = LDLM_BL_CALLBACK; + work_ast_lock = ldlm_work_bl_ast_lock; + break; + case LDLM_WORK_CP_AST: + arg->type = LDLM_CP_CALLBACK; + work_ast_lock = ldlm_work_cp_ast_lock; + break; + case LDLM_WORK_REVOKE_AST: + arg->type = LDLM_BL_CALLBACK; + work_ast_lock = ldlm_work_revoke_ast_lock; + break; + case LDLM_WORK_GL_AST: + arg->type = LDLM_GL_CALLBACK; + work_ast_lock = ldlm_work_gl_ast_lock; + break; + default: + LBUG(); + } + + /* We create a ptlrpc request set with flow control extension. + * This request set will use the work_ast_lock function to produce new + * requests and will send a new request each time one completes in order + * to keep the number of requests in flight to ns_max_parallel_ast */ + arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX, + work_ast_lock, arg); + if (arg->set == NULL) { + rc = -ENOMEM; + goto out; + } + + ptlrpc_set_wait(arg->set); + ptlrpc_set_destroy(arg->set); + + rc = atomic_read(&arg->restart) ? -ERESTART : 0; + goto out; +out: + OBD_FREE_PTR(arg); + return rc; +} + +static int reprocess_one_queue(struct ldlm_resource *res, void *closure) +{ + ldlm_reprocess_all(res); + return LDLM_ITER_CONTINUE; +} + +static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + int rc; + + rc = reprocess_one_queue(res, arg); + + return rc == LDLM_ITER_STOP; +} + +/** + * Iterate through all resources on a namespace attempting to grant waiting + * locks. + */ +void ldlm_reprocess_all_ns(struct ldlm_namespace *ns) +{ + if (ns != NULL) { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_reprocess_res, NULL); + } +} +EXPORT_SYMBOL(ldlm_reprocess_all_ns); + +/** + * Try to grant all waiting locks on a resource. + * + * Calls ldlm_reprocess_queue on converting and waiting queues. + * + * Typically called after some resource locks are cancelled to see + * if anything could be granted as a result of the cancellation. + */ +void ldlm_reprocess_all(struct ldlm_resource *res) +{ + LIST_HEAD(rpc_list); + + if (!ns_is_client(ldlm_res_to_ns(res))) { + CERROR("This is client-side-only module, cannot handle LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } +} + +/** + * Helper function to call blocking AST for LDLM lock \a lock in a + * "cancelling" mode. + */ +void ldlm_cancel_callback(struct ldlm_lock *lock) +{ + check_res_locked(lock->l_resource); + if (!(lock->l_flags & LDLM_FL_CANCEL)) { + lock->l_flags |= LDLM_FL_CANCEL; + if (lock->l_blocking_ast) { + unlock_res_and_lock(lock); + lock->l_blocking_ast(lock, NULL, lock->l_ast_data, + LDLM_CB_CANCELING); + lock_res_and_lock(lock); + } else { + LDLM_DEBUG(lock, "no blocking ast"); + } + } + lock->l_flags |= LDLM_FL_BL_DONE; +} + +/** + * Remove skiplist-enabled LDLM lock \a req from granted list + */ +void ldlm_unlink_lock_skiplist(struct ldlm_lock *req) +{ + if (req->l_resource->lr_type != LDLM_PLAIN && + req->l_resource->lr_type != LDLM_IBITS) + return; + + list_del_init(&req->l_sl_policy); + list_del_init(&req->l_sl_mode); +} + +/** + * Attempts to cancel LDLM lock \a lock that has no reader/writer references. + */ +void ldlm_lock_cancel(struct ldlm_lock *lock) +{ + struct ldlm_resource *res; + struct ldlm_namespace *ns; + + lock_res_and_lock(lock); + + res = lock->l_resource; + ns = ldlm_res_to_ns(res); + + /* Please do not, no matter how tempting, remove this LBUG without + * talking to me first. -phik */ + if (lock->l_readers || lock->l_writers) { + LDLM_ERROR(lock, "lock still has references"); + LBUG(); + } + + if (lock->l_flags & LDLM_FL_WAITED) + ldlm_del_waiting_lock(lock); + + /* Releases cancel callback. */ + ldlm_cancel_callback(lock); + + /* Yes, second time, just in case it was added again while we were + * running with no res lock in ldlm_cancel_callback */ + if (lock->l_flags & LDLM_FL_WAITED) + ldlm_del_waiting_lock(lock); + + ldlm_resource_unlink_lock(lock); + ldlm_lock_destroy_nolock(lock); + + if (lock->l_granted_mode == lock->l_req_mode) + ldlm_pool_del(&ns->ns_pool, lock); + + /* Make sure we will not be called again for same lock what is possible + * if not to zero out lock->l_granted_mode */ + lock->l_granted_mode = LCK_MINMODE; + unlock_res_and_lock(lock); +} +EXPORT_SYMBOL(ldlm_lock_cancel); + +/** + * Set opaque data into the lock that only makes sense to upper layer. + */ +int ldlm_lock_set_data(struct lustre_handle *lockh, void *data) +{ + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + int rc = -EINVAL; + + if (lock) { + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + rc = 0; + LDLM_LOCK_PUT(lock); + } + return rc; +} +EXPORT_SYMBOL(ldlm_lock_set_data); + +struct export_cl_data { + struct obd_export *ecl_exp; + int ecl_loop; +}; + +/** + * Iterator function for ldlm_cancel_locks_for_export. + * Cancels passed locks. + */ +int ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) + +{ + struct export_cl_data *ecl = (struct export_cl_data *)data; + struct obd_export *exp = ecl->ecl_exp; + struct ldlm_lock *lock = cfs_hash_object(hs, hnode); + struct ldlm_resource *res; + + res = ldlm_resource_getref(lock->l_resource); + LDLM_LOCK_GET(lock); + + LDLM_DEBUG(lock, "export %p", exp); + ldlm_res_lvbo_update(res, NULL, 1); + ldlm_lock_cancel(lock); + ldlm_reprocess_all(res); + ldlm_resource_putref(res); + LDLM_LOCK_RELEASE(lock); + + ecl->ecl_loop++; + if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) { + CDEBUG(D_INFO, + "Cancel lock %p for export %p (loop %d), still have %d locks left on hash table.\n", + lock, exp, ecl->ecl_loop, + atomic_read(&hs->hs_count)); + } + + return 0; +} + +/** + * Cancel all locks for given export. + * + * Typically called on client disconnection/eviction + */ +void ldlm_cancel_locks_for_export(struct obd_export *exp) +{ + struct export_cl_data ecl = { + .ecl_exp = exp, + .ecl_loop = 0, + }; + + cfs_hash_for_each_empty(exp->exp_lock_hash, + ldlm_cancel_locks_for_export_cb, &ecl); +} + +/** + * Downgrade an exclusive lock. + * + * A fast variant of ldlm_lock_convert for conversion of exclusive + * locks. The conversion is always successful. + * Used by Commit on Sharing (COS) code. + * + * \param lock A lock to convert + * \param new_mode new lock mode + */ +void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode) +{ + LASSERT(lock->l_granted_mode & (LCK_PW | LCK_EX)); + LASSERT(new_mode == LCK_COS); + + lock_res_and_lock(lock); + ldlm_resource_unlink_lock(lock); + /* + * Remove the lock from pool as it will be added again in + * ldlm_grant_lock() called below. + */ + ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock); + + lock->l_req_mode = new_mode; + ldlm_grant_lock(lock, NULL); + unlock_res_and_lock(lock); + ldlm_reprocess_all(lock->l_resource); +} +EXPORT_SYMBOL(ldlm_lock_downgrade); + +/** + * Attempt to convert already granted lock to a different mode. + * + * While lock conversion is not currently used, future client-side + * optimizations could take advantage of it to avoid discarding cached + * pages on a file. + */ +struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, + __u32 *flags) +{ + LIST_HEAD(rpc_list); + struct ldlm_resource *res; + struct ldlm_namespace *ns; + int granted = 0; + struct ldlm_interval *node; + + /* Just return if mode is unchanged. */ + if (new_mode == lock->l_granted_mode) { + *flags |= LDLM_FL_BLOCK_GRANTED; + return lock->l_resource; + } + + /* I can't check the type of lock here because the bitlock of lock + * is not held here, so do the allocation blindly. -jay */ + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS); + if (node == NULL) + /* Actually, this causes EDEADLOCK to be returned */ + return NULL; + + LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR), + "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode); + + lock_res_and_lock(lock); + + res = lock->l_resource; + ns = ldlm_res_to_ns(res); + + lock->l_req_mode = new_mode; + if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) { + ldlm_resource_unlink_lock(lock); + } else { + ldlm_resource_unlink_lock(lock); + if (res->lr_type == LDLM_EXTENT) { + /* FIXME: ugly code, I have to attach the lock to a + * interval node again since perhaps it will be granted + * soon */ + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + node = NULL; + } + } + + /* + * Remove old lock from the pool before adding the lock with new + * mode below in ->policy() + */ + ldlm_pool_del(&ns->ns_pool, lock); + + /* If this is a local resource, put it on the appropriate list. */ + if (ns_is_client(ldlm_res_to_ns(res))) { + if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) { + ldlm_resource_add_lock(res, &res->lr_converting, lock); + } else { + /* This should never happen, because of the way the + * server handles conversions. */ + LDLM_ERROR(lock, "Erroneous flags %x on local lock\n", + *flags); + LBUG(); + + ldlm_grant_lock(lock, &rpc_list); + granted = 1; + /* FIXME: completion handling not with lr_lock held ! */ + if (lock->l_completion_ast) + lock->l_completion_ast(lock, 0, NULL); + } + } else { + CERROR("This is client-side-only module, cannot handle LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } + unlock_res_and_lock(lock); + + if (granted) + ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST); + if (node) + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + return res; +} +EXPORT_SYMBOL(ldlm_lock_convert); + +/** + * Print lock with lock handle \a lockh description into debug log. + * + * Used when printing all locks on a resource for debug purposes. + */ +void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh) +{ + struct ldlm_lock *lock; + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + lock = ldlm_handle2lock(lockh); + if (lock == NULL) + return; + + LDLM_DEBUG_LIMIT(level, lock, "###"); + + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_dump_handle); + +/** + * Print lock information with custom message into debug log. + * Helper function. + */ +void _ldlm_lock_debug(struct ldlm_lock *lock, + struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + va_list args; + struct obd_export *exp = lock->l_export; + struct ldlm_resource *resource = lock->l_resource; + char *nid = "local"; + + va_start(args, fmt); + + if (exp && exp->exp_connection) { + nid = libcfs_nid2str(exp->exp_connection->c_peer.nid); + } else if (exp && exp->exp_obd != NULL) { + struct obd_import *imp = exp->exp_obd->u.cli.cl_import; + + nid = libcfs_nid2str(imp->imp_connection->c_peer.nid); + } + + if (resource == NULL) { + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n", + lock, + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type); + va_end(args); + return; + } + + switch (resource->lr_type) { + case LDLM_EXTENT: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n", + ldlm_lock_to_ns_name(lock), lock, + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_policy_data.l_extent.start, + lock->l_policy_data.l_extent.end, + lock->l_req_extent.start, lock->l_req_extent.end, + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, + lock->l_lvb_type); + break; + + case LDLM_FLOCK: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu\n", + ldlm_lock_to_ns_name(lock), lock, + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_policy_data.l_flock.pid, + lock->l_policy_data.l_flock.start, + lock->l_policy_data.l_flock.end, + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout); + break; + + case LDLM_IBITS: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n", + ldlm_lock_to_ns_name(lock), + lock, lock->l_handle.h_cookie, + atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + lock->l_policy_data.l_inodebits.bits, + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, + lock->l_lvb_type); + break; + + default: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n", + ldlm_lock_to_ns_name(lock), + lock, lock->l_handle.h_cookie, + atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, + lock->l_lvb_type); + break; + } + va_end(args); +} +EXPORT_SYMBOL(_ldlm_lock_debug); diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c new file mode 100644 index 000000000..08a91f5d9 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c @@ -0,0 +1,1191 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_lockd.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/lustre_dlm.h" +#include "../include/obd_class.h" +#include +#include "ldlm_internal.h" + +static int ldlm_num_threads; +module_param(ldlm_num_threads, int, 0444); +MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start"); + +static char *ldlm_cpts; +module_param(ldlm_cpts, charp, 0444); +MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on"); + +static struct mutex ldlm_ref_mutex; +static int ldlm_refcount; + +struct ldlm_cb_async_args { + struct ldlm_cb_set_arg *ca_set_arg; + struct ldlm_lock *ca_lock; +}; + +/* LDLM state */ + +static struct ldlm_state *ldlm_state; + +inline unsigned long round_timeout(unsigned long timeout) +{ + return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1); +} + +/* timeout for initial callback (AST) reply (bz10399) */ +static inline unsigned int ldlm_get_rq_timeout(void) +{ + /* Non-AT value */ + unsigned int timeout = min(ldlm_timeout, obd_timeout / 3); + + return timeout < 1 ? 1 : timeout; +} + +#define ELT_STOPPED 0 +#define ELT_READY 1 +#define ELT_TERMINATE 2 + +struct ldlm_bl_pool { + spinlock_t blp_lock; + + /* + * blp_prio_list is used for callbacks that should be handled + * as a priority. It is used for LDLM_FL_DISCARD_DATA requests. + * see bug 13843 + */ + struct list_head blp_prio_list; + + /* + * blp_list is used for all other callbacks which are likely + * to take longer to process. + */ + struct list_head blp_list; + + wait_queue_head_t blp_waitq; + struct completion blp_comp; + atomic_t blp_num_threads; + atomic_t blp_busy_threads; + int blp_min_threads; + int blp_max_threads; +}; + +struct ldlm_bl_work_item { + struct list_head blwi_entry; + struct ldlm_namespace *blwi_ns; + struct ldlm_lock_desc blwi_ld; + struct ldlm_lock *blwi_lock; + struct list_head blwi_head; + int blwi_count; + struct completion blwi_comp; + ldlm_cancel_flags_t blwi_flags; + int blwi_mem_pressure; +}; + + +int ldlm_del_waiting_lock(struct ldlm_lock *lock) +{ + return 0; +} + +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout) +{ + return 0; +} + + + +/** + * Callback handler for receiving incoming blocking ASTs. + * + * This can only happen on client side. + */ +void ldlm_handle_bl_callback(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, struct ldlm_lock *lock) +{ + int do_ast; + + LDLM_DEBUG(lock, "client blocking AST callback handler"); + + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_CBPENDING; + + if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) + lock->l_flags |= LDLM_FL_CANCEL; + + do_ast = !lock->l_readers && !lock->l_writers; + unlock_res_and_lock(lock); + + if (do_ast) { + CDEBUG(D_DLMTRACE, + "Lock %p already unused, calling callback (%p)\n", lock, + lock->l_blocking_ast); + if (lock->l_blocking_ast != NULL) + lock->l_blocking_ast(lock, ld, lock->l_ast_data, + LDLM_CB_BLOCKING); + } else { + CDEBUG(D_DLMTRACE, + "Lock %p is referenced, will be cancelled later\n", + lock); + } + + LDLM_DEBUG(lock, "client blocking callback handler END"); + LDLM_LOCK_RELEASE(lock); +} + +/** + * Callback handler for receiving incoming completion ASTs. + * + * This only can happen on client side. + */ +static void ldlm_handle_cp_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns, + struct ldlm_request *dlm_req, + struct ldlm_lock *lock) +{ + int lvb_len; + LIST_HEAD(ast_list); + int rc = 0; + + LDLM_DEBUG(lock, "client completion callback handler START"); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) { + int to = cfs_time_seconds(1); + + while (to > 0) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(to); + if (lock->l_granted_mode == lock->l_req_mode || + lock->l_flags & LDLM_FL_DESTROYED) + break; + } + } + + lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT); + if (lvb_len < 0) { + LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len); + rc = lvb_len; + goto out; + } else if (lvb_len > 0) { + if (lock->l_lvb_len > 0) { + /* for extent lock, lvb contains ost_lvb{}. */ + LASSERT(lock->l_lvb_data != NULL); + + if (unlikely(lock->l_lvb_len < lvb_len)) { + LDLM_ERROR(lock, "Replied LVB is larger than expectation, expected = %d, replied = %d", + lock->l_lvb_len, lvb_len); + rc = -EINVAL; + goto out; + } + } else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has + * variable length */ + void *lvb_data; + + OBD_ALLOC(lvb_data, lvb_len); + if (lvb_data == NULL) { + LDLM_ERROR(lock, "No memory: %d.\n", lvb_len); + rc = -ENOMEM; + goto out; + } + + lock_res_and_lock(lock); + LASSERT(lock->l_lvb_data == NULL); + lock->l_lvb_type = LVB_T_LAYOUT; + lock->l_lvb_data = lvb_data; + lock->l_lvb_len = lvb_len; + unlock_res_and_lock(lock); + } + } + + lock_res_and_lock(lock); + if ((lock->l_flags & LDLM_FL_DESTROYED) || + lock->l_granted_mode == lock->l_req_mode) { + /* bug 11300: the lock has already been granted */ + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "Double grant race happened"); + rc = 0; + goto out; + } + + /* If we receive the completion AST before the actual enqueue returned, + * then we might need to switch lock modes, resources, or extents. */ + if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) { + lock->l_req_mode = dlm_req->lock_desc.l_granted_mode; + LDLM_DEBUG(lock, "completion AST, new lock mode"); + } + + if (lock->l_resource->lr_type != LDLM_PLAIN) { + ldlm_convert_policy_to_local(req->rq_export, + dlm_req->lock_desc.l_resource.lr_type, + &dlm_req->lock_desc.l_policy_data, + &lock->l_policy_data); + LDLM_DEBUG(lock, "completion AST, new policy data"); + } + + ldlm_resource_unlink_lock(lock); + if (memcmp(&dlm_req->lock_desc.l_resource.lr_name, + &lock->l_resource->lr_name, + sizeof(lock->l_resource->lr_name)) != 0) { + unlock_res_and_lock(lock); + rc = ldlm_lock_change_resource(ns, lock, + &dlm_req->lock_desc.l_resource.lr_name); + if (rc < 0) { + LDLM_ERROR(lock, "Failed to allocate resource"); + goto out; + } + LDLM_DEBUG(lock, "completion AST, new resource"); + CERROR("change resource!\n"); + lock_res_and_lock(lock); + } + + if (dlm_req->lock_flags & LDLM_FL_AST_SENT) { + /* BL_AST locks are not needed in LRU. + * Let ldlm_cancel_lru() be fast. */ + ldlm_lock_remove_from_lru(lock); + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; + LDLM_DEBUG(lock, "completion AST includes blocking AST"); + } + + if (lock->l_lvb_len > 0) { + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT, + lock->l_lvb_data, lvb_len); + if (rc < 0) { + unlock_res_and_lock(lock); + goto out; + } + } + + ldlm_grant_lock(lock, &ast_list); + unlock_res_and_lock(lock); + + LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work"); + + /* Let Enqueue to call osc_lock_upcall() and initialize + * l_ast_data */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2); + + ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST); + + LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)", + lock); + goto out; + +out: + if (rc < 0) { + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_FAILED; + unlock_res_and_lock(lock); + wake_up(&lock->l_waitq); + } + LDLM_LOCK_RELEASE(lock); +} + +/** + * Callback handler for receiving incoming glimpse ASTs. + * + * This only can happen on client side. After handling the glimpse AST + * we also consider dropping the lock here if it is unused locally for a + * long time. + */ +static void ldlm_handle_gl_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns, + struct ldlm_request *dlm_req, + struct ldlm_lock *lock) +{ + int rc = -ENOSYS; + + LDLM_DEBUG(lock, "client glimpse AST callback handler"); + + if (lock->l_glimpse_ast != NULL) + rc = lock->l_glimpse_ast(lock, req); + + if (req->rq_repmsg != NULL) { + ptlrpc_reply(req); + } else { + req->rq_status = rc; + ptlrpc_error(req); + } + + lock_res_and_lock(lock); + if (lock->l_granted_mode == LCK_PW && + !lock->l_readers && !lock->l_writers && + cfs_time_after(cfs_time_current(), + cfs_time_add(lock->l_last_used, + cfs_time_seconds(10)))) { + unlock_res_and_lock(lock); + if (ldlm_bl_to_thread_lock(ns, NULL, lock)) + ldlm_handle_bl_callback(ns, NULL, lock); + + return; + } + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); +} + +static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) +{ + if (req->rq_no_reply) + return 0; + + req->rq_status = rc; + if (!req->rq_packed_final) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + return rc; + } + return ptlrpc_reply(req); +} + +static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi, + ldlm_cancel_flags_t cancel_flags) +{ + struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; + + spin_lock(&blp->blp_lock); + if (blwi->blwi_lock && + blwi->blwi_lock->l_flags & LDLM_FL_DISCARD_DATA) { + /* add LDLM_FL_DISCARD_DATA requests to the priority list */ + list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list); + } else { + /* other blocking callbacks are added to the regular list */ + list_add_tail(&blwi->blwi_entry, &blp->blp_list); + } + spin_unlock(&blp->blp_lock); + + wake_up(&blp->blp_waitq); + + /* can not check blwi->blwi_flags as blwi could be already freed in + LCF_ASYNC mode */ + if (!(cancel_flags & LCF_ASYNC)) + wait_for_completion(&blwi->blwi_comp); + + return 0; +} + +static inline void init_blwi(struct ldlm_bl_work_item *blwi, + struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + struct ldlm_lock *lock, + ldlm_cancel_flags_t cancel_flags) +{ + init_completion(&blwi->blwi_comp); + INIT_LIST_HEAD(&blwi->blwi_head); + + if (memory_pressure_get()) + blwi->blwi_mem_pressure = 1; + + blwi->blwi_ns = ns; + blwi->blwi_flags = cancel_flags; + if (ld != NULL) + blwi->blwi_ld = *ld; + if (count) { + list_add(&blwi->blwi_head, cancels); + list_del_init(cancels); + blwi->blwi_count = count; + } else { + blwi->blwi_lock = lock; + } +} + +/** + * Queues a list of locks \a cancels containing \a count locks + * for later processing by a blocking thread. If \a count is zero, + * then the lock referenced as \a lock is queued instead. + * + * The blocking thread would then call ->l_blocking_ast callback in the lock. + * If list addition fails an error is returned and caller is supposed to + * call ->l_blocking_ast itself. + */ +static int ldlm_bl_to_thread(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct ldlm_lock *lock, + struct list_head *cancels, int count, + ldlm_cancel_flags_t cancel_flags) +{ + if (cancels && count == 0) + return 0; + + if (cancel_flags & LCF_ASYNC) { + struct ldlm_bl_work_item *blwi; + + OBD_ALLOC(blwi, sizeof(*blwi)); + if (blwi == NULL) + return -ENOMEM; + init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags); + + return __ldlm_bl_to_thread(blwi, cancel_flags); + } else { + /* if it is synchronous call do minimum mem alloc, as it could + * be triggered from kernel shrinker + */ + struct ldlm_bl_work_item blwi; + + memset(&blwi, 0, sizeof(blwi)); + init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags); + return __ldlm_bl_to_thread(&blwi, cancel_flags); + } +} + + +int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct ldlm_lock *lock) +{ + return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC); +} + +int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + ldlm_cancel_flags_t cancel_flags) +{ + return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags); +} + +/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */ +static int ldlm_handle_setinfo(struct ptlrpc_request *req) +{ + struct obd_device *obd = req->rq_export->exp_obd; + char *key; + void *val; + int keylen, vallen; + int rc = -ENOSYS; + + DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name); + + req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO); + + key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + if (key == NULL) { + DEBUG_REQ(D_IOCTL, req, "no set_info key"); + return -EFAULT; + } + keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT); + val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + if (val == NULL) { + DEBUG_REQ(D_IOCTL, req, "no set_info val"); + return -EFAULT; + } + vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT); + + /* We are responsible for swabbing contents of val */ + + if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) + /* Pass it on to mdc (the "export" in this case) */ + rc = obd_set_info_async(req->rq_svc_thread->t_env, + req->rq_export, + sizeof(KEY_HSM_COPYTOOL_SEND), + KEY_HSM_COPYTOOL_SEND, + vallen, val, NULL); + else + DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key); + + return rc; +} + +static inline void ldlm_callback_errmsg(struct ptlrpc_request *req, + const char *msg, int rc, + struct lustre_handle *handle) +{ + DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req, + "%s: [nid %s] [rc %d] [lock %#llx]", + msg, libcfs_id2str(req->rq_peer), rc, + handle ? handle->cookie : 0); + if (req->rq_no_reply) + CWARN("No reply was sent, maybe cause bug 21636.\n"); + else if (rc) + CWARN("Send reply failed, maybe cause bug 21636.\n"); +} + +static int ldlm_handle_qc_callback(struct ptlrpc_request *req) +{ + struct obd_quotactl *oqctl; + struct client_obd *cli = &req->rq_export->exp_obd->u.cli; + + oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + if (oqctl == NULL) { + CERROR("Can't unpack obd_quotactl\n"); + return -EPROTO; + } + + oqctl->qc_stat = ptlrpc_status_ntoh(oqctl->qc_stat); + + cli->cl_qchk_stat = oqctl->qc_stat; + return 0; +} + +/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */ +static int ldlm_callback_handler(struct ptlrpc_request *req) +{ + struct ldlm_namespace *ns; + struct ldlm_request *dlm_req; + struct ldlm_lock *lock; + int rc; + + /* Requests arrive in sender's byte order. The ptlrpc service + * handler has already checked and, if necessary, byte-swapped the + * incoming request message body, but I am responsible for the + * message buffers. */ + + /* do nothing for sec context finalize */ + if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI) + return 0; + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + + if (req->rq_export == NULL) { + rc = ldlm_callback_reply(req, -ENOTCONN); + ldlm_callback_errmsg(req, "Operate on unconnected server", + rc, NULL); + return 0; + } + + LASSERT(req->rq_export != NULL); + LASSERT(req->rq_export->exp_obd != NULL); + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case LDLM_BL_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + return 0; + break; + case LDLM_CP_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET)) + return 0; + break; + case LDLM_GL_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET)) + return 0; + break; + case LDLM_SET_INFO: + rc = ldlm_handle_setinfo(req); + ldlm_callback_reply(req, rc); + return 0; + case OBD_QC_CALLBACK: + req_capsule_set(&req->rq_pill, &RQF_QC_CALLBACK); + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_QC_CALLBACK_NET)) + return 0; + rc = ldlm_handle_qc_callback(req); + ldlm_callback_reply(req, rc); + return 0; + default: + CERROR("unknown opcode %u\n", + lustre_msg_get_opc(req->rq_reqmsg)); + ldlm_callback_reply(req, -EPROTO); + return 0; + } + + ns = req->rq_export->exp_obd->obd_namespace; + LASSERT(ns != NULL); + + req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK); + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) { + rc = ldlm_callback_reply(req, -EPROTO); + ldlm_callback_errmsg(req, "Operate without parameter", rc, + NULL); + return 0; + } + + /* Force a known safe race, send a cancel to the server for a lock + * which the server has already started a blocking callback on. */ + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) && + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { + rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } + + lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0); + if (!lock) { + CDEBUG(D_DLMTRACE, "callback on lock %#llx - lock disappeared\n", + dlm_req->lock_handle[0].cookie); + rc = ldlm_callback_reply(req, -EINVAL); + ldlm_callback_errmsg(req, "Operate with invalid parameter", rc, + &dlm_req->lock_handle[0]); + return 0; + } + + if ((lock->l_flags & LDLM_FL_FAIL_LOC) && + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */ + lock_res_and_lock(lock); + lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags & + LDLM_AST_FLAGS); + if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { + /* If somebody cancels lock and cache is already dropped, + * or lock is failed before cp_ast received on client, + * we can tell the server we have no lock. Otherwise, we + * should send cancel after dropping the cache. */ + if (((lock->l_flags & LDLM_FL_CANCELING) && + (lock->l_flags & LDLM_FL_BL_DONE)) || + (lock->l_flags & LDLM_FL_FAILED)) { + LDLM_DEBUG(lock, "callback on lock %#llx - lock disappeared\n", + dlm_req->lock_handle[0].cookie); + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + rc = ldlm_callback_reply(req, -EINVAL); + ldlm_callback_errmsg(req, "Operate on stale lock", rc, + &dlm_req->lock_handle[0]); + return 0; + } + /* BL_AST locks are not needed in LRU. + * Let ldlm_cancel_lru() be fast. */ + ldlm_lock_remove_from_lru(lock); + lock->l_flags |= LDLM_FL_BL_AST; + } + unlock_res_and_lock(lock); + + /* We want the ost thread to get this reply so that it can respond + * to ost requests (write cache writeback) that might be triggered + * in the callback. + * + * But we'd also like to be able to indicate in the reply that we're + * cancelling right now, because it's unused, or have an intent result + * in the reply, so we might have to push the responsibility for sending + * the reply down into the AST handlers, alas. */ + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case LDLM_BL_CALLBACK: + CDEBUG(D_INODE, "blocking ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK); + if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)) { + rc = ldlm_callback_reply(req, 0); + if (req->rq_no_reply || rc) + ldlm_callback_errmsg(req, "Normal process", rc, + &dlm_req->lock_handle[0]); + } + if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock)) + ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock); + break; + case LDLM_CP_CALLBACK: + CDEBUG(D_INODE, "completion ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); + ldlm_callback_reply(req, 0); + ldlm_handle_cp_callback(req, ns, dlm_req, lock); + break; + case LDLM_GL_CALLBACK: + CDEBUG(D_INODE, "glimpse ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK); + ldlm_handle_gl_callback(req, ns, dlm_req, lock); + break; + default: + LBUG(); /* checked above */ + } + + return 0; +} + + +static struct ldlm_bl_work_item *ldlm_bl_get_work(struct ldlm_bl_pool *blp) +{ + struct ldlm_bl_work_item *blwi = NULL; + static unsigned int num_bl; + + spin_lock(&blp->blp_lock); + /* process a request from the blp_list at least every blp_num_threads */ + if (!list_empty(&blp->blp_list) && + (list_empty(&blp->blp_prio_list) || num_bl == 0)) + blwi = list_entry(blp->blp_list.next, + struct ldlm_bl_work_item, blwi_entry); + else + if (!list_empty(&blp->blp_prio_list)) + blwi = list_entry(blp->blp_prio_list.next, + struct ldlm_bl_work_item, + blwi_entry); + + if (blwi) { + if (++num_bl >= atomic_read(&blp->blp_num_threads)) + num_bl = 0; + list_del(&blwi->blwi_entry); + } + spin_unlock(&blp->blp_lock); + + return blwi; +} + +/* This only contains temporary data until the thread starts */ +struct ldlm_bl_thread_data { + char bltd_name[CFS_CURPROC_COMM_MAX]; + struct ldlm_bl_pool *bltd_blp; + struct completion bltd_comp; + int bltd_num; +}; + +static int ldlm_bl_thread_main(void *arg); + +static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp) +{ + struct ldlm_bl_thread_data bltd = { .bltd_blp = blp }; + struct task_struct *task; + + init_completion(&bltd.bltd_comp); + bltd.bltd_num = atomic_read(&blp->blp_num_threads); + snprintf(bltd.bltd_name, sizeof(bltd.bltd_name), + "ldlm_bl_%02d", bltd.bltd_num); + task = kthread_run(ldlm_bl_thread_main, &bltd, "%s", bltd.bltd_name); + if (IS_ERR(task)) { + CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n", + atomic_read(&blp->blp_num_threads), PTR_ERR(task)); + return PTR_ERR(task); + } + wait_for_completion(&bltd.bltd_comp); + + return 0; +} + +/** + * Main blocking requests processing thread. + * + * Callers put locks into its queue by calling ldlm_bl_to_thread. + * This thread in the end ends up doing actual call to ->l_blocking_ast + * for queued locks. + */ +static int ldlm_bl_thread_main(void *arg) +{ + struct ldlm_bl_pool *blp; + + { + struct ldlm_bl_thread_data *bltd = arg; + + blp = bltd->bltd_blp; + + atomic_inc(&blp->blp_num_threads); + atomic_inc(&blp->blp_busy_threads); + + complete(&bltd->bltd_comp); + /* cannot use bltd after this, it is only on caller's stack */ + } + + while (1) { + struct l_wait_info lwi = { 0 }; + struct ldlm_bl_work_item *blwi = NULL; + int busy; + + blwi = ldlm_bl_get_work(blp); + + if (blwi == NULL) { + atomic_dec(&blp->blp_busy_threads); + l_wait_event_exclusive(blp->blp_waitq, + (blwi = ldlm_bl_get_work(blp)) != NULL, + &lwi); + busy = atomic_inc_return(&blp->blp_busy_threads); + } else { + busy = atomic_read(&blp->blp_busy_threads); + } + + if (blwi->blwi_ns == NULL) + /* added by ldlm_cleanup() */ + break; + + /* Not fatal if racy and have a few too many threads */ + if (unlikely(busy < blp->blp_max_threads && + busy >= atomic_read(&blp->blp_num_threads) && + !blwi->blwi_mem_pressure)) + /* discard the return value, we tried */ + ldlm_bl_thread_start(blp); + + if (blwi->blwi_mem_pressure) + memory_pressure_set(); + + if (blwi->blwi_count) { + int count; + /* The special case when we cancel locks in LRU + * asynchronously, we pass the list of locks here. + * Thus locks are marked LDLM_FL_CANCELING, but NOT + * canceled locally yet. */ + count = ldlm_cli_cancel_list_local(&blwi->blwi_head, + blwi->blwi_count, + LCF_BL_AST); + ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL, + blwi->blwi_flags); + } else { + ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld, + blwi->blwi_lock); + } + if (blwi->blwi_mem_pressure) + memory_pressure_clr(); + + if (blwi->blwi_flags & LCF_ASYNC) + OBD_FREE(blwi, sizeof(*blwi)); + else + complete(&blwi->blwi_comp); + } + + atomic_dec(&blp->blp_busy_threads); + atomic_dec(&blp->blp_num_threads); + complete(&blp->blp_comp); + return 0; +} + + +static int ldlm_setup(void); +static int ldlm_cleanup(void); + +int ldlm_get_ref(void) +{ + int rc = 0; + + mutex_lock(&ldlm_ref_mutex); + if (++ldlm_refcount == 1) { + rc = ldlm_setup(); + if (rc) + ldlm_refcount--; + } + mutex_unlock(&ldlm_ref_mutex); + + return rc; +} +EXPORT_SYMBOL(ldlm_get_ref); + +void ldlm_put_ref(void) +{ + mutex_lock(&ldlm_ref_mutex); + if (ldlm_refcount == 1) { + int rc = ldlm_cleanup(); + + if (rc) + CERROR("ldlm_cleanup failed: %d\n", rc); + else + ldlm_refcount--; + } else { + ldlm_refcount--; + } + mutex_unlock(&ldlm_ref_mutex); +} +EXPORT_SYMBOL(ldlm_put_ref); + +/* + * Export handle<->lock hash operations. + */ +static unsigned +ldlm_export_lock_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask); +} + +static void * +ldlm_export_lock_key(struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + return &lock->l_remote_handle; +} + +static void +ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + lock->l_remote_handle = *(struct lustre_handle *)key; +} + +static int +ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode) +{ + return lustre_handle_equal(ldlm_export_lock_key(hnode), key); +} + +static void * +ldlm_export_lock_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_lock, l_exp_hash); +} + +static void +ldlm_export_lock_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + LDLM_LOCK_GET(lock); +} + +static void +ldlm_export_lock_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + LDLM_LOCK_RELEASE(lock); +} + +static cfs_hash_ops_t ldlm_export_lock_ops = { + .hs_hash = ldlm_export_lock_hash, + .hs_key = ldlm_export_lock_key, + .hs_keycmp = ldlm_export_lock_keycmp, + .hs_keycpy = ldlm_export_lock_keycpy, + .hs_object = ldlm_export_lock_object, + .hs_get = ldlm_export_lock_get, + .hs_put = ldlm_export_lock_put, + .hs_put_locked = ldlm_export_lock_put, +}; + +int ldlm_init_export(struct obd_export *exp) +{ + int rc; + + exp->exp_lock_hash = + cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid), + HASH_EXP_LOCK_CUR_BITS, + HASH_EXP_LOCK_MAX_BITS, + HASH_EXP_LOCK_BKT_BITS, 0, + CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &ldlm_export_lock_ops, + CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY | + CFS_HASH_NBLK_CHANGE); + + if (!exp->exp_lock_hash) + return -ENOMEM; + + rc = ldlm_init_flock_export(exp); + if (rc) + goto err; + + return 0; +err: + ldlm_destroy_export(exp); + return rc; +} +EXPORT_SYMBOL(ldlm_init_export); + +void ldlm_destroy_export(struct obd_export *exp) +{ + cfs_hash_putref(exp->exp_lock_hash); + exp->exp_lock_hash = NULL; + + ldlm_destroy_flock_export(exp); +} +EXPORT_SYMBOL(ldlm_destroy_export); + +static int ldlm_setup(void) +{ + static struct ptlrpc_service_conf conf; + struct ldlm_bl_pool *blp = NULL; + int rc = 0; + int i; + + if (ldlm_state != NULL) + return -EALREADY; + + OBD_ALLOC(ldlm_state, sizeof(*ldlm_state)); + if (ldlm_state == NULL) + return -ENOMEM; + + rc = ldlm_proc_setup(); + if (rc != 0) + goto out; + + memset(&conf, 0, sizeof(conf)); + conf = (typeof(conf)) { + .psc_name = "ldlm_cbd", + .psc_watchdog_factor = 2, + .psc_buf = { + .bc_nbufs = LDLM_CLIENT_NBUFS, + .bc_buf_size = LDLM_BUFSIZE, + .bc_req_max_size = LDLM_MAXREQSIZE, + .bc_rep_max_size = LDLM_MAXREPSIZE, + .bc_req_portal = LDLM_CB_REQUEST_PORTAL, + .bc_rep_portal = LDLM_CB_REPLY_PORTAL, + }, + .psc_thr = { + .tc_thr_name = "ldlm_cb", + .tc_thr_factor = LDLM_THR_FACTOR, + .tc_nthrs_init = LDLM_NTHRS_INIT, + .tc_nthrs_base = LDLM_NTHRS_BASE, + .tc_nthrs_max = LDLM_NTHRS_MAX, + .tc_nthrs_user = ldlm_num_threads, + .tc_cpu_affinity = 1, + .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD, + }, + .psc_cpt = { + .cc_pattern = ldlm_cpts, + }, + .psc_ops = { + .so_req_handler = ldlm_callback_handler, + }, + }; + ldlm_state->ldlm_cb_service = + ptlrpc_register_service(&conf, ldlm_svc_proc_dir); + if (IS_ERR(ldlm_state->ldlm_cb_service)) { + CERROR("failed to start service\n"); + rc = PTR_ERR(ldlm_state->ldlm_cb_service); + ldlm_state->ldlm_cb_service = NULL; + goto out; + } + + + OBD_ALLOC(blp, sizeof(*blp)); + if (blp == NULL) { + rc = -ENOMEM; + goto out; + } + ldlm_state->ldlm_bl_pool = blp; + + spin_lock_init(&blp->blp_lock); + INIT_LIST_HEAD(&blp->blp_list); + INIT_LIST_HEAD(&blp->blp_prio_list); + init_waitqueue_head(&blp->blp_waitq); + atomic_set(&blp->blp_num_threads, 0); + atomic_set(&blp->blp_busy_threads, 0); + + if (ldlm_num_threads == 0) { + blp->blp_min_threads = LDLM_NTHRS_INIT; + blp->blp_max_threads = LDLM_NTHRS_MAX; + } else { + blp->blp_min_threads = blp->blp_max_threads = + min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT, + ldlm_num_threads)); + } + + for (i = 0; i < blp->blp_min_threads; i++) { + rc = ldlm_bl_thread_start(blp); + if (rc < 0) + goto out; + } + + + rc = ldlm_pools_init(); + if (rc) { + CERROR("Failed to initialize LDLM pools: %d\n", rc); + goto out; + } + return 0; + + out: + ldlm_cleanup(); + return rc; +} + +static int ldlm_cleanup(void) +{ + if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) || + !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) { + CERROR("ldlm still has namespaces; clean these up first.\n"); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); + return -EBUSY; + } + + ldlm_pools_fini(); + + if (ldlm_state->ldlm_bl_pool != NULL) { + struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; + + while (atomic_read(&blp->blp_num_threads) > 0) { + struct ldlm_bl_work_item blwi = { .blwi_ns = NULL }; + + init_completion(&blp->blp_comp); + + spin_lock(&blp->blp_lock); + list_add_tail(&blwi.blwi_entry, &blp->blp_list); + wake_up(&blp->blp_waitq); + spin_unlock(&blp->blp_lock); + + wait_for_completion(&blp->blp_comp); + } + + OBD_FREE(blp, sizeof(*blp)); + } + + if (ldlm_state->ldlm_cb_service != NULL) + ptlrpc_unregister_service(ldlm_state->ldlm_cb_service); + + ldlm_proc_cleanup(); + + + OBD_FREE(ldlm_state, sizeof(*ldlm_state)); + ldlm_state = NULL; + + return 0; +} + +int ldlm_init(void) +{ + mutex_init(&ldlm_ref_mutex); + mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER)); + mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); + ldlm_resource_slab = kmem_cache_create("ldlm_resources", + sizeof(struct ldlm_resource), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_resource_slab == NULL) + return -ENOMEM; + + ldlm_lock_slab = kmem_cache_create("ldlm_locks", + sizeof(struct ldlm_lock), 0, + SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL); + if (ldlm_lock_slab == NULL) { + kmem_cache_destroy(ldlm_resource_slab); + return -ENOMEM; + } + + ldlm_interval_slab = kmem_cache_create("interval_node", + sizeof(struct ldlm_interval), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_interval_slab == NULL) { + kmem_cache_destroy(ldlm_resource_slab); + kmem_cache_destroy(ldlm_lock_slab); + return -ENOMEM; + } +#if LUSTRE_TRACKS_LOCK_EXP_REFS + class_export_dump_hook = ldlm_dump_export_locks; +#endif + return 0; +} + +void ldlm_exit(void) +{ + if (ldlm_refcount) + CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount); + kmem_cache_destroy(ldlm_resource_slab); + /* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call + * synchronize_rcu() to wait a grace period elapsed, so that + * ldlm_lock_free() get a chance to be called. */ + synchronize_rcu(); + kmem_cache_destroy(ldlm_lock_slab); + kmem_cache_destroy(ldlm_interval_slab); +} diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c new file mode 100644 index 000000000..a1fe2c161 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c @@ -0,0 +1,72 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_plain.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of PLAIN lock type. + * + * PLAIN locks are the simplest form of LDLM locking, and are used when + * there only needs to be a single lock on a resource. This avoids some + * of the complexity of EXTENT and IBITS lock types, but doesn't allow + * different "parts" of a resource to be locked concurrently. Example + * use cases for PLAIN locks include locking of MGS configuration logs + * and (as of Lustre 2.4) quota records. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include "../include/lustre_dlm.h" +#include "../include/obd_support.h" +#include "../include/lustre_lib.h" + +#include "ldlm_internal.h" + + +void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + /* No policy for plain locks */ +} + +void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + /* No policy for plain locks */ +} diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c new file mode 100644 index 000000000..a9f4833e0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c @@ -0,0 +1,1455 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_pool.c + * + * Author: Yury Umanets + */ + +/* + * Idea of this code is rather simple. Each second, for each server namespace + * we have SLV - server lock volume which is calculated on current number of + * granted locks, grant speed for past period, etc - that is, locking load. + * This SLV number may be thought as a flow definition for simplicity. It is + * sent to clients with each occasion to let them know what is current load + * situation on the server. By default, at the beginning, SLV on server is + * set max value which is calculated as the following: allow to one client + * have all locks of limit ->pl_limit for 10h. + * + * Next, on clients, number of cached locks is not limited artificially in any + * way as it was before. Instead, client calculates CLV, that is, client lock + * volume for each lock and compares it with last SLV from the server. CLV is + * calculated as the number of locks in LRU * lock live time in seconds. If + * CLV > SLV - lock is canceled. + * + * Client has LVF, that is, lock volume factor which regulates how much + * sensitive client should be about last SLV from server. The higher LVF is the + * more locks will be canceled on client. Default value for it is 1. Setting LVF + * to 2 means that client will cancel locks 2 times faster. + * + * Locks on a client will be canceled more intensively in these cases: + * (1) if SLV is smaller, that is, load is higher on the server; + * (2) client has a lot of locks (the more locks are held by client, the bigger + * chances that some of them should be canceled); + * (3) client has old locks (taken some time ago); + * + * Thus, according to flow paradigm that we use for better understanding SLV, + * CLV is the volume of particle in flow described by SLV. According to this, + * if flow is getting thinner, more and more particles become outside of it and + * as particles are locks, they should be canceled. + * + * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). + * Andreas Dilger (adilger@clusterfs.com) proposed few nice ideas like using + * LVF and many cleanups. Flow definition to allow more easy understanding of + * the logic belongs to Nikita Danilov (nikita@clusterfs.com) as well as many + * cleanups and fixes. And design and implementation are done by Yury Umanets + * (umka@clusterfs.com). + * + * Glossary for terms used: + * + * pl_limit - Number of allowed locks in pool. Applies to server and client + * side (tunable); + * + * pl_granted - Number of granted locks (calculated); + * pl_grant_rate - Number of granted locks for last T (calculated); + * pl_cancel_rate - Number of canceled locks for last T (calculated); + * pl_grant_speed - Grant speed (GR - CR) for last T (calculated); + * pl_grant_plan - Planned number of granted locks for next T (calculated); + * pl_server_lock_volume - Current server lock volume (calculated); + * + * As it may be seen from list above, we have few possible tunables which may + * affect behavior much. They all may be modified via proc. However, they also + * give a possibility for constructing few pre-defined behavior policies. If + * none of predefines is suitable for a working pattern being used, new one may + * be "constructed" via proc tunables. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include "../include/lustre_dlm.h" +#include "../include/cl_object.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "ldlm_internal.h" + + +/* + * 50 ldlm locks for 1MB of RAM. + */ +#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_CACHE_SHIFT)) * 50) + +/* + * Maximal possible grant step plan in %. + */ +#define LDLM_POOL_MAX_GSP (30) + +/* + * Minimal possible grant step plan in %. + */ +#define LDLM_POOL_MIN_GSP (1) + +/* + * This controls the speed of reaching LDLM_POOL_MAX_GSP + * with increasing thread period. + */ +#define LDLM_POOL_GSP_STEP_SHIFT (2) + +/* + * LDLM_POOL_GSP% of all locks is default GP. + */ +#define LDLM_POOL_GP(L) (((L) * LDLM_POOL_MAX_GSP) / 100) + +/* + * Max age for locks on clients. + */ +#define LDLM_POOL_MAX_AGE (36000) + +/* + * The granularity of SLV calculation. + */ +#define LDLM_POOL_SLV_SHIFT (10) + +static inline __u64 dru(__u64 val, __u32 shift, int round_up) +{ + return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift; +} + +static inline __u64 ldlm_pool_slv_max(__u32 L) +{ + /* + * Allow to have all locks for 1 client for 10 hrs. + * Formula is the following: limit * 10h / 1 client. + */ + __u64 lim = (__u64)L * LDLM_POOL_MAX_AGE / 1; + return lim; +} + +static inline __u64 ldlm_pool_slv_min(__u32 L) +{ + return 1; +} + +enum { + LDLM_POOL_FIRST_STAT = 0, + LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT, + LDLM_POOL_GRANT_STAT, + LDLM_POOL_CANCEL_STAT, + LDLM_POOL_GRANT_RATE_STAT, + LDLM_POOL_CANCEL_RATE_STAT, + LDLM_POOL_GRANT_PLAN_STAT, + LDLM_POOL_SLV_STAT, + LDLM_POOL_SHRINK_REQTD_STAT, + LDLM_POOL_SHRINK_FREED_STAT, + LDLM_POOL_RECALC_STAT, + LDLM_POOL_TIMING_STAT, + LDLM_POOL_LAST_STAT +}; + +static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl) +{ + return container_of(pl, struct ldlm_namespace, ns_pool); +} + +/** + * Calculates suggested grant_step in % of available locks for passed + * \a period. This is later used in grant_plan calculations. + */ +static inline int ldlm_pool_t2gsp(unsigned int t) +{ + /* + * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP + * and up to 30% for anything higher than LDLM_POOL_GSP_STEP. + * + * How this will affect execution is the following: + * + * - for thread period 1s we will have grant_step 1% which good from + * pov of taking some load off from server and push it out to clients. + * This is like that because 1% for grant_step means that server will + * not allow clients to get lots of locks in short period of time and + * keep all old locks in their caches. Clients will always have to + * get some locks back if they want to take some new; + * + * - for thread period 10s (which is default) we will have 23% which + * means that clients will have enough of room to take some new locks + * without getting some back. All locks from this 23% which were not + * taken by clients in current period will contribute in SLV growing. + * SLV growing means more locks cached on clients until limit or grant + * plan is reached. + */ + return LDLM_POOL_MAX_GSP - + ((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >> + (t >> LDLM_POOL_GSP_STEP_SHIFT)); +} + +/** + * Recalculates next grant limit on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl) +{ + int granted, grant_step, limit; + + limit = ldlm_pool_get_limit(pl); + granted = atomic_read(&pl->pl_granted); + + grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period); + grant_step = ((limit - granted) * grant_step) / 100; + pl->pl_grant_plan = granted + grant_step; + limit = (limit * 5) >> 2; + if (pl->pl_grant_plan > limit) + pl->pl_grant_plan = limit; +} + +/** + * Recalculates next SLV on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_slv(struct ldlm_pool *pl) +{ + int granted; + int grant_plan; + int round_up; + __u64 slv; + __u64 slv_factor; + __u64 grant_usage; + __u32 limit; + + slv = pl->pl_server_lock_volume; + grant_plan = pl->pl_grant_plan; + limit = ldlm_pool_get_limit(pl); + granted = atomic_read(&pl->pl_granted); + round_up = granted < limit; + + grant_usage = max_t(int, limit - (granted - grant_plan), 1); + + /* + * Find out SLV change factor which is the ratio of grant usage + * from limit. SLV changes as fast as the ratio of grant plan + * consumption. The more locks from grant plan are not consumed + * by clients in last interval (idle time), the faster grows + * SLV. And the opposite, the more grant plan is over-consumed + * (load time) the faster drops SLV. + */ + slv_factor = grant_usage << LDLM_POOL_SLV_SHIFT; + do_div(slv_factor, limit); + slv = slv * slv_factor; + slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up); + + if (slv > ldlm_pool_slv_max(limit)) + slv = ldlm_pool_slv_max(limit); + else if (slv < ldlm_pool_slv_min(limit)) + slv = ldlm_pool_slv_min(limit); + + pl->pl_server_lock_volume = slv; +} + +/** + * Recalculates next stats on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_stats(struct ldlm_pool *pl) +{ + int grant_plan = pl->pl_grant_plan; + __u64 slv = pl->pl_server_lock_volume; + int granted = atomic_read(&pl->pl_granted); + int grant_rate = atomic_read(&pl->pl_grant_rate); + int cancel_rate = atomic_read(&pl->pl_cancel_rate); + + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, + slv); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT, + granted); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, + grant_rate); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, + grant_plan); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, + cancel_rate); +} + +/** + * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd. + */ +static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Set new SLV in obd field for using it later without accessing the + * pool. This is required to avoid race between sending reply to client + * with new SLV and cleanup server stack in which we can't guarantee + * that namespace is still alive. We know only that obd is alive as + * long as valid export is alive. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_slv = pl->pl_server_lock_volume; + write_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates all pool fields on passed \a pl. + * + * \pre ->pl_lock is not locked. + */ +static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) +{ + time_t recalc_interval_sec; + + recalc_interval_sec = get_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) + return 0; + + spin_lock(&pl->pl_lock); + recalc_interval_sec = get_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) { + spin_unlock(&pl->pl_lock); + return 0; + } + /* + * Recalc SLV after last period. This should be done + * _before_ recalculating new grant plan. + */ + ldlm_pool_recalc_slv(pl); + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); + + /* + * Update grant_plan for new period. + */ + ldlm_pool_recalc_grant_plan(pl); + + pl->pl_recalc_time = get_seconds(); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + recalc_interval_sec); + spin_unlock(&pl->pl_lock); + return 0; +} + +/** + * This function is used on server side as main entry point for memory + * pressure handling. It decreases SLV on \a pl according to passed + * \a nr and \a gfp_mask. + * + * Our goal here is to decrease SLV such a way that clients hold \a nr + * locks smaller in next 10h. + */ +static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, + int nr, gfp_t gfp_mask) +{ + __u32 limit; + + /* + * VM is asking how many entries may be potentially freed. + */ + if (nr == 0) + return atomic_read(&pl->pl_granted); + + /* + * Client already canceled locks but server is already in shrinker + * and can't cancel anything. Let's catch this race. + */ + if (atomic_read(&pl->pl_granted) == 0) + return 0; + + spin_lock(&pl->pl_lock); + + /* + * We want shrinker to possibly cause cancellation of @nr locks from + * clients or grant approximately @nr locks smaller next intervals. + * + * This is why we decreased SLV by @nr. This effect will only be as + * long as one re-calc interval (1s these days) and this should be + * enough to pass this decreased SLV to all clients. On next recalc + * interval pool will either increase SLV if locks load is not high + * or will keep on same level or even decrease again, thus, shrinker + * decreased SLV will affect next recalc intervals and this way will + * make locking load lower. + */ + if (nr < pl->pl_server_lock_volume) { + pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr; + } else { + limit = ldlm_pool_get_limit(pl); + pl->pl_server_lock_volume = ldlm_pool_slv_min(limit); + } + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); + spin_unlock(&pl->pl_lock); + + /* + * We did not really free any memory here so far, it only will be + * freed later may be, so that we return 0 to not confuse VM. + */ + return 0; +} + +/** + * Setup server side pool \a pl with passed \a limit. + */ +static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit) +{ + struct obd_device *obd; + + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL && obd != LP_POISON); + LASSERT(obd->obd_type != LP_POISON); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_limit = limit; + write_unlock(&obd->obd_pool_lock); + + ldlm_pool_set_limit(pl, limit); + return 0; +} + +/** + * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl. + */ +static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Get new SLV and Limit from obd which is updated with coming + * RPCs. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + read_lock(&obd->obd_pool_lock); + pl->pl_server_lock_volume = obd->obd_pool_slv; + ldlm_pool_set_limit(pl, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates client size pool \a pl according to current SLV and Limit. + */ +static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) +{ + time_t recalc_interval_sec; + int ret; + + recalc_interval_sec = get_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) + return 0; + + spin_lock(&pl->pl_lock); + /* + * Check if we need to recalc lists now. + */ + recalc_interval_sec = get_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) { + spin_unlock(&pl->pl_lock); + return 0; + } + + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + ldlm_cli_pool_pop_slv(pl); + + spin_unlock(&pl->pl_lock); + + /* + * Do not cancel locks in case lru resize is disabled for this ns. + */ + if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) { + ret = 0; + goto out; + } + + /* + * In the time of canceling locks on client we do not need to maintain + * sharp timing, we only want to cancel locks asap according to new SLV. + * It may be called when SLV has changed much, this is why we do not + * take into account pl->pl_recalc_time here. + */ + ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, LDLM_CANCEL_LRUR); + +out: + spin_lock(&pl->pl_lock); + /* + * Time of LRU resizing might be longer than period, + * so update after LRU resizing rather than before it. + */ + pl->pl_recalc_time = get_seconds(); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + recalc_interval_sec); + spin_unlock(&pl->pl_lock); + return ret; +} + +/** + * This function is main entry point for memory pressure handling on client + * side. Main goal of this function is to cancel some number of locks on + * passed \a pl according to \a nr and \a gfp_mask. + */ +static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, + int nr, gfp_t gfp_mask) +{ + struct ldlm_namespace *ns; + int unused; + + ns = ldlm_pl2ns(pl); + + /* + * Do not cancel locks in case lru resize is disabled for this ns. + */ + if (!ns_connect_lru_resize(ns)) + return 0; + + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + ldlm_cli_pool_pop_slv(pl); + + spin_lock(&ns->ns_lock); + unused = ns->ns_nr_unused; + spin_unlock(&ns->ns_lock); + + if (nr == 0) + return (unused / 100) * sysctl_vfs_cache_pressure; + else + return ldlm_cancel_lru(ns, nr, LCF_ASYNC, LDLM_CANCEL_SHRINK); +} + +static const struct ldlm_pool_ops ldlm_srv_pool_ops = { + .po_recalc = ldlm_srv_pool_recalc, + .po_shrink = ldlm_srv_pool_shrink, + .po_setup = ldlm_srv_pool_setup +}; + +static const struct ldlm_pool_ops ldlm_cli_pool_ops = { + .po_recalc = ldlm_cli_pool_recalc, + .po_shrink = ldlm_cli_pool_shrink +}; + +/** + * Pool recalc wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + */ +int ldlm_pool_recalc(struct ldlm_pool *pl) +{ + time_t recalc_interval_sec; + int count; + + recalc_interval_sec = get_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec <= 0) + goto recalc; + + spin_lock(&pl->pl_lock); + if (recalc_interval_sec > 0) { + /* + * Update pool statistics every 1s. + */ + ldlm_pool_recalc_stats(pl); + + /* + * Zero out all rates and speed for the last period. + */ + atomic_set(&pl->pl_grant_rate, 0); + atomic_set(&pl->pl_cancel_rate, 0); + } + spin_unlock(&pl->pl_lock); + + recalc: + if (pl->pl_ops->po_recalc != NULL) { + count = pl->pl_ops->po_recalc(pl); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, + count); + } + recalc_interval_sec = pl->pl_recalc_time - get_seconds() + + pl->pl_recalc_period; + if (recalc_interval_sec <= 0) { + /* Prevent too frequent recalculation. */ + CDEBUG(D_DLMTRACE, "Negative interval(%ld), " + "too short period(%ld)", + recalc_interval_sec, + pl->pl_recalc_period); + recalc_interval_sec = 1; + } + + return recalc_interval_sec; +} + +/* + * Pool shrink wrapper. Will call either client or server pool recalc callback + * depending what pool pl is used. When nr == 0, just return the number of + * freeable locks. Otherwise, return the number of canceled locks. + */ +int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, + gfp_t gfp_mask) +{ + int cancel = 0; + + if (pl->pl_ops->po_shrink != NULL) { + cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask); + if (nr > 0) { + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_REQTD_STAT, + nr); + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_FREED_STAT, + cancel); + CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, shrunk %d\n", + pl->pl_name, nr, cancel); + } + } + return cancel; +} +EXPORT_SYMBOL(ldlm_pool_shrink); + +/** + * Pool setup wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + * + * Sets passed \a limit into pool \a pl. + */ +int ldlm_pool_setup(struct ldlm_pool *pl, int limit) +{ + if (pl->pl_ops->po_setup != NULL) + return pl->pl_ops->po_setup(pl, limit); + return 0; +} +EXPORT_SYMBOL(ldlm_pool_setup); + +#if defined(CONFIG_PROC_FS) +static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused) +{ + int granted, grant_rate, cancel_rate, grant_step; + int grant_speed, grant_plan, lvf; + struct ldlm_pool *pl = m->private; + __u64 slv, clv; + __u32 limit; + + spin_lock(&pl->pl_lock); + slv = pl->pl_server_lock_volume; + clv = pl->pl_client_lock_volume; + limit = ldlm_pool_get_limit(pl); + grant_plan = pl->pl_grant_plan; + granted = atomic_read(&pl->pl_granted); + grant_rate = atomic_read(&pl->pl_grant_rate); + cancel_rate = atomic_read(&pl->pl_cancel_rate); + grant_speed = grant_rate - cancel_rate; + lvf = atomic_read(&pl->pl_lock_volume_factor); + grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period); + spin_unlock(&pl->pl_lock); + + seq_printf(m, "LDLM pool state (%s):\n" + " SLV: %llu\n" + " CLV: %llu\n" + " LVF: %d\n", + pl->pl_name, slv, clv, lvf); + + if (ns_is_server(ldlm_pl2ns(pl))) { + seq_printf(m, " GSP: %d%%\n" + " GP: %d\n", + grant_step, grant_plan); + } + seq_printf(m, " GR: %d\n CR: %d\n GS: %d\n" + " G: %d\n L: %d\n", + grant_rate, cancel_rate, grant_speed, + granted, limit); + + return 0; +} +LPROC_SEQ_FOPS_RO(lprocfs_pool_state); + +static int lprocfs_grant_speed_seq_show(struct seq_file *m, void *unused) +{ + struct ldlm_pool *pl = m->private; + int grant_speed; + + spin_lock(&pl->pl_lock); + /* serialize with ldlm_pool_recalc */ + grant_speed = atomic_read(&pl->pl_grant_rate) - + atomic_read(&pl->pl_cancel_rate); + spin_unlock(&pl->pl_lock); + return lprocfs_rd_uint(m, &grant_speed); +} + +LDLM_POOL_PROC_READER_SEQ_SHOW(grant_plan, int); +LPROC_SEQ_FOPS_RO(lprocfs_grant_plan); + +LDLM_POOL_PROC_READER_SEQ_SHOW(recalc_period, int); +LDLM_POOL_PROC_WRITER(recalc_period, int); +static ssize_t lprocfs_recalc_period_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + + return lprocfs_wr_recalc_period(file, buf, len, seq->private); +} +LPROC_SEQ_FOPS(lprocfs_recalc_period); + +LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, u64); +LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, atomic); +LPROC_SEQ_FOPS_RW_TYPE(ldlm_pool_rw, atomic); + +LPROC_SEQ_FOPS_RO(lprocfs_grant_speed); + +#define LDLM_POOL_ADD_VAR(name, var, ops) \ + do { \ + snprintf(var_name, MAX_STRING_SIZE, #name); \ + pool_vars[0].data = var; \ + pool_vars[0].fops = ops; \ + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, NULL);\ + } while (0) + +static int ldlm_pool_proc_init(struct ldlm_pool *pl) +{ + struct ldlm_namespace *ns = ldlm_pl2ns(pl); + struct proc_dir_entry *parent_ns_proc; + struct lprocfs_vars pool_vars[2]; + char *var_name = NULL; + int rc = 0; + + OBD_ALLOC(var_name, MAX_STRING_SIZE + 1); + if (!var_name) + return -ENOMEM; + + parent_ns_proc = ns->ns_proc_dir_entry; + if (parent_ns_proc == NULL) { + CERROR("%s: proc entry is not initialized\n", + ldlm_ns_name(ns)); + rc = -EINVAL; + goto out_free_name; + } + pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc, + NULL, NULL); + if (IS_ERR(pl->pl_proc_dir)) { + CERROR("LProcFS failed in ldlm-pool-init\n"); + rc = PTR_ERR(pl->pl_proc_dir); + pl->pl_proc_dir = NULL; + goto out_free_name; + } + + var_name[MAX_STRING_SIZE] = '\0'; + memset(pool_vars, 0, sizeof(pool_vars)); + pool_vars[0].name = var_name; + + LDLM_POOL_ADD_VAR("server_lock_volume", &pl->pl_server_lock_volume, + &ldlm_pool_u64_fops); + LDLM_POOL_ADD_VAR("limit", &pl->pl_limit, &ldlm_pool_rw_atomic_fops); + LDLM_POOL_ADD_VAR("granted", &pl->pl_granted, &ldlm_pool_atomic_fops); + LDLM_POOL_ADD_VAR("grant_speed", pl, &lprocfs_grant_speed_fops); + LDLM_POOL_ADD_VAR("cancel_rate", &pl->pl_cancel_rate, + &ldlm_pool_atomic_fops); + LDLM_POOL_ADD_VAR("grant_rate", &pl->pl_grant_rate, + &ldlm_pool_atomic_fops); + LDLM_POOL_ADD_VAR("grant_plan", pl, &lprocfs_grant_plan_fops); + LDLM_POOL_ADD_VAR("recalc_period", pl, &lprocfs_recalc_period_fops); + LDLM_POOL_ADD_VAR("lock_volume_factor", &pl->pl_lock_volume_factor, + &ldlm_pool_rw_atomic_fops); + LDLM_POOL_ADD_VAR("state", pl, &lprocfs_pool_state_fops); + + pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT - + LDLM_POOL_FIRST_STAT, 0); + if (!pl->pl_stats) { + rc = -ENOMEM; + goto out_free_name; + } + + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "granted", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "grant", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "cancel", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "grant_rate", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "cancel_rate", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "grant_plan", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "slv", "slv"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "shrink_request", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "shrink_freed", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "recalc_freed", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "recalc_timing", "sec"); + rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats); + +out_free_name: + OBD_FREE(var_name, MAX_STRING_SIZE + 1); + return rc; +} + +static void ldlm_pool_proc_fini(struct ldlm_pool *pl) +{ + if (pl->pl_stats != NULL) { + lprocfs_free_stats(&pl->pl_stats); + pl->pl_stats = NULL; + } + if (pl->pl_proc_dir != NULL) { + lprocfs_remove(&pl->pl_proc_dir); + pl->pl_proc_dir = NULL; + } +} +#else /* !CONFIG_PROC_FS */ +static int ldlm_pool_proc_init(struct ldlm_pool *pl) +{ + return 0; +} + +static void ldlm_pool_proc_fini(struct ldlm_pool *pl) {} +#endif /* CONFIG_PROC_FS */ + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, ldlm_side_t client) +{ + int rc; + + spin_lock_init(&pl->pl_lock); + atomic_set(&pl->pl_granted, 0); + pl->pl_recalc_time = get_seconds(); + atomic_set(&pl->pl_lock_volume_factor, 1); + + atomic_set(&pl->pl_grant_rate, 0); + atomic_set(&pl->pl_cancel_rate, 0); + pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L); + + snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d", + ldlm_ns_name(ns), idx); + + if (client == LDLM_NAMESPACE_SERVER) { + pl->pl_ops = &ldlm_srv_pool_ops; + ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L); + pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD; + pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L); + } else { + ldlm_pool_set_limit(pl, 1); + pl->pl_server_lock_volume = 0; + pl->pl_ops = &ldlm_cli_pool_ops; + pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD; + } + pl->pl_client_lock_volume = 0; + rc = ldlm_pool_proc_init(pl); + if (rc) + return rc; + + CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name); + + return rc; +} +EXPORT_SYMBOL(ldlm_pool_init); + +void ldlm_pool_fini(struct ldlm_pool *pl) +{ + ldlm_pool_proc_fini(pl); + + /* + * Pool should not be used after this point. We can't free it here as + * it lives in struct ldlm_namespace, but still interested in catching + * any abnormal using cases. + */ + POISON(pl, 0x5a, sizeof(*pl)); +} +EXPORT_SYMBOL(ldlm_pool_fini); + +/** + * Add new taken ldlm lock \a lock into pool \a pl accounting. + */ +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + /* + * FLOCK locks are special in a sense that they are almost never + * cancelled, instead special kind of lock is used to drop them. + * also there is no LRU for flock locks, so no point in tracking + * them anyway. + */ + if (lock->l_resource->lr_type == LDLM_FLOCK) + return; + + atomic_inc(&pl->pl_granted); + atomic_inc(&pl->pl_grant_rate); + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT); + /* + * Do not do pool recalc for client side as all locks which + * potentially may be canceled has already been packed into + * enqueue/cancel rpc. Also we do not want to run out of stack + * with too long call paths. + */ + if (ns_is_server(ldlm_pl2ns(pl))) + ldlm_pool_recalc(pl); +} +EXPORT_SYMBOL(ldlm_pool_add); + +/** + * Remove ldlm lock \a lock from pool \a pl accounting. + */ +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + /* + * Filter out FLOCK locks. Read above comment in ldlm_pool_add(). + */ + if (lock->l_resource->lr_type == LDLM_FLOCK) + return; + + LASSERT(atomic_read(&pl->pl_granted) > 0); + atomic_dec(&pl->pl_granted); + atomic_inc(&pl->pl_cancel_rate); + + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT); + + if (ns_is_server(ldlm_pl2ns(pl))) + ldlm_pool_recalc(pl); +} +EXPORT_SYMBOL(ldlm_pool_del); + +/** + * Returns current \a pl SLV. + * + * \pre ->pl_lock is not locked. + */ +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl) +{ + __u64 slv; + + spin_lock(&pl->pl_lock); + slv = pl->pl_server_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; +} +EXPORT_SYMBOL(ldlm_pool_get_slv); + +/** + * Sets passed \a slv to \a pl. + * + * \pre ->pl_lock is not locked. + */ +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv) +{ + spin_lock(&pl->pl_lock); + pl->pl_server_lock_volume = slv; + spin_unlock(&pl->pl_lock); +} +EXPORT_SYMBOL(ldlm_pool_set_slv); + +/** + * Returns current \a pl CLV. + * + * \pre ->pl_lock is not locked. + */ +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl) +{ + __u64 slv; + + spin_lock(&pl->pl_lock); + slv = pl->pl_client_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; +} +EXPORT_SYMBOL(ldlm_pool_get_clv); + +/** + * Sets passed \a clv to \a pl. + * + * \pre ->pl_lock is not locked. + */ +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) +{ + spin_lock(&pl->pl_lock); + pl->pl_client_lock_volume = clv; + spin_unlock(&pl->pl_lock); +} +EXPORT_SYMBOL(ldlm_pool_set_clv); + +/** + * Returns current \a pl limit. + */ +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_limit); +} +EXPORT_SYMBOL(ldlm_pool_get_limit); + +/** + * Sets passed \a limit to \a pl. + */ +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit) +{ + atomic_set(&pl->pl_limit, limit); +} +EXPORT_SYMBOL(ldlm_pool_set_limit); + +/** + * Returns current LVF from \a pl. + */ +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_lock_volume_factor); +} +EXPORT_SYMBOL(ldlm_pool_get_lvf); + +static int ldlm_pool_granted(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_granted); +} + +static struct ptlrpc_thread *ldlm_pools_thread; +static struct completion ldlm_pools_comp; + +/* + * count locks from all namespaces (if possible). Returns number of + * cached locks. + */ +static unsigned long ldlm_pools_count(ldlm_side_t client, gfp_t gfp_mask) +{ + int total = 0, nr_ns; + struct ldlm_namespace *ns; + struct ldlm_namespace *ns_old = NULL; /* loop detection */ + void *cookie; + + if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS)) + return 0; + + CDEBUG(D_DLMTRACE, "Request to count %s locks from all pools\n", + client == LDLM_NAMESPACE_CLIENT ? "client" : "server"); + + cookie = cl_env_reenter(); + + /* + * Find out how many resources we may release. + */ + for (nr_ns = ldlm_namespace_nr_read(client); + nr_ns > 0; nr_ns--) { + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + cl_env_reexit(cookie); + return 0; + } + ns = ldlm_namespace_first_locked(client); + + if (ns == ns_old) { + mutex_unlock(ldlm_namespace_lock(client)); + break; + } + + if (ldlm_ns_empty(ns)) { + ldlm_namespace_move_to_inactive_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + continue; + } + + if (ns_old == NULL) + ns_old = ns; + + ldlm_namespace_get(ns); + ldlm_namespace_move_to_active_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask); + ldlm_namespace_put(ns); + } + + cl_env_reexit(cookie); + return total; +} + +static unsigned long ldlm_pools_scan(ldlm_side_t client, int nr, gfp_t gfp_mask) +{ + unsigned long freed = 0; + int tmp, nr_ns; + struct ldlm_namespace *ns; + void *cookie; + + if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS)) + return -1; + + cookie = cl_env_reenter(); + + /* + * Shrink at least ldlm_namespace_nr_read(client) namespaces. + */ + for (tmp = nr_ns = ldlm_namespace_nr_read(client); + tmp > 0; tmp--) { + int cancel, nr_locks; + + /* + * Do not call shrink under ldlm_namespace_lock(client) + */ + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + break; + } + ns = ldlm_namespace_first_locked(client); + ldlm_namespace_get(ns); + ldlm_namespace_move_to_active_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + + nr_locks = ldlm_pool_granted(&ns->ns_pool); + /* + * We use to shrink propotionally but with new shrinker API, + * we lost the total number of freeable locks. + */ + cancel = 1 + min_t(int, nr_locks, nr / nr_ns); + freed += ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask); + ldlm_namespace_put(ns); + } + cl_env_reexit(cookie); + /* + * we only decrease the SLV in server pools shrinker, return + * SHRINK_STOP to kernel to avoid needless loop. LU-1128 + */ + return (client == LDLM_NAMESPACE_SERVER) ? SHRINK_STOP : freed; +} + +static unsigned long ldlm_pools_srv_count(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_count(LDLM_NAMESPACE_SERVER, sc->gfp_mask); +} + +static unsigned long ldlm_pools_srv_scan(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_scan(LDLM_NAMESPACE_SERVER, sc->nr_to_scan, + sc->gfp_mask); +} + +static unsigned long ldlm_pools_cli_count(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_count(LDLM_NAMESPACE_CLIENT, sc->gfp_mask); +} + +static unsigned long ldlm_pools_cli_scan(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_scan(LDLM_NAMESPACE_CLIENT, sc->nr_to_scan, + sc->gfp_mask); +} + +int ldlm_pools_recalc(ldlm_side_t client) +{ + __u32 nr_l = 0, nr_p = 0, l; + struct ldlm_namespace *ns; + struct ldlm_namespace *ns_old = NULL; + int nr, equal = 0; + int time = 50; /* seconds of sleep if no active namespaces */ + + /* + * No need to setup pool limit for client pools. + */ + if (client == LDLM_NAMESPACE_SERVER) { + /* + * Check all modest namespaces first. + */ + mutex_lock(ldlm_namespace_lock(client)); + list_for_each_entry(ns, ldlm_namespace_list(client), + ns_list_chain) { + if (ns->ns_appetite != LDLM_NAMESPACE_MODEST) + continue; + + l = ldlm_pool_granted(&ns->ns_pool); + if (l == 0) + l = 1; + + /* + * Set the modest pools limit equal to their avg granted + * locks + ~6%. + */ + l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0); + ldlm_pool_setup(&ns->ns_pool, l); + nr_l += l; + nr_p++; + } + + /* + * Make sure that modest namespaces did not eat more that 2/3 + * of limit. + */ + if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) { + CWARN("\"Modest\" pools eat out 2/3 of server locks limit (%d of %lu). This means that you have too many clients for this amount of server RAM. Upgrade server!\n", + nr_l, LDLM_POOL_HOST_L); + equal = 1; + } + + /* + * The rest is given to greedy namespaces. + */ + list_for_each_entry(ns, ldlm_namespace_list(client), + ns_list_chain) { + if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY) + continue; + + if (equal) { + /* + * In the case 2/3 locks are eaten out by + * modest pools, we re-setup equal limit + * for _all_ pools. + */ + l = LDLM_POOL_HOST_L / + ldlm_namespace_nr_read(client); + } else { + /* + * All the rest of greedy pools will have + * all locks in equal parts. + */ + l = (LDLM_POOL_HOST_L - nr_l) / + (ldlm_namespace_nr_read(client) - + nr_p); + } + ldlm_pool_setup(&ns->ns_pool, l); + } + mutex_unlock(ldlm_namespace_lock(client)); + } + + /* + * Recalc at least ldlm_namespace_nr_read(client) namespaces. + */ + for (nr = ldlm_namespace_nr_read(client); nr > 0; nr--) { + int skip; + /* + * Lock the list, get first @ns in the list, getref, move it + * to the tail, unlock and call pool recalc. This way we avoid + * calling recalc under @ns lock what is really good as we get + * rid of potential deadlock on client nodes when canceling + * locks synchronously. + */ + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + break; + } + ns = ldlm_namespace_first_locked(client); + + if (ns_old == ns) { /* Full pass complete */ + mutex_unlock(ldlm_namespace_lock(client)); + break; + } + + /* We got an empty namespace, need to move it back to inactive + * list. + * The race with parallel resource creation is fine: + * - If they do namespace_get before our check, we fail the + * check and they move this item to the end of the list anyway + * - If we do the check and then they do namespace_get, then + * we move the namespace to inactive and they will move + * it back to active (synchronised by the lock, so no clash + * there). + */ + if (ldlm_ns_empty(ns)) { + ldlm_namespace_move_to_inactive_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + continue; + } + + if (ns_old == NULL) + ns_old = ns; + + spin_lock(&ns->ns_lock); + /* + * skip ns which is being freed, and we don't want to increase + * its refcount again, not even temporarily. bz21519 & LU-499. + */ + if (ns->ns_stopping) { + skip = 1; + } else { + skip = 0; + ldlm_namespace_get(ns); + } + spin_unlock(&ns->ns_lock); + + ldlm_namespace_move_to_active_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + + /* + * After setup is done - recalc the pool. + */ + if (!skip) { + int ttime = ldlm_pool_recalc(&ns->ns_pool); + + if (ttime < time) + time = ttime; + + ldlm_namespace_put(ns); + } + } + return time; +} +EXPORT_SYMBOL(ldlm_pools_recalc); + +static int ldlm_pools_thread_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; + int s_time, c_time; + + thread_set_flags(thread, SVC_RUNNING); + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n", + "ldlm_poold", current_pid()); + + while (1) { + struct l_wait_info lwi; + + /* + * Recal all pools on this tick. + */ + s_time = ldlm_pools_recalc(LDLM_NAMESPACE_SERVER); + c_time = ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT); + + /* + * Wait until the next check time, or until we're + * stopped. + */ + lwi = LWI_TIMEOUT(cfs_time_seconds(min(s_time, c_time)), + NULL, NULL); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopping(thread) || + thread_is_event(thread), + &lwi); + + if (thread_test_and_clear_flags(thread, SVC_STOPPING)) + break; + else + thread_test_and_clear_flags(thread, SVC_EVENT); + } + + thread_set_flags(thread, SVC_STOPPED); + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n", + "ldlm_poold", current_pid()); + + complete_and_exit(&ldlm_pools_comp, 0); +} + +static int ldlm_pools_thread_start(void) +{ + struct l_wait_info lwi = { 0 }; + struct task_struct *task; + + if (ldlm_pools_thread != NULL) + return -EALREADY; + + OBD_ALLOC_PTR(ldlm_pools_thread); + if (ldlm_pools_thread == NULL) + return -ENOMEM; + + init_completion(&ldlm_pools_comp); + init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq); + + task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread, + "ldlm_poold"); + if (IS_ERR(task)) { + CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task)); + OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread)); + ldlm_pools_thread = NULL; + return PTR_ERR(task); + } + l_wait_event(ldlm_pools_thread->t_ctl_waitq, + thread_is_running(ldlm_pools_thread), &lwi); + return 0; +} + +static void ldlm_pools_thread_stop(void) +{ + if (ldlm_pools_thread == NULL) + return; + + thread_set_flags(ldlm_pools_thread, SVC_STOPPING); + wake_up(&ldlm_pools_thread->t_ctl_waitq); + + /* + * Make sure that pools thread is finished before freeing @thread. + * This fixes possible race and oops due to accessing freed memory + * in pools thread. + */ + wait_for_completion(&ldlm_pools_comp); + OBD_FREE_PTR(ldlm_pools_thread); + ldlm_pools_thread = NULL; +} + +static struct shrinker ldlm_pools_srv_shrinker = { + .count_objects = ldlm_pools_srv_count, + .scan_objects = ldlm_pools_srv_scan, + .seeks = DEFAULT_SEEKS, +}; + +static struct shrinker ldlm_pools_cli_shrinker = { + .count_objects = ldlm_pools_cli_count, + .scan_objects = ldlm_pools_cli_scan, + .seeks = DEFAULT_SEEKS, +}; + +int ldlm_pools_init(void) +{ + int rc; + + rc = ldlm_pools_thread_start(); + if (rc == 0) { + register_shrinker(&ldlm_pools_srv_shrinker); + register_shrinker(&ldlm_pools_cli_shrinker); + } + return rc; +} +EXPORT_SYMBOL(ldlm_pools_init); + +void ldlm_pools_fini(void) +{ + unregister_shrinker(&ldlm_pools_srv_shrinker); + unregister_shrinker(&ldlm_pools_cli_shrinker); + ldlm_pools_thread_stop(); +} +EXPORT_SYMBOL(ldlm_pools_fini); diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_request.c new file mode 100644 index 000000000..4f7131831 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_request.c @@ -0,0 +1,2294 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** + * This file contains Asynchronous System Trap (AST) handlers and related + * LDLM request-processing routines. + * + * An AST is a callback issued on a lock when its state is changed. There are + * several different types of ASTs (callbacks) registered for each lock: + * + * - completion AST: when a lock is enqueued by some process, but cannot be + * granted immediately due to other conflicting locks on the same resource, + * the completion AST is sent to notify the caller when the lock is + * eventually granted + * + * - blocking AST: when a lock is granted to some process, if another process + * enqueues a conflicting (blocking) lock on a resource, a blocking AST is + * sent to notify the holder(s) of the lock(s) of the conflicting lock + * request. The lock holder(s) must release their lock(s) on that resource in + * a timely manner or be evicted by the server. + * + * - glimpse AST: this is used when a process wants information about a lock + * (i.e. the lock value block (LVB)) but does not necessarily require holding + * the lock. If the resource is locked, the lock holder(s) are sent glimpse + * ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL + * their lock(s) if they are idle. If the resource is not locked, the server + * may grant the lock. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include "../include/lustre_dlm.h" +#include "../include/obd_class.h" +#include "../include/obd.h" + +#include "ldlm_internal.h" + +int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; +module_param(ldlm_enqueue_min, int, 0644); +MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum"); + +/* in client side, whether the cached locks will be canceled before replay */ +unsigned int ldlm_cancel_unused_locks_before_replay = 1; + +static void interrupted_completion_wait(void *data) +{ +} + +struct lock_wait_data { + struct ldlm_lock *lwd_lock; + __u32 lwd_conn_cnt; +}; + +struct ldlm_async_args { + struct lustre_handle lock_handle; +}; + +int ldlm_expired_completion_wait(void *data) +{ + struct lock_wait_data *lwd = data; + struct ldlm_lock *lock = lwd->lwd_lock; + struct obd_import *imp; + struct obd_device *obd; + + if (lock->l_conn_export == NULL) { + static unsigned long next_dump, last_dump; + + LCONSOLE_WARN("lock timed out (enqueued at "CFS_TIME_T", " + CFS_DURATION_T"s ago)\n", + lock->l_last_activity, + cfs_time_sub(get_seconds(), + lock->l_last_activity)); + LDLM_DEBUG(lock, "lock timed out (enqueued at " CFS_TIME_T ", " CFS_DURATION_T "s ago); not entering recovery in server code, just going back to sleep", + lock->l_last_activity, + cfs_time_sub(get_seconds(), + lock->l_last_activity)); + if (cfs_time_after(cfs_time_current(), next_dump)) { + last_dump = next_dump; + next_dump = cfs_time_shift(300); + ldlm_namespace_dump(D_DLMTRACE, + ldlm_lock_to_ns(lock)); + if (last_dump == 0) + libcfs_debug_dumplog(); + } + return 0; + } + + obd = lock->l_conn_export->exp_obd; + imp = obd->u.cli.cl_import; + ptlrpc_fail_import(imp, lwd->lwd_conn_cnt); + LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", " + CFS_DURATION_T"s ago), entering recovery for %s@%s", + lock->l_last_activity, + cfs_time_sub(get_seconds(), lock->l_last_activity), + obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid); + + return 0; +} +EXPORT_SYMBOL(ldlm_expired_completion_wait); + +/* We use the same basis for both server side and client side functions + from a single node. */ +int ldlm_get_enq_timeout(struct ldlm_lock *lock) +{ + int timeout = at_get(ldlm_lock_to_ns_at(lock)); + + if (AT_OFF) + return obd_timeout / 2; + /* Since these are non-updating timeouts, we should be conservative. + It would be nice to have some kind of "early reply" mechanism for + lock callbacks too... */ + timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */ + return max(timeout, ldlm_enqueue_min); +} +EXPORT_SYMBOL(ldlm_get_enq_timeout); + +/** + * Helper function for ldlm_completion_ast(), updating timings when lock is + * actually granted. + */ +static int ldlm_completion_tail(struct ldlm_lock *lock) +{ + long delay; + int result; + + if (lock->l_flags & (LDLM_FL_DESTROYED | LDLM_FL_FAILED)) { + LDLM_DEBUG(lock, "client-side enqueue: destroyed"); + result = -EIO; + } else { + delay = cfs_time_sub(get_seconds(), + lock->l_last_activity); + LDLM_DEBUG(lock, "client-side enqueue: granted after " + CFS_DURATION_T"s", delay); + + /* Update our time estimate */ + at_measured(ldlm_lock_to_ns_at(lock), + delay); + result = 0; + } + return result; +} + +/** + * Implementation of ->l_completion_ast() for a client, that doesn't wait + * until lock is granted. Suitable for locks enqueued through ptlrpcd, of + * other threads that cannot block for long. + */ +int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data) +{ + if (flags == LDLM_FL_WAIT_NOREPROC) { + LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock"); + return 0; + } + + if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | + LDLM_FL_BLOCK_CONV))) { + wake_up(&lock->l_waitq); + return ldlm_completion_tail(lock); + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, going forward"); + ldlm_reprocess_all(lock->l_resource); + return 0; +} +EXPORT_SYMBOL(ldlm_completion_ast_async); + +/** + * Generic LDLM "completion" AST. This is called in several cases: + * + * - when a reply to an ENQUEUE RPC is received from the server + * (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at + * this point (determined by flags); + * + * - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has + * been granted; + * + * - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock + * gets correct lvb; + * + * - to force all locks when resource is destroyed (cleanup_resource()); + * + * - during lock conversion (not used currently). + * + * If lock is not granted in the first case, this function waits until second + * or penultimate cases happen in some other thread. + * + */ +int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + /* XXX ALLOCATE - 160 bytes */ + struct lock_wait_data lwd; + struct obd_device *obd; + struct obd_import *imp = NULL; + struct l_wait_info lwi; + __u32 timeout; + int rc = 0; + + if (flags == LDLM_FL_WAIT_NOREPROC) { + LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock"); + goto noreproc; + } + + if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | + LDLM_FL_BLOCK_CONV))) { + wake_up(&lock->l_waitq); + return 0; + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, sleeping"); + +noreproc: + + obd = class_exp2obd(lock->l_conn_export); + + /* if this is a local lock, then there is no import */ + if (obd != NULL) + imp = obd->u.cli.cl_import; + + /* Wait a long time for enqueue - server may have to callback a + lock from another client. Server will evict the other client if it + doesn't respond reasonably, and then give us the lock. */ + timeout = ldlm_get_enq_timeout(lock) * 2; + + lwd.lwd_lock = lock; + + if (lock->l_flags & LDLM_FL_NO_TIMEOUT) { + LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT"); + lwi = LWI_INTR(interrupted_completion_wait, &lwd); + } else { + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), + ldlm_expired_completion_wait, + interrupted_completion_wait, &lwd); + } + + if (imp != NULL) { + spin_lock(&imp->imp_lock); + lwd.lwd_conn_cnt = imp->imp_conn_cnt; + spin_unlock(&imp->imp_lock); + } + + if (ns_is_client(ldlm_lock_to_ns(lock)) && + OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST, + OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) { + lock->l_flags |= LDLM_FL_FAIL_LOC; + rc = -EINTR; + } else { + /* Go to sleep until the lock is granted or cancelled. */ + rc = l_wait_event(lock->l_waitq, + is_granted_or_cancelled(lock), &lwi); + } + + if (rc) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + return rc; + } + + return ldlm_completion_tail(lock); +} +EXPORT_SYMBOL(ldlm_completion_ast); + +/** + * A helper to build a blocking AST function + * + * Perform a common operation for blocking ASTs: + * deferred lock cancellation. + * + * \param lock the lock blocking or canceling AST was called on + * \retval 0 + * \see mdt_blocking_ast + * \see ldlm_blocking_ast + */ +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock) +{ + int do_ast; + + lock->l_flags |= LDLM_FL_CBPENDING; + do_ast = !lock->l_readers && !lock->l_writers; + unlock_res_and_lock(lock); + + if (do_ast) { + struct lustre_handle lockh; + int rc; + + LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } else { + LDLM_DEBUG(lock, "Lock still has references, will be cancelled later"); + } + return 0; +} +EXPORT_SYMBOL(ldlm_blocking_ast_nocheck); + +/** + * Server blocking AST + * + * ->l_blocking_ast() callback for LDLM locks acquired by server-side + * OBDs. + * + * \param lock the lock which blocks a request or cancelling lock + * \param desc unused + * \param data unused + * \param flag indicates whether this cancelling or blocking callback + * \retval 0 + * \see ldlm_blocking_ast_nocheck + */ +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + if (flag == LDLM_CB_CANCELING) { + /* Don't need to do anything here. */ + return 0; + } + + lock_res_and_lock(lock); + /* Get this: if ldlm_blocking_ast is racing with intent_policy, such + * that ldlm_blocking_ast is called just before intent_policy method + * takes the lr_lock, then by the time we get the lock, we might not + * be the correct blocking function anymore. So check, and return + * early, if so. */ + if (lock->l_blocking_ast != ldlm_blocking_ast) { + unlock_res_and_lock(lock); + return 0; + } + return ldlm_blocking_ast_nocheck(lock); +} +EXPORT_SYMBOL(ldlm_blocking_ast); + +/** + * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See + * comment in filter_intent_policy() on why you may need this. + */ +int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp) +{ + /* + * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for + * that is rather subtle: with OST-side locking, it may so happen that + * _all_ extent locks are held by the OST. If client wants to obtain + * current file size it calls ll{,u}_glimpse_size(), and (as locks are + * on the server), dummy glimpse callback fires and does + * nothing. Client still receives correct file size due to the + * following fragment in filter_intent_policy(): + * + * rc = l->l_glimpse_ast(l, NULL); // this will update the LVB + * if (rc != 0 && res->lr_namespace->ns_lvbo && + * res->lr_namespace->ns_lvbo->lvbo_update) { + * res->lr_namespace->ns_lvbo->lvbo_update(res, NULL, 0, 1); + * } + * + * that is, after glimpse_ast() fails, filter_lvbo_update() runs, and + * returns correct file size to the client. + */ + return -ELDLM_NO_LOCK_DATA; +} +EXPORT_SYMBOL(ldlm_glimpse_ast); + +/** + * Enqueue a local lock (typically on a server). + */ +int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_type_t type, ldlm_policy_data_t *policy, + ldlm_mode_t mode, __u64 *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, enum lvb_type lvb_type, + const __u64 *client_cookie, + struct lustre_handle *lockh) +{ + struct ldlm_lock *lock; + int err; + const struct ldlm_callback_suite cbs = { .lcs_completion = completion, + .lcs_blocking = blocking, + .lcs_glimpse = glimpse, + }; + + LASSERT(!(*flags & LDLM_FL_REPLAY)); + if (unlikely(ns_is_client(ns))) { + CERROR("Trying to enqueue local lock in a shadow namespace\n"); + LBUG(); + } + + lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len, + lvb_type); + if (unlikely(!lock)) { + err = -ENOMEM; + goto out_nolock; + } + + ldlm_lock2handle(lock, lockh); + + /* NB: we don't have any lock now (lock_res_and_lock) + * because it's a new lock */ + ldlm_lock_addref_internal_nolock(lock, mode); + lock->l_flags |= LDLM_FL_LOCAL; + if (*flags & LDLM_FL_ATOMIC_CB) + lock->l_flags |= LDLM_FL_ATOMIC_CB; + + if (policy != NULL) + lock->l_policy_data = *policy; + if (client_cookie != NULL) + lock->l_client_cookie = *client_cookie; + if (type == LDLM_EXTENT) + lock->l_req_extent = policy->l_extent; + + err = ldlm_lock_enqueue(ns, &lock, policy, flags); + if (unlikely(err != ELDLM_OK)) + goto out; + + if (policy != NULL) + *policy = lock->l_policy_data; + + if (lock->l_completion_ast) + lock->l_completion_ast(lock, *flags, NULL); + + LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created"); + out: + LDLM_LOCK_RELEASE(lock); + out_nolock: + return err; +} +EXPORT_SYMBOL(ldlm_cli_enqueue_local); + +static void failed_lock_cleanup(struct ldlm_namespace *ns, + struct ldlm_lock *lock, int mode) +{ + int need_cancel = 0; + + /* Set a flag to prevent us from sending a CANCEL (bug 407) */ + lock_res_and_lock(lock); + /* Check that lock is not granted or failed, we might race. */ + if ((lock->l_req_mode != lock->l_granted_mode) && + !(lock->l_flags & LDLM_FL_FAILED)) { + /* Make sure that this lock will not be found by raced + * bl_ast and -EINVAL reply is sent to server anyways. + * bug 17645 */ + lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED | + LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING; + need_cancel = 1; + } + unlock_res_and_lock(lock); + + if (need_cancel) + LDLM_DEBUG(lock, + "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING"); + else + LDLM_DEBUG(lock, "lock was granted or failed in race"); + + ldlm_lock_decref_internal(lock, mode); + + /* XXX - HACK because we shouldn't call ldlm_lock_destroy() + * from llite/file.c/ll_file_flock(). */ + /* This code makes for the fact that we do not have blocking handler on + * a client for flock locks. As such this is the place where we must + * completely kill failed locks. (interrupted and those that + * were waiting to be granted when server evicted us. */ + if (lock->l_resource->lr_type == LDLM_FLOCK) { + lock_res_and_lock(lock); + ldlm_resource_unlink_lock(lock); + ldlm_lock_destroy_nolock(lock); + unlock_res_and_lock(lock); + } +} + +/** + * Finishing portion of client lock enqueue code. + * + * Called after receiving reply from server. + */ +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode, + __u64 *flags, void *lvb, __u32 lvb_len, + struct lustre_handle *lockh, int rc) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + int is_replay = *flags & LDLM_FL_REPLAY; + struct ldlm_lock *lock; + struct ldlm_reply *reply; + int cleanup_phase = 1; + int size = 0; + + lock = ldlm_handle2lock(lockh); + /* ldlm_cli_enqueue is holding a reference on this lock. */ + if (!lock) { + LASSERT(type == LDLM_FLOCK); + return -ENOLCK; + } + + LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len), + "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len); + + if (rc != ELDLM_OK) { + LASSERT(!is_replay); + LDLM_DEBUG(lock, "client-side enqueue END (%s)", + rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED"); + + if (rc != ELDLM_LOCK_ABORTED) + goto cleanup; + } + + /* Before we return, swab the reply */ + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) { + rc = -EPROTO; + goto cleanup; + } + + if (lvb_len != 0) { + LASSERT(lvb != NULL); + + size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, + RCL_SERVER); + if (size < 0) { + LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size); + rc = size; + goto cleanup; + } else if (unlikely(size > lvb_len)) { + LDLM_ERROR(lock, "Replied LVB is larger than expectation, expected = %d, replied = %d", + lvb_len, size); + rc = -EINVAL; + goto cleanup; + } + } + + if (rc == ELDLM_LOCK_ABORTED) { + if (lvb_len != 0) + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, + lvb, size); + if (rc == 0) + rc = ELDLM_LOCK_ABORTED; + goto cleanup; + } + + /* lock enqueued on the server */ + cleanup_phase = 0; + + lock_res_and_lock(lock); + /* Key change rehash lock in per-export hash with new key */ + if (exp->exp_lock_hash) { + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_rehash_key(exp->exp_lock_hash, + &lock->l_remote_handle, + &reply->lock_handle, + &lock->l_exp_hash); + } else { + lock->l_remote_handle = reply->lock_handle; + } + + *flags = ldlm_flags_from_wire(reply->lock_flags); + lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags & + LDLM_INHERIT_FLAGS); + /* move NO_TIMEOUT flag to the lock to force ldlm_lock_match() + * to wait with no timeout as well */ + lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags & + LDLM_FL_NO_TIMEOUT); + unlock_res_and_lock(lock); + + CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: 0x%llx\n", + lock, reply->lock_handle.cookie, *flags); + + /* If enqueue returned a blocked lock but the completion handler has + * already run, then it fixed up the resource and we don't need to do it + * again. */ + if ((*flags) & LDLM_FL_LOCK_CHANGED) { + int newmode = reply->lock_desc.l_req_mode; + + LASSERT(!is_replay); + if (newmode && newmode != lock->l_req_mode) { + LDLM_DEBUG(lock, "server returned different mode %s", + ldlm_lockname[newmode]); + lock->l_req_mode = newmode; + } + + if (!ldlm_res_eq(&reply->lock_desc.l_resource.lr_name, + &lock->l_resource->lr_name)) { + CDEBUG(D_INFO, "remote intent success, locking "DLDLMRES + " instead of "DLDLMRES"\n", + PLDLMRES(&reply->lock_desc.l_resource), + PLDLMRES(lock->l_resource)); + + rc = ldlm_lock_change_resource(ns, lock, + &reply->lock_desc.l_resource.lr_name); + if (rc || lock->l_resource == NULL) { + rc = -ENOMEM; + goto cleanup; + } + LDLM_DEBUG(lock, "client-side enqueue, new resource"); + } + if (with_policy) + if (!(type == LDLM_IBITS && + !(exp_connect_flags(exp) & OBD_CONNECT_IBITS))) + /* We assume lock type cannot change on server*/ + ldlm_convert_policy_to_local(exp, + lock->l_resource->lr_type, + &reply->lock_desc.l_policy_data, + &lock->l_policy_data); + if (type != LDLM_PLAIN) + LDLM_DEBUG(lock, + "client-side enqueue, new policy data"); + } + + if ((*flags) & LDLM_FL_AST_SENT || + /* Cancel extent locks as soon as possible on a liblustre client, + * because it cannot handle asynchronous ASTs robustly (see + * bug 7311). */ + (LIBLUSTRE_CLIENT && type == LDLM_EXTENT)) { + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "enqueue reply includes blocking AST"); + } + + /* If the lock has already been granted by a completion AST, don't + * clobber the LVB with an older one. */ + if (lvb_len != 0) { + /* We must lock or a racing completion might update lvb without + * letting us know and we'll clobber the correct value. + * Cannot unlock after the check either, a that still leaves + * a tiny window for completion to get in */ + lock_res_and_lock(lock); + if (lock->l_req_mode != lock->l_granted_mode) + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, + lock->l_lvb_data, size); + unlock_res_and_lock(lock); + if (rc < 0) { + cleanup_phase = 1; + goto cleanup; + } + } + + if (!is_replay) { + rc = ldlm_lock_enqueue(ns, &lock, NULL, flags); + if (lock->l_completion_ast != NULL) { + int err = lock->l_completion_ast(lock, *flags, NULL); + + if (!rc) + rc = err; + if (rc) + cleanup_phase = 1; + } + } + + if (lvb_len && lvb != NULL) { + /* Copy the LVB here, and not earlier, because the completion + * AST (if any) can override what we got in the reply */ + memcpy(lvb, lock->l_lvb_data, lvb_len); + } + + LDLM_DEBUG(lock, "client-side enqueue END"); +cleanup: + if (cleanup_phase == 1 && rc) + failed_lock_cleanup(ns, lock, mode); + /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */ + LDLM_LOCK_PUT(lock); + LDLM_LOCK_RELEASE(lock); + return rc; +} +EXPORT_SYMBOL(ldlm_cli_enqueue_fini); + +/** + * Estimate number of lock handles that would fit into request of given + * size. PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into + * a single page on the send/receive side. XXX: 512 should be changed to + * more adequate value. + */ +static inline int ldlm_req_handles_avail(int req_size, int off) +{ + int avail; + + avail = min_t(int, LDLM_MAXREQSIZE, PAGE_CACHE_SIZE - 512) - req_size; + if (likely(avail >= 0)) + avail /= (int)sizeof(struct lustre_handle); + else + avail = 0; + avail += LDLM_LOCKREQ_HANDLES - off; + + return avail; +} + +static inline int ldlm_capsule_handles_avail(struct req_capsule *pill, + enum req_location loc, + int off) +{ + int size = req_capsule_msg_size(pill, loc); + + return ldlm_req_handles_avail(size, off); +} + +static inline int ldlm_format_handles_avail(struct obd_import *imp, + const struct req_format *fmt, + enum req_location loc, int off) +{ + int size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc); + + return ldlm_req_handles_avail(size, off); +} + +/** + * Cancel LRU locks and pack them into the enqueue request. Pack there the given + * \a count locks in \a cancels. + * + * This is to be called by functions preparing their own requests that + * might contain lists of locks to cancel in addition to actual operation + * that needs to be performed. + */ +int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req, + int version, int opc, int canceloff, + struct list_head *cancels, int count) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + struct req_capsule *pill = &req->rq_pill; + struct ldlm_request *dlm = NULL; + int flags, avail, to_free, pack = 0; + LIST_HEAD(head); + int rc; + + if (cancels == NULL) + cancels = &head; + if (ns_connect_cancelset(ns)) { + /* Estimate the amount of available space in the request. */ + req_capsule_filled_sizes(pill, RCL_CLIENT); + avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff); + + flags = ns_connect_lru_resize(ns) ? + LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED; + to_free = !ns_connect_lru_resize(ns) && + opc == LDLM_ENQUEUE ? 1 : 0; + + /* Cancel LRU locks here _only_ if the server supports + * EARLY_CANCEL. Otherwise we have to send extra CANCEL + * RPC, which will make us slower. */ + if (avail > count) + count += ldlm_cancel_lru_local(ns, cancels, to_free, + avail - count, 0, flags); + if (avail > count) + pack = count; + else + pack = avail; + req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT, + ldlm_request_bufsize(pack, opc)); + } + + rc = ptlrpc_request_pack(req, version, opc); + if (rc) { + ldlm_lock_list_put(cancels, l_bl_ast, count); + return rc; + } + + if (ns_connect_cancelset(ns)) { + if (canceloff) { + dlm = req_capsule_client_get(pill, &RMF_DLM_REQ); + LASSERT(dlm); + /* Skip first lock handler in ldlm_request_pack(), + * this method will increment @lock_count according + * to the lock handle amount actually written to + * the buffer. */ + dlm->lock_count = canceloff; + } + /* Pack into the request @pack lock handles. */ + ldlm_cli_cancel_list(cancels, pack, req, 0); + /* Prepare and send separate cancel RPC for others. */ + ldlm_cli_cancel_list(cancels, count - pack, NULL, 0); + } else { + ldlm_lock_list_put(cancels, l_bl_ast, count); + } + return 0; +} +EXPORT_SYMBOL(ldlm_prep_elc_req); + +int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req, + struct list_head *cancels, int count) +{ + return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE, + LDLM_ENQUEUE_CANCEL_OFF, cancels, count); +} +EXPORT_SYMBOL(ldlm_prep_enqueue_req); + +struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len) +{ + struct ptlrpc_request *req; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + return ERR_PTR(rc); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); + ptlrpc_request_set_replen(req); + return req; +} +EXPORT_SYMBOL(ldlm_enqueue_pack); + +/** + * Client-side lock enqueue. + * + * If a request has some specific initialisation it is passed in \a reqp, + * otherwise it is created in ldlm_cli_enqueue. + * + * Supports sync and async requests, pass \a async flag accordingly. If a + * request was created in ldlm_cli_enqueue and it is the async request, + * pass it to the caller in \a reqp. + */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_enqueue_info *einfo, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t const *policy, __u64 *flags, + void *lvb, __u32 lvb_len, enum lvb_type lvb_type, + struct lustre_handle *lockh, int async) +{ + struct ldlm_namespace *ns; + struct ldlm_lock *lock; + struct ldlm_request *body; + int is_replay = *flags & LDLM_FL_REPLAY; + int req_passed_in = 1; + int rc, err; + struct ptlrpc_request *req; + + LASSERT(exp != NULL); + + ns = exp->exp_obd->obd_namespace; + + /* If we're replaying this lock, just check some invariants. + * If we're creating a new lock, get everything all setup nice. */ + if (is_replay) { + lock = ldlm_handle2lock_long(lockh, 0); + LASSERT(lock != NULL); + LDLM_DEBUG(lock, "client-side enqueue START"); + LASSERT(exp == lock->l_conn_export); + } else { + const struct ldlm_callback_suite cbs = { + .lcs_completion = einfo->ei_cb_cp, + .lcs_blocking = einfo->ei_cb_bl, + .lcs_glimpse = einfo->ei_cb_gl + }; + lock = ldlm_lock_create(ns, res_id, einfo->ei_type, + einfo->ei_mode, &cbs, einfo->ei_cbdata, + lvb_len, lvb_type); + if (lock == NULL) + return -ENOMEM; + /* for the local lock, add the reference */ + ldlm_lock_addref_internal(lock, einfo->ei_mode); + ldlm_lock2handle(lock, lockh); + if (policy != NULL) + lock->l_policy_data = *policy; + + if (einfo->ei_type == LDLM_EXTENT) + lock->l_req_extent = policy->l_extent; + LDLM_DEBUG(lock, "client-side enqueue START, flags %llx\n", + *flags); + } + + lock->l_conn_export = exp; + lock->l_export = NULL; + lock->l_blocking_ast = einfo->ei_cb_bl; + lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL)); + + /* lock not sent to server yet */ + + if (reqp == NULL || *reqp == NULL) { + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE, + LUSTRE_DLM_VERSION, + LDLM_ENQUEUE); + if (req == NULL) { + failed_lock_cleanup(ns, lock, einfo->ei_mode); + LDLM_LOCK_RELEASE(lock); + return -ENOMEM; + } + req_passed_in = 0; + if (reqp) + *reqp = req; + } else { + int len; + + req = *reqp; + len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, + RCL_CLIENT); + LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n", + DLM_LOCKREQ_OFF, len, (int)sizeof(*body)); + } + + /* Dump lock data into the request buffer */ + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + ldlm_lock2desc(lock, &body->lock_desc); + body->lock_flags = ldlm_flags_to_wire(*flags); + body->lock_handle[0] = *lockh; + + /* Continue as normal. */ + if (!req_passed_in) { + if (lvb_len > 0) + req_capsule_extend(&req->rq_pill, + &RQF_LDLM_ENQUEUE_LVB); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + lvb_len); + ptlrpc_request_set_replen(req); + } + + /* + * Liblustre client doesn't get extent locks, except for O_APPEND case + * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where + * [i_size, OBD_OBJECT_EOF] lock is taken. + */ + LASSERT(ergo(LIBLUSTRE_CLIENT, einfo->ei_type != LDLM_EXTENT || + policy->l_extent.end == OBD_OBJECT_EOF)); + + if (async) { + LASSERT(reqp != NULL); + return 0; + } + + LDLM_DEBUG(lock, "sending request"); + + rc = ptlrpc_queue_wait(req); + + err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0, + einfo->ei_mode, flags, lvb, lvb_len, + lockh, rc); + + /* If ldlm_cli_enqueue_fini did not find the lock, we need to free + * one reference that we took */ + if (err == -ENOLCK) + LDLM_LOCK_RELEASE(lock); + else + rc = err; + + if (!req_passed_in && req != NULL) { + ptlrpc_req_finished(req); + if (reqp) + *reqp = NULL; + } + + return rc; +} +EXPORT_SYMBOL(ldlm_cli_enqueue); + +static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode, + __u32 *flags) +{ + struct ldlm_resource *res; + int rc; + + if (ns_is_client(ldlm_lock_to_ns(lock))) { + CERROR("Trying to cancel local lock\n"); + LBUG(); + } + LDLM_DEBUG(lock, "client-side local convert"); + + res = ldlm_lock_convert(lock, new_mode, flags); + if (res) { + ldlm_reprocess_all(res); + rc = 0; + } else { + rc = LUSTRE_EDEADLK; + } + LDLM_DEBUG(lock, "client-side local convert handler END"); + LDLM_LOCK_PUT(lock); + return rc; +} + +/* FIXME: one of ldlm_cli_convert or the server side should reject attempted + * conversion of locks which are on the waiting or converting queue */ +/* Caller of this code is supposed to take care of lock readers/writers + accounting */ +int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags) +{ + struct ldlm_request *body; + struct ldlm_reply *reply; + struct ldlm_lock *lock; + struct ldlm_resource *res; + struct ptlrpc_request *req; + int rc; + + lock = ldlm_handle2lock(lockh); + if (!lock) { + LBUG(); + return -EINVAL; + } + *flags = 0; + + if (lock->l_conn_export == NULL) + return ldlm_cli_convert_local(lock, new_mode, flags); + + LDLM_DEBUG(lock, "client-side convert"); + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export), + &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION, + LDLM_CONVERT); + if (req == NULL) { + LDLM_LOCK_PUT(lock); + return -ENOMEM; + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + body->lock_handle[0] = lock->l_remote_handle; + + body->lock_desc.l_req_mode = new_mode; + body->lock_flags = ldlm_flags_to_wire(*flags); + + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc != ELDLM_OK) + goto out; + + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) { + rc = -EPROTO; + goto out; + } + + if (req->rq_status) { + rc = req->rq_status; + goto out; + } + + res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags); + if (res != NULL) { + ldlm_reprocess_all(res); + /* Go to sleep until the lock is granted. */ + /* FIXME: or cancelled. */ + if (lock->l_completion_ast) { + rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC, + NULL); + if (rc) + goto out; + } + } else { + rc = LUSTRE_EDEADLK; + } + out: + LDLM_LOCK_PUT(lock); + ptlrpc_req_finished(req); + return rc; +} +EXPORT_SYMBOL(ldlm_cli_convert); + +/** + * Cancel locks locally. + * Returns: + * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server + * \retval LDLM_FL_CANCELING otherwise; + * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC. + */ +static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock) +{ + __u64 rc = LDLM_FL_LOCAL_ONLY; + + if (lock->l_conn_export) { + bool local_only; + + LDLM_DEBUG(lock, "client-side cancel"); + /* Set this flag to prevent others from getting new references*/ + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_CBPENDING; + local_only = !!(lock->l_flags & + (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK)); + ldlm_cancel_callback(lock); + rc = (lock->l_flags & LDLM_FL_BL_AST) ? + LDLM_FL_BL_AST : LDLM_FL_CANCELING; + unlock_res_and_lock(lock); + + if (local_only) { + CDEBUG(D_DLMTRACE, "not sending request (at caller's instruction)\n"); + rc = LDLM_FL_LOCAL_ONLY; + } + ldlm_lock_cancel(lock); + } else { + if (ns_is_client(ldlm_lock_to_ns(lock))) { + LDLM_ERROR(lock, "Trying to cancel local lock"); + LBUG(); + } + LDLM_DEBUG(lock, "server-side local cancel"); + ldlm_lock_cancel(lock); + ldlm_reprocess_all(lock->l_resource); + } + + return rc; +} + +/** + * Pack \a count locks in \a head into ldlm_request buffer of request \a req. + */ +static void ldlm_cancel_pack(struct ptlrpc_request *req, + struct list_head *head, int count) +{ + struct ldlm_request *dlm; + struct ldlm_lock *lock; + int max, packed = 0; + + dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + LASSERT(dlm != NULL); + + /* Check the room in the request buffer. */ + max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) - + sizeof(struct ldlm_request); + max /= sizeof(struct lustre_handle); + max += LDLM_LOCKREQ_HANDLES; + LASSERT(max >= dlm->lock_count + count); + + /* XXX: it would be better to pack lock handles grouped by resource. + * so that the server cancel would call filter_lvbo_update() less + * frequently. */ + list_for_each_entry(lock, head, l_bl_ast) { + if (!count--) + break; + LASSERT(lock->l_conn_export); + /* Pack the lock handle to the given request buffer. */ + LDLM_DEBUG(lock, "packing"); + dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle; + packed++; + } + CDEBUG(D_DLMTRACE, "%d locks packed\n", packed); +} + +/** + * Prepare and send a batched cancel RPC. It will include \a count lock + * handles of locks given in \a cancels list. */ +int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels, + int count, ldlm_cancel_flags_t flags) +{ + struct ptlrpc_request *req = NULL; + struct obd_import *imp; + int free, sent = 0; + int rc = 0; + + LASSERT(exp != NULL); + LASSERT(count > 0); + + CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val); + + if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE)) + return count; + + free = ldlm_format_handles_avail(class_exp2cliimp(exp), + &RQF_LDLM_CANCEL, RCL_CLIENT, 0); + if (count > free) + count = free; + + while (1) { + imp = class_exp2cliimp(exp); + if (imp == NULL || imp->imp_invalid) { + CDEBUG(D_DLMTRACE, + "skipping cancel on invalid import %p\n", imp); + return count; + } + + req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL); + if (req == NULL) { + rc = -ENOMEM; + goto out; + } + + req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT, + ldlm_request_bufsize(count, LDLM_CANCEL)); + + rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL); + if (rc) { + ptlrpc_request_free(req); + goto out; + } + + req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; + req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(req); + + ldlm_cancel_pack(req, cancels, count); + + ptlrpc_request_set_replen(req); + if (flags & LCF_ASYNC) { + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + sent = count; + goto out; + } else { + rc = ptlrpc_queue_wait(req); + } + if (rc == LUSTRE_ESTALE) { + CDEBUG(D_DLMTRACE, "client/server (nid %s) out of sync -- not fatal\n", + libcfs_nid2str(req->rq_import-> + imp_connection->c_peer.nid)); + rc = 0; + } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/ + req->rq_import_generation == imp->imp_generation) { + ptlrpc_req_finished(req); + continue; + } else if (rc != ELDLM_OK) { + /* -ESHUTDOWN is common on umount */ + CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, + "Got rc %d from cancel RPC: canceling anyway\n", + rc); + break; + } + sent = count; + break; + } + + ptlrpc_req_finished(req); +out: + return sent ? sent : rc; +} +EXPORT_SYMBOL(ldlm_cli_cancel_req); + +static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp) +{ + LASSERT(imp != NULL); + return &imp->imp_obd->obd_namespace->ns_pool; +} + +/** + * Update client's OBD pool related fields with new SLV and Limit from \a req. + */ +int ldlm_cli_update_pool(struct ptlrpc_request *req) +{ + struct obd_device *obd; + __u64 new_slv; + __u32 new_limit; + + if (unlikely(!req->rq_import || !req->rq_import->imp_obd || + !imp_connect_lru_resize(req->rq_import))) { + /* + * Do nothing for corner cases. + */ + return 0; + } + + /* In some cases RPC may contain SLV and limit zeroed out. This + * is the case when server does not support LRU resize feature. + * This is also possible in some recovery cases when server-side + * reqs have no reference to the OBD export and thus access to + * server-side namespace is not possible. */ + if (lustre_msg_get_slv(req->rq_repmsg) == 0 || + lustre_msg_get_limit(req->rq_repmsg) == 0) { + DEBUG_REQ(D_HA, req, + "Zero SLV or Limit found (SLV: %llu, Limit: %u)", + lustre_msg_get_slv(req->rq_repmsg), + lustre_msg_get_limit(req->rq_repmsg)); + return 0; + } + + new_limit = lustre_msg_get_limit(req->rq_repmsg); + new_slv = lustre_msg_get_slv(req->rq_repmsg); + obd = req->rq_import->imp_obd; + + /* Set new SLV and limit in OBD fields to make them accessible + * to the pool thread. We do not access obd_namespace and pool + * directly here as there is no reliable way to make sure that + * they are still alive at cleanup time. Evil races are possible + * which may cause Oops at that time. */ + write_lock(&obd->obd_pool_lock); + obd->obd_pool_slv = new_slv; + obd->obd_pool_limit = new_limit; + write_unlock(&obd->obd_pool_lock); + + return 0; +} +EXPORT_SYMBOL(ldlm_cli_update_pool); + +/** + * Client side lock cancel. + * + * Lock must not have any readers or writers by this time. + */ +int ldlm_cli_cancel(struct lustre_handle *lockh, + ldlm_cancel_flags_t cancel_flags) +{ + struct obd_export *exp; + int avail, flags, count = 1; + __u64 rc = 0; + struct ldlm_namespace *ns; + struct ldlm_lock *lock; + LIST_HEAD(cancels); + + /* concurrent cancels on the same handle can happen */ + lock = ldlm_handle2lock_long(lockh, LDLM_FL_CANCELING); + if (lock == NULL) { + LDLM_DEBUG_NOLOCK("lock is already being destroyed\n"); + return 0; + } + + rc = ldlm_cli_cancel_local(lock); + if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) { + LDLM_LOCK_RELEASE(lock); + return 0; + } + /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL + * RPC which goes to canceld portal, so we can cancel other LRU locks + * here and send them all as one LDLM_CANCEL RPC. */ + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, &cancels); + + exp = lock->l_conn_export; + if (exp_connect_cancelset(exp)) { + avail = ldlm_format_handles_avail(class_exp2cliimp(exp), + &RQF_LDLM_CANCEL, + RCL_CLIENT, 0); + LASSERT(avail > 0); + + ns = ldlm_lock_to_ns(lock); + flags = ns_connect_lru_resize(ns) ? + LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED; + count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1, + LCF_BL_AST, flags); + } + ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags); + return 0; +} +EXPORT_SYMBOL(ldlm_cli_cancel); + +/** + * Locally cancel up to \a count locks in list \a cancels. + * Return the number of cancelled locks. + */ +int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, + ldlm_cancel_flags_t flags) +{ + LIST_HEAD(head); + struct ldlm_lock *lock, *next; + int left = 0, bl_ast = 0; + __u64 rc; + + left = count; + list_for_each_entry_safe(lock, next, cancels, l_bl_ast) { + if (left-- == 0) + break; + + if (flags & LCF_LOCAL) { + rc = LDLM_FL_LOCAL_ONLY; + ldlm_lock_cancel(lock); + } else { + rc = ldlm_cli_cancel_local(lock); + } + /* Until we have compound requests and can send LDLM_CANCEL + * requests batched with generic RPCs, we need to send cancels + * with the LDLM_FL_BL_AST flag in a separate RPC from + * the one being generated now. */ + if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) { + LDLM_DEBUG(lock, "Cancel lock separately"); + list_del_init(&lock->l_bl_ast); + list_add(&lock->l_bl_ast, &head); + bl_ast++; + continue; + } + if (rc == LDLM_FL_LOCAL_ONLY) { + /* CANCEL RPC should not be sent to server. */ + list_del_init(&lock->l_bl_ast); + LDLM_LOCK_RELEASE(lock); + count--; + } + } + if (bl_ast > 0) { + count -= bl_ast; + ldlm_cli_cancel_list(&head, bl_ast, NULL, 0); + } + + return count; +} +EXPORT_SYMBOL(ldlm_cli_cancel_list_local); + +/** + * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back + * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g. + * readahead requests, ...) + */ +static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK; + ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery; + + lock_res_and_lock(lock); + + /* don't check added & count since we want to process all locks + * from unused list */ + switch (lock->l_resource->lr_type) { + case LDLM_EXTENT: + case LDLM_IBITS: + if (cb && cb(lock)) + break; + default: + result = LDLM_POLICY_SKIP_LOCK; + lock->l_flags |= LDLM_FL_SKIPPED; + break; + } + + unlock_res_and_lock(lock); + return result; +} + +/** + * Callback function for LRU-resize policy. Decides whether to keep + * \a lock in LRU for current \a LRU size \a unused, added in current + * scan \a added and number of locks to be preferably canceled \a count. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + unsigned long cur = cfs_time_current(); + struct ldlm_pool *pl = &ns->ns_pool; + __u64 slv, lvf, lv; + unsigned long la; + + /* Stop LRU processing when we reach past @count or have checked all + * locks in LRU. */ + if (count && added >= count) + return LDLM_POLICY_KEEP_LOCK; + + slv = ldlm_pool_get_slv(pl); + lvf = ldlm_pool_get_lvf(pl); + la = cfs_duration_sec(cfs_time_sub(cur, + lock->l_last_used)); + lv = lvf * la * unused; + + /* Inform pool about current CLV to see it via proc. */ + ldlm_pool_set_clv(pl, lv); + + /* Stop when SLV is not yet come from server or lv is smaller than + * it is. */ + return (slv == 0 || lv < slv) ? + LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; +} + +/** + * Callback function for proc used policy. Makes decision whether to keep + * \a lock in LRU for current \a LRU size \a unused, added in current scan \a + * added and number of locks to be preferably canceled \a count. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + /* Stop LRU processing when we reach past @count or have checked all + * locks in LRU. */ + return (added >= count) ? + LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; +} + +/** + * Callback function for aged policy. Makes decision whether to keep \a lock in + * LRU for current LRU size \a unused, added in current scan \a added and + * number of locks to be preferably canceled \a count. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + /* Stop LRU processing if young lock is found and we reach past count */ + return ((added >= count) && + time_before(cfs_time_current(), + cfs_time_add(lock->l_last_used, ns->ns_max_age))) ? + LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; +} + +/** + * Callback function for default policy. Makes decision whether to keep \a lock + * in LRU for current LRU size \a unused, added in current scan \a added and + * number of locks to be preferably canceled \a count. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + /* Stop LRU processing when we reach past count or have checked all + * locks in LRU. */ + return (added >= count) ? + LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; +} + +typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *, + struct ldlm_lock *, int, + int, int); + +static ldlm_cancel_lru_policy_t +ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) +{ + if (flags & LDLM_CANCEL_NO_WAIT) + return ldlm_cancel_no_wait_policy; + + if (ns_connect_lru_resize(ns)) { + if (flags & LDLM_CANCEL_SHRINK) + /* We kill passed number of old locks. */ + return ldlm_cancel_passed_policy; + else if (flags & LDLM_CANCEL_LRUR) + return ldlm_cancel_lrur_policy; + else if (flags & LDLM_CANCEL_PASSED) + return ldlm_cancel_passed_policy; + } else { + if (flags & LDLM_CANCEL_AGED) + return ldlm_cancel_aged_policy; + } + + return ldlm_cancel_default_policy; +} + +/** + * - Free space in LRU for \a count new locks, + * redundant unused locks are canceled locally; + * - also cancel locally unused aged locks; + * - do not cancel more than \a max locks; + * - GET the found locks and add them into the \a cancels list. + * + * A client lock can be added to the l_bl_ast list only when it is + * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing + * CANCEL. There are the following use cases: + * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and + * ldlm_cli_cancel(), which check and set this flag properly. As any + * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed + * later without any special locking. + * + * Calling policies for enabled LRU resize: + * ---------------------------------------- + * flags & LDLM_CANCEL_LRUR - use LRU resize policy (SLV from server) to + * cancel not more than \a count locks; + * + * flags & LDLM_CANCEL_PASSED - cancel \a count number of old locks (located at + * the beginning of LRU list); + * + * flags & LDLM_CANCEL_SHRINK - cancel not more than \a count locks according to + * memory pressure policy function; + * + * flags & LDLM_CANCEL_AGED - cancel \a count locks according to "aged policy". + * + * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible + * (typically before replaying locks) w/o + * sending any RPCs or waiting for any + * outstanding RPC to complete. + */ +static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, + struct list_head *cancels, int count, int max, + int flags) +{ + ldlm_cancel_lru_policy_t pf; + struct ldlm_lock *lock, *next; + int added = 0, unused, remained; + + spin_lock(&ns->ns_lock); + unused = ns->ns_nr_unused; + remained = unused; + + if (!ns_connect_lru_resize(ns)) + count += unused - ns->ns_max_unused; + + pf = ldlm_cancel_lru_policy(ns, flags); + LASSERT(pf != NULL); + + while (!list_empty(&ns->ns_unused_list)) { + ldlm_policy_res_t result; + + /* all unused locks */ + if (remained-- <= 0) + break; + + /* For any flags, stop scanning if @max is reached. */ + if (max && added >= max) + break; + + list_for_each_entry_safe(lock, next, &ns->ns_unused_list, + l_lru) { + /* No locks which got blocking requests. */ + LASSERT(!(lock->l_flags & LDLM_FL_BL_AST)); + + if (flags & LDLM_CANCEL_NO_WAIT && + lock->l_flags & LDLM_FL_SKIPPED) + /* already processed */ + continue; + + /* Somebody is already doing CANCEL. No need for this + * lock in LRU, do not traverse it again. */ + if (!(lock->l_flags & LDLM_FL_CANCELING)) + break; + + ldlm_lock_remove_from_lru_nolock(lock); + } + if (&lock->l_lru == &ns->ns_unused_list) + break; + + LDLM_LOCK_GET(lock); + spin_unlock(&ns->ns_lock); + lu_ref_add(&lock->l_reference, __func__, current); + + /* Pass the lock through the policy filter and see if it + * should stay in LRU. + * + * Even for shrinker policy we stop scanning if + * we find a lock that should stay in the cache. + * We should take into account lock age anyway + * as a new lock is a valuable resource even if + * it has a low weight. + * + * That is, for shrinker policy we drop only + * old locks, but additionally choose them by + * their weight. Big extent locks will stay in + * the cache. */ + result = pf(ns, lock, unused, added, count); + if (result == LDLM_POLICY_KEEP_LOCK) { + lu_ref_del(&lock->l_reference, + __func__, current); + LDLM_LOCK_RELEASE(lock); + spin_lock(&ns->ns_lock); + break; + } + if (result == LDLM_POLICY_SKIP_LOCK) { + lu_ref_del(&lock->l_reference, + __func__, current); + LDLM_LOCK_RELEASE(lock); + spin_lock(&ns->ns_lock); + continue; + } + + lock_res_and_lock(lock); + /* Check flags again under the lock. */ + if ((lock->l_flags & LDLM_FL_CANCELING) || + (ldlm_lock_remove_from_lru(lock) == 0)) { + /* Another thread is removing lock from LRU, or + * somebody is already doing CANCEL, or there + * is a blocking request which will send cancel + * by itself, or the lock is no longer unused. */ + unlock_res_and_lock(lock); + lu_ref_del(&lock->l_reference, + __func__, current); + LDLM_LOCK_RELEASE(lock); + spin_lock(&ns->ns_lock); + continue; + } + LASSERT(!lock->l_readers && !lock->l_writers); + + /* If we have chosen to cancel this lock voluntarily, we + * better send cancel notification to server, so that it + * frees appropriate state. This might lead to a race + * where while we are doing cancel here, server is also + * silently cancelling this lock. */ + lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK; + + /* Setting the CBPENDING flag is a little misleading, + * but prevents an important race; namely, once + * CBPENDING is set, the lock can accumulate no more + * readers/writers. Since readers and writers are + * already zero here, ldlm_lock_decref() won't see + * this flag and call l_blocking_ast */ + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING; + + /* We can't re-add to l_lru as it confuses the + * refcounting in ldlm_lock_remove_from_lru() if an AST + * arrives after we drop lr_lock below. We use l_bl_ast + * and can't use l_pending_chain as it is used both on + * server and client nevertheless bug 5666 says it is + * used only on server */ + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, cancels); + unlock_res_and_lock(lock); + lu_ref_del(&lock->l_reference, __func__, current); + spin_lock(&ns->ns_lock); + added++; + unused--; + } + spin_unlock(&ns->ns_lock); + return added; +} + +int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, + int count, int max, ldlm_cancel_flags_t cancel_flags, + int flags) +{ + int added; + + added = ldlm_prepare_lru_list(ns, cancels, count, max, flags); + if (added <= 0) + return added; + return ldlm_cli_cancel_list_local(cancels, added, cancel_flags); +} + +/** + * Cancel at least \a nr locks from given namespace LRU. + * + * When called with LCF_ASYNC the blocking callback will be handled + * in a thread and this function will return after the thread has been + * asked to call the callback. When called with LCF_ASYNC the blocking + * callback will be performed in this function. + */ +int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, + ldlm_cancel_flags_t cancel_flags, + int flags) +{ + LIST_HEAD(cancels); + int count, rc; + + /* Just prepare the list of locks, do not actually cancel them yet. + * Locks are cancelled later in a separate thread. */ + count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags); + rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags); + if (rc == 0) + return count; + + return 0; +} + +/** + * Find and cancel locally unused locks found on resource, matched to the + * given policy, mode. GET the found locks and add them into the \a cancels + * list. + */ +int ldlm_cancel_resource_local(struct ldlm_resource *res, + struct list_head *cancels, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, __u64 lock_flags, + ldlm_cancel_flags_t cancel_flags, void *opaque) +{ + struct ldlm_lock *lock; + int count = 0; + + lock_res(res); + list_for_each_entry(lock, &res->lr_granted, l_res_link) { + if (opaque != NULL && lock->l_ast_data != opaque) { + LDLM_ERROR(lock, "data %p doesn't match opaque %p", + lock->l_ast_data, opaque); + continue; + } + + if (lock->l_readers || lock->l_writers) + continue; + + /* If somebody is already doing CANCEL, or blocking AST came, + * skip this lock. */ + if (lock->l_flags & LDLM_FL_BL_AST || + lock->l_flags & LDLM_FL_CANCELING) + continue; + + if (lockmode_compat(lock->l_granted_mode, mode)) + continue; + + /* If policy is given and this is IBITS lock, add to list only + * those locks that match by policy. */ + if (policy && (lock->l_resource->lr_type == LDLM_IBITS) && + !(lock->l_policy_data.l_inodebits.bits & + policy->l_inodebits.bits)) + continue; + + /* See CBPENDING comment in ldlm_cancel_lru */ + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING | + lock_flags; + + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, cancels); + LDLM_LOCK_GET(lock); + count++; + } + unlock_res(res); + + return ldlm_cli_cancel_list_local(cancels, count, cancel_flags); +} +EXPORT_SYMBOL(ldlm_cancel_resource_local); + +/** + * Cancel client-side locks from a list and send/prepare cancel RPCs to the + * server. + * If \a req is NULL, send CANCEL request to server with handles of locks + * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests + * separately per lock. + * If \a req is not NULL, put handles of locks in \a cancels into the request + * buffer at the offset \a off. + * Destroy \a cancels at the end. + */ +int ldlm_cli_cancel_list(struct list_head *cancels, int count, + struct ptlrpc_request *req, ldlm_cancel_flags_t flags) +{ + struct ldlm_lock *lock; + int res = 0; + + if (list_empty(cancels) || count == 0) + return 0; + + /* XXX: requests (both batched and not) could be sent in parallel. + * Usually it is enough to have just 1 RPC, but it is possible that + * there are too many locks to be cancelled in LRU or on a resource. + * It would also speed up the case when the server does not support + * the feature. */ + while (count > 0) { + LASSERT(!list_empty(cancels)); + lock = list_entry(cancels->next, struct ldlm_lock, + l_bl_ast); + LASSERT(lock->l_conn_export); + + if (exp_connect_cancelset(lock->l_conn_export)) { + res = count; + if (req) + ldlm_cancel_pack(req, cancels, count); + else + res = ldlm_cli_cancel_req(lock->l_conn_export, + cancels, count, + flags); + } else { + res = ldlm_cli_cancel_req(lock->l_conn_export, + cancels, 1, flags); + } + + if (res < 0) { + CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, + "ldlm_cli_cancel_list: %d\n", res); + res = count; + } + + count -= res; + ldlm_lock_list_put(cancels, l_bl_ast, res); + } + LASSERT(count == 0); + return 0; +} +EXPORT_SYMBOL(ldlm_cli_cancel_list); + +/** + * Cancel all locks on a resource that have 0 readers/writers. + * + * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying + * to notify the server. */ +int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque) +{ + struct ldlm_resource *res; + LIST_HEAD(cancels); + int count; + int rc; + + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (res == NULL) { + /* This is not a problem. */ + CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]); + return 0; + } + + LDLM_RESOURCE_ADDREF(res); + count = ldlm_cancel_resource_local(res, &cancels, policy, mode, + 0, flags | LCF_BL_AST, opaque); + rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags); + if (rc != ELDLM_OK) + CERROR("canceling unused lock "DLDLMRES": rc = %d\n", + PLDLMRES(res), rc); + + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + return 0; +} +EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource); + +struct ldlm_cli_cancel_arg { + int lc_flags; + void *lc_opaque; +}; + +static int ldlm_cli_hash_cancel_unused(struct cfs_hash *hs, + struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + struct ldlm_cli_cancel_arg *lc = arg; + + ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name, + NULL, LCK_MINMODE, + lc->lc_flags, lc->lc_opaque); + /* must return 0 for hash iteration */ + return 0; +} + +/** + * Cancel all locks on a namespace (or a specific resource, if given) + * that have 0 readers/writers. + * + * If flags & LCF_LOCAL, throw the locks away without trying + * to notify the server. */ +int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_cancel_flags_t flags, void *opaque) +{ + struct ldlm_cli_cancel_arg arg = { + .lc_flags = flags, + .lc_opaque = opaque, + }; + + if (ns == NULL) + return ELDLM_OK; + + if (res_id != NULL) { + return ldlm_cli_cancel_unused_resource(ns, res_id, NULL, + LCK_MINMODE, flags, + opaque); + } else { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_cli_hash_cancel_unused, &arg); + return ELDLM_OK; + } +} +EXPORT_SYMBOL(ldlm_cli_cancel_unused); + +/* Lock iterators. */ + +int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, + void *closure) +{ + struct list_head *tmp, *next; + struct ldlm_lock *lock; + int rc = LDLM_ITER_CONTINUE; + + if (!res) + return LDLM_ITER_CONTINUE; + + lock_res(res); + list_for_each_safe(tmp, next, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) { + rc = LDLM_ITER_STOP; + goto out; + } + } + + list_for_each_safe(tmp, next, &res->lr_converting) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) { + rc = LDLM_ITER_STOP; + goto out; + } + } + + list_for_each_safe(tmp, next, &res->lr_waiting) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) { + rc = LDLM_ITER_STOP; + goto out; + } + } + out: + unlock_res(res); + return rc; +} +EXPORT_SYMBOL(ldlm_resource_foreach); + +struct iter_helper_data { + ldlm_iterator_t iter; + void *closure; +}; + +static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure) +{ + struct iter_helper_data *helper = closure; + + return helper->iter(lock, helper->closure); +} + +static int ldlm_res_iter_helper(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) + +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + return ldlm_resource_foreach(res, ldlm_iter_helper, arg) == + LDLM_ITER_STOP; +} + +void ldlm_namespace_foreach(struct ldlm_namespace *ns, + ldlm_iterator_t iter, void *closure) + +{ + struct iter_helper_data helper = { + .iter = iter, + .closure = closure, + }; + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_res_iter_helper, &helper); + +} +EXPORT_SYMBOL(ldlm_namespace_foreach); + +/* non-blocking function to manipulate a lock whose cb_data is being put away. + * return 0: find no resource + * > 0: must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE. + * < 0: errors + */ +int ldlm_resource_iterate(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_iterator_t iter, void *data) +{ + struct ldlm_resource *res; + int rc; + + if (ns == NULL) { + CERROR("must pass in namespace\n"); + LBUG(); + } + + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (res == NULL) + return 0; + + LDLM_RESOURCE_ADDREF(res); + rc = ldlm_resource_foreach(res, iter, data); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + return rc; +} +EXPORT_SYMBOL(ldlm_resource_iterate); + +/* Lock replay */ + +static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure) +{ + struct list_head *list = closure; + + /* we use l_pending_chain here, because it's unused on clients. */ + LASSERTF(list_empty(&lock->l_pending_chain), + "lock %p next %p prev %p\n", + lock, &lock->l_pending_chain.next, + &lock->l_pending_chain.prev); + /* bug 9573: don't replay locks left after eviction, or + * bug 17614: locks being actively cancelled. Get a reference + * on a lock so that it does not disappear under us (e.g. due to cancel) + */ + if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) { + list_add(&lock->l_pending_chain, list); + LDLM_LOCK_GET(lock); + } + + return LDLM_ITER_CONTINUE; +} + +static int replay_lock_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct ldlm_async_args *aa, int rc) +{ + struct ldlm_lock *lock; + struct ldlm_reply *reply; + struct obd_export *exp; + + atomic_dec(&req->rq_import->imp_replay_inflight); + if (rc != ELDLM_OK) + goto out; + + + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) { + rc = -EPROTO; + goto out; + } + + lock = ldlm_handle2lock(&aa->lock_handle); + if (!lock) { + CERROR("received replay ack for unknown local cookie %#llx remote cookie %#llx from server %s id %s\n", + aa->lock_handle.cookie, reply->lock_handle.cookie, + req->rq_export->exp_client_uuid.uuid, + libcfs_id2str(req->rq_peer)); + rc = -ESTALE; + goto out; + } + + /* Key change rehash lock in per-export hash with new key */ + exp = req->rq_export; + if (exp && exp->exp_lock_hash) { + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_rehash_key(exp->exp_lock_hash, + &lock->l_remote_handle, + &reply->lock_handle, + &lock->l_exp_hash); + } else { + lock->l_remote_handle = reply->lock_handle; + } + + LDLM_DEBUG(lock, "replayed lock:"); + ptlrpc_import_recovery_state_machine(req->rq_import); + LDLM_LOCK_PUT(lock); +out: + if (rc != ELDLM_OK) + ptlrpc_connect_import(req->rq_import); + + return rc; +} + +static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + struct ldlm_async_args *aa; + struct ldlm_request *body; + int flags; + + /* Bug 11974: Do not replay a lock which is actively being canceled */ + if (lock->l_flags & LDLM_FL_CANCELING) { + LDLM_DEBUG(lock, "Not replaying canceled lock:"); + return 0; + } + + /* If this is reply-less callback lock, we cannot replay it, since + * server might have long dropped it, but notification of that event was + * lost by network. (and server granted conflicting lock already) */ + if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) { + LDLM_DEBUG(lock, "Not replaying reply-less lock:"); + ldlm_lock_cancel(lock); + return 0; + } + + /* + * If granted mode matches the requested mode, this lock is granted. + * + * If they differ, but we have a granted mode, then we were granted + * one mode and now want another: ergo, converting. + * + * If we haven't been granted anything and are on a resource list, + * then we're blocked/waiting. + * + * If we haven't been granted anything and we're NOT on a resource list, + * then we haven't got a reply yet and don't have a known disposition. + * This happens whenever a lock enqueue is the request that triggers + * recovery. + */ + if (lock->l_granted_mode == lock->l_req_mode) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED; + else if (lock->l_granted_mode) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV; + else if (!list_empty(&lock->l_res_link)) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT; + else + flags = LDLM_FL_REPLAY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE, + LUSTRE_DLM_VERSION, LDLM_ENQUEUE); + if (req == NULL) + return -ENOMEM; + + /* We're part of recovery, so don't wait for it. */ + req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS; + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + ldlm_lock2desc(lock, &body->lock_desc); + body->lock_flags = ldlm_flags_to_wire(flags); + + ldlm_lock2handle(lock, &body->lock_handle[0]); + if (lock->l_lvb_len > 0) + req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + lock->l_lvb_len); + ptlrpc_request_set_replen(req); + /* notify the server we've replayed all requests. + * also, we mark the request to be put on a dedicated + * queue to be processed after all request replayes. + * bug 6063 */ + lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE); + + LDLM_DEBUG(lock, "replaying lock:"); + + atomic_inc(&req->rq_import->imp_replay_inflight); + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->lock_handle = body->lock_handle[0]; + req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret; + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + + return 0; +} + +/** + * Cancel as many unused locks as possible before replay. since we are + * in recovery, we can't wait for any outstanding RPCs to send any RPC + * to the server. + * + * Called only in recovery before replaying locks. there is no need to + * replay locks that are unused. since the clients may hold thousands of + * cached unused locks, dropping the unused locks can greatly reduce the + * load on the servers at recovery time. + */ +static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns) +{ + int canceled; + LIST_HEAD(cancels); + + CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before replay for namespace %s (%d)\n", + ldlm_ns_name(ns), ns->ns_nr_unused); + + /* We don't need to care whether or not LRU resize is enabled + * because the LDLM_CANCEL_NO_WAIT policy doesn't use the + * count parameter */ + canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0, + LCF_LOCAL, LDLM_CANCEL_NO_WAIT); + + CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n", + canceled, ldlm_ns_name(ns)); +} + +int ldlm_replay_locks(struct obd_import *imp) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + LIST_HEAD(list); + struct ldlm_lock *lock, *next; + int rc = 0; + + LASSERT(atomic_read(&imp->imp_replay_inflight) == 0); + + /* don't replay locks if import failed recovery */ + if (imp->imp_vbr_failed) + return 0; + + /* ensure this doesn't fall to 0 before all have been queued */ + atomic_inc(&imp->imp_replay_inflight); + + if (ldlm_cancel_unused_locks_before_replay) + ldlm_cancel_unused_locks_for_replay(ns); + + ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list); + + list_for_each_entry_safe(lock, next, &list, l_pending_chain) { + list_del_init(&lock->l_pending_chain); + if (rc) { + LDLM_LOCK_RELEASE(lock); + continue; /* or try to do the rest? */ + } + rc = replay_one_lock(imp, lock); + LDLM_LOCK_RELEASE(lock); + } + + atomic_dec(&imp->imp_replay_inflight); + + return rc; +} +EXPORT_SYMBOL(ldlm_replay_locks); diff --git a/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c new file mode 100644 index 000000000..f750d42a7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c @@ -0,0 +1,1425 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_resource.c + * + * Author: Phil Schwan + * Author: Peter Braam + */ + +#define DEBUG_SUBSYSTEM S_LDLM +#include "../include/lustre_dlm.h" +#include "../include/lustre_fid.h" +#include "../include/obd_class.h" +#include "ldlm_internal.h" + +struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab; + +int ldlm_srv_namespace_nr = 0; +int ldlm_cli_namespace_nr = 0; + +struct mutex ldlm_srv_namespace_lock; +LIST_HEAD(ldlm_srv_namespace_list); + +struct mutex ldlm_cli_namespace_lock; +/* Client Namespaces that have active resources in them. + * Once all resources go away, ldlm_poold moves such namespaces to the + * inactive list */ +LIST_HEAD(ldlm_cli_active_namespace_list); +/* Client namespaces that don't have any locks in them */ +LIST_HEAD(ldlm_cli_inactive_namespace_list); + +struct proc_dir_entry *ldlm_type_proc_dir = NULL; +static struct proc_dir_entry *ldlm_ns_proc_dir = NULL; +struct proc_dir_entry *ldlm_svc_proc_dir = NULL; + +extern unsigned int ldlm_cancel_unused_locks_before_replay; + +/* during debug dump certain amount of granted locks for one resource to avoid + * DDOS. */ +unsigned int ldlm_dump_granted_max = 256; + +#if defined(CONFIG_PROC_FS) +static ssize_t lprocfs_wr_dump_ns(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); + return count; +} +LPROC_SEQ_FOPS_WR_ONLY(ldlm, dump_ns); + +LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint); +LPROC_SEQ_FOPS_RO_TYPE(ldlm, uint); + +int ldlm_proc_setup(void) +{ + int rc; + struct lprocfs_vars list[] = { + { "dump_namespaces", &ldlm_dump_ns_fops, NULL, 0222 }, + { "dump_granted_max", &ldlm_rw_uint_fops, + &ldlm_dump_granted_max }, + { "cancel_unused_locks_before_replay", &ldlm_rw_uint_fops, + &ldlm_cancel_unused_locks_before_replay }, + { NULL } }; + LASSERT(ldlm_ns_proc_dir == NULL); + + ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME, + proc_lustre_root, + NULL, NULL); + if (IS_ERR(ldlm_type_proc_dir)) { + CERROR("LProcFS failed in ldlm-init\n"); + rc = PTR_ERR(ldlm_type_proc_dir); + goto err; + } + + ldlm_ns_proc_dir = lprocfs_register("namespaces", + ldlm_type_proc_dir, + NULL, NULL); + if (IS_ERR(ldlm_ns_proc_dir)) { + CERROR("LProcFS failed in ldlm-init\n"); + rc = PTR_ERR(ldlm_ns_proc_dir); + goto err_type; + } + + ldlm_svc_proc_dir = lprocfs_register("services", + ldlm_type_proc_dir, + NULL, NULL); + if (IS_ERR(ldlm_svc_proc_dir)) { + CERROR("LProcFS failed in ldlm-init\n"); + rc = PTR_ERR(ldlm_svc_proc_dir); + goto err_ns; + } + + rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL); + + return 0; + +err_ns: + lprocfs_remove(&ldlm_ns_proc_dir); +err_type: + lprocfs_remove(&ldlm_type_proc_dir); +err: + ldlm_svc_proc_dir = NULL; + ldlm_type_proc_dir = NULL; + ldlm_ns_proc_dir = NULL; + return rc; +} + +void ldlm_proc_cleanup(void) +{ + if (ldlm_svc_proc_dir) + lprocfs_remove(&ldlm_svc_proc_dir); + + if (ldlm_ns_proc_dir) + lprocfs_remove(&ldlm_ns_proc_dir); + + if (ldlm_type_proc_dir) + lprocfs_remove(&ldlm_type_proc_dir); + + ldlm_svc_proc_dir = NULL; + ldlm_type_proc_dir = NULL; + ldlm_ns_proc_dir = NULL; +} + +static int lprocfs_ns_resources_seq_show(struct seq_file *m, void *v) +{ + struct ldlm_namespace *ns = m->private; + __u64 res = 0; + struct cfs_hash_bd bd; + int i; + + /* result is not strictly consistent */ + cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i) + res += cfs_hash_bd_count_get(&bd); + return lprocfs_rd_u64(m, &res); +} +LPROC_SEQ_FOPS_RO(lprocfs_ns_resources); + +static int lprocfs_ns_locks_seq_show(struct seq_file *m, void *v) +{ + struct ldlm_namespace *ns = m->private; + __u64 locks; + + locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS, + LPROCFS_FIELDS_FLAGS_SUM); + return lprocfs_rd_u64(m, &locks); +} +LPROC_SEQ_FOPS_RO(lprocfs_ns_locks); + +static int lprocfs_lru_size_seq_show(struct seq_file *m, void *v) +{ + struct ldlm_namespace *ns = m->private; + __u32 *nr = &ns->ns_max_unused; + + if (ns_connect_lru_resize(ns)) + nr = &ns->ns_nr_unused; + return lprocfs_rd_uint(m, nr); +} + +static ssize_t lprocfs_lru_size_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private; + char dummy[MAX_STRING_SIZE + 1]; + unsigned long tmp; + int lru_resize; + int err; + + dummy[MAX_STRING_SIZE] = '\0'; + if (copy_from_user(dummy, buffer, MAX_STRING_SIZE)) + return -EFAULT; + + if (strncmp(dummy, "clear", 5) == 0) { + CDEBUG(D_DLMTRACE, + "dropping all unused locks from namespace %s\n", + ldlm_ns_name(ns)); + if (ns_connect_lru_resize(ns)) { + int canceled, unused = ns->ns_nr_unused; + + /* Try to cancel all @ns_nr_unused locks. */ + canceled = ldlm_cancel_lru(ns, unused, 0, + LDLM_CANCEL_PASSED); + if (canceled < unused) { + CDEBUG(D_DLMTRACE, + "not all requested locks are canceled, requested: %d, canceled: %d\n", + unused, + canceled); + return -EINVAL; + } + } else { + tmp = ns->ns_max_unused; + ns->ns_max_unused = 0; + ldlm_cancel_lru(ns, 0, 0, LDLM_CANCEL_PASSED); + ns->ns_max_unused = tmp; + } + return count; + } + + err = kstrtoul(dummy, 10, &tmp); + if (err != 0) { + CERROR("invalid value written\n"); + return -EINVAL; + } + lru_resize = (tmp == 0); + + if (ns_connect_lru_resize(ns)) { + if (!lru_resize) + ns->ns_max_unused = (unsigned int)tmp; + + if (tmp > ns->ns_nr_unused) + tmp = ns->ns_nr_unused; + tmp = ns->ns_nr_unused - tmp; + + CDEBUG(D_DLMTRACE, + "changing namespace %s unused locks from %u to %u\n", + ldlm_ns_name(ns), ns->ns_nr_unused, + (unsigned int)tmp); + ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_CANCEL_PASSED); + + if (!lru_resize) { + CDEBUG(D_DLMTRACE, + "disable lru_resize for namespace %s\n", + ldlm_ns_name(ns)); + ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE; + } + } else { + CDEBUG(D_DLMTRACE, + "changing namespace %s max_unused from %u to %u\n", + ldlm_ns_name(ns), ns->ns_max_unused, + (unsigned int)tmp); + ns->ns_max_unused = (unsigned int)tmp; + ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_CANCEL_PASSED); + + /* Make sure that LRU resize was originally supported before + * turning it on here. */ + if (lru_resize && + (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) { + CDEBUG(D_DLMTRACE, + "enable lru_resize for namespace %s\n", + ldlm_ns_name(ns)); + ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE; + } + } + + return count; +} +LPROC_SEQ_FOPS(lprocfs_lru_size); + +static int lprocfs_elc_seq_show(struct seq_file *m, void *v) +{ + struct ldlm_namespace *ns = m->private; + unsigned int supp = ns_connect_cancelset(ns); + + return lprocfs_rd_uint(m, &supp); +} + +static ssize_t lprocfs_elc_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private; + unsigned int supp = -1; + int rc; + + rc = lprocfs_wr_uint(file, buffer, count, &supp); + if (rc < 0) + return rc; + + if (supp == 0) + ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET; + else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET) + ns->ns_connect_flags |= OBD_CONNECT_CANCELSET; + return count; +} +LPROC_SEQ_FOPS(lprocfs_elc); + +void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns) +{ + if (ns->ns_proc_dir_entry == NULL) + CERROR("dlm namespace %s has no procfs dir?\n", + ldlm_ns_name(ns)); + else + lprocfs_remove(&ns->ns_proc_dir_entry); + + if (ns->ns_stats != NULL) + lprocfs_free_stats(&ns->ns_stats); +} + +#define LDLM_NS_ADD_VAR(name, var, ops) \ + do { \ + snprintf(lock_name, MAX_STRING_SIZE, name); \ + lock_vars[0].data = var; \ + lock_vars[0].fops = ops; \ + lprocfs_add_vars(ns_pde, lock_vars, NULL); \ + } while (0) + +int ldlm_namespace_proc_register(struct ldlm_namespace *ns) +{ + struct lprocfs_vars lock_vars[2]; + char lock_name[MAX_STRING_SIZE + 1]; + struct proc_dir_entry *ns_pde; + + LASSERT(ns != NULL); + LASSERT(ns->ns_rs_hash != NULL); + + if (ns->ns_proc_dir_entry != NULL) { + ns_pde = ns->ns_proc_dir_entry; + } else { + ns_pde = proc_mkdir(ldlm_ns_name(ns), ldlm_ns_proc_dir); + if (ns_pde == NULL) + return -ENOMEM; + ns->ns_proc_dir_entry = ns_pde; + } + + ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0); + if (ns->ns_stats == NULL) + return -ENOMEM; + + lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS, + LPROCFS_CNTR_AVGMINMAX, "locks", "locks"); + + lock_name[MAX_STRING_SIZE] = '\0'; + + memset(lock_vars, 0, sizeof(lock_vars)); + lock_vars[0].name = lock_name; + + LDLM_NS_ADD_VAR("resource_count", ns, &lprocfs_ns_resources_fops); + LDLM_NS_ADD_VAR("lock_count", ns, &lprocfs_ns_locks_fops); + + if (ns_is_client(ns)) { + LDLM_NS_ADD_VAR("lock_unused_count", &ns->ns_nr_unused, + &ldlm_uint_fops); + LDLM_NS_ADD_VAR("lru_size", ns, &lprocfs_lru_size_fops); + LDLM_NS_ADD_VAR("lru_max_age", &ns->ns_max_age, + &ldlm_rw_uint_fops); + LDLM_NS_ADD_VAR("early_lock_cancel", ns, &lprocfs_elc_fops); + } else { + LDLM_NS_ADD_VAR("ctime_age_limit", &ns->ns_ctime_age_limit, + &ldlm_rw_uint_fops); + LDLM_NS_ADD_VAR("lock_timeouts", &ns->ns_timeouts, + &ldlm_uint_fops); + LDLM_NS_ADD_VAR("max_nolock_bytes", &ns->ns_max_nolock_size, + &ldlm_rw_uint_fops); + LDLM_NS_ADD_VAR("contention_seconds", &ns->ns_contention_time, + &ldlm_rw_uint_fops); + LDLM_NS_ADD_VAR("contended_locks", &ns->ns_contended_locks, + &ldlm_rw_uint_fops); + LDLM_NS_ADD_VAR("max_parallel_ast", &ns->ns_max_parallel_ast, + &ldlm_rw_uint_fops); + } + return 0; +} +#undef MAX_STRING_SIZE +#else /* CONFIG_PROC_FS */ + +#define ldlm_namespace_proc_unregister(ns) ({; }) +#define ldlm_namespace_proc_register(ns) ({0; }) + +#endif /* CONFIG_PROC_FS */ + +static unsigned ldlm_res_hop_hash(struct cfs_hash *hs, + const void *key, unsigned mask) +{ + const struct ldlm_res_id *id = key; + unsigned val = 0; + unsigned i; + + for (i = 0; i < RES_NAME_SIZE; i++) + val += id->name[i]; + return val & mask; +} + +static unsigned ldlm_res_hop_fid_hash(struct cfs_hash *hs, + const void *key, unsigned mask) +{ + const struct ldlm_res_id *id = key; + struct lu_fid fid; + __u32 hash; + __u32 val; + + fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF]; + fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF]; + fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); + + hash = fid_flatten32(&fid); + hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ + if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) { + val = id->name[LUSTRE_RES_ID_HSH_OFF]; + hash += (val >> 5) + (val << 11); + } else { + val = fid_oid(&fid); + } + hash = hash_long(hash, hs->hs_bkt_bits); + /* give me another random factor */ + hash -= hash_long((unsigned long)hs, val % 11 + 3); + + hash <<= hs->hs_cur_bits - hs->hs_bkt_bits; + hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1); + + return hash & mask; +} + +static void *ldlm_res_hop_key(struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return &res->lr_name; +} + +static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return ldlm_res_eq((const struct ldlm_res_id *)key, + (const struct ldlm_res_id *)&res->lr_name); +} + +static void *ldlm_res_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_resource, lr_hash); +} + +static void ldlm_res_hop_get_locked(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + ldlm_resource_getref(res); +} + +static void ldlm_res_hop_put_locked(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + /* cfs_hash_for_each_nolock is the only chance we call it */ + ldlm_resource_putref_locked(res); +} + +static void ldlm_res_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + ldlm_resource_putref(res); +} + +cfs_hash_ops_t ldlm_ns_hash_ops = { + .hs_hash = ldlm_res_hop_hash, + .hs_key = ldlm_res_hop_key, + .hs_keycmp = ldlm_res_hop_keycmp, + .hs_keycpy = NULL, + .hs_object = ldlm_res_hop_object, + .hs_get = ldlm_res_hop_get_locked, + .hs_put_locked = ldlm_res_hop_put_locked, + .hs_put = ldlm_res_hop_put +}; + +cfs_hash_ops_t ldlm_ns_fid_hash_ops = { + .hs_hash = ldlm_res_hop_fid_hash, + .hs_key = ldlm_res_hop_key, + .hs_keycmp = ldlm_res_hop_keycmp, + .hs_keycpy = NULL, + .hs_object = ldlm_res_hop_object, + .hs_get = ldlm_res_hop_get_locked, + .hs_put_locked = ldlm_res_hop_put_locked, + .hs_put = ldlm_res_hop_put +}; + +struct ldlm_ns_hash_def { + ldlm_ns_type_t nsd_type; + /** hash bucket bits */ + unsigned nsd_bkt_bits; + /** hash bits */ + unsigned nsd_all_bits; + /** hash operations */ + cfs_hash_ops_t *nsd_hops; +}; + +struct ldlm_ns_hash_def ldlm_ns_hash_defs[] = { + { + .nsd_type = LDLM_NS_TYPE_MDC, + .nsd_bkt_bits = 11, + .nsd_all_bits = 16, + .nsd_hops = &ldlm_ns_fid_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_MDT, + .nsd_bkt_bits = 14, + .nsd_all_bits = 21, + .nsd_hops = &ldlm_ns_fid_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_OSC, + .nsd_bkt_bits = 8, + .nsd_all_bits = 12, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_OST, + .nsd_bkt_bits = 11, + .nsd_all_bits = 17, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_MGC, + .nsd_bkt_bits = 4, + .nsd_all_bits = 4, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_MGT, + .nsd_bkt_bits = 4, + .nsd_all_bits = 4, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_UNKNOWN, + }, +}; + +/** + * Create and initialize new empty namespace. + */ +struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, + ldlm_side_t client, + ldlm_appetite_t apt, + ldlm_ns_type_t ns_type) +{ + struct ldlm_namespace *ns = NULL; + struct ldlm_ns_bucket *nsb; + struct ldlm_ns_hash_def *nsd; + struct cfs_hash_bd bd; + int idx; + int rc; + + LASSERT(obd != NULL); + + rc = ldlm_get_ref(); + if (rc) { + CERROR("ldlm_get_ref failed: %d\n", rc); + return NULL; + } + + for (idx = 0;; idx++) { + nsd = &ldlm_ns_hash_defs[idx]; + if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) { + CERROR("Unknown type %d for ns %s\n", ns_type, name); + goto out_ref; + } + + if (nsd->nsd_type == ns_type) + break; + } + + OBD_ALLOC_PTR(ns); + if (!ns) + goto out_ref; + + ns->ns_rs_hash = cfs_hash_create(name, + nsd->nsd_all_bits, nsd->nsd_all_bits, + nsd->nsd_bkt_bits, sizeof(*nsb), + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + nsd->nsd_hops, + CFS_HASH_DEPTH | + CFS_HASH_BIGNAME | + CFS_HASH_SPIN_BKTLOCK | + CFS_HASH_NO_ITEMREF); + if (ns->ns_rs_hash == NULL) + goto out_ns; + + cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) { + nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); + at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0); + nsb->nsb_namespace = ns; + } + + ns->ns_obd = obd; + ns->ns_appetite = apt; + ns->ns_client = client; + + INIT_LIST_HEAD(&ns->ns_list_chain); + INIT_LIST_HEAD(&ns->ns_unused_list); + spin_lock_init(&ns->ns_lock); + atomic_set(&ns->ns_bref, 0); + init_waitqueue_head(&ns->ns_waitq); + + ns->ns_max_nolock_size = NS_DEFAULT_MAX_NOLOCK_BYTES; + ns->ns_contention_time = NS_DEFAULT_CONTENTION_SECONDS; + ns->ns_contended_locks = NS_DEFAULT_CONTENDED_LOCKS; + + ns->ns_max_parallel_ast = LDLM_DEFAULT_PARALLEL_AST_LIMIT; + ns->ns_nr_unused = 0; + ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE; + ns->ns_max_age = LDLM_DEFAULT_MAX_ALIVE; + ns->ns_ctime_age_limit = LDLM_CTIME_AGE_LIMIT; + ns->ns_timeouts = 0; + ns->ns_orig_connect_flags = 0; + ns->ns_connect_flags = 0; + ns->ns_stopping = 0; + rc = ldlm_namespace_proc_register(ns); + if (rc != 0) { + CERROR("Can't initialize ns proc, rc %d\n", rc); + goto out_hash; + } + + idx = ldlm_namespace_nr_read(client); + rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client); + if (rc) { + CERROR("Can't initialize lock pool, rc %d\n", rc); + goto out_proc; + } + + ldlm_namespace_register(ns, client); + return ns; +out_proc: + ldlm_namespace_proc_unregister(ns); + ldlm_namespace_cleanup(ns, 0); +out_hash: + cfs_hash_putref(ns->ns_rs_hash); +out_ns: + OBD_FREE_PTR(ns); +out_ref: + ldlm_put_ref(); + return NULL; +} +EXPORT_SYMBOL(ldlm_namespace_new); + +extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); + +/** + * Cancel and destroy all locks on a resource. + * + * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just + * clean up. This is currently only used for recovery, and we make + * certain assumptions as a result--notably, that we shouldn't cancel + * locks with refs. + */ +static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, + __u64 flags) +{ + struct list_head *tmp; + int rc = 0, client = ns_is_client(ldlm_res_to_ns(res)); + bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY); + + do { + struct ldlm_lock *lock = NULL; + + /* First, we look for non-cleaned-yet lock + * all cleaned locks are marked by CLEANED flag. */ + lock_res(res); + list_for_each(tmp, q) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + if (lock->l_flags & LDLM_FL_CLEANED) { + lock = NULL; + continue; + } + LDLM_LOCK_GET(lock); + lock->l_flags |= LDLM_FL_CLEANED; + break; + } + + if (lock == NULL) { + unlock_res(res); + break; + } + + /* Set CBPENDING so nothing in the cancellation path + * can match this lock. */ + lock->l_flags |= LDLM_FL_CBPENDING; + lock->l_flags |= LDLM_FL_FAILED; + lock->l_flags |= flags; + + /* ... without sending a CANCEL message for local_only. */ + if (local_only) + lock->l_flags |= LDLM_FL_LOCAL_ONLY; + + if (local_only && (lock->l_readers || lock->l_writers)) { + /* This is a little bit gross, but much better than the + * alternative: pretend that we got a blocking AST from + * the server, so that when the lock is decref'd, it + * will go away ... */ + unlock_res(res); + LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY"); + if (lock->l_completion_ast) + lock->l_completion_ast(lock, 0, NULL); + LDLM_LOCK_RELEASE(lock); + continue; + } + + if (client) { + struct lustre_handle lockh; + + unlock_res(res); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc) + CERROR("ldlm_cli_cancel: %d\n", rc); + } else { + ldlm_resource_unlink_lock(lock); + unlock_res(res); + LDLM_DEBUG(lock, "Freeing a lock still held by a client node"); + ldlm_lock_destroy(lock); + } + LDLM_LOCK_RELEASE(lock); + } while (1); +} + +static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + __u64 flags = *(__u64 *)arg; + + cleanup_resource(res, &res->lr_granted, flags); + cleanup_resource(res, &res->lr_converting, flags); + cleanup_resource(res, &res->lr_waiting, flags); + + return 0; +} + +static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + lock_res(res); + CERROR("%s: namespace resource "DLDLMRES + " (%p) refcount nonzero (%d) after lock cleanup; forcing cleanup.\n", + ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res, + atomic_read(&res->lr_refcount) - 1); + + ldlm_resource_dump(D_ERROR, res); + unlock_res(res); + return 0; +} + +/** + * Cancel and destroy all locks in the namespace. + * + * Typically used during evictions when server notified client that it was + * evicted and all of its state needs to be destroyed. + * Also used during shutdown. + */ +int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags) +{ + if (ns == NULL) { + CDEBUG(D_INFO, "NULL ns, skipping cleanup\n"); + return ELDLM_OK; + } + + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags); + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL); + return ELDLM_OK; +} +EXPORT_SYMBOL(ldlm_namespace_cleanup); + +/** + * Attempts to free namespace. + * + * Only used when namespace goes away, like during an unmount. + */ +static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force) +{ + /* At shutdown time, don't call the cancellation callback */ + ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0); + + if (atomic_read(&ns->ns_bref) > 0) { + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + int rc; + + CDEBUG(D_DLMTRACE, + "dlm namespace %s free waiting on refcount %d\n", + ldlm_ns_name(ns), atomic_read(&ns->ns_bref)); +force_wait: + if (force) + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL); + + rc = l_wait_event(ns->ns_waitq, + atomic_read(&ns->ns_bref) == 0, &lwi); + + /* Forced cleanups should be able to reclaim all references, + * so it's safe to wait forever... we can't leak locks... */ + if (force && rc == -ETIMEDOUT) { + LCONSOLE_ERROR("Forced cleanup waiting for %s namespace with %d resources in use, (rc=%d)\n", + ldlm_ns_name(ns), + atomic_read(&ns->ns_bref), rc); + goto force_wait; + } + + if (atomic_read(&ns->ns_bref)) { + LCONSOLE_ERROR("Cleanup waiting for %s namespace with %d resources in use, (rc=%d)\n", + ldlm_ns_name(ns), + atomic_read(&ns->ns_bref), rc); + return ELDLM_NAMESPACE_EXISTS; + } + CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n", + ldlm_ns_name(ns)); + } + + return ELDLM_OK; +} + +/** + * Performs various cleanups for passed \a ns to make it drop refc and be + * ready for freeing. Waits for refc == 0. + * + * The following is done: + * (0) Unregister \a ns from its list to make inaccessible for potential + * users like pools thread and others; + * (1) Clear all locks in \a ns. + */ +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) +{ + int rc; + + if (!ns) + return; + + spin_lock(&ns->ns_lock); + ns->ns_stopping = 1; + spin_unlock(&ns->ns_lock); + + /* + * Can fail with -EINTR when force == 0 in which case try harder. + */ + rc = __ldlm_namespace_free(ns, force); + if (rc != ELDLM_OK) { + if (imp) { + ptlrpc_disconnect_import(imp, 0); + ptlrpc_invalidate_import(imp); + } + + /* + * With all requests dropped and the import inactive + * we are guaranteed all reference will be dropped. + */ + rc = __ldlm_namespace_free(ns, 1); + LASSERT(rc == 0); + } +} + +/** + * Performs freeing memory structures related to \a ns. This is only done + * when ldlm_namespce_free_prior() successfully removed all resources + * referencing \a ns and its refc == 0. + */ +void ldlm_namespace_free_post(struct ldlm_namespace *ns) +{ + if (!ns) + return; + + /* Make sure that nobody can find this ns in its list. */ + ldlm_namespace_unregister(ns, ns->ns_client); + /* Fini pool _before_ parent proc dir is removed. This is important as + * ldlm_pool_fini() removes own proc dir which is child to @dir. + * Removing it after @dir may cause oops. */ + ldlm_pool_fini(&ns->ns_pool); + + ldlm_namespace_proc_unregister(ns); + cfs_hash_putref(ns->ns_rs_hash); + /* Namespace \a ns should be not on list at this time, otherwise + * this will cause issues related to using freed \a ns in poold + * thread. */ + LASSERT(list_empty(&ns->ns_list_chain)); + OBD_FREE_PTR(ns); + ldlm_put_ref(); +} + +/** + * Cleanup the resource, and free namespace. + * bug 12864: + * Deadlock issue: + * proc1: destroy import + * class_disconnect_export(grab cl_sem) -> + * -> ldlm_namespace_free -> + * -> lprocfs_remove(grab _lprocfs_lock). + * proc2: read proc info + * lprocfs_fops_read(grab _lprocfs_lock) -> + * -> osc_rd_active, etc(grab cl_sem). + * + * So that I have to split the ldlm_namespace_free into two parts - the first + * part ldlm_namespace_free_prior is used to cleanup the resource which is + * being used; the 2nd part ldlm_namespace_free_post is used to unregister the + * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem + * held. + */ +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) +{ + ldlm_namespace_free_prior(ns, imp, force); + ldlm_namespace_free_post(ns); +} +EXPORT_SYMBOL(ldlm_namespace_free); + +void ldlm_namespace_get(struct ldlm_namespace *ns) +{ + atomic_inc(&ns->ns_bref); +} +EXPORT_SYMBOL(ldlm_namespace_get); + +/* This is only for callers that care about refcount */ +int ldlm_namespace_get_return(struct ldlm_namespace *ns) +{ + return atomic_inc_return(&ns->ns_bref); +} + +void ldlm_namespace_put(struct ldlm_namespace *ns) +{ + if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) { + wake_up(&ns->ns_waitq); + spin_unlock(&ns->ns_lock); + } +} +EXPORT_SYMBOL(ldlm_namespace_put); + +/** Register \a ns in the list of namespaces */ +void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client) +{ + mutex_lock(ldlm_namespace_lock(client)); + LASSERT(list_empty(&ns->ns_list_chain)); + list_add(&ns->ns_list_chain, ldlm_namespace_inactive_list(client)); + ldlm_namespace_nr_inc(client); + mutex_unlock(ldlm_namespace_lock(client)); +} + +/** Unregister \a ns from the list of namespaces. */ +void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client) +{ + mutex_lock(ldlm_namespace_lock(client)); + LASSERT(!list_empty(&ns->ns_list_chain)); + /* Some asserts and possibly other parts of the code are still + * using list_empty(&ns->ns_list_chain). This is why it is + * important to use list_del_init() here. */ + list_del_init(&ns->ns_list_chain); + ldlm_namespace_nr_dec(client); + mutex_unlock(ldlm_namespace_lock(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *ns, + ldlm_side_t client) +{ + LASSERT(!list_empty(&ns->ns_list_chain)); + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *ns, + ldlm_side_t client) +{ + LASSERT(!list_empty(&ns->ns_list_chain)); + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + list_move_tail(&ns->ns_list_chain, + ldlm_namespace_inactive_list(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client) +{ + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + LASSERT(!list_empty(ldlm_namespace_list(client))); + return container_of(ldlm_namespace_list(client)->next, + struct ldlm_namespace, ns_list_chain); +} + +/** Create and initialize new resource. */ +static struct ldlm_resource *ldlm_resource_new(void) +{ + struct ldlm_resource *res; + int idx; + + OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS); + if (res == NULL) + return NULL; + + INIT_LIST_HEAD(&res->lr_granted); + INIT_LIST_HEAD(&res->lr_converting); + INIT_LIST_HEAD(&res->lr_waiting); + + /* Initialize interval trees for each lock mode. */ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + res->lr_itree[idx].lit_size = 0; + res->lr_itree[idx].lit_mode = 1 << idx; + res->lr_itree[idx].lit_root = NULL; + } + + atomic_set(&res->lr_refcount, 1); + spin_lock_init(&res->lr_lock); + lu_ref_init(&res->lr_reference); + + /* The creator of the resource must unlock the mutex after LVB + * initialization. */ + mutex_init(&res->lr_lvb_mutex); + mutex_lock(&res->lr_lvb_mutex); + + return res; +} + +/** + * Return a reference to resource with given name, creating it if necessary. + * Args: namespace with ns_lock unlocked + * Locks: takes and releases NS hash-lock and res->lr_lock + * Returns: referenced, unlocked ldlm_resource or NULL + */ +struct ldlm_resource * +ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, + const struct ldlm_res_id *name, ldlm_type_t type, int create) +{ + struct hlist_node *hnode; + struct ldlm_resource *res; + struct cfs_hash_bd bd; + __u64 version; + int ns_refcount = 0; + + LASSERT(ns != NULL); + LASSERT(parent == NULL); + LASSERT(ns->ns_rs_hash != NULL); + LASSERT(name->name[0] != 0); + + cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0); + hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); + if (hnode != NULL) { + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + /* Synchronize with regard to resource creation. */ + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { + mutex_lock(&res->lr_lvb_mutex); + mutex_unlock(&res->lr_lvb_mutex); + } + + if (unlikely(res->lr_lvb_len < 0)) { + ldlm_resource_putref(res); + res = NULL; + } + return res; + } + + version = cfs_hash_bd_version_get(&bd); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); + + if (create == 0) + return NULL; + + LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE, + "type: %d\n", type); + res = ldlm_resource_new(); + if (!res) + return NULL; + + res->lr_ns_bucket = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); + res->lr_name = *name; + res->lr_type = type; + res->lr_most_restr = LCK_NL; + + cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1); + hnode = (version == cfs_hash_bd_version_get(&bd)) ? NULL : + cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); + + if (hnode != NULL) { + /* Someone won the race and already added the resource. */ + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + /* Clean lu_ref for failed resource. */ + lu_ref_fini(&res->lr_reference); + /* We have taken lr_lvb_mutex. Drop it. */ + mutex_unlock(&res->lr_lvb_mutex); + OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof(*res)); + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + /* Synchronize with regard to resource creation. */ + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { + mutex_lock(&res->lr_lvb_mutex); + mutex_unlock(&res->lr_lvb_mutex); + } + + if (unlikely(res->lr_lvb_len < 0)) { + ldlm_resource_putref(res); + res = NULL; + } + return res; + } + /* We won! Let's add the resource. */ + cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash); + if (cfs_hash_bd_count_get(&bd) == 1) + ns_refcount = ldlm_namespace_get_return(ns); + + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { + int rc; + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2); + rc = ns->ns_lvbo->lvbo_init(res); + if (rc < 0) { + CERROR("%s: lvbo_init failed for resource %#llx:%#llx: rc = %d\n", + ns->ns_obd->obd_name, name->name[0], + name->name[1], rc); + if (res->lr_lvb_data) { + OBD_FREE(res->lr_lvb_data, res->lr_lvb_len); + res->lr_lvb_data = NULL; + } + res->lr_lvb_len = rc; + mutex_unlock(&res->lr_lvb_mutex); + ldlm_resource_putref(res); + return NULL; + } + } + + /* We create resource with locked lr_lvb_mutex. */ + mutex_unlock(&res->lr_lvb_mutex); + + /* Let's see if we happened to be the very first resource in this + * namespace. If so, and this is a client namespace, we need to move + * the namespace into the active namespaces list to be patrolled by + * the ldlm_poold. */ + if (ns_is_client(ns) && ns_refcount == 1) { + mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); + ldlm_namespace_move_to_active_locked(ns, LDLM_NAMESPACE_CLIENT); + mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); + } + + return res; +} +EXPORT_SYMBOL(ldlm_resource_get); + +struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res) +{ + LASSERT(res != NULL); + LASSERT(res != LP_POISON); + atomic_inc(&res->lr_refcount); + CDEBUG(D_INFO, "getref res: %p count: %d\n", res, + atomic_read(&res->lr_refcount)); + return res; +} + +static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd, + struct ldlm_resource *res) +{ + struct ldlm_ns_bucket *nsb = res->lr_ns_bucket; + + if (!list_empty(&res->lr_granted)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + if (!list_empty(&res->lr_converting)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + if (!list_empty(&res->lr_waiting)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash, + bd, &res->lr_hash); + lu_ref_fini(&res->lr_reference); + if (cfs_hash_bd_count_get(bd) == 0) + ldlm_namespace_put(nsb->nsb_namespace); +} + +/* Returns 1 if the resource was freed, 0 if it remains. */ +int ldlm_resource_putref(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + struct cfs_hash_bd bd; + + LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "putref res: %p count: %d\n", + res, atomic_read(&res->lr_refcount) - 1); + + cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd); + if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) { + __ldlm_resource_putref_final(&bd, res); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free) + ns->ns_lvbo->lvbo_free(res); + OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof(*res)); + return 1; + } + return 0; +} +EXPORT_SYMBOL(ldlm_resource_putref); + +/* Returns 1 if the resource was freed, 0 if it remains. */ +int ldlm_resource_putref_locked(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + + LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "putref res: %p count: %d\n", + res, atomic_read(&res->lr_refcount) - 1); + + if (atomic_dec_and_test(&res->lr_refcount)) { + struct cfs_hash_bd bd; + + cfs_hash_bd_get(ldlm_res_to_ns(res)->ns_rs_hash, + &res->lr_name, &bd); + __ldlm_resource_putref_final(&bd, res); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + /* NB: ns_rs_hash is created with CFS_HASH_NO_ITEMREF, + * so we should never be here while calling cfs_hash_del, + * cfs_hash_for_each_nolock is the only case we can get + * here, which is safe to release cfs_hash_bd_lock. + */ + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free) + ns->ns_lvbo->lvbo_free(res); + OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof(*res)); + + cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1); + return 1; + } + return 0; +} + +/** + * Add a lock into a given resource into specified lock list. + */ +void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head, + struct ldlm_lock *lock) +{ + check_res_locked(res); + + LDLM_DEBUG(lock, "About to add this lock:\n"); + + if (lock->l_flags & LDLM_FL_DESTROYED) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + return; + } + + LASSERT(list_empty(&lock->l_res_link)); + + list_add_tail(&lock->l_res_link, head); +} + +/** + * Insert a lock into resource after specified lock. + * + * Obtain resource description from the lock we are inserting after. + */ +void ldlm_resource_insert_lock_after(struct ldlm_lock *original, + struct ldlm_lock *new) +{ + struct ldlm_resource *res = original->l_resource; + + check_res_locked(res); + + ldlm_resource_dump(D_INFO, res); + LDLM_DEBUG(new, "About to insert this lock after %p:\n", original); + + if (new->l_flags & LDLM_FL_DESTROYED) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + goto out; + } + + LASSERT(list_empty(&new->l_res_link)); + + list_add(&new->l_res_link, &original->l_res_link); + out:; +} + +void ldlm_resource_unlink_lock(struct ldlm_lock *lock) +{ + int type = lock->l_resource->lr_type; + + check_res_locked(lock->l_resource); + if (type == LDLM_IBITS || type == LDLM_PLAIN) + ldlm_unlink_lock_skiplist(lock); + else if (type == LDLM_EXTENT) + ldlm_extent_unlink_lock(lock); + list_del_init(&lock->l_res_link); +} +EXPORT_SYMBOL(ldlm_resource_unlink_lock); + +void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc) +{ + desc->lr_type = res->lr_type; + desc->lr_name = res->lr_name; +} + +/** + * Print information about all locks in all namespaces on this node to debug + * log. + */ +void ldlm_dump_all_namespaces(ldlm_side_t client, int level) +{ + struct list_head *tmp; + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + mutex_lock(ldlm_namespace_lock(client)); + + list_for_each(tmp, ldlm_namespace_list(client)) { + struct ldlm_namespace *ns; + + ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain); + ldlm_namespace_dump(level, ns); + } + + mutex_unlock(ldlm_namespace_lock(client)); +} +EXPORT_SYMBOL(ldlm_dump_all_namespaces); + +static int ldlm_res_hash_dump(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + int level = (int)(unsigned long)arg; + + lock_res(res); + ldlm_resource_dump(level, res); + unlock_res(res); + + return 0; +} + +/** + * Print information about all locks in this namespace on this node to debug + * log. + */ +void ldlm_namespace_dump(int level, struct ldlm_namespace *ns) +{ + if (!((libcfs_debug | D_ERROR) & level)) + return; + + CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n", + ldlm_ns_name(ns), atomic_read(&ns->ns_bref), + ns_is_client(ns) ? "client" : "server"); + + if (time_before(cfs_time_current(), ns->ns_next_dump)) + return; + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_res_hash_dump, + (void *)(unsigned long)level); + spin_lock(&ns->ns_lock); + ns->ns_next_dump = cfs_time_shift(10); + spin_unlock(&ns->ns_lock); +} +EXPORT_SYMBOL(ldlm_namespace_dump); + +/** + * Print information about all locks in this resource to debug log. + */ +void ldlm_resource_dump(int level, struct ldlm_resource *res) +{ + struct ldlm_lock *lock; + unsigned int granted = 0; + + CLASSERT(RES_NAME_SIZE == 4); + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + CDEBUG(level, "--- Resource: "DLDLMRES" (%p) refcount = %d\n", + PLDLMRES(res), res, atomic_read(&res->lr_refcount)); + + if (!list_empty(&res->lr_granted)) { + CDEBUG(level, "Granted locks (in reverse order):\n"); + list_for_each_entry_reverse(lock, &res->lr_granted, + l_res_link) { + LDLM_DEBUG_LIMIT(level, lock, "###"); + if (!(level & D_CANTMASK) && + ++granted > ldlm_dump_granted_max) { + CDEBUG(level, "only dump %d granted locks to avoid DDOS.\n", + granted); + break; + } + } + } + if (!list_empty(&res->lr_converting)) { + CDEBUG(level, "Converting locks:\n"); + list_for_each_entry(lock, &res->lr_converting, l_res_link) + LDLM_DEBUG_LIMIT(level, lock, "###"); + } + if (!list_empty(&res->lr_waiting)) { + CDEBUG(level, "Waiting locks:\n"); + list_for_each_entry(lock, &res->lr_waiting, l_res_link) + LDLM_DEBUG_LIMIT(level, lock, "###"); + } +} diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/Makefile b/kernel/drivers/staging/lustre/lustre/libcfs/Makefile new file mode 100644 index 000000000..2996a48a3 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/Makefile @@ -0,0 +1,18 @@ +obj-$(CONFIG_LUSTRE_FS) += libcfs.o + +libcfs-linux-objs := linux-tracefile.o linux-debug.o +libcfs-linux-objs += linux-prim.o linux-cpu.o +libcfs-linux-objs += linux-tcpip.o +libcfs-linux-objs += linux-curproc.o +libcfs-linux-objs += linux-module.o +libcfs-linux-objs += linux-crypto.o +libcfs-linux-objs += linux-crypto-adler.o + +libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs)) + +libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \ + libcfs_string.o hash.o kernel_user_comm.o \ + prng.o workitem.o libcfs_cpu.o \ + libcfs_mem.o libcfs_lock.o + +libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs) diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/debug.c b/kernel/drivers/staging/lustre/lustre/libcfs/debug.c new file mode 100644 index 000000000..021c92fa0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/debug.c @@ -0,0 +1,460 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/debug.c + * + * Author: Phil Schwan + * + */ + +# define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/libcfs/libcfs.h" +#include "tracefile.h" + +static char debug_file_name[1024]; + +unsigned int libcfs_subsystem_debug = ~0; +module_param(libcfs_subsystem_debug, int, 0644); +MODULE_PARM_DESC(libcfs_subsystem_debug, "Lustre kernel debug subsystem mask"); +EXPORT_SYMBOL(libcfs_subsystem_debug); + +unsigned int libcfs_debug = (D_CANTMASK | + D_NETERROR | D_HA | D_CONFIG | D_IOCTL); +module_param(libcfs_debug, int, 0644); +MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask"); +EXPORT_SYMBOL(libcfs_debug); + +static unsigned int libcfs_debug_mb; +module_param(libcfs_debug_mb, uint, 0644); +MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size."); +EXPORT_SYMBOL(libcfs_debug_mb); + +unsigned int libcfs_printk = D_CANTMASK; +module_param(libcfs_printk, uint, 0644); +MODULE_PARM_DESC(libcfs_printk, "Lustre kernel debug console mask"); +EXPORT_SYMBOL(libcfs_printk); + +unsigned int libcfs_console_ratelimit = 1; +module_param(libcfs_console_ratelimit, uint, 0644); +MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)"); +EXPORT_SYMBOL(libcfs_console_ratelimit); + +unsigned int libcfs_console_max_delay; +module_param(libcfs_console_max_delay, uint, 0644); +MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)"); +EXPORT_SYMBOL(libcfs_console_max_delay); + +unsigned int libcfs_console_min_delay; +module_param(libcfs_console_min_delay, uint, 0644); +MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)"); +EXPORT_SYMBOL(libcfs_console_min_delay); + +unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF; +module_param(libcfs_console_backoff, uint, 0644); +MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor"); +EXPORT_SYMBOL(libcfs_console_backoff); + +unsigned int libcfs_debug_binary = 1; +EXPORT_SYMBOL(libcfs_debug_binary); + +unsigned int libcfs_stack = 3 * THREAD_SIZE / 4; +EXPORT_SYMBOL(libcfs_stack); + +static unsigned int portal_enter_debugger; +EXPORT_SYMBOL(portal_enter_debugger); + +unsigned int libcfs_catastrophe; +EXPORT_SYMBOL(libcfs_catastrophe); + +unsigned int libcfs_watchdog_ratelimit = 300; +EXPORT_SYMBOL(libcfs_watchdog_ratelimit); + +unsigned int libcfs_panic_on_lbug = 1; +module_param(libcfs_panic_on_lbug, uint, 0644); +MODULE_PARM_DESC(libcfs_panic_on_lbug, "Lustre kernel panic on LBUG"); +EXPORT_SYMBOL(libcfs_panic_on_lbug); + +atomic_t libcfs_kmemory = ATOMIC_INIT(0); +EXPORT_SYMBOL(libcfs_kmemory); + +static wait_queue_head_t debug_ctlwq; + +char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT; + +/* We need to pass a pointer here, but elsewhere this must be a const */ +static char *libcfs_debug_file_path; +module_param(libcfs_debug_file_path, charp, 0644); +MODULE_PARM_DESC(libcfs_debug_file_path, + "Path for dumping debug logs, set 'NONE' to prevent log dumping"); + +int libcfs_panic_in_progress; + +/* libcfs_debug_token2mask() expects the returned + * string in lower-case */ +static const char * +libcfs_debug_subsys2str(int subsys) +{ + switch (1 << subsys) { + default: + return NULL; + case S_UNDEFINED: + return "undefined"; + case S_MDC: + return "mdc"; + case S_MDS: + return "mds"; + case S_OSC: + return "osc"; + case S_OST: + return "ost"; + case S_CLASS: + return "class"; + case S_LOG: + return "log"; + case S_LLITE: + return "llite"; + case S_RPC: + return "rpc"; + case S_LNET: + return "lnet"; + case S_LND: + return "lnd"; + case S_PINGER: + return "pinger"; + case S_FILTER: + return "filter"; + case S_ECHO: + return "echo"; + case S_LDLM: + return "ldlm"; + case S_LOV: + return "lov"; + case S_LQUOTA: + return "lquota"; + case S_OSD: + return "osd"; + case S_LMV: + return "lmv"; + case S_SEC: + return "sec"; + case S_GSS: + return "gss"; + case S_MGC: + return "mgc"; + case S_MGS: + return "mgs"; + case S_FID: + return "fid"; + case S_FLD: + return "fld"; + } +} + +/* libcfs_debug_token2mask() expects the returned + * string in lower-case */ +static const char * +libcfs_debug_dbg2str(int debug) +{ + switch (1 << debug) { + default: + return NULL; + case D_TRACE: + return "trace"; + case D_INODE: + return "inode"; + case D_SUPER: + return "super"; + case D_EXT2: + return "ext2"; + case D_MALLOC: + return "malloc"; + case D_CACHE: + return "cache"; + case D_INFO: + return "info"; + case D_IOCTL: + return "ioctl"; + case D_NETERROR: + return "neterror"; + case D_NET: + return "net"; + case D_WARNING: + return "warning"; + case D_BUFFS: + return "buffs"; + case D_OTHER: + return "other"; + case D_DENTRY: + return "dentry"; + case D_NETTRACE: + return "nettrace"; + case D_PAGE: + return "page"; + case D_DLMTRACE: + return "dlmtrace"; + case D_ERROR: + return "error"; + case D_EMERG: + return "emerg"; + case D_HA: + return "ha"; + case D_RPCTRACE: + return "rpctrace"; + case D_VFSTRACE: + return "vfstrace"; + case D_READA: + return "reada"; + case D_MMAP: + return "mmap"; + case D_CONFIG: + return "config"; + case D_CONSOLE: + return "console"; + case D_QUOTA: + return "quota"; + case D_SEC: + return "sec"; + case D_LFSCK: + return "lfsck"; + } +} + +int +libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int len = 0; + const char *token; + int i; + + if (mask == 0) { /* "0" */ + if (size > 0) + str[0] = '0'; + len = 1; + } else { /* space-separated tokens */ + for (i = 0; i < 32; i++) { + if ((mask & (1 << i)) == 0) + continue; + + token = fn(i); + if (token == NULL) /* unused bit */ + continue; + + if (len > 0) { /* separator? */ + if (len < size) + str[len] = ' '; + len++; + } + + while (*token != 0) { + if (len < size) + str[len] = *token; + token++; + len++; + } + } + } + + /* terminate 'str' */ + if (len < size) + str[len] = 0; + else + str[size - 1] = 0; + + return len; +} + +int +libcfs_debug_str2mask(int *mask, const char *str, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int m = 0; + int matched; + int n; + int t; + + /* Allow a number for backwards compatibility */ + + for (n = strlen(str); n > 0; n--) + if (!isspace(str[n-1])) + break; + matched = n; + t = sscanf(str, "%i%n", &m, &matched); + if (t >= 1 && matched == n) { + /* don't print warning for lctl set_param debug=0 or -1 */ + if (m != 0 && m != -1) + CWARN("You are trying to use a numerical value for the mask - this will be deprecated in a future release.\n"); + *mask = m; + return 0; + } + + return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK, + 0xffffffff); +} + +/** + * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages() + */ +void libcfs_debug_dumplog_internal(void *arg) +{ + void *journal_info; + + journal_info = current->journal_info; + current->journal_info = NULL; + + if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) { + snprintf(debug_file_name, sizeof(debug_file_name) - 1, + "%s.%ld.%ld", libcfs_debug_file_path_arr, + get_seconds(), (long_ptr_t)arg); + pr_alert("LustreError: dumping log to %s\n", + debug_file_name); + cfs_tracefile_dump_all_pages(debug_file_name); + libcfs_run_debug_log_upcall(debug_file_name); + } + + current->journal_info = journal_info; +} + +static int libcfs_debug_dumplog_thread(void *arg) +{ + libcfs_debug_dumplog_internal(arg); + wake_up(&debug_ctlwq); + return 0; +} + +void libcfs_debug_dumplog(void) +{ + wait_queue_t wait; + struct task_struct *dumper; + + /* we're being careful to ensure that the kernel thread is + * able to set our state to running as it exits before we + * get to schedule() */ + init_waitqueue_entry(&wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&debug_ctlwq, &wait); + + dumper = kthread_run(libcfs_debug_dumplog_thread, + (void *)(long)current_pid(), + "libcfs_debug_dumper"); + if (IS_ERR(dumper)) + pr_err("LustreError: cannot start log dump thread: %ld\n", + PTR_ERR(dumper)); + else + schedule(); + + /* be sure to teardown if cfs_create_thread() failed */ + remove_wait_queue(&debug_ctlwq, &wait); + set_current_state(TASK_RUNNING); +} +EXPORT_SYMBOL(libcfs_debug_dumplog); + +int libcfs_debug_init(unsigned long bufsize) +{ + int rc = 0; + unsigned int max = libcfs_debug_mb; + + init_waitqueue_head(&debug_ctlwq); + + if (libcfs_console_max_delay <= 0 || /* not set by user or */ + libcfs_console_min_delay <= 0 || /* set to invalid values */ + libcfs_console_min_delay >= libcfs_console_max_delay) { + libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY; + libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY; + } + + if (libcfs_debug_file_path != NULL) { + strncpy(libcfs_debug_file_path_arr, + libcfs_debug_file_path, PATH_MAX-1); + libcfs_debug_file_path_arr[PATH_MAX - 1] = '\0'; + } + + /* If libcfs_debug_mb is set to an invalid value or uninitialized + * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */ + if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) { + max = TCD_MAX_PAGES; + } else { + max = max / num_possible_cpus(); + max <<= (20 - PAGE_CACHE_SHIFT); + } + rc = cfs_tracefile_init(max); + + if (rc == 0) + libcfs_register_panic_notifier(); + + return rc; +} + +int libcfs_debug_cleanup(void) +{ + libcfs_unregister_panic_notifier(); + cfs_tracefile_exit(); + return 0; +} + +int libcfs_debug_clear_buffer(void) +{ + cfs_trace_flush_pages(); + return 0; +} + +/* Debug markers, although printed by S_LNET + * should not be be marked as such. */ +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_UNDEFINED +int libcfs_debug_mark_buffer(const char *text) +{ + CDEBUG(D_TRACE, + "***************************************************\n"); + LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text); + CDEBUG(D_TRACE, + "***************************************************\n"); + + return 0; +} +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_LNET + +void libcfs_debug_set_level(unsigned int debug_level) +{ + pr_warn("Lustre: Setting portals debug level to %08x\n", + debug_level); + libcfs_debug = debug_level; +} + +EXPORT_SYMBOL(libcfs_debug_set_level); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/fail.c b/kernel/drivers/staging/lustre/lustre/libcfs/fail.c new file mode 100644 index 000000000..92444b0fe --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/fail.c @@ -0,0 +1,138 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores, + * CA 94065 USA or visit www.oracle.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Oracle Corporation, Inc. + */ + +#include "../../include/linux/libcfs/libcfs.h" + +unsigned long cfs_fail_loc = 0; +EXPORT_SYMBOL(cfs_fail_loc); + +unsigned int cfs_fail_val = 0; +EXPORT_SYMBOL(cfs_fail_val); + +wait_queue_head_t cfs_race_waitq; +EXPORT_SYMBOL(cfs_race_waitq); + +int cfs_race_state; +EXPORT_SYMBOL(cfs_race_state); + +int __cfs_fail_check_set(__u32 id, __u32 value, int set) +{ + static atomic_t cfs_fail_count = ATOMIC_INIT(0); + + LASSERT(!(id & CFS_FAIL_ONCE)); + + if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) == + (CFS_FAILED | CFS_FAIL_ONCE)) { + atomic_set(&cfs_fail_count, 0); /* paranoia */ + return 0; + } + + /* Fail 1/cfs_fail_val times */ + if (cfs_fail_loc & CFS_FAIL_RAND) { + if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0) + return 0; + } + + /* Skip the first cfs_fail_val, then fail */ + if (cfs_fail_loc & CFS_FAIL_SKIP) { + if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val) + return 0; + } + + /* check cfs_fail_val... */ + if (set == CFS_FAIL_LOC_VALUE) { + if (cfs_fail_val != -1 && cfs_fail_val != value) + return 0; + } + + /* Fail cfs_fail_val times, overridden by FAIL_ONCE */ + if (cfs_fail_loc & CFS_FAIL_SOME && + (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) { + int count = atomic_inc_return(&cfs_fail_count); + + if (count >= cfs_fail_val) { + set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); + atomic_set(&cfs_fail_count, 0); + /* we are lost race to increase */ + if (count > cfs_fail_val) + return 0; + } + } + + if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) && + (value & CFS_FAIL_ONCE)) + set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); + /* Lost race to set CFS_FAILED_BIT. */ + if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) { + /* If CFS_FAIL_ONCE is valid, only one process can fail, + * otherwise multi-process can fail at the same time. */ + if (cfs_fail_loc & CFS_FAIL_ONCE) + return 0; + } + + switch (set) { + case CFS_FAIL_LOC_NOSET: + case CFS_FAIL_LOC_VALUE: + break; + case CFS_FAIL_LOC_ORSET: + cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE); + break; + case CFS_FAIL_LOC_RESET: + cfs_fail_loc = value; + break; + default: + LASSERTF(0, "called with bad set %u\n", set); + break; + } + + return 1; +} +EXPORT_SYMBOL(__cfs_fail_check_set); + +int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) +{ + int ret = 0; + + ret = __cfs_fail_check_set(id, value, set); + if (ret) { + CERROR("cfs_fail_timeout id %x sleeping for %dms\n", + id, ms); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(ms) / 1000); + CERROR("cfs_fail_timeout id %x awake\n", id); + } + return ret; +} +EXPORT_SYMBOL(__cfs_fail_timeout_set); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/hash.c b/kernel/drivers/staging/lustre/lustre/libcfs/hash.c new file mode 100644 index 000000000..a55567e0d --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/hash.c @@ -0,0 +1,2098 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/hash.c + * + * Implement a hash class for hash process in lustre system. + * + * Author: YuZhangyong + * + * 2008-08-15: Brian Behlendorf + * - Simplified API and improved documentation + * - Added per-hash feature flags: + * * CFS_HASH_DEBUG additional validation + * * CFS_HASH_REHASH dynamic rehashing + * - Added per-hash statistics + * - General performance enhancements + * + * 2009-07-31: Liang Zhen + * - move all stuff to libcfs + * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH + * - ignore hs_rwlock if without CFS_HASH_REHASH setting + * - buckets are allocated one by one(instead of contiguous memory), + * to avoid unnecessary cacheline conflict + * + * 2010-03-01: Liang Zhen + * - "bucket" is a group of hlist_head now, user can specify bucket size + * by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share + * one lock for reducing memory overhead. + * + * - support lockless hash, caller will take care of locks: + * avoid lock overhead for hash tables that are already protected + * by locking in the caller for another reason + * + * - support both spin_lock/rwlock for bucket: + * overhead of spinlock contention is lower than read/write + * contention of rwlock, so using spinlock to serialize operations on + * bucket is more reasonable for those frequently changed hash tables + * + * - support one-single lock mode: + * one lock to protect all hash operations to avoid overhead of + * multiple locks if hash table is always small + * + * - removed a lot of unnecessary addref & decref on hash element: + * addref & decref are atomic operations in many use-cases which + * are expensive. + * + * - support non-blocking cfs_hash_add() and cfs_hash_findadd(): + * some lustre use-cases require these functions to be strictly + * non-blocking, we need to schedule required rehash on a different + * thread on those cases. + * + * - safer rehash on large hash table + * In old implementation, rehash function will exclusively lock the + * hash table and finish rehash in one batch, it's dangerous on SMP + * system because rehash millions of elements could take long time. + * New implemented rehash can release lock and relax CPU in middle + * of rehash, it's safe for another thread to search/change on the + * hash table even it's in rehasing. + * + * - support two different refcount modes + * . hash table has refcount on element + * . hash table doesn't change refcount on adding/removing element + * + * - support long name hash table (for param-tree) + * + * - fix a bug for cfs_hash_rehash_key: + * in old implementation, cfs_hash_rehash_key could screw up the + * hash-table because @key is overwritten without any protection. + * Now we need user to define hs_keycpy for those rehash enabled + * hash tables, cfs_hash_rehash_key will overwrite hash-key + * inside lock by calling hs_keycpy. + * + * - better hash iteration: + * Now we support both locked iteration & lockless iteration of hash + * table. Also, user can break the iteration by return 1 in callback. + */ + +#include "../../include/linux/libcfs/libcfs.h" +#include + +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 +static unsigned int warn_on_depth = 8; +module_param(warn_on_depth, uint, 0644); +MODULE_PARM_DESC(warn_on_depth, "warning when hash depth is high."); +#endif + +struct cfs_wi_sched *cfs_sched_rehash; + +static inline void +cfs_hash_nl_lock(union cfs_hash_lock *lock, int exclusive) {} + +static inline void +cfs_hash_nl_unlock(union cfs_hash_lock *lock, int exclusive) {} + +static inline void +cfs_hash_spin_lock(union cfs_hash_lock *lock, int exclusive) + __acquires(&lock->spin) +{ + spin_lock(&lock->spin); +} + +static inline void +cfs_hash_spin_unlock(union cfs_hash_lock *lock, int exclusive) + __releases(&lock->spin) +{ + spin_unlock(&lock->spin); +} + +static inline void +cfs_hash_rw_lock(union cfs_hash_lock *lock, int exclusive) + __acquires(&lock->rw) +{ + if (!exclusive) + read_lock(&lock->rw); + else + write_lock(&lock->rw); +} + +static inline void +cfs_hash_rw_unlock(union cfs_hash_lock *lock, int exclusive) + __releases(&lock->rw) +{ + if (!exclusive) + read_unlock(&lock->rw); + else + write_unlock(&lock->rw); +} + +/** No lock hash */ +static cfs_hash_lock_ops_t cfs_hash_nl_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_nl_lock, + .hs_bkt_unlock = cfs_hash_nl_unlock, +}; + +/** no bucket lock, one spinlock to protect everything */ +static cfs_hash_lock_ops_t cfs_hash_nbl_lops = { + .hs_lock = cfs_hash_spin_lock, + .hs_unlock = cfs_hash_spin_unlock, + .hs_bkt_lock = cfs_hash_nl_lock, + .hs_bkt_unlock = cfs_hash_nl_unlock, +}; + +/** spin bucket lock, rehash is enabled */ +static cfs_hash_lock_ops_t cfs_hash_bkt_spin_lops = { + .hs_lock = cfs_hash_rw_lock, + .hs_unlock = cfs_hash_rw_unlock, + .hs_bkt_lock = cfs_hash_spin_lock, + .hs_bkt_unlock = cfs_hash_spin_unlock, +}; + +/** rw bucket lock, rehash is enabled */ +static cfs_hash_lock_ops_t cfs_hash_bkt_rw_lops = { + .hs_lock = cfs_hash_rw_lock, + .hs_unlock = cfs_hash_rw_unlock, + .hs_bkt_lock = cfs_hash_rw_lock, + .hs_bkt_unlock = cfs_hash_rw_unlock, +}; + +/** spin bucket lock, rehash is disabled */ +static cfs_hash_lock_ops_t cfs_hash_nr_bkt_spin_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_spin_lock, + .hs_bkt_unlock = cfs_hash_spin_unlock, +}; + +/** rw bucket lock, rehash is disabled */ +static cfs_hash_lock_ops_t cfs_hash_nr_bkt_rw_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_rw_lock, + .hs_bkt_unlock = cfs_hash_rw_unlock, +}; + +static void +cfs_hash_lock_setup(struct cfs_hash *hs) +{ + if (cfs_hash_with_no_lock(hs)) { + hs->hs_lops = &cfs_hash_nl_lops; + + } else if (cfs_hash_with_no_bktlock(hs)) { + hs->hs_lops = &cfs_hash_nbl_lops; + spin_lock_init(&hs->hs_lock.spin); + + } else if (cfs_hash_with_rehash(hs)) { + rwlock_init(&hs->hs_lock.rw); + + if (cfs_hash_with_rw_bktlock(hs)) + hs->hs_lops = &cfs_hash_bkt_rw_lops; + else if (cfs_hash_with_spin_bktlock(hs)) + hs->hs_lops = &cfs_hash_bkt_spin_lops; + else + LBUG(); + } else { + if (cfs_hash_with_rw_bktlock(hs)) + hs->hs_lops = &cfs_hash_nr_bkt_rw_lops; + else if (cfs_hash_with_spin_bktlock(hs)) + hs->hs_lops = &cfs_hash_nr_bkt_spin_lops; + else + LBUG(); + } +} + +/** + * Simple hash head without depth tracking + * new element is always added to head of hlist + */ +typedef struct { + struct hlist_head hh_head; /**< entries list */ +} cfs_hash_head_t; + +static int +cfs_hash_hh_hhead_size(struct cfs_hash *hs) +{ + return sizeof(cfs_hash_head_t); +} + +static struct hlist_head * +cfs_hash_hh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + cfs_hash_head_t *head = (cfs_hash_head_t *)&bd->bd_bucket->hsb_head[0]; + + return &head[bd->bd_offset].hh_head; +} + +static int +cfs_hash_hh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd)); + return -1; /* unknown depth */ +} + +static int +cfs_hash_hh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + hlist_del_init(hnode); + return -1; /* unknown depth */ +} + +/** + * Simple hash head with depth tracking + * new element is always added to head of hlist + */ +typedef struct { + struct hlist_head hd_head; /**< entries list */ + unsigned int hd_depth; /**< list length */ +} cfs_hash_head_dep_t; + +static int +cfs_hash_hd_hhead_size(struct cfs_hash *hs) +{ + return sizeof(cfs_hash_head_dep_t); +} + +static struct hlist_head * +cfs_hash_hd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + cfs_hash_head_dep_t *head; + + head = (cfs_hash_head_dep_t *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].hd_head; +} + +static int +cfs_hash_hd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd), + cfs_hash_head_dep_t, hd_head); + hlist_add_head(hnode, &hh->hd_head); + return ++hh->hd_depth; +} + +static int +cfs_hash_hd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd), + cfs_hash_head_dep_t, hd_head); + hlist_del_init(hnode); + return --hh->hd_depth; +} + +/** + * double links hash head without depth tracking + * new element is always added to tail of hlist + */ +typedef struct { + struct hlist_head dh_head; /**< entries list */ + struct hlist_node *dh_tail; /**< the last entry */ +} cfs_hash_dhead_t; + +static int +cfs_hash_dh_hhead_size(struct cfs_hash *hs) +{ + return sizeof(cfs_hash_dhead_t); +} + +static struct hlist_head * +cfs_hash_dh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + cfs_hash_dhead_t *head; + + head = (cfs_hash_dhead_t *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].dh_head; +} + +static int +cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd), + cfs_hash_dhead_t, dh_head); + + if (dh->dh_tail != NULL) /* not empty */ + hlist_add_behind(hnode, dh->dh_tail); + else /* empty list */ + hlist_add_head(hnode, &dh->dh_head); + dh->dh_tail = hnode; + return -1; /* unknown depth */ +} + +static int +cfs_hash_dh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnd) +{ + cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd), + cfs_hash_dhead_t, dh_head); + + if (hnd->next == NULL) { /* it's the tail */ + dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL : + container_of(hnd->pprev, struct hlist_node, next); + } + hlist_del_init(hnd); + return -1; /* unknown depth */ +} + +/** + * double links hash head with depth tracking + * new element is always added to tail of hlist + */ +typedef struct { + struct hlist_head dd_head; /**< entries list */ + struct hlist_node *dd_tail; /**< the last entry */ + unsigned int dd_depth; /**< list length */ +} cfs_hash_dhead_dep_t; + +static int +cfs_hash_dd_hhead_size(struct cfs_hash *hs) +{ + return sizeof(cfs_hash_dhead_dep_t); +} + +static struct hlist_head * +cfs_hash_dd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + cfs_hash_dhead_dep_t *head; + + head = (cfs_hash_dhead_dep_t *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].dd_head; +} + +static int +cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd), + cfs_hash_dhead_dep_t, dd_head); + + if (dh->dd_tail != NULL) /* not empty */ + hlist_add_behind(hnode, dh->dd_tail); + else /* empty list */ + hlist_add_head(hnode, &dh->dd_head); + dh->dd_tail = hnode; + return ++dh->dd_depth; +} + +static int +cfs_hash_dd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnd) +{ + cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd), + cfs_hash_dhead_dep_t, dd_head); + + if (hnd->next == NULL) { /* it's the tail */ + dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL : + container_of(hnd->pprev, struct hlist_node, next); + } + hlist_del_init(hnd); + return --dh->dd_depth; +} + +static cfs_hash_hlist_ops_t cfs_hash_hh_hops = { + .hop_hhead = cfs_hash_hh_hhead, + .hop_hhead_size = cfs_hash_hh_hhead_size, + .hop_hnode_add = cfs_hash_hh_hnode_add, + .hop_hnode_del = cfs_hash_hh_hnode_del, +}; + +static cfs_hash_hlist_ops_t cfs_hash_hd_hops = { + .hop_hhead = cfs_hash_hd_hhead, + .hop_hhead_size = cfs_hash_hd_hhead_size, + .hop_hnode_add = cfs_hash_hd_hnode_add, + .hop_hnode_del = cfs_hash_hd_hnode_del, +}; + +static cfs_hash_hlist_ops_t cfs_hash_dh_hops = { + .hop_hhead = cfs_hash_dh_hhead, + .hop_hhead_size = cfs_hash_dh_hhead_size, + .hop_hnode_add = cfs_hash_dh_hnode_add, + .hop_hnode_del = cfs_hash_dh_hnode_del, +}; + +static cfs_hash_hlist_ops_t cfs_hash_dd_hops = { + .hop_hhead = cfs_hash_dd_hhead, + .hop_hhead_size = cfs_hash_dd_hhead_size, + .hop_hnode_add = cfs_hash_dd_hnode_add, + .hop_hnode_del = cfs_hash_dd_hnode_del, +}; + +static void +cfs_hash_hlist_setup(struct cfs_hash *hs) +{ + if (cfs_hash_with_add_tail(hs)) { + hs->hs_hops = cfs_hash_with_depth(hs) ? + &cfs_hash_dd_hops : &cfs_hash_dh_hops; + } else { + hs->hs_hops = cfs_hash_with_depth(hs) ? + &cfs_hash_hd_hops : &cfs_hash_hh_hops; + } +} + +static void +cfs_hash_bd_from_key(struct cfs_hash *hs, struct cfs_hash_bucket **bkts, + unsigned int bits, const void *key, struct cfs_hash_bd *bd) +{ + unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1); + + LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits); + + bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)]; + bd->bd_offset = index >> (bits - hs->hs_bkt_bits); +} + +void +cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (likely(hs->hs_rehash_buckets == NULL)) { + cfs_hash_bd_from_key(hs, hs->hs_buckets, + hs->hs_cur_bits, key, bd); + } else { + LASSERT(hs->hs_rehash_bits != 0); + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, bd); + } +} +EXPORT_SYMBOL(cfs_hash_bd_get); + +static inline void +cfs_hash_bd_dep_record(struct cfs_hash *hs, struct cfs_hash_bd *bd, int dep_cur) +{ + if (likely(dep_cur <= bd->bd_bucket->hsb_depmax)) + return; + + bd->bd_bucket->hsb_depmax = dep_cur; +# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 + if (likely(warn_on_depth == 0 || + max(warn_on_depth, hs->hs_dep_max) >= dep_cur)) + return; + + spin_lock(&hs->hs_dep_lock); + hs->hs_dep_max = dep_cur; + hs->hs_dep_bkt = bd->bd_bucket->hsb_index; + hs->hs_dep_off = bd->bd_offset; + hs->hs_dep_bits = hs->hs_cur_bits; + spin_unlock(&hs->hs_dep_lock); + + cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi); +# endif +} + +void +cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + int rc; + + rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode); + cfs_hash_bd_dep_record(hs, bd, rc); + bd->bd_bucket->hsb_version++; + if (unlikely(bd->bd_bucket->hsb_version == 0)) + bd->bd_bucket->hsb_version++; + bd->bd_bucket->hsb_count++; + + if (cfs_hash_with_counter(hs)) + atomic_inc(&hs->hs_count); + if (!cfs_hash_with_no_itemref(hs)) + cfs_hash_get(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_bd_add_locked); + +void +cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + hs->hs_hops->hop_hnode_del(hs, bd, hnode); + + LASSERT(bd->bd_bucket->hsb_count > 0); + bd->bd_bucket->hsb_count--; + bd->bd_bucket->hsb_version++; + if (unlikely(bd->bd_bucket->hsb_version == 0)) + bd->bd_bucket->hsb_version++; + + if (cfs_hash_with_counter(hs)) { + LASSERT(atomic_read(&hs->hs_count) > 0); + atomic_dec(&hs->hs_count); + } + if (!cfs_hash_with_no_itemref(hs)) + cfs_hash_put_locked(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_bd_del_locked); + +void +cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old, + struct cfs_hash_bd *bd_new, struct hlist_node *hnode) +{ + struct cfs_hash_bucket *obkt = bd_old->bd_bucket; + struct cfs_hash_bucket *nbkt = bd_new->bd_bucket; + int rc; + + if (cfs_hash_bd_compare(bd_old, bd_new) == 0) + return; + + /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops + * in cfs_hash_bd_del/add_locked */ + hs->hs_hops->hop_hnode_del(hs, bd_old, hnode); + rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode); + cfs_hash_bd_dep_record(hs, bd_new, rc); + + LASSERT(obkt->hsb_count > 0); + obkt->hsb_count--; + obkt->hsb_version++; + if (unlikely(obkt->hsb_version == 0)) + obkt->hsb_version++; + nbkt->hsb_count++; + nbkt->hsb_version++; + if (unlikely(nbkt->hsb_version == 0)) + nbkt->hsb_version++; +} +EXPORT_SYMBOL(cfs_hash_bd_move_locked); + +enum { + /** always set, for sanity (avoid ZERO intent) */ + CFS_HS_LOOKUP_MASK_FIND = 1 << 0, + /** return entry with a ref */ + CFS_HS_LOOKUP_MASK_REF = 1 << 1, + /** add entry if not existing */ + CFS_HS_LOOKUP_MASK_ADD = 1 << 2, + /** delete entry, ignore other masks */ + CFS_HS_LOOKUP_MASK_DEL = 1 << 3, +}; + +typedef enum cfs_hash_lookup_intent { + /** return item w/o refcount */ + CFS_HS_LOOKUP_IT_PEEK = CFS_HS_LOOKUP_MASK_FIND, + /** return item with refcount */ + CFS_HS_LOOKUP_IT_FIND = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_REF), + /** return item w/o refcount if existed, otherwise add */ + CFS_HS_LOOKUP_IT_ADD = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_ADD), + /** return item with refcount if existed, otherwise add */ + CFS_HS_LOOKUP_IT_FINDADD = (CFS_HS_LOOKUP_IT_FIND | + CFS_HS_LOOKUP_MASK_ADD), + /** delete if existed */ + CFS_HS_LOOKUP_IT_FINDDEL = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_DEL) +} cfs_hash_lookup_intent_t; + +static struct hlist_node * +cfs_hash_bd_lookup_intent(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key, struct hlist_node *hnode, + cfs_hash_lookup_intent_t intent) + +{ + struct hlist_head *hhead = cfs_hash_bd_hhead(hs, bd); + struct hlist_node *ehnode; + struct hlist_node *match; + int intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0; + + /* with this function, we can avoid a lot of useless refcount ops, + * which are expensive atomic operations most time. */ + match = intent_add ? NULL : hnode; + hlist_for_each(ehnode, hhead) { + if (!cfs_hash_keycmp(hs, key, ehnode)) + continue; + + if (match != NULL && match != ehnode) /* can't match */ + continue; + + /* match and ... */ + if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) { + cfs_hash_bd_del_locked(hs, bd, ehnode); + return ehnode; + } + + /* caller wants refcount? */ + if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0) + cfs_hash_get(hs, ehnode); + return ehnode; + } + /* no match item */ + if (!intent_add) + return NULL; + + LASSERT(hnode != NULL); + cfs_hash_bd_add_locked(hs, bd, hnode); + return hnode; +} + +struct hlist_node * +cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, const void *key) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, + CFS_HS_LOOKUP_IT_FIND); +} +EXPORT_SYMBOL(cfs_hash_bd_lookup_locked); + +struct hlist_node * +cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, const void *key) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, + CFS_HS_LOOKUP_IT_PEEK); +} +EXPORT_SYMBOL(cfs_hash_bd_peek_locked); + +struct hlist_node * +cfs_hash_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key, struct hlist_node *hnode, + int noref) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, hnode, + CFS_HS_LOOKUP_IT_ADD | + (!noref * CFS_HS_LOOKUP_MASK_REF)); +} +EXPORT_SYMBOL(cfs_hash_bd_findadd_locked); + +struct hlist_node * +cfs_hash_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key, struct hlist_node *hnode) +{ + /* hnode can be NULL, we find the first item with @key */ + return cfs_hash_bd_lookup_intent(hs, bd, key, hnode, + CFS_HS_LOOKUP_IT_FINDDEL); +} +EXPORT_SYMBOL(cfs_hash_bd_finddel_locked); + +static void +cfs_hash_multi_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, int excl) +{ + struct cfs_hash_bucket *prev = NULL; + int i; + + /** + * bds must be ascendantly ordered by bd->bd_bucket->hsb_index. + * NB: it's possible that several bds point to the same bucket but + * have different bd::bd_offset, so need take care of deadlock. + */ + cfs_hash_for_each_bd(bds, n, i) { + if (prev == bds[i].bd_bucket) + continue; + + LASSERT(prev == NULL || + prev->hsb_index < bds[i].bd_bucket->hsb_index); + cfs_hash_bd_lock(hs, &bds[i], excl); + prev = bds[i].bd_bucket; + } +} + +static void +cfs_hash_multi_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, int excl) +{ + struct cfs_hash_bucket *prev = NULL; + int i; + + cfs_hash_for_each_bd(bds, n, i) { + if (prev != bds[i].bd_bucket) { + cfs_hash_bd_unlock(hs, &bds[i], excl); + prev = bds[i].bd_bucket; + } + } +} + +static struct hlist_node * +cfs_hash_multi_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, const void *key) +{ + struct hlist_node *ehnode; + unsigned i; + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL, + CFS_HS_LOOKUP_IT_FIND); + if (ehnode != NULL) + return ehnode; + } + return NULL; +} + +static struct hlist_node * +cfs_hash_multi_bd_findadd_locked(struct cfs_hash *hs, + struct cfs_hash_bd *bds, unsigned n, const void *key, + struct hlist_node *hnode, int noref) +{ + struct hlist_node *ehnode; + int intent; + unsigned i; + + LASSERT(hnode != NULL); + intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF); + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, + NULL, intent); + if (ehnode != NULL) + return ehnode; + } + + if (i == 1) { /* only one bucket */ + cfs_hash_bd_add_locked(hs, &bds[0], hnode); + } else { + struct cfs_hash_bd mybd; + + cfs_hash_bd_get(hs, key, &mybd); + cfs_hash_bd_add_locked(hs, &mybd, hnode); + } + + return hnode; +} + +static struct hlist_node * +cfs_hash_multi_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, const void *key, + struct hlist_node *hnode) +{ + struct hlist_node *ehnode; + unsigned i; + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode, + CFS_HS_LOOKUP_IT_FINDDEL); + if (ehnode != NULL) + return ehnode; + } + return NULL; +} + +static void +cfs_hash_bd_order(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2) +{ + int rc; + + if (bd2->bd_bucket == NULL) + return; + + if (bd1->bd_bucket == NULL) { + *bd1 = *bd2; + bd2->bd_bucket = NULL; + return; + } + + rc = cfs_hash_bd_compare(bd1, bd2); + if (rc == 0) { + bd2->bd_bucket = NULL; + + } else if (rc > 0) { /* swab bd1 and bd2 */ + struct cfs_hash_bd tmp; + + tmp = *bd2; + *bd2 = *bd1; + *bd1 = tmp; + } +} + +void +cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bds) +{ + /* NB: caller should hold hs_lock.rw if REHASH is set */ + cfs_hash_bd_from_key(hs, hs->hs_buckets, + hs->hs_cur_bits, key, &bds[0]); + if (likely(hs->hs_rehash_buckets == NULL)) { + /* no rehash or not rehashing */ + bds[1].bd_bucket = NULL; + return; + } + + LASSERT(hs->hs_rehash_bits != 0); + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, &bds[1]); + + cfs_hash_bd_order(&bds[0], &bds[1]); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_get); + +void +cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl) +{ + cfs_hash_multi_bd_lock(hs, bds, 2, excl); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_lock); + +void +cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl) +{ + cfs_hash_multi_bd_unlock(hs, bds, 2, excl); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_unlock); + +struct hlist_node * +cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key) +{ + return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_lookup_locked); + +struct hlist_node * +cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode, + int noref) +{ + return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key, + hnode, noref); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_findadd_locked); + +struct hlist_node * +cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode) +{ + return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_finddel_locked); + +static void +cfs_hash_buckets_free(struct cfs_hash_bucket **buckets, + int bkt_size, int prev_size, int size) +{ + int i; + + for (i = prev_size; i < size; i++) { + if (buckets[i] != NULL) + LIBCFS_FREE(buckets[i], bkt_size); + } + + LIBCFS_FREE(buckets, sizeof(buckets[0]) * size); +} + +/* + * Create or grow bucket memory. Return old_buckets if no allocation was + * needed, the newly allocated buckets if allocation was needed and + * successful, and NULL on error. + */ +static struct cfs_hash_bucket ** +cfs_hash_buckets_realloc(struct cfs_hash *hs, struct cfs_hash_bucket **old_bkts, + unsigned int old_size, unsigned int new_size) +{ + struct cfs_hash_bucket **new_bkts; + int i; + + LASSERT(old_size == 0 || old_bkts != NULL); + + if (old_bkts != NULL && old_size == new_size) + return old_bkts; + + LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size); + if (new_bkts == NULL) + return NULL; + + if (old_bkts != NULL) { + memcpy(new_bkts, old_bkts, + min(old_size, new_size) * sizeof(*old_bkts)); + } + + for (i = old_size; i < new_size; i++) { + struct hlist_head *hhead; + struct cfs_hash_bd bd; + + LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs)); + if (new_bkts[i] == NULL) { + cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs), + old_size, new_size); + return NULL; + } + + new_bkts[i]->hsb_index = i; + new_bkts[i]->hsb_version = 1; /* shouldn't be zero */ + new_bkts[i]->hsb_depmax = -1; /* unknown */ + bd.bd_bucket = new_bkts[i]; + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) + INIT_HLIST_HEAD(hhead); + + if (cfs_hash_with_no_lock(hs) || + cfs_hash_with_no_bktlock(hs)) + continue; + + if (cfs_hash_with_rw_bktlock(hs)) + rwlock_init(&new_bkts[i]->hsb_lock.rw); + else if (cfs_hash_with_spin_bktlock(hs)) + spin_lock_init(&new_bkts[i]->hsb_lock.spin); + else + LBUG(); /* invalid use-case */ + } + return new_bkts; +} + +/** + * Initialize new libcfs hash, where: + * @name - Descriptive hash name + * @cur_bits - Initial hash table size, in bits + * @max_bits - Maximum allowed hash table resize, in bits + * @ops - Registered hash table operations + * @flags - CFS_HASH_REHASH enable synamic hash resizing + * - CFS_HASH_SORT enable chained hash sort + */ +static int cfs_hash_rehash_worker(cfs_workitem_t *wi); + +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 +static int cfs_hash_dep_print(cfs_workitem_t *wi) +{ + struct cfs_hash *hs = container_of(wi, struct cfs_hash, hs_dep_wi); + int dep; + int bkt; + int off; + int bits; + + spin_lock(&hs->hs_dep_lock); + dep = hs->hs_dep_max; + bkt = hs->hs_dep_bkt; + off = hs->hs_dep_off; + bits = hs->hs_dep_bits; + spin_unlock(&hs->hs_dep_lock); + + LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n", + hs->hs_name, bits, dep, bkt, off); + spin_lock(&hs->hs_dep_lock); + hs->hs_dep_bits = 0; /* mark as workitem done */ + spin_unlock(&hs->hs_dep_lock); + return 0; +} + +static void cfs_hash_depth_wi_init(struct cfs_hash *hs) +{ + spin_lock_init(&hs->hs_dep_lock); + cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print); +} + +static void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) +{ + if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi)) + return; + + spin_lock(&hs->hs_dep_lock); + while (hs->hs_dep_bits != 0) { + spin_unlock(&hs->hs_dep_lock); + cond_resched(); + spin_lock(&hs->hs_dep_lock); + } + spin_unlock(&hs->hs_dep_lock); +} + +#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */ + +static inline void cfs_hash_depth_wi_init(struct cfs_hash *hs) {} +static inline void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) {} + +#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */ + +struct cfs_hash * +cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits, + unsigned bkt_bits, unsigned extra_bytes, + unsigned min_theta, unsigned max_theta, + cfs_hash_ops_t *ops, unsigned flags) +{ + struct cfs_hash *hs; + int len; + + CLASSERT(CFS_HASH_THETA_BITS < 15); + + LASSERT(name != NULL); + LASSERT(ops != NULL); + LASSERT(ops->hs_key); + LASSERT(ops->hs_hash); + LASSERT(ops->hs_object); + LASSERT(ops->hs_keycmp); + LASSERT(ops->hs_get != NULL); + LASSERT(ops->hs_put_locked != NULL); + + if ((flags & CFS_HASH_REHASH) != 0) + flags |= CFS_HASH_COUNTER; /* must have counter */ + + LASSERT(cur_bits > 0); + LASSERT(cur_bits >= bkt_bits); + LASSERT(max_bits >= cur_bits && max_bits < 31); + LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits)); + LASSERT(ergo((flags & CFS_HASH_REHASH) != 0, + (flags & CFS_HASH_NO_LOCK) == 0)); + LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0, + ops->hs_keycpy != NULL)); + + len = (flags & CFS_HASH_BIGNAME) == 0 ? + CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN; + LIBCFS_ALLOC(hs, offsetof(struct cfs_hash, hs_name[len])); + if (hs == NULL) + return NULL; + + strncpy(hs->hs_name, name, len); + hs->hs_name[len - 1] = '\0'; + hs->hs_flags = flags; + + atomic_set(&hs->hs_refcount, 1); + atomic_set(&hs->hs_count, 0); + + cfs_hash_lock_setup(hs); + cfs_hash_hlist_setup(hs); + + hs->hs_cur_bits = (__u8)cur_bits; + hs->hs_min_bits = (__u8)cur_bits; + hs->hs_max_bits = (__u8)max_bits; + hs->hs_bkt_bits = (__u8)bkt_bits; + + hs->hs_ops = ops; + hs->hs_extra_bytes = extra_bytes; + hs->hs_rehash_bits = 0; + cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker); + cfs_hash_depth_wi_init(hs); + + if (cfs_hash_with_rehash(hs)) + __cfs_hash_set_theta(hs, min_theta, max_theta); + + hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0, + CFS_HASH_NBKT(hs)); + if (hs->hs_buckets != NULL) + return hs; + + LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[len])); + return NULL; +} +EXPORT_SYMBOL(cfs_hash_create); + +/** + * Cleanup libcfs hash @hs. + */ +static void +cfs_hash_destroy(struct cfs_hash *hs) +{ + struct hlist_node *hnode; + struct hlist_node *pos; + struct cfs_hash_bd bd; + int i; + + LASSERT(hs != NULL); + LASSERT(!cfs_hash_is_exiting(hs) && + !cfs_hash_is_iterating(hs)); + + /** + * prohibit further rehashes, don't need any lock because + * I'm the only (last) one can change it. + */ + hs->hs_exiting = 1; + if (cfs_hash_with_rehash(hs)) + cfs_hash_rehash_cancel(hs); + + cfs_hash_depth_wi_cancel(hs); + /* rehash should be done/canceled */ + LASSERT(hs->hs_buckets != NULL && + hs->hs_rehash_buckets == NULL); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + LASSERT(bd.bd_bucket != NULL); + /* no need to take this lock, just for consistent code */ + cfs_hash_bd_lock(hs, &bd, 1); + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + LASSERTF(!cfs_hash_with_assert_empty(hs), + "hash %s bucket %u(%u) is not empty: %u items left\n", + hs->hs_name, bd.bd_bucket->hsb_index, + bd.bd_offset, bd.bd_bucket->hsb_count); + /* can't assert key valicate, because we + * can interrupt rehash */ + cfs_hash_bd_del_locked(hs, &bd, hnode); + cfs_hash_exit(hs, hnode); + } + } + LASSERT(bd.bd_bucket->hsb_count == 0); + cfs_hash_bd_unlock(hs, &bd, 1); + cond_resched(); + } + + LASSERT(atomic_read(&hs->hs_count) == 0); + + cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs), + 0, CFS_HASH_NBKT(hs)); + i = cfs_hash_with_bigname(hs) ? + CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN; + LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[i])); +} + +struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs) +{ + if (atomic_inc_not_zero(&hs->hs_refcount)) + return hs; + return NULL; +} +EXPORT_SYMBOL(cfs_hash_getref); + +void cfs_hash_putref(struct cfs_hash *hs) +{ + if (atomic_dec_and_test(&hs->hs_refcount)) + cfs_hash_destroy(hs); +} +EXPORT_SYMBOL(cfs_hash_putref); + +static inline int +cfs_hash_rehash_bits(struct cfs_hash *hs) +{ + if (cfs_hash_with_no_lock(hs) || + !cfs_hash_with_rehash(hs)) + return -EOPNOTSUPP; + + if (unlikely(cfs_hash_is_exiting(hs))) + return -ESRCH; + + if (unlikely(cfs_hash_is_rehashing(hs))) + return -EALREADY; + + if (unlikely(cfs_hash_is_iterating(hs))) + return -EAGAIN; + + /* XXX: need to handle case with max_theta != 2.0 + * and the case with min_theta != 0.5 */ + if ((hs->hs_cur_bits < hs->hs_max_bits) && + (__cfs_hash_theta(hs) > hs->hs_max_theta)) + return hs->hs_cur_bits + 1; + + if (!cfs_hash_with_shrink(hs)) + return 0; + + if ((hs->hs_cur_bits > hs->hs_min_bits) && + (__cfs_hash_theta(hs) < hs->hs_min_theta)) + return hs->hs_cur_bits - 1; + + return 0; +} + +/** + * don't allow inline rehash if: + * - user wants non-blocking change (add/del) on hash table + * - too many elements + */ +static inline int +cfs_hash_rehash_inline(struct cfs_hash *hs) +{ + return !cfs_hash_with_nblk_change(hs) && + atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG; +} + +/** + * Add item @hnode to libcfs hash @hs using @key. The registered + * ops->hs_get function will be called when the item is added. + */ +void +cfs_hash_add(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + struct cfs_hash_bd bd; + int bits; + + LASSERT(hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + cfs_hash_bd_get_and_lock(hs, key, &bd, 1); + + cfs_hash_key_validate(hs, key, hnode); + cfs_hash_bd_add_locked(hs, &bd, hnode); + + cfs_hash_bd_unlock(hs, &bd, 1); + + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); +} +EXPORT_SYMBOL(cfs_hash_add); + +static struct hlist_node * +cfs_hash_find_or_add(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode, int noref) +{ + struct hlist_node *ehnode; + struct cfs_hash_bd bds[2]; + int bits = 0; + + LASSERT(hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); + + cfs_hash_key_validate(hs, key, hnode); + ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key, + hnode, noref); + cfs_hash_dual_bd_unlock(hs, bds, 1); + + if (ehnode == hnode) /* new item added */ + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); + + return ehnode; +} + +/** + * Add item @hnode to libcfs hash @hs using @key. The registered + * ops->hs_get function will be called if the item was added. + * Returns 0 on success or -EALREADY on key collisions. + */ +int +cfs_hash_add_unique(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ? + -EALREADY : 0; +} +EXPORT_SYMBOL(cfs_hash_add_unique); + +/** + * Add item @hnode to libcfs hash @hs using @key. If this @key + * already exists in the hash then ops->hs_get will be called on the + * conflicting entry and that entry will be returned to the caller. + * Otherwise ops->hs_get is called on the item which was added. + */ +void * +cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) +{ + hnode = cfs_hash_find_or_add(hs, key, hnode, 0); + + return cfs_hash_object(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_findadd_unique); + +/** + * Delete item @hnode from the libcfs hash @hs using @key. The @key + * is required to ensure the correct hash bucket is locked since there + * is no direct linkage from the item to the bucket. The object + * removed from the hash will be returned and obs->hs_put is called + * on the removed object. + */ +void * +cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + void *obj = NULL; + int bits = 0; + struct cfs_hash_bd bds[2]; + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); + + /* NB: do nothing if @hnode is not in hash table */ + if (hnode == NULL || !hlist_unhashed(hnode)) { + if (bds[1].bd_bucket == NULL && hnode != NULL) { + cfs_hash_bd_del_locked(hs, &bds[0], hnode); + } else { + hnode = cfs_hash_dual_bd_finddel_locked(hs, bds, + key, hnode); + } + } + + if (hnode != NULL) { + obj = cfs_hash_object(hs, hnode); + bits = cfs_hash_rehash_bits(hs); + } + + cfs_hash_dual_bd_unlock(hs, bds, 1); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); + + return obj; +} +EXPORT_SYMBOL(cfs_hash_del); + +/** + * Delete item given @key in libcfs hash @hs. The first @key found in + * the hash will be removed, if the key exists multiple times in the hash + * @hs this function must be called once per key. The removed object + * will be returned and ops->hs_put is called on the removed object. + */ +void * +cfs_hash_del_key(struct cfs_hash *hs, const void *key) +{ + return cfs_hash_del(hs, key, NULL); +} +EXPORT_SYMBOL(cfs_hash_del_key); + +/** + * Lookup an item using @key in the libcfs hash @hs and return it. + * If the @key is found in the hash hs->hs_get() is called and the + * matching objects is returned. It is the callers responsibility + * to call the counterpart ops->hs_put using the cfs_hash_put() macro + * when when finished with the object. If the @key was not found + * in the hash @hs NULL is returned. + */ +void * +cfs_hash_lookup(struct cfs_hash *hs, const void *key) +{ + void *obj = NULL; + struct hlist_node *hnode; + struct cfs_hash_bd bds[2]; + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); + + hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key); + if (hnode != NULL) + obj = cfs_hash_object(hs, hnode); + + cfs_hash_dual_bd_unlock(hs, bds, 0); + cfs_hash_unlock(hs, 0); + + return obj; +} +EXPORT_SYMBOL(cfs_hash_lookup); + +static void +cfs_hash_for_each_enter(struct cfs_hash *hs) { + LASSERT(!cfs_hash_is_exiting(hs)); + + if (!cfs_hash_with_rehash(hs)) + return; + /* + * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter + * because it's just an unreliable signal to rehash-thread, + * rehash-thread will try to finish rehash ASAP when seeing this. + */ + hs->hs_iterating = 1; + + cfs_hash_lock(hs, 1); + hs->hs_iterators++; + + /* NB: iteration is mostly called by service thread, + * we tend to cancel pending rehash-request, instead of + * blocking service thread, we will relaunch rehash request + * after iteration */ + if (cfs_hash_is_rehashing(hs)) + cfs_hash_rehash_cancel_locked(hs); + cfs_hash_unlock(hs, 1); +} + +static void +cfs_hash_for_each_exit(struct cfs_hash *hs) { + int remained; + int bits; + + if (!cfs_hash_with_rehash(hs)) + return; + cfs_hash_lock(hs, 1); + remained = --hs->hs_iterators; + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 1); + /* NB: it's race on cfs_has_t::hs_iterating, see above */ + if (remained == 0) + hs->hs_iterating = 0; + if (bits > 0) { + cfs_hash_rehash(hs, atomic_read(&hs->hs_count) < + CFS_HASH_LOOP_HOG); + } +} + +/** + * For each item in the libcfs hash @hs call the passed callback @func + * and pass to it as an argument each hash item and the private @data. + * + * a) the function may sleep! + * b) during the callback: + * . the bucket lock is held so the callback must never sleep. + * . if @removal_safe is true, use can remove current item by + * cfs_hash_bd_del_locked + */ +static __u64 +cfs_hash_for_each_tight(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, + void *data, int remove_safe) { + struct hlist_node *hnode; + struct hlist_node *pos; + struct cfs_hash_bd bd; + __u64 count = 0; + int excl = !!remove_safe; + int loop = 0; + int i; + + cfs_hash_for_each_enter(hs); + + cfs_hash_lock(hs, 0); + LASSERT(!cfs_hash_is_rehashing(hs)); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, excl); + if (func == NULL) { /* only glimpse size */ + count += bd.bd_bucket->hsb_count; + cfs_hash_bd_unlock(hs, &bd, excl); + continue; + } + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + cfs_hash_bucket_validate(hs, &bd, hnode); + count++; + loop++; + if (func(hs, &bd, hnode, data)) { + cfs_hash_bd_unlock(hs, &bd, excl); + goto out; + } + } + } + cfs_hash_bd_unlock(hs, &bd, excl); + if (loop < CFS_HASH_LOOP_HOG) + continue; + loop = 0; + cfs_hash_unlock(hs, 0); + cond_resched(); + cfs_hash_lock(hs, 0); + } + out: + cfs_hash_unlock(hs, 0); + + cfs_hash_for_each_exit(hs); + return count; +} + +typedef struct { + cfs_hash_cond_opt_cb_t func; + void *arg; +} cfs_hash_cond_arg_t; + +static int +cfs_hash_cond_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + cfs_hash_cond_arg_t *cond = data; + + if (cond->func(cfs_hash_object(hs, hnode), cond->arg)) + cfs_hash_bd_del_locked(hs, bd, hnode); + return 0; +} + +/** + * Delete item from the libcfs hash @hs when @func return true. + * The write lock being hold during loop for each bucket to avoid + * any object be reference. + */ +void +cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t func, void *data) +{ + cfs_hash_cond_arg_t arg = { + .func = func, + .arg = data, + }; + + cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1); +} +EXPORT_SYMBOL(cfs_hash_cond_del); + +void +cfs_hash_for_each(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + cfs_hash_for_each_tight(hs, func, data, 0); +} +EXPORT_SYMBOL(cfs_hash_for_each); + +void +cfs_hash_for_each_safe(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) { + cfs_hash_for_each_tight(hs, func, data, 1); +} +EXPORT_SYMBOL(cfs_hash_for_each_safe); + +static int +cfs_hash_peek(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + *(int *)data = 0; + return 1; /* return 1 to break the loop */ +} + +int +cfs_hash_is_empty(struct cfs_hash *hs) +{ + int empty = 1; + + cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0); + return empty; +} +EXPORT_SYMBOL(cfs_hash_is_empty); + +__u64 +cfs_hash_size_get(struct cfs_hash *hs) +{ + return cfs_hash_with_counter(hs) ? + atomic_read(&hs->hs_count) : + cfs_hash_for_each_tight(hs, NULL, NULL, 0); +} +EXPORT_SYMBOL(cfs_hash_size_get); + +/* + * cfs_hash_for_each_relax: + * Iterate the hash table and call @func on each item without + * any lock. This function can't guarantee to finish iteration + * if these features are enabled: + * + * a. if rehash_key is enabled, an item can be moved from + * one bucket to another bucket + * b. user can remove non-zero-ref item from hash-table, + * so the item can be removed from hash-table, even worse, + * it's possible that user changed key and insert to another + * hash bucket. + * there's no way for us to finish iteration correctly on previous + * two cases, so iteration has to be stopped on change. + */ +static int +cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, + void *data) { + struct hlist_node *hnode; + struct hlist_node *tmp; + struct cfs_hash_bd bd; + __u32 version; + int count = 0; + int stop_on_change; + int rc; + int i; + + stop_on_change = cfs_hash_with_rehash_key(hs) || + !cfs_hash_with_no_itemref(hs) || + hs->hs_ops->hs_put_locked == NULL; + cfs_hash_lock(hs, 0); + LASSERT(!cfs_hash_is_rehashing(hs)); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, 0); + version = cfs_hash_bd_version_get(&bd); + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + for (hnode = hhead->first; hnode != NULL;) { + cfs_hash_bucket_validate(hs, &bd, hnode); + cfs_hash_get(hs, hnode); + cfs_hash_bd_unlock(hs, &bd, 0); + cfs_hash_unlock(hs, 0); + + rc = func(hs, &bd, hnode, data); + if (stop_on_change) + cfs_hash_put(hs, hnode); + cond_resched(); + count++; + + cfs_hash_lock(hs, 0); + cfs_hash_bd_lock(hs, &bd, 0); + if (!stop_on_change) { + tmp = hnode->next; + cfs_hash_put_locked(hs, hnode); + hnode = tmp; + } else { /* bucket changed? */ + if (version != + cfs_hash_bd_version_get(&bd)) + break; + /* safe to continue because no change */ + hnode = hnode->next; + } + if (rc) /* callback wants to break iteration */ + break; + } + } + cfs_hash_bd_unlock(hs, &bd, 0); + } + cfs_hash_unlock(hs, 0); + + return count; +} + +int +cfs_hash_for_each_nolock(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) { + if (cfs_hash_with_no_lock(hs) || + cfs_hash_with_rehash_key(hs) || + !cfs_hash_with_no_itemref(hs)) + return -EOPNOTSUPP; + + if (hs->hs_ops->hs_get == NULL || + (hs->hs_ops->hs_put == NULL && + hs->hs_ops->hs_put_locked == NULL)) + return -EOPNOTSUPP; + + cfs_hash_for_each_enter(hs); + cfs_hash_for_each_relax(hs, func, data); + cfs_hash_for_each_exit(hs); + + return 0; +} +EXPORT_SYMBOL(cfs_hash_for_each_nolock); + +/** + * For each hash bucket in the libcfs hash @hs call the passed callback + * @func until all the hash buckets are empty. The passed callback @func + * or the previously registered callback hs->hs_put must remove the item + * from the hash. You may either use the cfs_hash_del() or hlist_del() + * functions. No rwlocks will be held during the callback @func it is + * safe to sleep if needed. This function will not terminate until the + * hash is empty. Note it is still possible to concurrently add new + * items in to the hash. It is the callers responsibility to ensure + * the required locking is in place to prevent concurrent insertions. + */ +int +cfs_hash_for_each_empty(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) { + unsigned i = 0; + + if (cfs_hash_with_no_lock(hs)) + return -EOPNOTSUPP; + + if (hs->hs_ops->hs_get == NULL || + (hs->hs_ops->hs_put == NULL && + hs->hs_ops->hs_put_locked == NULL)) + return -EOPNOTSUPP; + + cfs_hash_for_each_enter(hs); + while (cfs_hash_for_each_relax(hs, func, data)) { + CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n", + hs->hs_name, i++); + } + cfs_hash_for_each_exit(hs); + return 0; +} +EXPORT_SYMBOL(cfs_hash_for_each_empty); + +void +cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex, + cfs_hash_for_each_cb_t func, void *data) +{ + struct hlist_head *hhead; + struct hlist_node *hnode; + struct cfs_hash_bd bd; + + cfs_hash_for_each_enter(hs); + cfs_hash_lock(hs, 0); + if (hindex >= CFS_HASH_NHLIST(hs)) + goto out; + + cfs_hash_bd_index_set(hs, hindex, &bd); + + cfs_hash_bd_lock(hs, &bd, 0); + hhead = cfs_hash_bd_hhead(hs, &bd); + hlist_for_each(hnode, hhead) { + if (func(hs, &bd, hnode, data)) + break; + } + cfs_hash_bd_unlock(hs, &bd, 0); + out: + cfs_hash_unlock(hs, 0); + cfs_hash_for_each_exit(hs); +} + +EXPORT_SYMBOL(cfs_hash_hlist_for_each); + +/* + * For each item in the libcfs hash @hs which matches the @key call + * the passed callback @func and pass to it as an argument each hash + * item and the private @data. During the callback the bucket lock + * is held so the callback must never sleep. + */ +void +cfs_hash_for_each_key(struct cfs_hash *hs, const void *key, + cfs_hash_for_each_cb_t func, void *data) { + struct hlist_node *hnode; + struct cfs_hash_bd bds[2]; + unsigned i; + + cfs_hash_lock(hs, 0); + + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); + + cfs_hash_for_each_bd(bds, 2, i) { + struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]); + + hlist_for_each(hnode, hlist) { + cfs_hash_bucket_validate(hs, &bds[i], hnode); + + if (cfs_hash_keycmp(hs, key, hnode)) { + if (func(hs, &bds[i], hnode, data)) + break; + } + } + } + + cfs_hash_dual_bd_unlock(hs, bds, 0); + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_for_each_key); + +/** + * Rehash the libcfs hash @hs to the given @bits. This can be used + * to grow the hash size when excessive chaining is detected, or to + * shrink the hash when it is larger than needed. When the CFS_HASH_REHASH + * flag is set in @hs the libcfs hash may be dynamically rehashed + * during addition or removal if the hash's theta value exceeds + * either the hs->hs_min_theta or hs->max_theta values. By default + * these values are tuned to keep the chained hash depth small, and + * this approach assumes a reasonably uniform hashing function. The + * theta thresholds for @hs are tunable via cfs_hash_set_theta(). + */ +void +cfs_hash_rehash_cancel_locked(struct cfs_hash *hs) +{ + int i; + + /* need hold cfs_hash_lock(hs, 1) */ + LASSERT(cfs_hash_with_rehash(hs) && + !cfs_hash_with_no_lock(hs)); + + if (!cfs_hash_is_rehashing(hs)) + return; + + if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) { + hs->hs_rehash_bits = 0; + return; + } + + for (i = 2; cfs_hash_is_rehashing(hs); i++) { + cfs_hash_unlock(hs, 1); + /* raise console warning while waiting too long */ + CDEBUG(IS_PO2(i >> 3) ? D_WARNING : D_INFO, + "hash %s is still rehashing, rescheded %d\n", + hs->hs_name, i - 1); + cond_resched(); + cfs_hash_lock(hs, 1); + } +} +EXPORT_SYMBOL(cfs_hash_rehash_cancel_locked); + +void +cfs_hash_rehash_cancel(struct cfs_hash *hs) +{ + cfs_hash_lock(hs, 1); + cfs_hash_rehash_cancel_locked(hs); + cfs_hash_unlock(hs, 1); +} +EXPORT_SYMBOL(cfs_hash_rehash_cancel); + +int +cfs_hash_rehash(struct cfs_hash *hs, int do_rehash) +{ + int rc; + + LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs)); + + cfs_hash_lock(hs, 1); + + rc = cfs_hash_rehash_bits(hs); + if (rc <= 0) { + cfs_hash_unlock(hs, 1); + return rc; + } + + hs->hs_rehash_bits = rc; + if (!do_rehash) { + /* launch and return */ + cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi); + cfs_hash_unlock(hs, 1); + return 0; + } + + /* rehash right now */ + cfs_hash_unlock(hs, 1); + + return cfs_hash_rehash_worker(&hs->hs_rehash_wi); +} +EXPORT_SYMBOL(cfs_hash_rehash); + +static int +cfs_hash_rehash_bd(struct cfs_hash *hs, struct cfs_hash_bd *old) +{ + struct cfs_hash_bd new; + struct hlist_head *hhead; + struct hlist_node *hnode; + struct hlist_node *pos; + void *key; + int c = 0; + + /* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */ + cfs_hash_bd_for_each_hlist(hs, old, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + key = cfs_hash_key(hs, hnode); + LASSERT(key != NULL); + /* Validate hnode is in the correct bucket. */ + cfs_hash_bucket_validate(hs, old, hnode); + /* + * Delete from old hash bucket; move to new bucket. + * ops->hs_key must be defined. + */ + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, &new); + cfs_hash_bd_move_locked(hs, old, &new, hnode); + c++; + } + } + + return c; +} + +static int +cfs_hash_rehash_worker(cfs_workitem_t *wi) +{ + struct cfs_hash *hs = container_of(wi, struct cfs_hash, hs_rehash_wi); + struct cfs_hash_bucket **bkts; + struct cfs_hash_bd bd; + unsigned int old_size; + unsigned int new_size; + int bsize; + int count = 0; + int rc = 0; + int i; + + LASSERT (hs != NULL && cfs_hash_with_rehash(hs)); + + cfs_hash_lock(hs, 0); + LASSERT(cfs_hash_is_rehashing(hs)); + + old_size = CFS_HASH_NBKT(hs); + new_size = CFS_HASH_RH_NBKT(hs); + + cfs_hash_unlock(hs, 0); + + /* + * don't need hs::hs_rwlock for hs::hs_buckets, + * because nobody can change bkt-table except me. + */ + bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets, + old_size, new_size); + cfs_hash_lock(hs, 1); + if (bkts == NULL) { + rc = -ENOMEM; + goto out; + } + + if (bkts == hs->hs_buckets) { + bkts = NULL; /* do nothing */ + goto out; + } + + rc = __cfs_hash_theta(hs); + if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) { + /* free the new allocated bkt-table */ + old_size = new_size; + new_size = CFS_HASH_NBKT(hs); + rc = -EALREADY; + goto out; + } + + LASSERT(hs->hs_rehash_buckets == NULL); + hs->hs_rehash_buckets = bkts; + + rc = 0; + cfs_hash_for_each_bucket(hs, &bd, i) { + if (cfs_hash_is_exiting(hs)) { + rc = -ESRCH; + /* someone wants to destroy the hash, abort now */ + if (old_size < new_size) /* OK to free old bkt-table */ + break; + /* it's shrinking, need free new bkt-table */ + hs->hs_rehash_buckets = NULL; + old_size = new_size; + new_size = CFS_HASH_NBKT(hs); + goto out; + } + + count += cfs_hash_rehash_bd(hs, &bd); + if (count < CFS_HASH_LOOP_HOG || + cfs_hash_is_iterating(hs)) { /* need to finish ASAP */ + continue; + } + + count = 0; + cfs_hash_unlock(hs, 1); + cond_resched(); + cfs_hash_lock(hs, 1); + } + + hs->hs_rehash_count++; + + bkts = hs->hs_buckets; + hs->hs_buckets = hs->hs_rehash_buckets; + hs->hs_rehash_buckets = NULL; + + hs->hs_cur_bits = hs->hs_rehash_bits; + out: + hs->hs_rehash_bits = 0; + if (rc == -ESRCH) /* never be scheduled again */ + cfs_wi_exit(cfs_sched_rehash, wi); + bsize = cfs_hash_bkt_size(hs); + cfs_hash_unlock(hs, 1); + /* can't refer to @hs anymore because it could be destroyed */ + if (bkts != NULL) + cfs_hash_buckets_free(bkts, bsize, new_size, old_size); + if (rc != 0) + CDEBUG(D_INFO, "early quit of rehashing: %d\n", rc); + /* return 1 only if cfs_wi_exit is called */ + return rc == -ESRCH; +} + +/** + * Rehash the object referenced by @hnode in the libcfs hash @hs. The + * @old_key must be provided to locate the objects previous location + * in the hash, and the @new_key will be used to reinsert the object. + * Use this function instead of a cfs_hash_add() + cfs_hash_del() + * combo when it is critical that there is no window in time where the + * object is missing from the hash. When an object is being rehashed + * the registered cfs_hash_get() and cfs_hash_put() functions will + * not be called. + */ +void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key, + void *new_key, struct hlist_node *hnode) +{ + struct cfs_hash_bd bds[3]; + struct cfs_hash_bd old_bds[2]; + struct cfs_hash_bd new_bd; + + LASSERT(!hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + + cfs_hash_dual_bd_get(hs, old_key, old_bds); + cfs_hash_bd_get(hs, new_key, &new_bd); + + bds[0] = old_bds[0]; + bds[1] = old_bds[1]; + bds[2] = new_bd; + + /* NB: bds[0] and bds[1] are ordered already */ + cfs_hash_bd_order(&bds[1], &bds[2]); + cfs_hash_bd_order(&bds[0], &bds[1]); + + cfs_hash_multi_bd_lock(hs, bds, 3, 1); + if (likely(old_bds[1].bd_bucket == NULL)) { + cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode); + } else { + cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode); + cfs_hash_bd_add_locked(hs, &new_bd, hnode); + } + /* overwrite key inside locks, otherwise may screw up with + * other operations, i.e: rehash */ + cfs_hash_keycpy(hs, new_key, hnode); + + cfs_hash_multi_bd_unlock(hs, bds, 3, 1); + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_rehash_key); + +void cfs_hash_debug_header(struct seq_file *m) +{ + seq_printf(m, "%-*s cur min max theta t-min t-max flags rehash count maxdep maxdepb distribution\n", + CFS_HASH_BIGNAME_LEN, "name"); +} +EXPORT_SYMBOL(cfs_hash_debug_header); + +static struct cfs_hash_bucket ** +cfs_hash_full_bkts(struct cfs_hash *hs) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (hs->hs_rehash_buckets == NULL) + return hs->hs_buckets; + + LASSERT(hs->hs_rehash_bits != 0); + return hs->hs_rehash_bits > hs->hs_cur_bits ? + hs->hs_rehash_buckets : hs->hs_buckets; +} + +static unsigned int +cfs_hash_full_nbkt(struct cfs_hash *hs) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (hs->hs_rehash_buckets == NULL) + return CFS_HASH_NBKT(hs); + + LASSERT(hs->hs_rehash_bits != 0); + return hs->hs_rehash_bits > hs->hs_cur_bits ? + CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs); +} + +void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m) +{ + int dist[8] = { 0, }; + int maxdep = -1; + int maxdepb = -1; + int total = 0; + int theta; + int i; + + cfs_hash_lock(hs, 0); + theta = __cfs_hash_theta(hs); + + seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d 0x%02x %6d ", + CFS_HASH_BIGNAME_LEN, hs->hs_name, + 1 << hs->hs_cur_bits, 1 << hs->hs_min_bits, + 1 << hs->hs_max_bits, + __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta), + __cfs_hash_theta_int(hs->hs_min_theta), + __cfs_hash_theta_frac(hs->hs_min_theta), + __cfs_hash_theta_int(hs->hs_max_theta), + __cfs_hash_theta_frac(hs->hs_max_theta), + hs->hs_flags, hs->hs_rehash_count); + + /* + * The distribution is a summary of the chained hash depth in + * each of the libcfs hash buckets. Each buckets hsb_count is + * divided by the hash theta value and used to generate a + * histogram of the hash distribution. A uniform hash will + * result in all hash buckets being close to the average thus + * only the first few entries in the histogram will be non-zero. + * If you hash function results in a non-uniform hash the will + * be observable by outlier bucks in the distribution histogram. + * + * Uniform hash distribution: 128/128/0/0/0/0/0/0 + * Non-Uniform hash distribution: 128/125/0/0/0/0/2/1 + */ + for (i = 0; i < cfs_hash_full_nbkt(hs); i++) { + struct cfs_hash_bd bd; + + bd.bd_bucket = cfs_hash_full_bkts(hs)[i]; + cfs_hash_bd_lock(hs, &bd, 0); + if (maxdep < bd.bd_bucket->hsb_depmax) { + maxdep = bd.bd_bucket->hsb_depmax; + maxdepb = ffz(~maxdep); + } + total += bd.bd_bucket->hsb_count; + dist[min(fls(bd.bd_bucket->hsb_count / max(theta, 1)), 7)]++; + cfs_hash_bd_unlock(hs, &bd, 0); + } + + seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb); + for (i = 0; i < 8; i++) + seq_printf(m, "%d%c", dist[i], (i == 7) ? '\n' : '/'); + + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_debug_str); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c b/kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c new file mode 100644 index 000000000..d9b7c6b69 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c @@ -0,0 +1,240 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: Nathan Rutman + * + * Kernel <-> userspace communication routines. + * Using pipes for all arches. + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_KUC D_OTHER + +#include "../../include/linux/libcfs/libcfs.h" + +/* This is the kernel side (liblustre as well). */ + +/** + * libcfs_kkuc_msg_put - send an message from kernel to userspace + * @param fp to send the message to + * @param payload Payload data. First field of payload is always + * struct kuc_hdr + */ +int libcfs_kkuc_msg_put(struct file *filp, void *payload) +{ + struct kuc_hdr *kuch = (struct kuc_hdr *)payload; + ssize_t count = kuch->kuc_msglen; + loff_t offset = 0; + mm_segment_t fs; + int rc = -ENOSYS; + + if (filp == NULL || IS_ERR(filp)) + return -EBADF; + + if (kuch->kuc_magic != KUC_MAGIC) { + CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic); + return -ENOSYS; + } + + fs = get_fs(); + set_fs(KERNEL_DS); + while (count > 0) { + rc = vfs_write(filp, (void __force __user *)payload, + count, &offset); + if (rc < 0) + break; + count -= rc; + payload += rc; + rc = 0; + } + set_fs(fs); + + if (rc < 0) + CWARN("message send failed (%d)\n", rc); + else + CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp); + + return rc; +} +EXPORT_SYMBOL(libcfs_kkuc_msg_put); + +/* Broadcast groups are global across all mounted filesystems; + * i.e. registering for a group on 1 fs will get messages for that + * group from any fs */ +/** A single group registration has a uid and a file pointer */ +struct kkuc_reg { + struct list_head kr_chain; + int kr_uid; + struct file *kr_fp; + __u32 kr_data; +}; +static struct list_head kkuc_groups[KUC_GRP_MAX+1] = {}; +/* Protect message sending against remove and adds */ +static DECLARE_RWSEM(kg_sem); + +/** Add a receiver to a broadcast group + * @param filp pipe to write into + * @param uid identifier for this receiver + * @param group group number + */ +int libcfs_kkuc_group_add(struct file *filp, int uid, int group, __u32 data) +{ + struct kkuc_reg *reg; + + if (group > KUC_GRP_MAX) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + return -EINVAL; + } + + /* fput in group_rem */ + if (filp == NULL) + return -EBADF; + + /* freed in group_rem */ + reg = kmalloc(sizeof(*reg), 0); + if (reg == NULL) + return -ENOMEM; + + reg->kr_fp = filp; + reg->kr_uid = uid; + reg->kr_data = data; + + down_write(&kg_sem); + if (kkuc_groups[group].next == NULL) + INIT_LIST_HEAD(&kkuc_groups[group]); + list_add(®->kr_chain, &kkuc_groups[group]); + up_write(&kg_sem); + + CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group); + + return 0; +} +EXPORT_SYMBOL(libcfs_kkuc_group_add); + +int libcfs_kkuc_group_rem(int uid, int group) +{ + struct kkuc_reg *reg, *next; + + if (kkuc_groups[group].next == NULL) + return 0; + + if (uid == 0) { + /* Broadcast a shutdown message */ + struct kuc_hdr lh; + + lh.kuc_magic = KUC_MAGIC; + lh.kuc_transport = KUC_TRANSPORT_GENERIC; + lh.kuc_msgtype = KUC_MSG_SHUTDOWN; + lh.kuc_msglen = sizeof(lh); + libcfs_kkuc_group_put(group, &lh); + } + + down_write(&kg_sem); + list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) { + if ((uid == 0) || (uid == reg->kr_uid)) { + list_del(®->kr_chain); + CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n", + reg->kr_uid, reg->kr_fp, group); + if (reg->kr_fp != NULL) + fput(reg->kr_fp); + kfree(reg); + } + } + up_write(&kg_sem); + + return 0; +} +EXPORT_SYMBOL(libcfs_kkuc_group_rem); + +int libcfs_kkuc_group_put(int group, void *payload) +{ + struct kkuc_reg *reg; + int rc = 0; + int one_success = 0; + + down_read(&kg_sem); + list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { + if (reg->kr_fp != NULL) { + rc = libcfs_kkuc_msg_put(reg->kr_fp, payload); + if (rc == 0) + one_success = 1; + else if (rc == -EPIPE) { + fput(reg->kr_fp); + reg->kr_fp = NULL; + } + } + } + up_read(&kg_sem); + + /* don't return an error if the message has been delivered + * at least to one agent */ + if (one_success) + rc = 0; + + return rc; +} +EXPORT_SYMBOL(libcfs_kkuc_group_put); + +/** + * Calls a callback function for each link of the given kuc group. + * @param group the group to call the function on. + * @param cb_func the function to be called. + * @param cb_arg iextra argument to be passed to the callback function. + */ +int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func, + void *cb_arg) +{ + struct kkuc_reg *reg; + int rc = 0; + + if (group > KUC_GRP_MAX) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + return -EINVAL; + } + + /* no link for this group */ + if (kkuc_groups[group].next == NULL) + return 0; + + down_write(&kg_sem); + list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { + if (reg->kr_fp != NULL) + rc = cb_func(reg->kr_data, cb_arg); + } + up_write(&kg_sem); + + return rc; +} +EXPORT_SYMBOL(libcfs_kkuc_group_foreach); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c new file mode 100644 index 000000000..31a558115 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c @@ -0,0 +1,224 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/libcfs/libcfs.h" + +/** Global CPU partition table */ +struct cfs_cpt_table *cfs_cpt_table __read_mostly; +EXPORT_SYMBOL(cfs_cpt_table); + +#ifndef HAVE_LIBCFS_CPT + +#define CFS_CPU_VERSION_MAGIC 0xbabecafe + +struct cfs_cpt_table * +cfs_cpt_table_alloc(unsigned int ncpt) +{ + struct cfs_cpt_table *cptab; + + if (ncpt != 1) { + CERROR("Can't support cpu partition number %d\n", ncpt); + return NULL; + } + + LIBCFS_ALLOC(cptab, sizeof(*cptab)); + if (cptab != NULL) { + cptab->ctb_version = CFS_CPU_VERSION_MAGIC; + cptab->ctb_nparts = ncpt; + } + + return cptab; +} +EXPORT_SYMBOL(cfs_cpt_table_alloc); + +void +cfs_cpt_table_free(struct cfs_cpt_table *cptab) +{ + LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC); + + LIBCFS_FREE(cptab, sizeof(*cptab)); +} +EXPORT_SYMBOL(cfs_cpt_table_free); + +#ifdef CONFIG_SMP +int +cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + int rc = 0; + + rc = snprintf(buf, len, "%d\t: %d\n", 0, 0); + len -= rc; + if (len <= 0) + return -EFBIG; + + return rc; +} +EXPORT_SYMBOL(cfs_cpt_table_print); +#endif /* CONFIG_SMP */ + +int +cfs_cpt_number(struct cfs_cpt_table *cptab) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_number); + +int +cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_weight); + +int +cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_online); + +int +cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpu); + +void +cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_cpu); + +int +cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpumask); + +void +cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_cpumask); + +int +cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_node); + +void +cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_node); + +int +cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_nodemask); + +void +cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_nodemask); + +void +cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt) +{ +} +EXPORT_SYMBOL(cfs_cpt_clear); + +int +cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_spread_node); + +int +cfs_cpu_ht_nsiblings(int cpu) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpu_ht_nsiblings); + +int +cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_current); + +int +cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_of_cpu); + +int +cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_bind); + +void +cfs_cpu_fini(void) +{ + if (cfs_cpt_table != NULL) { + cfs_cpt_table_free(cfs_cpt_table); + cfs_cpt_table = NULL; + } +} + +int +cfs_cpu_init(void) +{ + cfs_cpt_table = cfs_cpt_table_alloc(1); + + return cfs_cpt_table != NULL ? 0 : -1; +} + +#endif /* HAVE_LIBCFS_CPT */ diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c new file mode 100644 index 000000000..2c199c725 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c @@ -0,0 +1,189 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/libcfs/libcfs.h" + + +/** destroy cpu-partition lock, see libcfs_private.h for more detail */ +void +cfs_percpt_lock_free(struct cfs_percpt_lock *pcl) +{ + LASSERT(pcl->pcl_locks != NULL); + LASSERT(!pcl->pcl_locked); + + cfs_percpt_free(pcl->pcl_locks); + LIBCFS_FREE(pcl, sizeof(*pcl)); +} +EXPORT_SYMBOL(cfs_percpt_lock_free); + +/** + * create cpu-partition lock, see libcfs_private.h for more detail. + * + * cpu-partition lock is designed for large-scale SMP system, so we need to + * reduce cacheline conflict as possible as we can, that's the + * reason we always allocate cacheline-aligned memory block. + */ +struct cfs_percpt_lock * +cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab) +{ + struct cfs_percpt_lock *pcl; + spinlock_t *lock; + int i; + + /* NB: cptab can be NULL, pcl will be for HW CPUs on that case */ + LIBCFS_ALLOC(pcl, sizeof(*pcl)); + if (pcl == NULL) + return NULL; + + pcl->pcl_cptab = cptab; + pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock)); + if (pcl->pcl_locks == NULL) { + LIBCFS_FREE(pcl, sizeof(*pcl)); + return NULL; + } + + cfs_percpt_for_each(lock, i, pcl->pcl_locks) + spin_lock_init(lock); + + return pcl; +} +EXPORT_SYMBOL(cfs_percpt_lock_alloc); + +/** + * lock a CPU partition + * + * \a index != CFS_PERCPT_LOCK_EX + * hold private lock indexed by \a index + * + * \a index == CFS_PERCPT_LOCK_EX + * exclusively lock @pcl and nobody can take private lock + */ +void +cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index) +{ + int ncpt = cfs_cpt_number(pcl->pcl_cptab); + int i; + + LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt); + + if (ncpt == 1) { + index = 0; + } else { /* serialize with exclusive lock */ + while (pcl->pcl_locked) + cpu_relax(); + } + + if (likely(index != CFS_PERCPT_LOCK_EX)) { + spin_lock(pcl->pcl_locks[index]); + return; + } + + /* exclusive lock request */ + for (i = 0; i < ncpt; i++) { + spin_lock(pcl->pcl_locks[i]); + if (i == 0) { + LASSERT(!pcl->pcl_locked); + /* nobody should take private lock after this + * so I wouldn't starve for too long time */ + pcl->pcl_locked = 1; + } + } +} +EXPORT_SYMBOL(cfs_percpt_lock); + +/** unlock a CPU partition */ +void +cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index) +{ + int ncpt = cfs_cpt_number(pcl->pcl_cptab); + int i; + + index = ncpt == 1 ? 0 : index; + + if (likely(index != CFS_PERCPT_LOCK_EX)) { + spin_unlock(pcl->pcl_locks[index]); + return; + } + + for (i = ncpt - 1; i >= 0; i--) { + if (i == 0) { + LASSERT(pcl->pcl_locked); + pcl->pcl_locked = 0; + } + spin_unlock(pcl->pcl_locks[i]); + } +} +EXPORT_SYMBOL(cfs_percpt_unlock); + + +/** free cpu-partition refcount */ +void +cfs_percpt_atomic_free(atomic_t **refs) +{ + cfs_percpt_free(refs); +} +EXPORT_SYMBOL(cfs_percpt_atomic_free); + +/** allocate cpu-partition refcount with initial value @init_val */ +atomic_t ** +cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int init_val) +{ + atomic_t **refs; + atomic_t *ref; + int i; + + refs = cfs_percpt_alloc(cptab, sizeof(*ref)); + if (refs == NULL) + return NULL; + + cfs_percpt_for_each(ref, i, refs) + atomic_set(ref, init_val); + return refs; +} +EXPORT_SYMBOL(cfs_percpt_atomic_alloc); + +/** return sum of cpu-partition refs */ +int +cfs_percpt_atomic_summary(atomic_t **refs) +{ + atomic_t *ref; + int i; + int val = 0; + + cfs_percpt_for_each(ref, i, refs) + val += atomic_read(ref); + + return val; +} +EXPORT_SYMBOL(cfs_percpt_atomic_summary); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c new file mode 100644 index 000000000..1debdda72 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c @@ -0,0 +1,202 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/libcfs/libcfs.h" + +struct cfs_var_array { + unsigned int va_count; /* # of buffers */ + unsigned int va_size; /* size of each var */ + struct cfs_cpt_table *va_cptab; /* cpu partition table */ + void *va_ptrs[0]; /* buffer addresses */ +}; + +/* + * free per-cpu data, see more detail in cfs_percpt_free + */ +void +cfs_percpt_free(void *vars) +{ + struct cfs_var_array *arr; + int i; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + for (i = 0; i < arr->va_count; i++) { + if (arr->va_ptrs[i] != NULL) + LIBCFS_FREE(arr->va_ptrs[i], arr->va_size); + } + + LIBCFS_FREE(arr, offsetof(struct cfs_var_array, + va_ptrs[arr->va_count])); +} +EXPORT_SYMBOL(cfs_percpt_free); + +/* + * allocate per cpu-partition variables, returned value is an array of pointers, + * variable can be indexed by CPU partition ID, i.e: + * + * arr = cfs_percpt_alloc(cfs_cpu_pt, size); + * then caller can access memory block for CPU 0 by arr[0], + * memory block for CPU 1 by arr[1]... + * memory block for CPU N by arr[N]... + * + * cacheline aligned. + */ +void * +cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size) +{ + struct cfs_var_array *arr; + int count; + int i; + + count = cfs_cpt_number(cptab); + + LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count])); + if (arr == NULL) + return NULL; + + arr->va_size = size = L1_CACHE_ALIGN(size); + arr->va_count = count; + arr->va_cptab = cptab; + + for (i = 0; i < count; i++) { + LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size); + if (arr->va_ptrs[i] == NULL) { + cfs_percpt_free((void *)&arr->va_ptrs[0]); + return NULL; + } + } + + return (void *)&arr->va_ptrs[0]; +} +EXPORT_SYMBOL(cfs_percpt_alloc); + +/* + * return number of CPUs (or number of elements in per-cpu data) + * according to cptab of @vars + */ +int +cfs_percpt_number(void *vars) +{ + struct cfs_var_array *arr; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + return arr->va_count; +} +EXPORT_SYMBOL(cfs_percpt_number); + +/* + * return memory block shadowed from current CPU + */ +void * +cfs_percpt_current(void *vars) +{ + struct cfs_var_array *arr; + int cpt; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + cpt = cfs_cpt_current(arr->va_cptab, 0); + if (cpt < 0) + return NULL; + + return arr->va_ptrs[cpt]; +} +EXPORT_SYMBOL(cfs_percpt_current); + +void * +cfs_percpt_index(void *vars, int idx) +{ + struct cfs_var_array *arr; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + LASSERT(idx >= 0 && idx < arr->va_count); + return arr->va_ptrs[idx]; +} +EXPORT_SYMBOL(cfs_percpt_index); + +/* + * free variable array, see more detail in cfs_array_alloc + */ +void +cfs_array_free(void *vars) +{ + struct cfs_var_array *arr; + int i; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + for (i = 0; i < arr->va_count; i++) { + if (arr->va_ptrs[i] == NULL) + continue; + + LIBCFS_FREE(arr->va_ptrs[i], arr->va_size); + } + LIBCFS_FREE(arr, offsetof(struct cfs_var_array, + va_ptrs[arr->va_count])); +} +EXPORT_SYMBOL(cfs_array_free); + +/* + * allocate a variable array, returned value is an array of pointers. + * Caller can specify length of array by @count, @size is size of each + * memory block in array. + */ +void * +cfs_array_alloc(int count, unsigned int size) +{ + struct cfs_var_array *arr; + int i; + + LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count])); + if (arr == NULL) + return NULL; + + arr->va_count = count; + arr->va_size = size; + + for (i = 0; i < count; i++) { + LIBCFS_ALLOC(arr->va_ptrs[i], size); + + if (arr->va_ptrs[i] == NULL) { + cfs_array_free((void *)&arr->va_ptrs[0]); + return NULL; + } + } + + return (void *)&arr->va_ptrs[0]; +} +EXPORT_SYMBOL(cfs_array_alloc); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c new file mode 100644 index 000000000..76d4392bd --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c @@ -0,0 +1,562 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * String manipulation functions. + * + * libcfs/libcfs/libcfs_string.c + * + * Author: Nathan Rutman + */ + +#include "../../include/linux/libcfs/libcfs.h" + +/* Convert a text string to a bitmask */ +int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), + int *oldmask, int minmask, int allmask) +{ + const char *debugstr; + char op = '\0'; + int newmask = minmask, i, len, found = 0; + + /* must be a list of tokens separated by whitespace + * and optionally an operator ('+' or '-'). If an operator + * appears first in , '*oldmask' is used as the starting point + * (relative), otherwise minmask is used (absolute). An operator + * applies to all following tokens up to the next operator. */ + while (*str != '\0') { + while (isspace(*str)) + str++; + if (*str == '\0') + break; + if (*str == '+' || *str == '-') { + op = *str++; + if (!found) + /* only if first token is relative */ + newmask = *oldmask; + while (isspace(*str)) + str++; + if (*str == '\0') /* trailing op */ + return -EINVAL; + } + + /* find token length */ + len = 0; + while (str[len] != '\0' && !isspace(str[len]) && + str[len] != '+' && str[len] != '-') + len++; + + /* match token */ + found = 0; + for (i = 0; i < 32; i++) { + debugstr = bit2str(i); + if (debugstr != NULL && + strlen(debugstr) == len && + strncasecmp(str, debugstr, len) == 0) { + if (op == '-') + newmask &= ~(1 << i); + else + newmask |= (1 << i); + found = 1; + break; + } + } + if (!found && len == 3 && + (strncasecmp(str, "ALL", len) == 0)) { + if (op == '-') + newmask = minmask; + else + newmask = allmask; + found = 1; + } + if (!found) { + CWARN("unknown mask '%.*s'.\n" + "mask usage: [+|-] ...\n", len, str); + return -EINVAL; + } + str += len; + } + + *oldmask = newmask; + return 0; +} + +/* get the first string out of @str */ +char *cfs_firststr(char *str, size_t size) +{ + size_t i = 0; + char *end; + + /* trim leading spaces */ + while (i < size && *str && isspace(*str)) { + ++i; + ++str; + } + + /* string with all spaces */ + if (*str == '\0') + goto out; + + end = str; + while (i < size && *end != '\0' && !isspace(*end)) { + ++i; + ++end; + } + + *end = '\0'; +out: + return str; +} +EXPORT_SYMBOL(cfs_firststr); + +char * +cfs_trimwhite(char *str) +{ + char *end; + + while (isspace(*str)) + str++; + + end = str + strlen(str); + while (end > str) { + if (!isspace(end[-1])) + break; + end--; + } + + *end = 0; + return str; +} +EXPORT_SYMBOL(cfs_trimwhite); + +/** + * Extracts tokens from strings. + * + * Looks for \a delim in string \a next, sets \a res to point to + * substring before the delimiter, sets \a next right after the found + * delimiter. + * + * \retval 1 if \a res points to a string of non-whitespace characters + * \retval 0 otherwise + */ +int +cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res) +{ + char *end; + + if (next->ls_str == NULL) + return 0; + + /* skip leading white spaces */ + while (next->ls_len) { + if (!isspace(*next->ls_str)) + break; + next->ls_str++; + next->ls_len--; + } + + if (next->ls_len == 0) /* whitespaces only */ + return 0; + + if (*next->ls_str == delim) { + /* first non-writespace is the delimiter */ + return 0; + } + + res->ls_str = next->ls_str; + end = memchr(next->ls_str, delim, next->ls_len); + if (end == NULL) { + /* there is no the delimeter in the string */ + end = next->ls_str + next->ls_len; + next->ls_str = NULL; + } else { + next->ls_str = end + 1; + next->ls_len -= (end - res->ls_str + 1); + } + + /* skip ending whitespaces */ + while (--end != res->ls_str) { + if (!isspace(*end)) + break; + } + + res->ls_len = end - res->ls_str + 1; + return 1; +} + +/** + * Converts string to integer. + * + * Accepts decimal and hexadecimal number recordings. + * + * \retval 1 if first \a nob chars of \a str convert to decimal or + * hexadecimal integer in the range [\a min, \a max] + * \retval 0 otherwise + */ +int +cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max) +{ + char *endp; + + str = cfs_trimwhite(str); + *num = strtoul(str, &endp, 0); + if (endp == str) + return 0; + + for (; endp < str + nob; endp++) { + if (!isspace(*endp)) + return 0; + } + + return (*num >= min && *num <= max); +} + +/** + * Parses \ token of the syntax. If \a bracketed is false, + * \a src should only have a single token which can be \ or \* + * + * \retval pointer to allocated range_expr and initialized + * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a + `* src parses to + * \ | + * \ '-' \ | + * \ '-' \ '/' \ + * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or + * -ENOMEM will be returned. + */ +static int +cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max, + int bracketed, struct cfs_range_expr **expr) +{ + struct cfs_range_expr *re; + struct cfs_lstr tok; + + LIBCFS_ALLOC(re, sizeof(*re)); + if (re == NULL) + return -ENOMEM; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + re->re_lo = min; + re->re_hi = max; + re->re_stride = 1; + goto out; + } + + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_lo, min, max)) { + /* is parsed */ + re->re_hi = re->re_lo; + re->re_stride = 1; + goto out; + } + + if (!bracketed || !cfs_gettok(src, '-', &tok)) + goto failed; + + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_lo, min, max)) + goto failed; + + /* - */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_hi, min, max)) { + /* - is parsed */ + re->re_stride = 1; + goto out; + } + + /* go to check '-' '/' */ + if (cfs_gettok(src, '/', &tok)) { + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_hi, min, max)) + goto failed; + + /* - / ... */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_stride, min, max)) { + /* - / is parsed */ + goto out; + } + } + + out: + *expr = re; + return 0; + + failed: + LIBCFS_FREE(re, sizeof(*re)); + return -EINVAL; +} + +/** + * Matches value (\a value) against ranges expression list \a expr_list. + * + * \retval 1 if \a value matches + * \retval 0 otherwise + */ +int +cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (value >= expr->re_lo && value <= expr->re_hi && + ((value - expr->re_lo) % expr->re_stride) == 0) + return 1; + } + + return 0; +} + +/** + * Convert express list (\a expr_list) to an array of all matched values + * + * \retval N N is total number of all matched values + * \retval 0 if expression list is empty + * \retval < 0 for failure + */ +int +cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp) +{ + struct cfs_range_expr *expr; + __u32 *val; + int count = 0; + int i; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + count++; + } + } + + if (count == 0) /* empty expression list */ + return 0; + + if (count > max) { + CERROR("Number of values %d exceeds max allowed %d\n", + max, count); + return -EINVAL; + } + + LIBCFS_ALLOC(val, sizeof(val[0]) * count); + if (val == NULL) + return -ENOMEM; + + count = 0; + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + val[count++] = i; + } + } + + *valpp = val; + return count; +} +EXPORT_SYMBOL(cfs_expr_list_values); + +/** + * Frees cfs_range_expr structures of \a expr_list. + * + * \retval none + */ +void +cfs_expr_list_free(struct cfs_expr_list *expr_list) +{ + while (!list_empty(&expr_list->el_exprs)) { + struct cfs_range_expr *expr; + + expr = list_entry(expr_list->el_exprs.next, + struct cfs_range_expr, re_link), + list_del(&expr->re_link); + LIBCFS_FREE(expr, sizeof(*expr)); + } + + LIBCFS_FREE(expr_list, sizeof(*expr_list)); +} +EXPORT_SYMBOL(cfs_expr_list_free); + +/** + * Parses \ token of the syntax. + * + * \retval 1 if \a str parses to \ | \ + * \retval 0 otherwise + */ +int +cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *expr; + struct cfs_lstr src; + int rc; + + LIBCFS_ALLOC(expr_list, sizeof(*expr_list)); + if (expr_list == NULL) + return -ENOMEM; + + src.ls_str = str; + src.ls_len = len; + + INIT_LIST_HEAD(&expr_list->el_exprs); + + if (src.ls_str[0] == '[' && + src.ls_str[src.ls_len - 1] == ']') { + src.ls_str++; + src.ls_len -= 2; + + rc = -EINVAL; + while (src.ls_str != NULL) { + struct cfs_lstr tok; + + if (!cfs_gettok(&src, ',', &tok)) { + rc = -EINVAL; + break; + } + + rc = cfs_range_expr_parse(&tok, min, max, 1, &expr); + if (rc != 0) + break; + + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } else { + rc = cfs_range_expr_parse(&src, min, max, 0, &expr); + if (rc == 0) { + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } + + if (rc != 0) + cfs_expr_list_free(expr_list); + else + *elpp = expr_list; + + return rc; +} +EXPORT_SYMBOL(cfs_expr_list_parse); + +/** + * Frees cfs_expr_list structures of \a list. + * + * For each struct cfs_expr_list structure found on \a list it frees + * range_expr list attached to it and frees the cfs_expr_list itself. + * + * \retval none + */ +void +cfs_expr_list_free_list(struct list_head *list) +{ + struct cfs_expr_list *el; + + while (!list_empty(list)) { + el = list_entry(list->next, + struct cfs_expr_list, el_link); + list_del(&el->el_link); + cfs_expr_list_free(el); + } +} + +int +cfs_ip_addr_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + struct cfs_lstr src; + int rc; + int i; + + src.ls_str = str; + src.ls_len = len; + i = 0; + + while (src.ls_str != NULL) { + struct cfs_lstr res; + + if (!cfs_gettok(&src, '.', &res)) { + rc = -EINVAL; + goto out; + } + + rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el); + if (rc != 0) + goto out; + + list_add_tail(&el->el_link, list); + i++; + } + + if (i == 4) + return 0; + + rc = -EINVAL; + out: + cfs_expr_list_free_list(list); + + return rc; +} +EXPORT_SYMBOL(cfs_ip_addr_parse); + +/** + * Matches address (\a addr) against address set encoded in \a list. + * + * \retval 1 if \a addr matches + * \retval 0 otherwise + */ +int +cfs_ip_addr_match(__u32 addr, struct list_head *list) +{ + struct cfs_expr_list *el; + int i = 0; + + list_for_each_entry_reverse(el, list, el_link) { + if (!cfs_expr_list_match(addr & 0xff, el)) + return 0; + addr >>= 8; + i++; + } + + return i == 4; +} +EXPORT_SYMBOL(cfs_ip_addr_match); + +void +cfs_ip_addr_free(struct list_head *list) +{ + cfs_expr_list_free_list(list); +} +EXPORT_SYMBOL(cfs_ip_addr_free); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c new file mode 100644 index 000000000..cc3ab3519 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c @@ -0,0 +1,1056 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include "../../../include/linux/libcfs/libcfs.h" + +#ifdef CONFIG_SMP + +/** + * modparam for setting number of partitions + * + * 0 : estimate best value based on cores or NUMA nodes + * 1 : disable multiple partitions + * >1 : specify number of partitions + */ +static int cpu_npartitions; +module_param(cpu_npartitions, int, 0444); +MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions"); + +/** + * modparam for setting CPU partitions patterns: + * + * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID, + * number in bracket is processor ID (core or HT) + * + * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket + * are NUMA node ID, number before bracket is CPU partition ID. + * + * NB: If user specified cpu_pattern, cpu_npartitions will be ignored + */ +static char *cpu_pattern = ""; +module_param(cpu_pattern, charp, 0444); +MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern"); + +struct cfs_cpt_data { + /* serialize hotplug etc */ + spinlock_t cpt_lock; + /* reserved for hotplug */ + unsigned long cpt_version; + /* mutex to protect cpt_cpumask */ + struct mutex cpt_mutex; + /* scratch buffer for set/unset_node */ + cpumask_t *cpt_cpumask; +}; + +static struct cfs_cpt_data cpt_data; + +static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask) +{ + /* return cpumask of cores in the same socket */ + cpumask_copy(mask, topology_core_cpumask(cpu)); +} + +/* return cpumask of HTs in the same core */ +static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask) +{ + cpumask_copy(mask, topology_thread_cpumask(cpu)); +} + +static void cfs_node_to_cpumask(int node, cpumask_t *mask) +{ + cpumask_copy(mask, cpumask_of_node(node)); +} + +void +cfs_cpt_table_free(struct cfs_cpt_table *cptab) +{ + int i; + + if (cptab->ctb_cpu2cpt != NULL) { + LIBCFS_FREE(cptab->ctb_cpu2cpt, + num_possible_cpus() * + sizeof(cptab->ctb_cpu2cpt[0])); + } + + for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + if (part->cpt_nodemask != NULL) { + LIBCFS_FREE(part->cpt_nodemask, + sizeof(*part->cpt_nodemask)); + } + + if (part->cpt_cpumask != NULL) + LIBCFS_FREE(part->cpt_cpumask, cpumask_size()); + } + + if (cptab->ctb_parts != NULL) { + LIBCFS_FREE(cptab->ctb_parts, + cptab->ctb_nparts * sizeof(cptab->ctb_parts[0])); + } + + if (cptab->ctb_nodemask != NULL) + LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + if (cptab->ctb_cpumask != NULL) + LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size()); + + LIBCFS_FREE(cptab, sizeof(*cptab)); +} +EXPORT_SYMBOL(cfs_cpt_table_free); + +struct cfs_cpt_table * +cfs_cpt_table_alloc(unsigned int ncpt) +{ + struct cfs_cpt_table *cptab; + int i; + + LIBCFS_ALLOC(cptab, sizeof(*cptab)); + if (cptab == NULL) + return NULL; + + cptab->ctb_nparts = ncpt; + + LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size()); + LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + + if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL) + goto failed; + + LIBCFS_ALLOC(cptab->ctb_cpu2cpt, + num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0])); + if (cptab->ctb_cpu2cpt == NULL) + goto failed; + + memset(cptab->ctb_cpu2cpt, -1, + num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0])); + + LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0])); + if (cptab->ctb_parts == NULL) + goto failed; + + for (i = 0; i < ncpt; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size()); + LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask)); + if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL) + goto failed; + } + + spin_lock(&cpt_data.cpt_lock); + /* Reserved for hotplug */ + cptab->ctb_version = cpt_data.cpt_version; + spin_unlock(&cpt_data.cpt_lock); + + return cptab; + + failed: + cfs_cpt_table_free(cptab); + return NULL; +} +EXPORT_SYMBOL(cfs_cpt_table_alloc); + +int +cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + char *tmp = buf; + int rc = 0; + int i; + int j; + + for (i = 0; i < cptab->ctb_nparts; i++) { + if (len > 0) { + rc = snprintf(tmp, len, "%d\t: ", i); + len -= rc; + } + + if (len <= 0) { + rc = -EFBIG; + goto out; + } + + tmp += rc; + for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) { + rc = snprintf(tmp, len, "%d ", j); + len -= rc; + if (len <= 0) { + rc = -EFBIG; + goto out; + } + tmp += rc; + } + + *tmp = '\n'; + tmp++; + len--; + } + + out: + if (rc < 0) + return rc; + + return tmp - buf; +} +EXPORT_SYMBOL(cfs_cpt_table_print); + +int +cfs_cpt_number(struct cfs_cpt_table *cptab) +{ + return cptab->ctb_nparts; +} +EXPORT_SYMBOL(cfs_cpt_number); + +int +cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cpumask_weight(cptab->ctb_cpumask) : + cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask); +} +EXPORT_SYMBOL(cfs_cpt_weight); + +int +cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cpumask_any_and(cptab->ctb_cpumask, + cpu_online_mask) < nr_cpu_ids : + cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask, + cpu_online_mask) < nr_cpu_ids; +} +EXPORT_SYMBOL(cfs_cpt_online); + +cpumask_t * +cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask; +} +EXPORT_SYMBOL(cfs_cpt_cpumask); + +nodemask_t * +cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask; +} +EXPORT_SYMBOL(cfs_cpt_nodemask); + +int +cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + int node; + + LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts); + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) { + CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu); + return 0; + } + + if (cptab->ctb_cpu2cpt[cpu] != -1) { + CDEBUG(D_INFO, "CPU %d is already in partition %d\n", + cpu, cptab->ctb_cpu2cpt[cpu]); + return 0; + } + + cptab->ctb_cpu2cpt[cpu] = cpt; + + LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask)); + LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)); + + cpumask_set_cpu(cpu, cptab->ctb_cpumask); + cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask); + + node = cpu_to_node(cpu); + + /* first CPU of @node in this CPT table */ + if (!node_isset(node, *cptab->ctb_nodemask)) + node_set(node, *cptab->ctb_nodemask); + + /* first CPU of @node in this partition */ + if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)) + node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask); + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpu); + +void +cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + int node; + int i; + + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpu < 0 || cpu >= nr_cpu_ids) { + CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu); + return; + } + + if (cpt == CFS_CPT_ANY) { + /* caller doesn't know the partition ID */ + cpt = cptab->ctb_cpu2cpt[cpu]; + if (cpt < 0) { /* not set in this CPT-table */ + CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n", + cpt, cptab); + return; + } + + } else if (cpt != cptab->ctb_cpu2cpt[cpu]) { + CDEBUG(D_INFO, + "CPU %d is not in cpu-partition %d\n", cpu, cpt); + return; + } + + LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)); + LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask)); + + cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask); + cpumask_clear_cpu(cpu, cptab->ctb_cpumask); + cptab->ctb_cpu2cpt[cpu] = -1; + + node = cpu_to_node(cpu); + + LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)); + LASSERT(node_isset(node, *cptab->ctb_nodemask)); + + for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) { + /* this CPT has other CPU belonging to this node? */ + if (cpu_to_node(i) == node) + break; + } + + if (i >= nr_cpu_ids) + node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask); + + for_each_cpu(i, cptab->ctb_cpumask) { + /* this CPT-table has other CPU belonging to this node? */ + if (cpu_to_node(i) == node) + break; + } + + if (i >= nr_cpu_ids) + node_clear(node, *cptab->ctb_nodemask); + + return; +} +EXPORT_SYMBOL(cfs_cpt_unset_cpu); + +int +cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ + int i; + + if (cpumask_weight(mask) == 0 || + cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) { + CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n", + cpt); + return 0; + } + + for_each_cpu(i, mask) { + if (!cfs_cpt_set_cpu(cptab, cpt, i)) + return 0; + } + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpumask); + +void +cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ + int i; + + for_each_cpu(i, mask) + cfs_cpt_unset_cpu(cptab, cpt, i); +} +EXPORT_SYMBOL(cfs_cpt_unset_cpumask); + +int +cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + cpumask_t *mask; + int rc; + + if (node < 0 || node >= MAX_NUMNODES) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return 0; + } + + mutex_lock(&cpt_data.cpt_mutex); + + mask = cpt_data.cpt_cpumask; + cfs_node_to_cpumask(node, mask); + + rc = cfs_cpt_set_cpumask(cptab, cpt, mask); + + mutex_unlock(&cpt_data.cpt_mutex); + + return rc; +} +EXPORT_SYMBOL(cfs_cpt_set_node); + +void +cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + cpumask_t *mask; + + if (node < 0 || node >= MAX_NUMNODES) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return; + } + + mutex_lock(&cpt_data.cpt_mutex); + + mask = cpt_data.cpt_cpumask; + cfs_node_to_cpumask(node, mask); + + cfs_cpt_unset_cpumask(cptab, cpt, mask); + + mutex_unlock(&cpt_data.cpt_mutex); +} +EXPORT_SYMBOL(cfs_cpt_unset_node); + +int +cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ + int i; + + for_each_node_mask(i, *mask) { + if (!cfs_cpt_set_node(cptab, cpt, i)) + return 0; + } + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_nodemask); + +void +cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ + int i; + + for_each_node_mask(i, *mask) + cfs_cpt_unset_node(cptab, cpt, i); +} +EXPORT_SYMBOL(cfs_cpt_unset_nodemask); + +void +cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt) +{ + int last; + int i; + + if (cpt == CFS_CPT_ANY) { + last = cptab->ctb_nparts - 1; + cpt = 0; + } else { + last = cpt; + } + + for (; cpt <= last; cpt++) { + for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) + cfs_cpt_unset_cpu(cptab, cpt, i); + } +} +EXPORT_SYMBOL(cfs_cpt_clear); + +int +cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) +{ + nodemask_t *mask; + int weight; + int rotor; + int node; + + /* convert CPU partition ID to HW node id */ + + if (cpt < 0 || cpt >= cptab->ctb_nparts) { + mask = cptab->ctb_nodemask; + rotor = cptab->ctb_spread_rotor++; + } else { + mask = cptab->ctb_parts[cpt].cpt_nodemask; + rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++; + } + + weight = nodes_weight(*mask); + LASSERT(weight > 0); + + rotor %= weight; + + for_each_node_mask(node, *mask) { + if (rotor-- == 0) + return node; + } + + LBUG(); + return 0; +} +EXPORT_SYMBOL(cfs_cpt_spread_node); + +int +cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) +{ + int cpu = smp_processor_id(); + int cpt = cptab->ctb_cpu2cpt[cpu]; + + if (cpt < 0) { + if (!remap) + return cpt; + + /* don't return negative value for safety of upper layer, + * instead we shadow the unknown cpu to a valid partition ID */ + cpt = cpu % cptab->ctb_nparts; + } + + return cpt; +} +EXPORT_SYMBOL(cfs_cpt_current); + +int +cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) +{ + LASSERT(cpu >= 0 && cpu < nr_cpu_ids); + + return cptab->ctb_cpu2cpt[cpu]; +} +EXPORT_SYMBOL(cfs_cpt_of_cpu); + +int +cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) +{ + cpumask_t *cpumask; + nodemask_t *nodemask; + int rc; + int i; + + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpt == CFS_CPT_ANY) { + cpumask = cptab->ctb_cpumask; + nodemask = cptab->ctb_nodemask; + } else { + cpumask = cptab->ctb_parts[cpt].cpt_cpumask; + nodemask = cptab->ctb_parts[cpt].cpt_nodemask; + } + + if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) { + CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n", + cpt); + return -EINVAL; + } + + for_each_online_cpu(i) { + if (cpumask_test_cpu(i, cpumask)) + continue; + + rc = set_cpus_allowed_ptr(current, cpumask); + set_mems_allowed(*nodemask); + if (rc == 0) + schedule(); /* switch to allowed CPU */ + + return rc; + } + + /* don't need to set affinity because all online CPUs are covered */ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_bind); + +/** + * Choose max to \a number CPUs from \a node and set them in \a cpt. + * We always prefer to choose CPU in the same core/socket. + */ +static int +cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, + cpumask_t *node, int number) +{ + cpumask_t *socket = NULL; + cpumask_t *core = NULL; + int rc = 0; + int cpu; + + LASSERT(number > 0); + + if (number >= cpumask_weight(node)) { + while (!cpumask_empty(node)) { + cpu = cpumask_first(node); + + rc = cfs_cpt_set_cpu(cptab, cpt, cpu); + if (!rc) + return -EINVAL; + cpumask_clear_cpu(cpu, node); + } + return 0; + } + + /* allocate scratch buffer */ + LIBCFS_ALLOC(socket, cpumask_size()); + LIBCFS_ALLOC(core, cpumask_size()); + if (socket == NULL || core == NULL) { + rc = -ENOMEM; + goto out; + } + + while (!cpumask_empty(node)) { + cpu = cpumask_first(node); + + /* get cpumask for cores in the same socket */ + cfs_cpu_core_siblings(cpu, socket); + cpumask_and(socket, socket, node); + + LASSERT(!cpumask_empty(socket)); + + while (!cpumask_empty(socket)) { + int i; + + /* get cpumask for hts in the same core */ + cfs_cpu_ht_siblings(cpu, core); + cpumask_and(core, core, node); + + LASSERT(!cpumask_empty(core)); + + for_each_cpu(i, core) { + cpumask_clear_cpu(i, socket); + cpumask_clear_cpu(i, node); + + rc = cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + rc = -EINVAL; + goto out; + } + + if (--number == 0) + goto out; + } + cpu = cpumask_first(socket); + } + } + + out: + if (socket != NULL) + LIBCFS_FREE(socket, cpumask_size()); + if (core != NULL) + LIBCFS_FREE(core, cpumask_size()); + return rc; +} + +#define CPT_WEIGHT_MIN 4u + +static unsigned int +cfs_cpt_num_estimate(void) +{ + unsigned nnode = num_online_nodes(); + unsigned ncpu = num_online_cpus(); + unsigned ncpt; + + if (ncpu <= CPT_WEIGHT_MIN) { + ncpt = 1; + goto out; + } + + /* generate reasonable number of CPU partitions based on total number + * of CPUs, Preferred N should be power2 and match this condition: + * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */ + for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {} + + if (ncpt <= nnode) { /* fat numa system */ + while (nnode > ncpt) + nnode >>= 1; + + } else { /* ncpt > nnode */ + while ((nnode << 1) <= ncpt) + nnode <<= 1; + } + + ncpt = nnode; + + out: +#if (BITS_PER_LONG == 32) + /* config many CPU partitions on 32-bit system could consume + * too much memory */ + ncpt = min(2U, ncpt); +#endif + while (ncpu % ncpt != 0) + ncpt--; /* worst case is 1 */ + + return ncpt; +} + +static struct cfs_cpt_table * +cfs_cpt_table_create(int ncpt) +{ + struct cfs_cpt_table *cptab = NULL; + cpumask_t *mask = NULL; + int cpt = 0; + int num; + int rc; + int i; + + rc = cfs_cpt_num_estimate(); + if (ncpt <= 0) + ncpt = rc; + + if (ncpt > num_online_cpus() || ncpt > 4 * rc) { + CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n", + ncpt, rc); + } + + if (num_online_cpus() % ncpt != 0) { + CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n", + (int)num_online_cpus(), ncpt); + goto failed; + } + + cptab = cfs_cpt_table_alloc(ncpt); + if (cptab == NULL) { + CERROR("Failed to allocate CPU map(%d)\n", ncpt); + goto failed; + } + + num = num_online_cpus() / ncpt; + if (num == 0) { + CERROR("CPU changed while setting CPU partition\n"); + goto failed; + } + + LIBCFS_ALLOC(mask, cpumask_size()); + if (mask == NULL) { + CERROR("Failed to allocate scratch cpumask\n"); + goto failed; + } + + for_each_online_node(i) { + cfs_node_to_cpumask(i, mask); + + while (!cpumask_empty(mask)) { + struct cfs_cpu_partition *part; + int n; + + if (cpt >= ncpt) + goto failed; + + part = &cptab->ctb_parts[cpt]; + + n = num - cpumask_weight(part->cpt_cpumask); + LASSERT(n > 0); + + rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n); + if (rc < 0) + goto failed; + + LASSERT(num >= cpumask_weight(part->cpt_cpumask)); + if (num == cpumask_weight(part->cpt_cpumask)) + cpt++; + } + } + + if (cpt != ncpt || + num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) { + CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n", + cptab->ctb_nparts, num, cpt, + cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)); + goto failed; + } + + LIBCFS_FREE(mask, cpumask_size()); + + return cptab; + + failed: + CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n", + ncpt, num_online_nodes(), num_online_cpus()); + + if (mask != NULL) + LIBCFS_FREE(mask, cpumask_size()); + + if (cptab != NULL) + cfs_cpt_table_free(cptab); + + return NULL; +} + +static struct cfs_cpt_table * +cfs_cpt_table_create_pattern(char *pattern) +{ + struct cfs_cpt_table *cptab; + char *str = pattern; + int node = 0; + int high; + int ncpt; + int c; + + for (ncpt = 0;; ncpt++) { /* quick scan bracket */ + str = strchr(str, '['); + if (str == NULL) + break; + str++; + } + + str = cfs_trimwhite(pattern); + if (*str == 'n' || *str == 'N') { + pattern = str + 1; + node = 1; + } + + if (ncpt == 0 || + (node && ncpt > num_online_nodes()) || + (!node && ncpt > num_online_cpus())) { + CERROR("Invalid pattern %s, or too many partitions %d\n", + pattern, ncpt); + return NULL; + } + + high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1; + + cptab = cfs_cpt_table_alloc(ncpt); + if (cptab == NULL) { + CERROR("Failed to allocate cpu partition table\n"); + return NULL; + } + + for (str = cfs_trimwhite(pattern), c = 0;; c++) { + struct cfs_range_expr *range; + struct cfs_expr_list *el; + char *bracket = strchr(str, '['); + int cpt; + int rc; + int i; + int n; + + if (bracket == NULL) { + if (*str != 0) { + CERROR("Invalid pattern %s\n", str); + goto failed; + } else if (c != ncpt) { + CERROR("expect %d partitions but found %d\n", + ncpt, c); + goto failed; + } + break; + } + + if (sscanf(str, "%d%n", &cpt, &n) < 1) { + CERROR("Invalid cpu pattern %s\n", str); + goto failed; + } + + if (cpt < 0 || cpt >= ncpt) { + CERROR("Invalid partition id %d, total partitions %d\n", + cpt, ncpt); + goto failed; + } + + if (cfs_cpt_weight(cptab, cpt) != 0) { + CERROR("Partition %d has already been set.\n", cpt); + goto failed; + } + + str = cfs_trimwhite(str + n); + if (str != bracket) { + CERROR("Invalid pattern %s\n", str); + goto failed; + } + + bracket = strchr(str, ']'); + if (bracket == NULL) { + CERROR("missing right bracket for cpt %d, %s\n", + cpt, str); + goto failed; + } + + if (cfs_expr_list_parse(str, (bracket - str) + 1, + 0, high, &el) != 0) { + CERROR("Can't parse number range: %s\n", str); + goto failed; + } + + list_for_each_entry(range, &el->el_exprs, re_link) { + for (i = range->re_lo; i <= range->re_hi; i++) { + if ((i - range->re_lo) % range->re_stride != 0) + continue; + + rc = node ? cfs_cpt_set_node(cptab, cpt, i) : + cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + cfs_expr_list_free(el); + goto failed; + } + } + } + + cfs_expr_list_free(el); + + if (!cfs_cpt_online(cptab, cpt)) { + CERROR("No online CPU is found on partition %d\n", cpt); + goto failed; + } + + str = cfs_trimwhite(bracket + 1); + } + + return cptab; + + failed: + cfs_cpt_table_free(cptab); + return NULL; +} + +#ifdef CONFIG_HOTPLUG_CPU +static int +cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + bool warn; + + switch (action) { + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + spin_lock(&cpt_data.cpt_lock); + cpt_data.cpt_version++; + spin_unlock(&cpt_data.cpt_lock); + default: + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) { + CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n", + cpu, action); + break; + } + + mutex_lock(&cpt_data.cpt_mutex); + /* if all HTs in a core are offline, it may break affinity */ + cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask); + warn = cpumask_any_and(cpt_data.cpt_cpumask, + cpu_online_mask) >= nr_cpu_ids; + mutex_unlock(&cpt_data.cpt_mutex); + CDEBUG(warn ? D_WARNING : D_INFO, + "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u action: %lx]\n", + cpu, action); + } + + return NOTIFY_OK; +} + +static struct notifier_block cfs_cpu_notifier = { + .notifier_call = cfs_cpu_notify, + .priority = 0 +}; + +#endif + +void +cfs_cpu_fini(void) +{ + if (cfs_cpt_table != NULL) + cfs_cpt_table_free(cfs_cpt_table); + +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&cfs_cpu_notifier); +#endif + if (cpt_data.cpt_cpumask != NULL) + LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size()); +} + +int +cfs_cpu_init(void) +{ + LASSERT(cfs_cpt_table == NULL); + + memset(&cpt_data, 0, sizeof(cpt_data)); + + LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size()); + if (cpt_data.cpt_cpumask == NULL) { + CERROR("Failed to allocate scratch buffer\n"); + return -1; + } + + spin_lock_init(&cpt_data.cpt_lock); + mutex_init(&cpt_data.cpt_mutex); + +#ifdef CONFIG_HOTPLUG_CPU + register_hotcpu_notifier(&cfs_cpu_notifier); +#endif + + if (*cpu_pattern != 0) { + cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern); + if (cfs_cpt_table == NULL) { + CERROR("Failed to create cptab from pattern %s\n", + cpu_pattern); + goto failed; + } + + } else { + cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions); + if (cfs_cpt_table == NULL) { + CERROR("Failed to create ptable with npartitions %d\n", + cpu_npartitions); + goto failed; + } + } + + spin_lock(&cpt_data.cpt_lock); + if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) { + spin_unlock(&cpt_data.cpt_lock); + CERROR("CPU hotplug/unplug during setup\n"); + goto failed; + } + spin_unlock(&cpt_data.cpt_lock); + + LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n", + num_online_cpus(), cfs_cpt_number(cfs_cpt_table)); + return 0; + + failed: + cfs_cpu_fini(); + return -1; +} + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c new file mode 100644 index 000000000..5e185fa59 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c @@ -0,0 +1,141 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/* + * This is crypto api shash wrappers to zlib_adler32. + */ + +#include +#include +#include +#include "linux-crypto.h" + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +static u32 __adler32(u32 cksum, unsigned char const *p, size_t len) +{ + return zlib_adler32(cksum, p, len); +} + +static int adler32_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 1; + + return 0; +} + +static int adler32_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = *(u32 *)key; + return 0; +} + +static int adler32_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *cksump = shash_desc_ctx(desc); + + *cksump = *mctx; + + return 0; +} + +static int adler32_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *cksump = shash_desc_ctx(desc); + + *cksump = __adler32(*cksump, data, len); + return 0; +} +static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len, + u8 *out) +{ + *(u32 *)out = __adler32(*cksump, data, len); + return 0; +} + +static int adler32_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __adler32_finup(shash_desc_ctx(desc), data, len, out); +} + +static int adler32_final(struct shash_desc *desc, u8 *out) +{ + u32 *cksump = shash_desc_ctx(desc); + + *(u32 *)out = *cksump; + return 0; +} + +static int adler32_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} +static struct shash_alg alg = { + .setkey = adler32_setkey, + .init = adler32_init, + .update = adler32_update, + .final = adler32_final, + .finup = adler32_finup, + .digest = adler32_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "adler32", + .cra_driver_name = "adler32-zlib", + .cra_priority = 100, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = adler32_cra_init, + } +}; + + +int cfs_crypto_adler32_register(void) +{ + return crypto_register_shash(&alg); +} + +void cfs_crypto_adler32_unregister(void) +{ + crypto_unregister_shash(&alg); +} diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c new file mode 100644 index 000000000..aa3fffed1 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c @@ -0,0 +1,291 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Copyright (c) 2012, Intel Corporation. + */ + +#include +#include +#include "../../../include/linux/libcfs/libcfs.h" +#include "linux-crypto.h" +/** + * Array of hash algorithm speed in MByte per second + */ +static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX]; + + + +static int cfs_crypto_hash_alloc(unsigned char alg_id, + const struct cfs_crypto_hash_type **type, + struct hash_desc *desc, unsigned char *key, + unsigned int key_len) +{ + int err = 0; + + *type = cfs_crypto_hash_type(alg_id); + + if (*type == NULL) { + CWARN("Unsupported hash algorithm id = %d, max id is %d\n", + alg_id, CFS_HASH_ALG_MAX); + return -EINVAL; + } + desc->tfm = crypto_alloc_hash((*type)->cht_name, 0, 0); + + if (desc->tfm == NULL) + return -EINVAL; + + if (IS_ERR(desc->tfm)) { + CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n", + (*type)->cht_name); + return PTR_ERR(desc->tfm); + } + + desc->flags = 0; + + /** Shash have different logic for initialization then digest + * shash: crypto_hash_setkey, crypto_hash_init + * digest: crypto_digest_init, crypto_digest_setkey + * Skip this function for digest, because we use shash logic at + * cfs_crypto_hash_alloc. + */ + if (key != NULL) { + err = crypto_hash_setkey(desc->tfm, key, key_len); + } else if ((*type)->cht_key != 0) { + err = crypto_hash_setkey(desc->tfm, + (unsigned char *)&((*type)->cht_key), + (*type)->cht_size); + } + + if (err != 0) { + crypto_free_hash(desc->tfm); + return err; + } + + CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n", + (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_name, + (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_driver_name, + cfs_crypto_hash_speeds[alg_id]); + + return crypto_hash_init(desc); +} + +int cfs_crypto_hash_digest(unsigned char alg_id, + const void *buf, unsigned int buf_len, + unsigned char *key, unsigned int key_len, + unsigned char *hash, unsigned int *hash_len) +{ + struct scatterlist sl; + struct hash_desc hdesc; + int err; + const struct cfs_crypto_hash_type *type; + + if (buf == NULL || buf_len == 0 || hash_len == NULL) + return -EINVAL; + + err = cfs_crypto_hash_alloc(alg_id, &type, &hdesc, key, key_len); + if (err != 0) + return err; + + if (hash == NULL || *hash_len < type->cht_size) { + *hash_len = type->cht_size; + crypto_free_hash(hdesc.tfm); + return -ENOSPC; + } + sg_init_one(&sl, (void *)buf, buf_len); + + hdesc.flags = 0; + err = crypto_hash_digest(&hdesc, &sl, sl.length, hash); + crypto_free_hash(hdesc.tfm); + + return err; +} +EXPORT_SYMBOL(cfs_crypto_hash_digest); + +struct cfs_crypto_hash_desc * + cfs_crypto_hash_init(unsigned char alg_id, + unsigned char *key, unsigned int key_len) +{ + + struct hash_desc *hdesc; + int err; + const struct cfs_crypto_hash_type *type; + + hdesc = kmalloc(sizeof(*hdesc), 0); + if (hdesc == NULL) + return ERR_PTR(-ENOMEM); + + err = cfs_crypto_hash_alloc(alg_id, &type, hdesc, key, key_len); + + if (err) { + kfree(hdesc); + return ERR_PTR(err); + } + return (struct cfs_crypto_hash_desc *)hdesc; +} +EXPORT_SYMBOL(cfs_crypto_hash_init); + +int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc, + struct page *page, unsigned int offset, + unsigned int len) +{ + struct scatterlist sl; + + sg_init_table(&sl, 1); + sg_set_page(&sl, page, len, offset & ~CFS_PAGE_MASK); + + return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length); +} +EXPORT_SYMBOL(cfs_crypto_hash_update_page); + +int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc, + const void *buf, unsigned int buf_len) +{ + struct scatterlist sl; + + sg_init_one(&sl, (void *)buf, buf_len); + + return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length); +} +EXPORT_SYMBOL(cfs_crypto_hash_update); + +/* If hash_len pointer is NULL - destroy descriptor. */ +int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc, + unsigned char *hash, unsigned int *hash_len) +{ + int err; + int size = crypto_hash_digestsize(((struct hash_desc *)hdesc)->tfm); + + if (hash_len == NULL) { + crypto_free_hash(((struct hash_desc *)hdesc)->tfm); + kfree(hdesc); + return 0; + } + if (hash == NULL || *hash_len < size) { + *hash_len = size; + return -ENOSPC; + } + err = crypto_hash_final((struct hash_desc *) hdesc, hash); + + if (err < 0) { + /* May be caller can fix error */ + return err; + } + crypto_free_hash(((struct hash_desc *)hdesc)->tfm); + kfree(hdesc); + return err; +} +EXPORT_SYMBOL(cfs_crypto_hash_final); + +static void cfs_crypto_performance_test(unsigned char alg_id, + const unsigned char *buf, + unsigned int buf_len) +{ + unsigned long start, end; + int bcount, err = 0; + int sec = 1; /* do test only 1 sec */ + unsigned char hash[64]; + unsigned int hash_len = 64; + + for (start = jiffies, end = start + sec * HZ, bcount = 0; + time_before(jiffies, end); bcount++) { + err = cfs_crypto_hash_digest(alg_id, buf, buf_len, NULL, 0, + hash, &hash_len); + if (err) + break; + + } + end = jiffies; + + if (err) { + cfs_crypto_hash_speeds[alg_id] = -1; + CDEBUG(D_INFO, "Crypto hash algorithm %s, err = %d\n", + cfs_crypto_hash_name(alg_id), err); + } else { + unsigned long tmp; + tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) * + 1000) / (1024 * 1024); + cfs_crypto_hash_speeds[alg_id] = (int)tmp; + } + CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n", + cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]); +} + +int cfs_crypto_hash_speed(unsigned char hash_alg) +{ + if (hash_alg < CFS_HASH_ALG_MAX) + return cfs_crypto_hash_speeds[hash_alg]; + else + return -1; +} +EXPORT_SYMBOL(cfs_crypto_hash_speed); + +/** + * Do performance test for all hash algorithms. + */ +static int cfs_crypto_test_hashes(void) +{ + unsigned char i; + unsigned char *data; + unsigned int j; + /* Data block size for testing hash. Maximum + * kmalloc size for 2.6.18 kernel is 128K */ + unsigned int data_len = 1 * 128 * 1024; + + data = kmalloc(data_len, 0); + if (data == NULL) { + CERROR("Failed to allocate mem\n"); + return -ENOMEM; + } + + for (j = 0; j < data_len; j++) + data[j] = j & 0xff; + + for (i = 0; i < CFS_HASH_ALG_MAX; i++) + cfs_crypto_performance_test(i, data, data_len); + + kfree(data); + return 0; +} + +static int adler32; + +int cfs_crypto_register(void) +{ + request_module("crc32c"); + + adler32 = cfs_crypto_adler32_register(); + + /* check all algorithms and do performance test */ + cfs_crypto_test_hashes(); + return 0; +} +void cfs_crypto_unregister(void) +{ + if (adler32 == 0) + cfs_crypto_adler32_unregister(); + + return; +} diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h new file mode 100644 index 000000000..18e8cd4d8 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h @@ -0,0 +1,29 @@ + /* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/** + * Functions for start/stop shash adler32 algorithm. + */ +int cfs_crypto_adler32_register(void); +void cfs_crypto_adler32_unregister(void); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c new file mode 100644 index 000000000..277f6b890 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c @@ -0,0 +1,111 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/linux/linux-curproc.c + * + * Lustre curproc API implementation for Linux kernel + * + * Author: Nikita Danilov + */ + +#include +#include + +#include +#include + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../../include/linux/libcfs/libcfs.h" + +/* + * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h) + * for Linux kernel. + */ + +void cfs_cap_raise(cfs_cap_t cap) +{ + struct cred *cred; + + cred = prepare_creds(); + if (cred) { + cap_raise(cred->cap_effective, cap); + commit_creds(cred); + } +} + +void cfs_cap_lower(cfs_cap_t cap) +{ + struct cred *cred; + + cred = prepare_creds(); + if (cred) { + cap_lower(cred->cap_effective, cap); + commit_creds(cred); + } +} + +int cfs_cap_raised(cfs_cap_t cap) +{ + return cap_raised(current_cap(), cap); +} + +static void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap) +{ + /* XXX lost high byte */ + *cap = kcap.cap[0]; +} + +cfs_cap_t cfs_curproc_cap_pack(void) +{ + cfs_cap_t cap; + cfs_kernel_cap_pack(current_cap(), &cap); + return cap; +} + +EXPORT_SYMBOL(cfs_cap_raise); +EXPORT_SYMBOL(cfs_cap_lower); +EXPORT_SYMBOL(cfs_cap_raised); +EXPORT_SYMBOL(cfs_curproc_cap_pack); + +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c new file mode 100644 index 000000000..4545d54f7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c @@ -0,0 +1,200 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/linux/linux-debug.c + * + * Author: Phil Schwan + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +# define DEBUG_SUBSYSTEM S_LNET + +#include "../../../include/linux/libcfs/libcfs.h" + +#include "../tracefile.h" + +#include + +char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall"; +char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall"; + +/** + * Upcall function once a Lustre log has been dumped. + * + * \param file path of the dumped log + */ +void libcfs_run_debug_log_upcall(char *file) +{ + char *argv[3]; + int rc; + char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + + argv[0] = lnet_debug_log_upcall; + + LASSERTF(file != NULL, "called on a null filename\n"); + argv[1] = file; /* only need to pass the path of the file */ + + argv[2] = NULL; + + rc = call_usermodehelper(argv[0], argv, envp, 1); + if (rc < 0 && rc != -ENOENT) { + CERROR("Error %d invoking LNET debug log upcall %s %s; check /proc/sys/lnet/debug_log_upcall\n", + rc, argv[0], argv[1]); + } else { + CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n", + argv[0], argv[1]); + } +} + +void libcfs_run_upcall(char **argv) +{ + int rc; + int argc; + char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + + argv[0] = lnet_upcall; + argc = 1; + while (argv[argc] != NULL) + argc++; + + LASSERT(argc >= 2); + + rc = call_usermodehelper(argv[0], argv, envp, 1); + if (rc < 0 && rc != -ENOENT) { + CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; check /proc/sys/lnet/upcall\n", + rc, argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } else { + CDEBUG(D_HA, "Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n", + argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } +} + +void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *msgdata) +{ + char *argv[6]; + char buf[32]; + + snprintf(buf, sizeof(buf), "%d", msgdata->msg_line); + + argv[1] = "LBUG"; + argv[2] = (char *)msgdata->msg_file; + argv[3] = (char *)msgdata->msg_fn; + argv[4] = buf; + argv[5] = NULL; + + libcfs_run_upcall (argv); +} + +/* coverity[+kill] */ +void lbug_with_loc(struct libcfs_debug_msg_data *msgdata) +{ + libcfs_catastrophe = 1; + libcfs_debug_msg(msgdata, "LBUG\n"); + + if (in_interrupt()) { + panic("LBUG in interrupt.\n"); + /* not reached */ + } + + dump_stack(); + if (!libcfs_panic_on_lbug) + libcfs_debug_dumplog(); + libcfs_run_lbug_upcall(msgdata); + if (libcfs_panic_on_lbug) + panic("LBUG"); + set_task_state(current, TASK_UNINTERRUPTIBLE); + while (1) + schedule(); +} + +static int panic_notifier(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + if (libcfs_panic_in_progress) + return 0; + + libcfs_panic_in_progress = 1; + mb(); + + return 0; +} + +static struct notifier_block libcfs_panic_notifier = { + .notifier_call = panic_notifier, + .next = NULL, + .priority = 10000, +}; + +void libcfs_register_panic_notifier(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier); +} + +void libcfs_unregister_panic_notifier(void) +{ + atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier); +} + +EXPORT_SYMBOL(libcfs_run_upcall); +EXPORT_SYMBOL(libcfs_run_lbug_upcall); +EXPORT_SYMBOL(lbug_with_loc); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c new file mode 100644 index 000000000..e962f8968 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c @@ -0,0 +1,183 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../../include/linux/libcfs/libcfs.h" + +#define LNET_MINOR 240 + +int libcfs_ioctl_getdata(char *buf, char *end, void *arg) +{ + struct libcfs_ioctl_hdr *hdr; + struct libcfs_ioctl_data *data; + int orig_len; + + hdr = (struct libcfs_ioctl_hdr *)buf; + data = (struct libcfs_ioctl_data *)buf; + + if (copy_from_user(buf, (void *)arg, sizeof(*hdr))) + return -EFAULT; + + if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) { + CERROR("PORTALS: version mismatch kernel vs application\n"); + return -EINVAL; + } + + if (hdr->ioc_len >= end - buf) { + CERROR("PORTALS: user buffer exceeds kernel buffer\n"); + return -EINVAL; + } + + + if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) { + CERROR("PORTALS: user buffer too small for ioctl\n"); + return -EINVAL; + } + + orig_len = hdr->ioc_len; + if (copy_from_user(buf, (void *)arg, hdr->ioc_len)) + return -EFAULT; + if (orig_len != data->ioc_len) + return -EINVAL; + + if (libcfs_ioctl_is_invalid(data)) { + CERROR("PORTALS: ioctl not correctly formatted\n"); + return -EINVAL; + } + + if (data->ioc_inllen1) + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + + if (data->ioc_inllen2) + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + cfs_size_round(data->ioc_inllen1); + + return 0; +} + +int libcfs_ioctl_popdata(void *arg, void *data, int size) +{ + if (copy_to_user((char *)arg, data, size)) + return -EFAULT; + return 0; +} + +extern struct cfs_psdev_ops libcfs_psdev_ops; + +static int +libcfs_psdev_open(struct inode *inode, struct file *file) +{ + struct libcfs_device_userstate **pdu = NULL; + int rc = 0; + + if (!inode) + return -EINVAL; + pdu = (struct libcfs_device_userstate **)&file->private_data; + if (libcfs_psdev_ops.p_open != NULL) + rc = libcfs_psdev_ops.p_open(0, (void *)pdu); + else + return -EPERM; + return rc; +} + +/* called when closing /dev/device */ +static int +libcfs_psdev_release(struct inode *inode, struct file *file) +{ + struct libcfs_device_userstate *pdu; + int rc = 0; + + if (!inode) + return -EINVAL; + pdu = file->private_data; + if (libcfs_psdev_ops.p_close != NULL) + rc = libcfs_psdev_ops.p_close(0, (void *)pdu); + else + rc = -EPERM; + return rc; +} + +static long libcfs_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct cfs_psdev_file pfile; + int rc = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (_IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || + _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR || + _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR) { + CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); + return -EINVAL; + } + + /* Handle platform-dependent IOC requests */ + switch (cmd) { + case IOC_LIBCFS_PANIC: + if (!capable(CFS_CAP_SYS_BOOT)) + return -EPERM; + panic("debugctl-invoked panic"); + return 0; + case IOC_LIBCFS_MEMHOG: + if (!capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + /* go thought */ + } + + pfile.off = 0; + pfile.private_data = file->private_data; + if (libcfs_psdev_ops.p_ioctl != NULL) + rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); + else + rc = -EPERM; + return rc; +} + +static const struct file_operations libcfs_fops = { + .unlocked_ioctl = libcfs_ioctl, + .open = libcfs_psdev_open, + .release = libcfs_psdev_release, +}; + +struct miscdevice libcfs_dev = { + .minor = LNET_MINOR, + .name = "lnet", + .fops = &libcfs_fops, +}; diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c new file mode 100644 index 000000000..838f5f3bd --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c @@ -0,0 +1,217 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include +#include +#include +#include + +#include "../../../include/linux/libcfs/libcfs.h" + +#if defined(CONFIG_KGDB) +#include +#endif + +/** + * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively + * waiting threads, which is not always desirable because all threads will + * be waken up again and again, even user only needs a few of them to be + * active most time. This is not good for performance because cache can + * be polluted by different threads. + * + * LIFO list can resolve this problem because we always wakeup the most + * recent active thread by default. + * + * NB: please don't call non-exclusive & exclusive wait on the same + * waitq if add_wait_queue_exclusive_head is used. + */ +void +add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link) +{ + unsigned long flags; + + spin_lock_irqsave(&waitq->lock, flags); + __add_wait_queue_exclusive(waitq, link); + spin_unlock_irqrestore(&waitq->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive_head); + +void cfs_init_timer(struct timer_list *t) +{ + init_timer(t); +} +EXPORT_SYMBOL(cfs_init_timer); + +void cfs_timer_init(struct timer_list *t, cfs_timer_func_t *func, void *arg) +{ + init_timer(t); + t->function = func; + t->data = (unsigned long)arg; +} +EXPORT_SYMBOL(cfs_timer_init); + +void cfs_timer_done(struct timer_list *t) +{ + return; +} +EXPORT_SYMBOL(cfs_timer_done); + +void cfs_timer_arm(struct timer_list *t, unsigned long deadline) +{ + mod_timer(t, deadline); +} +EXPORT_SYMBOL(cfs_timer_arm); + +void cfs_timer_disarm(struct timer_list *t) +{ + del_timer(t); +} +EXPORT_SYMBOL(cfs_timer_disarm); + +int cfs_timer_is_armed(struct timer_list *t) +{ + return timer_pending(t); +} +EXPORT_SYMBOL(cfs_timer_is_armed); + +unsigned long cfs_timer_deadline(struct timer_list *t) +{ + return t->expires; +} +EXPORT_SYMBOL(cfs_timer_deadline); + +void cfs_enter_debugger(void) +{ +#if defined(CONFIG_KGDB) + /* BREAKPOINT(); */ +#else + /* nothing */ +#endif +} +EXPORT_SYMBOL(cfs_enter_debugger); + + +sigset_t +cfs_block_allsigs(void) +{ + unsigned long flags; + sigset_t old; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + old = current->blocked; + sigfillset(¤t->blocked); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + + return old; +} +EXPORT_SYMBOL(cfs_block_allsigs); + +sigset_t cfs_block_sigs(unsigned long sigs) +{ + unsigned long flags; + sigset_t old; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + old = current->blocked; + sigaddsetmask(¤t->blocked, sigs); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + return old; +} +EXPORT_SYMBOL(cfs_block_sigs); + +/* Block all signals except for the @sigs */ +sigset_t cfs_block_sigsinv(unsigned long sigs) +{ + unsigned long flags; + sigset_t old; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + old = current->blocked; + sigaddsetmask(¤t->blocked, ~sigs); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + + return old; +} +EXPORT_SYMBOL(cfs_block_sigsinv); + +void +cfs_restore_sigs(sigset_t old) +{ + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->blocked = old; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} +EXPORT_SYMBOL(cfs_restore_sigs); + +int +cfs_signal_pending(void) +{ + return signal_pending(current); +} +EXPORT_SYMBOL(cfs_signal_pending); + +void +cfs_clear_sigpending(void) +{ + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + clear_tsk_thread_flag(current, TIF_SIGPENDING); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} +EXPORT_SYMBOL(cfs_clear_sigpending); + +int +libcfs_arch_init(void) +{ + return 0; +} +EXPORT_SYMBOL(libcfs_arch_init); + +void +libcfs_arch_cleanup(void) +{ + return; +} +EXPORT_SYMBOL(libcfs_arch_cleanup); + diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c new file mode 100644 index 000000000..f2462e7f0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c @@ -0,0 +1,623 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../../include/linux/libcfs/libcfs.h" + +#include +#include +#include +/* For sys_open & sys_close */ +#include + +static int +libcfs_sock_ioctl(int cmd, unsigned long arg) +{ + mm_segment_t oldmm = get_fs(); + struct socket *sock; + int rc; + struct file *sock_filp; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return rc; + } + + sock_filp = sock_alloc_file(sock, 0, NULL); + if (IS_ERR(sock_filp)) { + sock_release(sock); + rc = PTR_ERR(sock_filp); + goto out; + } + + set_fs(KERNEL_DS); + if (sock_filp->f_op->unlocked_ioctl) + rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg); + set_fs(oldmm); + + fput(sock_filp); +out: + return rc; +} + +int +libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask) +{ + struct ifreq ifr; + int nob; + int rc; + __u32 val; + + nob = strnlen(name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + CERROR("Interface name %s too long\n", name); + return -EINVAL; + } + + CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ); + + strcpy(ifr.ifr_name, name); + rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr); + + if (rc != 0) { + CERROR("Can't get flags for interface %s\n", name); + return rc; + } + + if ((ifr.ifr_flags & IFF_UP) == 0) { + CDEBUG(D_NET, "Interface %s down\n", name); + *up = 0; + *ip = *mask = 0; + return 0; + } + + *up = 1; + + strcpy(ifr.ifr_name, name); + ifr.ifr_addr.sa_family = AF_INET; + rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr); + + if (rc != 0) { + CERROR("Can't get IP address for interface %s\n", name); + return rc; + } + + val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; + *ip = ntohl(val); + + strcpy(ifr.ifr_name, name); + ifr.ifr_addr.sa_family = AF_INET; + rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr); + + if (rc != 0) { + CERROR("Can't get netmask for interface %s\n", name); + return rc; + } + + val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr; + *mask = ntohl(val); + + return 0; +} + +EXPORT_SYMBOL(libcfs_ipif_query); + +int +libcfs_ipif_enumerate (char ***namesp) +{ + /* Allocate and fill in 'names', returning # interfaces/error */ + char **names; + int toobig; + int nalloc; + int nfound; + struct ifreq *ifr; + struct ifconf ifc; + int rc; + int nob; + int i; + + + nalloc = 16; /* first guess at max interfaces */ + toobig = 0; + for (;;) { + if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) { + toobig = 1; + nalloc = PAGE_CACHE_SIZE/sizeof(*ifr); + CWARN("Too many interfaces: only enumerating first %d\n", + nalloc); + } + + LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr)); + if (ifr == NULL) { + CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc); + rc = -ENOMEM; + goto out0; + } + + ifc.ifc_buf = (char *)ifr; + ifc.ifc_len = nalloc * sizeof(*ifr); + + rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc); + + if (rc < 0) { + CERROR ("Error %d enumerating interfaces\n", rc); + goto out1; + } + + LASSERT (rc == 0); + + nfound = ifc.ifc_len/sizeof(*ifr); + LASSERT (nfound <= nalloc); + + if (nfound < nalloc || toobig) + break; + + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); + nalloc *= 2; + } + + if (nfound == 0) + goto out1; + + LIBCFS_ALLOC(names, nfound * sizeof(*names)); + if (names == NULL) { + rc = -ENOMEM; + goto out1; + } + + for (i = 0; i < nfound; i++) { + + nob = strnlen (ifr[i].ifr_name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + /* no space for terminating NULL */ + CERROR("interface name %.*s too long (%d max)\n", + nob, ifr[i].ifr_name, IFNAMSIZ); + rc = -ENAMETOOLONG; + goto out2; + } + + LIBCFS_ALLOC(names[i], IFNAMSIZ); + if (names[i] == NULL) { + rc = -ENOMEM; + goto out2; + } + + memcpy(names[i], ifr[i].ifr_name, nob); + names[i][nob] = 0; + } + + *namesp = names; + rc = nfound; + + out2: + if (rc < 0) + libcfs_ipif_free_enumeration(names, nfound); + out1: + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); + out0: + return rc; +} + +EXPORT_SYMBOL(libcfs_ipif_enumerate); + +void +libcfs_ipif_free_enumeration (char **names, int n) +{ + int i; + + LASSERT (n > 0); + + for (i = 0; i < n && names[i] != NULL; i++) + LIBCFS_FREE(names[i], IFNAMSIZ); + + LIBCFS_FREE(names, n * sizeof(*names)); +} + +EXPORT_SYMBOL(libcfs_ipif_free_enumeration); + +int +libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + long ticks = timeout * HZ; + unsigned long then; + struct timeval tv; + + LASSERT (nob > 0); + /* Caller may pass a zero timeout if she thinks the socket buffer is + * empty enough to take the whole message immediately */ + + for (;;) { + struct kvec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0 + }; + + if (timeout != 0) { + /* Set send timeout to remaining time */ + tv = (struct timeval) { + .tv_sec = ticks / HZ, + .tv_usec = ((ticks % HZ) * 1000000) / HZ + }; + rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, + (char *)&tv, sizeof(tv)); + if (rc != 0) { + CERROR("Can't set socket send timeout %ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + } + + then = jiffies; + rc = kernel_sendmsg(sock, &msg, &iov, 1, nob); + ticks -= jiffies - then; + + if (rc == nob) + return 0; + + if (rc < 0) + return rc; + + if (rc == 0) { + CERROR ("Unexpected zero rc\n"); + return -ECONNABORTED; + } + + if (ticks <= 0) + return -EAGAIN; + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + + return 0; +} +EXPORT_SYMBOL(libcfs_sock_write); + +int +libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + long ticks = timeout * HZ; + unsigned long then; + struct timeval tv; + + LASSERT (nob > 0); + LASSERT (ticks > 0); + + for (;;) { + struct kvec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_flags = 0 + }; + + /* Set receive timeout to remaining time */ + tv = (struct timeval) { + .tv_sec = ticks / HZ, + .tv_usec = ((ticks % HZ) * 1000000) / HZ + }; + rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, + (char *)&tv, sizeof(tv)); + if (rc != 0) { + CERROR("Can't set socket recv timeout %ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + + then = jiffies; + rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0); + ticks -= jiffies - then; + + if (rc < 0) + return rc; + + if (rc == 0) + return -ECONNRESET; + + buffer = ((char *)buffer) + rc; + nob -= rc; + + if (nob == 0) + return 0; + + if (ticks <= 0) + return -ETIMEDOUT; + } +} + +EXPORT_SYMBOL(libcfs_sock_read); + +static int +libcfs_sock_create (struct socket **sockp, int *fatal, + __u32 local_ip, int local_port) +{ + struct sockaddr_in locaddr; + struct socket *sock; + int rc; + int option; + + /* All errors are fatal except bind failure if the port is in use */ + *fatal = 1; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + *sockp = sock; + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return rc; + } + + option = 1; + rc = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&option, sizeof (option)); + if (rc != 0) { + CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); + goto failed; + } + + if (local_ip != 0 || local_port != 0) { + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons(local_port); + locaddr.sin_addr.s_addr = (local_ip == 0) ? + INADDR_ANY : htonl(local_ip); + + rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr, + sizeof(locaddr)); + if (rc == -EADDRINUSE) { + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *fatal = 0; + goto failed; + } + if (rc != 0) { + CERROR("Error trying to bind to port %d: %d\n", + local_port, rc); + goto failed; + } + } + + return 0; + + failed: + sock_release(sock); + return rc; +} + +int +libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize) +{ + int option; + int rc; + + if (txbufsize != 0) { + option = txbufsize; + rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char *)&option, sizeof (option)); + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + option, rc); + return rc; + } + } + + if (rxbufsize != 0) { + option = rxbufsize; + rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, + (char *)&option, sizeof (option)); + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + option, rc); + return rc; + } + } + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_setbuf); + +int +libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port) +{ + struct sockaddr_in sin; + int len = sizeof (sin); + int rc; + + rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len, + remote ? 2 : 0); + if (rc != 0) { + CERROR ("Error %d getting sock %s IP/port\n", + rc, remote ? "peer" : "local"); + return rc; + } + + if (ip != NULL) + *ip = ntohl (sin.sin_addr.s_addr); + + if (port != NULL) + *port = ntohs (sin.sin_port); + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_getaddr); + +int +libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize) +{ + + if (txbufsize != NULL) { + *txbufsize = sock->sk->sk_sndbuf; + } + + if (rxbufsize != NULL) { + *rxbufsize = sock->sk->sk_rcvbuf; + } + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_getbuf); + +int +libcfs_sock_listen (struct socket **sockp, + __u32 local_ip, int local_port, int backlog) +{ + int fatal; + int rc; + + rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port); + if (rc != 0) { + if (!fatal) + CERROR("Can't create socket: port %d already in use\n", + local_port); + return rc; + } + + rc = (*sockp)->ops->listen(*sockp, backlog); + if (rc == 0) + return 0; + + CERROR("Can't set listen backlog %d: %d\n", backlog, rc); + sock_release(*sockp); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_listen); + +int +libcfs_sock_accept (struct socket **newsockp, struct socket *sock) +{ + wait_queue_t wait; + struct socket *newsock; + int rc; + + init_waitqueue_entry(&wait, current); + + /* XXX this should add a ref to sock->ops->owner, if + * TCP could be a module */ + rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock); + if (rc) { + CERROR("Can't allocate socket\n"); + return rc; + } + + newsock->ops = sock->ops; + + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + if (rc == -EAGAIN) { + /* Nothing ready, so wait for activity */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sock->sk), &wait); + schedule(); + remove_wait_queue(sk_sleep(sock->sk), &wait); + set_current_state(TASK_RUNNING); + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + } + + if (rc != 0) + goto failed; + + *newsockp = newsock; + return 0; + + failed: + sock_release(newsock); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_accept); + +void +libcfs_sock_abort_accept (struct socket *sock) +{ + wake_up_all(sk_sleep(sock->sk)); +} + +EXPORT_SYMBOL(libcfs_sock_abort_accept); + +int +libcfs_sock_connect (struct socket **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port) +{ + struct sockaddr_in srvaddr; + int rc; + + rc = libcfs_sock_create(sockp, fatal, local_ip, local_port); + if (rc != 0) + return rc; + + memset (&srvaddr, 0, sizeof (srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(peer_port); + srvaddr.sin_addr.s_addr = htonl(peer_ip); + + rc = (*sockp)->ops->connect(*sockp, + (struct sockaddr *)&srvaddr, sizeof(srvaddr), + 0); + if (rc == 0) + return 0; + + /* EADDRNOTAVAIL probably means we're already connected to the same + * peer/port on the same local port on a differently typed + * connection. Let our caller retry with a different local + * port... */ + *fatal = !(rc == -EADDRNOTAVAIL); + + CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET, + "Error %d connecting %pI4h/%d -> %pI4h/%d\n", rc, + &local_ip, local_port, &peer_ip, peer_port); + + sock_release(*sockp); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_connect); + +void +libcfs_sock_release (struct socket *sock) +{ + sock_release(sock); +} + +EXPORT_SYMBOL(libcfs_sock_release); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c new file mode 100644 index 000000000..c8e293002 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c @@ -0,0 +1,275 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#define LUSTRE_TRACEFILE_PRIVATE + +#include "../../../include/linux/libcfs/libcfs.h" +#include "../tracefile.h" + +/* percents to share the total debug memory for each type */ +static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = { + 80, /* 80% pages for CFS_TCD_TYPE_PROC */ + 10, /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */ + 10 /* 10% pages for CFS_TCD_TYPE_IRQ */ +}; + +char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX]; + +struct rw_semaphore cfs_tracefile_sem; + +int cfs_tracefile_init_arch(void) +{ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + + init_rwsem(&cfs_tracefile_sem); + + /* initialize trace_data */ + memset(cfs_trace_data, 0, sizeof(cfs_trace_data)); + for (i = 0; i < CFS_TCD_TYPE_MAX; i++) { + cfs_trace_data[i] = + kmalloc(sizeof(union cfs_trace_data_union) * + num_possible_cpus(), GFP_KERNEL); + if (cfs_trace_data[i] == NULL) + goto out; + + } + + /* arch related info initialized */ + cfs_tcd_for_each(tcd, i, j) { + spin_lock_init(&tcd->tcd_lock); + tcd->tcd_pages_factor = pages_factor[i]; + tcd->tcd_type = i; + tcd->tcd_cpu = j; + } + + for (i = 0; i < num_possible_cpus(); i++) + for (j = 0; j < 3; j++) { + cfs_trace_console_buffers[i][j] = + kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE, + GFP_KERNEL); + + if (cfs_trace_console_buffers[i][j] == NULL) + goto out; + } + + return 0; + +out: + cfs_tracefile_fini_arch(); + printk(KERN_ERR "lnet: Not enough memory\n"); + return -ENOMEM; +} + +void cfs_tracefile_fini_arch(void) +{ + int i; + int j; + + for (i = 0; i < num_possible_cpus(); i++) + for (j = 0; j < 3; j++) + if (cfs_trace_console_buffers[i][j] != NULL) { + kfree(cfs_trace_console_buffers[i][j]); + cfs_trace_console_buffers[i][j] = NULL; + } + + for (i = 0; cfs_trace_data[i] != NULL; i++) { + kfree(cfs_trace_data[i]); + cfs_trace_data[i] = NULL; + } +} + +void cfs_tracefile_read_lock(void) +{ + down_read(&cfs_tracefile_sem); +} + +void cfs_tracefile_read_unlock(void) +{ + up_read(&cfs_tracefile_sem); +} + +void cfs_tracefile_write_lock(void) +{ + down_write(&cfs_tracefile_sem); +} + +void cfs_tracefile_write_unlock(void) +{ + up_write(&cfs_tracefile_sem); +} + +cfs_trace_buf_type_t cfs_trace_buf_idx_get(void) +{ + if (in_irq()) + return CFS_TCD_TYPE_IRQ; + else if (in_softirq()) + return CFS_TCD_TYPE_SOFTIRQ; + else + return CFS_TCD_TYPE_PROC; +} + +/* + * The walking argument indicates the locking comes from all tcd types + * iterator and we must lock it and dissable local irqs to avoid deadlocks + * with other interrupt locks that might be happening. See LU-1311 + * for details. + */ +int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking) + __acquires(&tcd->tc_lock) +{ + __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX); + if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) + spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags); + else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) + spin_lock_bh(&tcd->tcd_lock); + else if (unlikely(walking)) + spin_lock_irq(&tcd->tcd_lock); + else + spin_lock(&tcd->tcd_lock); + return 1; +} + +void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking) + __releases(&tcd->tcd_lock) +{ + __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX); + if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) + spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags); + else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) + spin_unlock_bh(&tcd->tcd_lock); + else if (unlikely(walking)) + spin_unlock_irq(&tcd->tcd_lock); + else + spin_unlock(&tcd->tcd_lock); +} + +int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd, + struct cfs_trace_page *tage) +{ + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + return tcd->tcd_cpu == tage->cpu; +} + +void +cfs_set_ptldebug_header(struct ptldebug_header *header, + struct libcfs_debug_msg_data *msgdata, + unsigned long stack) +{ + struct timeval tv; + + do_gettimeofday(&tv); + + header->ph_subsys = msgdata->msg_subsys; + header->ph_mask = msgdata->msg_mask; + header->ph_cpu_id = smp_processor_id(); + header->ph_type = cfs_trace_buf_idx_get(); + header->ph_sec = (__u32)tv.tv_sec; + header->ph_usec = tv.tv_usec; + header->ph_stack = stack; + header->ph_pid = current->pid; + header->ph_line_num = msgdata->msg_line; + header->ph_extern_pid = 0; + return; +} + +static char * +dbghdr_to_err_string(struct ptldebug_header *hdr) +{ + switch (hdr->ph_subsys) { + + case S_LND: + case S_LNET: + return "LNetError"; + default: + return "LustreError"; + } +} + +static char * +dbghdr_to_info_string(struct ptldebug_header *hdr) +{ + switch (hdr->ph_subsys) { + + case S_LND: + case S_LNET: + return "LNet"; + default: + return "Lustre"; + } +} + +void cfs_print_to_console(struct ptldebug_header *hdr, int mask, + const char *buf, int len, const char *file, + const char *fn) +{ + char *prefix = "Lustre", *ptype = NULL; + + if ((mask & D_EMERG) != 0) { + prefix = dbghdr_to_err_string(hdr); + ptype = KERN_EMERG; + } else if ((mask & D_ERROR) != 0) { + prefix = dbghdr_to_err_string(hdr); + ptype = KERN_ERR; + } else if ((mask & D_WARNING) != 0) { + prefix = dbghdr_to_info_string(hdr); + ptype = KERN_WARNING; + } else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) { + prefix = dbghdr_to_info_string(hdr); + ptype = KERN_INFO; + } + + if ((mask & D_CONSOLE) != 0) { + printk("%s%s: %.*s", ptype, prefix, len, buf); + } else { + printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, + hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num, + fn, len, buf); + } + return; +} + +int cfs_trace_max_debug_mb(void) +{ + int total_mb = (totalram_pages >> (20 - PAGE_SHIFT)); + + return max(512, (total_mb * 80)/100); +} diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h new file mode 100644 index 000000000..ba84e4ffd --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h @@ -0,0 +1,48 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LINUX_TRACEFILE_H__ +#define __LIBCFS_LINUX_TRACEFILE_H__ + +/** + * three types of trace_data in linux + */ +typedef enum { + CFS_TCD_TYPE_PROC = 0, + CFS_TCD_TYPE_SOFTIRQ, + CFS_TCD_TYPE_IRQ, + CFS_TCD_TYPE_MAX +} cfs_trace_buf_type_t; + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/module.c b/kernel/drivers/staging/lustre/lustre/libcfs/module.c new file mode 100644 index 000000000..f0ee76abf --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/module.c @@ -0,0 +1,976 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +# define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/libcfs/libcfs.h" +#include + +#include "../../include/linux/libcfs/libcfs_crypto.h" +#include "../../include/linux/lnet/lib-lnet.h" +#include "../../include/linux/lnet/lnet.h" +#include "tracefile.h" + +MODULE_AUTHOR("Peter J. Braam "); +MODULE_DESCRIPTION("Portals v3.1"); +MODULE_LICENSE("GPL"); + +extern struct miscdevice libcfs_dev; +extern struct rw_semaphore cfs_tracefile_sem; +extern struct mutex cfs_trace_thread_mutex; +extern struct cfs_wi_sched *cfs_sched_rehash; +extern void libcfs_init_nidstrings(void); + +static int insert_proc(void); +static void remove_proc(void); + +static struct ctl_table_header *lnet_table_header; +extern char lnet_upcall[1024]; +/** + * The path of debug log dump upcall script. + */ +extern char lnet_debug_log_upcall[1024]; + +#define CTL_LNET (0x100) + +enum { + PSDEV_DEBUG = 1, /* control debugging */ + PSDEV_SUBSYSTEM_DEBUG, /* control debugging */ + PSDEV_PRINTK, /* force all messages to console */ + PSDEV_CONSOLE_RATELIMIT, /* ratelimit console messages */ + PSDEV_CONSOLE_MAX_DELAY_CS, /* maximum delay over which we skip messages */ + PSDEV_CONSOLE_MIN_DELAY_CS, /* initial delay over which we skip messages */ + PSDEV_CONSOLE_BACKOFF, /* delay increase factor */ + PSDEV_DEBUG_PATH, /* crashdump log location */ + PSDEV_DEBUG_DUMP_PATH, /* crashdump tracelog location */ + PSDEV_CPT_TABLE, /* information about cpu partitions */ + PSDEV_LNET_UPCALL, /* User mode upcall script */ + PSDEV_LNET_MEMUSED, /* bytes currently PORTAL_ALLOCated */ + PSDEV_LNET_CATASTROPHE, /* if we have LBUGged or panic'd */ + PSDEV_LNET_PANIC_ON_LBUG, /* flag to panic on LBUG */ + PSDEV_LNET_DUMP_KERNEL, /* snapshot kernel debug buffer to file */ + PSDEV_LNET_DAEMON_FILE, /* spool kernel debug buffer to file */ + PSDEV_LNET_DEBUG_MB, /* size of debug buffer */ + PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */ + PSDEV_LNET_WATCHDOG_RATELIMIT, /* ratelimit watchdog messages */ + PSDEV_LNET_FORCE_LBUG, /* hook to force an LBUG */ + PSDEV_LNET_FAIL_LOC, /* control test failures instrumentation */ + PSDEV_LNET_FAIL_VAL, /* userdata for fail loc */ +}; + +static void kportal_memhog_free (struct libcfs_device_userstate *ldu) +{ + struct page **level0p = &ldu->ldu_memhog_root_page; + struct page **level1p; + struct page **level2p; + int count1; + int count2; + + if (*level0p != NULL) { + + level1p = (struct page **)page_address(*level0p); + count1 = 0; + + while (count1 < PAGE_CACHE_SIZE/sizeof(struct page *) && + *level1p != NULL) { + + level2p = (struct page **)page_address(*level1p); + count2 = 0; + + while (count2 < PAGE_CACHE_SIZE/sizeof(struct page *) && + *level2p != NULL) { + + __free_page(*level2p); + ldu->ldu_memhog_pages--; + level2p++; + count2++; + } + + __free_page(*level1p); + ldu->ldu_memhog_pages--; + level1p++; + count1++; + } + + __free_page(*level0p); + ldu->ldu_memhog_pages--; + + *level0p = NULL; + } + + LASSERT (ldu->ldu_memhog_pages == 0); +} + +static int kportal_memhog_alloc(struct libcfs_device_userstate *ldu, int npages, + gfp_t flags) +{ + struct page **level0p; + struct page **level1p; + struct page **level2p; + int count1; + int count2; + + LASSERT (ldu->ldu_memhog_pages == 0); + LASSERT (ldu->ldu_memhog_root_page == NULL); + + if (npages < 0) + return -EINVAL; + + if (npages == 0) + return 0; + + level0p = &ldu->ldu_memhog_root_page; + *level0p = alloc_page(flags); + if (*level0p == NULL) + return -ENOMEM; + ldu->ldu_memhog_pages++; + + level1p = (struct page **)page_address(*level0p); + count1 = 0; + memset(level1p, 0, PAGE_CACHE_SIZE); + + while (ldu->ldu_memhog_pages < npages && + count1 < PAGE_CACHE_SIZE/sizeof(struct page *)) { + + if (cfs_signal_pending()) + return -EINTR; + + *level1p = alloc_page(flags); + if (*level1p == NULL) + return -ENOMEM; + ldu->ldu_memhog_pages++; + + level2p = (struct page **)page_address(*level1p); + count2 = 0; + memset(level2p, 0, PAGE_CACHE_SIZE); + + while (ldu->ldu_memhog_pages < npages && + count2 < PAGE_CACHE_SIZE/sizeof(struct page *)) { + + if (cfs_signal_pending()) + return -EINTR; + + *level2p = alloc_page(flags); + if (*level2p == NULL) + return -ENOMEM; + ldu->ldu_memhog_pages++; + + level2p++; + count2++; + } + + level1p++; + count1++; + } + + return 0; +} + +/* called when opening /dev/device */ +static int libcfs_psdev_open(unsigned long flags, void *args) +{ + struct libcfs_device_userstate *ldu; + + try_module_get(THIS_MODULE); + + LIBCFS_ALLOC(ldu, sizeof(*ldu)); + if (ldu != NULL) { + ldu->ldu_memhog_pages = 0; + ldu->ldu_memhog_root_page = NULL; + } + *(struct libcfs_device_userstate **)args = ldu; + + return 0; +} + +/* called when closing /dev/device */ +static int libcfs_psdev_release(unsigned long flags, void *args) +{ + struct libcfs_device_userstate *ldu; + + ldu = (struct libcfs_device_userstate *)args; + if (ldu != NULL) { + kportal_memhog_free(ldu); + LIBCFS_FREE(ldu, sizeof(*ldu)); + } + + module_put(THIS_MODULE); + return 0; +} + +static struct rw_semaphore ioctl_list_sem; +static struct list_head ioctl_list; + +int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand) +{ + int rc = 0; + + down_write(&ioctl_list_sem); + if (!list_empty(&hand->item)) + rc = -EBUSY; + else + list_add_tail(&hand->item, &ioctl_list); + up_write(&ioctl_list_sem); + + return rc; +} +EXPORT_SYMBOL(libcfs_register_ioctl); + +int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand) +{ + int rc = 0; + + down_write(&ioctl_list_sem); + if (list_empty(&hand->item)) + rc = -ENOENT; + else + list_del_init(&hand->item); + up_write(&ioctl_list_sem); + + return rc; +} +EXPORT_SYMBOL(libcfs_deregister_ioctl); + +static int libcfs_ioctl_int(struct cfs_psdev_file *pfile, unsigned long cmd, + void *arg, struct libcfs_ioctl_data *data) +{ + int err = -EINVAL; + + switch (cmd) { + case IOC_LIBCFS_CLEAR_DEBUG: + libcfs_debug_clear_buffer(); + return 0; + /* + * case IOC_LIBCFS_PANIC: + * Handled in arch/cfs_module.c + */ + case IOC_LIBCFS_MARK_DEBUG: + if (data->ioc_inlbuf1 == NULL || + data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') + return -EINVAL; + libcfs_debug_mark_buffer(data->ioc_inlbuf1); + return 0; + case IOC_LIBCFS_MEMHOG: + if (pfile->private_data == NULL) { + err = -EINVAL; + } else { + kportal_memhog_free(pfile->private_data); + /* XXX The ioc_flags is not GFP flags now, need to be fixed */ + err = kportal_memhog_alloc(pfile->private_data, + data->ioc_count, + data->ioc_flags); + if (err != 0) + kportal_memhog_free(pfile->private_data); + } + break; + + case IOC_LIBCFS_PING_TEST: { + extern void (kping_client)(struct libcfs_ioctl_data *); + void (*ping)(struct libcfs_ioctl_data *); + + CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n", + data->ioc_count, libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(data->ioc_nid)); + ping = symbol_get(kping_client); + if (!ping) + CERROR("symbol_get failed\n"); + else { + ping(data); + symbol_put(kping_client); + } + return 0; + } + + default: { + struct libcfs_ioctl_handler *hand; + err = -EINVAL; + down_read(&ioctl_list_sem); + list_for_each_entry(hand, &ioctl_list, item) { + err = hand->handle_ioctl(cmd, data); + if (err != -EINVAL) { + if (err == 0) + err = libcfs_ioctl_popdata(arg, + data, sizeof (*data)); + break; + } + } + up_read(&ioctl_list_sem); + break; + } + } + + return err; +} + +static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *arg) +{ + char *buf; + struct libcfs_ioctl_data *data; + int err = 0; + + LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS); + if (buf == NULL) + return -ENOMEM; + + /* 'cmd' and permissions get checked in our arch-specific caller */ + if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) { + CERROR("PORTALS ioctl: data error\n"); + err = -EINVAL; + goto out; + } + data = (struct libcfs_ioctl_data *)buf; + + err = libcfs_ioctl_int(pfile, cmd, arg, data); + +out: + LIBCFS_FREE(buf, 1024); + return err; +} + + +struct cfs_psdev_ops libcfs_psdev_ops = { + libcfs_psdev_open, + libcfs_psdev_release, + NULL, + NULL, + libcfs_ioctl +}; + +static int init_libcfs_module(void) +{ + int rc; + + libcfs_arch_init(); + libcfs_init_nidstrings(); + init_rwsem(&cfs_tracefile_sem); + mutex_init(&cfs_trace_thread_mutex); + init_rwsem(&ioctl_list_sem); + INIT_LIST_HEAD(&ioctl_list); + init_waitqueue_head(&cfs_race_waitq); + + rc = libcfs_debug_init(5 * 1024 * 1024); + if (rc < 0) { + pr_err("LustreError: libcfs_debug_init: %d\n", rc); + return rc; + } + + rc = cfs_cpu_init(); + if (rc != 0) + goto cleanup_debug; + + rc = misc_register(&libcfs_dev); + if (rc) { + CERROR("misc_register: error %d\n", rc); + goto cleanup_cpu; + } + + rc = cfs_wi_startup(); + if (rc) { + CERROR("initialize workitem: error %d\n", rc); + goto cleanup_deregister; + } + + /* max to 4 threads, should be enough for rehash */ + rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4); + rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY, + rc, &cfs_sched_rehash); + if (rc != 0) { + CERROR("Startup workitem scheduler: error: %d\n", rc); + goto cleanup_deregister; + } + + rc = cfs_crypto_register(); + if (rc) { + CERROR("cfs_crypto_register: error %d\n", rc); + goto cleanup_wi; + } + + + rc = insert_proc(); + if (rc) { + CERROR("insert_proc: error %d\n", rc); + goto cleanup_crypto; + } + + CDEBUG (D_OTHER, "portals setup OK\n"); + return 0; + cleanup_crypto: + cfs_crypto_unregister(); + cleanup_wi: + cfs_wi_shutdown(); + cleanup_deregister: + misc_deregister(&libcfs_dev); +cleanup_cpu: + cfs_cpu_fini(); + cleanup_debug: + libcfs_debug_cleanup(); + return rc; +} + +static void exit_libcfs_module(void) +{ + int rc; + + remove_proc(); + + CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + if (cfs_sched_rehash != NULL) { + cfs_wi_sched_destroy(cfs_sched_rehash); + cfs_sched_rehash = NULL; + } + + cfs_crypto_unregister(); + cfs_wi_shutdown(); + + rc = misc_deregister(&libcfs_dev); + if (rc) + CERROR("misc_deregister error %d\n", rc); + + cfs_cpu_fini(); + + if (atomic_read(&libcfs_kmemory) != 0) + CERROR("Portals memory leaked: %d bytes\n", + atomic_read(&libcfs_kmemory)); + + rc = libcfs_debug_cleanup(); + if (rc) + pr_err("LustreError: libcfs_debug_cleanup: %d\n", rc); + + libcfs_arch_cleanup(); +} + +static int proc_call_handler(void *data, int write, loff_t *ppos, + void __user *buffer, size_t *lenp, + int (*handler)(void *data, int write, + loff_t pos, void __user *buffer, int len)) +{ + int rc = handler(data, write, *ppos, buffer, *lenp); + + if (rc < 0) + return rc; + + if (write) { + *ppos += *lenp; + } else { + *lenp = rc; + *ppos += rc; + } + return 0; +} + +static int __proc_dobitmasks(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + const int tmpstrlen = 512; + char *tmpstr; + int rc; + unsigned int *mask = data; + int is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0; + int is_printk = (mask == &libcfs_printk) ? 1 : 0; + + rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen); + if (rc < 0) + return rc; + + if (!write) { + libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys); + rc = strlen(tmpstr); + + if (pos >= rc) { + rc = 0; + } else { + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, "\n"); + } + } else { + rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob); + if (rc < 0) { + cfs_trace_free_string_buffer(tmpstr, tmpstrlen); + return rc; + } + + rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys); + /* Always print LBUG/LASSERT to console, so keep this mask */ + if (is_printk) + *mask |= D_EMERG; + } + + cfs_trace_free_string_buffer(tmpstr, tmpstrlen); + return rc; +} + +static int proc_dobitmasks(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_call_handler(table->data, write, ppos, buffer, lenp, + __proc_dobitmasks); +} + +static int min_watchdog_ratelimit; /* disable ratelimiting */ +static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */ + +static int __proc_dump_kernel(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + if (!write) + return 0; + + return cfs_trace_dump_debug_buffer_usrstr(buffer, nob); +} + +static int proc_dump_kernel(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_call_handler(table->data, write, ppos, buffer, lenp, + __proc_dump_kernel); +} + +static int __proc_daemon_file(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + if (!write) { + int len = strlen(cfs_tracefile); + + if (pos >= len) + return 0; + + return cfs_trace_copyout_string(buffer, nob, + cfs_tracefile + pos, "\n"); + } + + return cfs_trace_daemon_command_usrstr(buffer, nob); +} + +static int proc_daemon_file(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_call_handler(table->data, write, ppos, buffer, lenp, + __proc_daemon_file); +} + +static int __proc_debug_mb(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + if (!write) { + char tmpstr[32]; + int len = snprintf(tmpstr, sizeof(tmpstr), "%d", + cfs_trace_get_debug_mb()); + + if (pos >= len) + return 0; + + return cfs_trace_copyout_string(buffer, nob, tmpstr + pos, + "\n"); + } + + return cfs_trace_set_debug_mb_usrstr(buffer, nob); +} + +static int proc_debug_mb(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_call_handler(table->data, write, ppos, buffer, lenp, + __proc_debug_mb); +} + +static int proc_console_max_delay_cs(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int rc, max_delay_cs; + struct ctl_table dummy = *table; + long d; + + dummy.data = &max_delay_cs; + dummy.proc_handler = &proc_dointvec; + + if (!write) { /* read */ + max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100); + rc = proc_dointvec(&dummy, write, buffer, lenp, ppos); + return rc; + } + + /* write */ + max_delay_cs = 0; + rc = proc_dointvec(&dummy, write, buffer, lenp, ppos); + if (rc < 0) + return rc; + if (max_delay_cs <= 0) + return -EINVAL; + + d = cfs_time_seconds(max_delay_cs) / 100; + if (d == 0 || d < libcfs_console_min_delay) + return -EINVAL; + libcfs_console_max_delay = d; + + return rc; +} + +static int proc_console_min_delay_cs(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int rc, min_delay_cs; + struct ctl_table dummy = *table; + long d; + + dummy.data = &min_delay_cs; + dummy.proc_handler = &proc_dointvec; + + if (!write) { /* read */ + min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100); + rc = proc_dointvec(&dummy, write, buffer, lenp, ppos); + return rc; + } + + /* write */ + min_delay_cs = 0; + rc = proc_dointvec(&dummy, write, buffer, lenp, ppos); + if (rc < 0) + return rc; + if (min_delay_cs <= 0) + return -EINVAL; + + d = cfs_time_seconds(min_delay_cs) / 100; + if (d == 0 || d > libcfs_console_max_delay) + return -EINVAL; + libcfs_console_min_delay = d; + + return rc; +} + +static int proc_console_backoff(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc, backoff; + struct ctl_table dummy = *table; + + dummy.data = &backoff; + dummy.proc_handler = &proc_dointvec; + + if (!write) { /* read */ + backoff = libcfs_console_backoff; + rc = proc_dointvec(&dummy, write, buffer, lenp, ppos); + return rc; + } + + /* write */ + backoff = 0; + rc = proc_dointvec(&dummy, write, buffer, lenp, ppos); + if (rc < 0) + return rc; + if (backoff <= 0) + return -EINVAL; + + libcfs_console_backoff = backoff; + + return rc; +} + +static int libcfs_force_lbug(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + if (write) + LBUG(); + return 0; +} + +static int proc_fail_loc(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int rc; + long old_fail_loc = cfs_fail_loc; + + rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (old_fail_loc != cfs_fail_loc) + wake_up(&cfs_race_waitq); + return rc; +} + +static int __proc_cpt_table(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + char *buf = NULL; + int len = 4096; + int rc = 0; + + if (write) + return -EPERM; + + LASSERT(cfs_cpt_table != NULL); + + while (1) { + LIBCFS_ALLOC(buf, len); + if (buf == NULL) + return -ENOMEM; + + rc = cfs_cpt_table_print(cfs_cpt_table, buf, len); + if (rc >= 0) + break; + + if (rc == -EFBIG) { + LIBCFS_FREE(buf, len); + len <<= 1; + continue; + } + goto out; + } + + if (pos >= rc) { + rc = 0; + goto out; + } + + rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); + out: + if (buf != NULL) + LIBCFS_FREE(buf, len); + return rc; +} + +static int proc_cpt_table(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_call_handler(table->data, write, ppos, buffer, lenp, + __proc_cpt_table); +} + +static struct ctl_table lnet_table[] = { + /* + * NB No .strategy entries have been provided since sysctl(8) prefers + * to go via /proc for portability. + */ + { + .procname = "debug", + .data = &libcfs_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + .procname = "subsystem_debug", + .data = &libcfs_subsystem_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + .procname = "printk", + .data = &libcfs_printk, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + .procname = "console_ratelimit", + .data = &libcfs_console_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "console_max_delay_centisecs", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_console_max_delay_cs + }, + { + .procname = "console_min_delay_centisecs", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_console_min_delay_cs + }, + { + .procname = "console_backoff", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_console_backoff + }, + + { + .procname = "debug_path", + .data = libcfs_debug_file_path_arr, + .maxlen = sizeof(libcfs_debug_file_path_arr), + .mode = 0644, + .proc_handler = &proc_dostring, + }, + + { + .procname = "cpu_partition_table", + .maxlen = 128, + .mode = 0444, + .proc_handler = &proc_cpt_table, + }, + + { + .procname = "upcall", + .data = lnet_upcall, + .maxlen = sizeof(lnet_upcall), + .mode = 0644, + .proc_handler = &proc_dostring, + }, + { + .procname = "debug_log_upcall", + .data = lnet_debug_log_upcall, + .maxlen = sizeof(lnet_debug_log_upcall), + .mode = 0644, + .proc_handler = &proc_dostring, + }, + { + .procname = "lnet_memused", + .data = (int *)&libcfs_kmemory.counter, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .procname = "catastrophe", + .data = &libcfs_catastrophe, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .procname = "panic_on_lbug", + .data = &libcfs_panic_on_lbug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "dump_kernel", + .maxlen = 256, + .mode = 0200, + .proc_handler = &proc_dump_kernel, + }, + { + .procname = "daemon_file", + .mode = 0644, + .maxlen = 256, + .proc_handler = &proc_daemon_file, + }, + { + .procname = "debug_mb", + .mode = 0644, + .proc_handler = &proc_debug_mb, + }, + { + .procname = "watchdog_ratelimit", + .data = &libcfs_watchdog_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &min_watchdog_ratelimit, + .extra2 = &max_watchdog_ratelimit, + }, + { + .procname = "force_lbug", + .data = NULL, + .maxlen = 0, + .mode = 0200, + .proc_handler = &libcfs_force_lbug + }, + { + .procname = "fail_loc", + .data = &cfs_fail_loc, + .maxlen = sizeof(cfs_fail_loc), + .mode = 0644, + .proc_handler = &proc_fail_loc + }, + { + .procname = "fail_val", + .data = &cfs_fail_val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + } +}; + +static struct ctl_table top_table[] = { + { + .procname = "lnet", + .mode = 0555, + .data = NULL, + .maxlen = 0, + .child = lnet_table, + }, + { + } +}; + +static int insert_proc(void) +{ + if (lnet_table_header == NULL) + lnet_table_header = register_sysctl_table(top_table); + return 0; +} + +static void remove_proc(void) +{ + if (lnet_table_header != NULL) + unregister_sysctl_table(lnet_table_header); + + lnet_table_header = NULL; +} + +MODULE_VERSION("1.0.0"); + +module_init(init_libcfs_module); +module_exit(exit_libcfs_module); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c b/kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c new file mode 100644 index 000000000..087449f4e --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c @@ -0,0 +1,842 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/nidstrings.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/lnet.h" + +/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids + * consistent in all conversion functions. Some code fragments are copied + * around for the sake of clarity... + */ + +/* CAVEAT EMPTOR! Racey temporary buffer allocation! + * Choose the number of nidstrings to support the MAXIMUM expected number of + * concurrent users. If there are more, the returned string will be volatile. + * NB this number must allow for a process to be descheduled for a timeslice + * between getting its string and using it. + */ + +static char libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE]; +static int libcfs_nidstring_idx; + +static spinlock_t libcfs_nidstring_lock; + +void libcfs_init_nidstrings(void) +{ + spin_lock_init(&libcfs_nidstring_lock); +} + +static char * +libcfs_next_nidstring(void) +{ + char *str; + unsigned long flags; + + spin_lock_irqsave(&libcfs_nidstring_lock, flags); + + str = libcfs_nidstrings[libcfs_nidstring_idx++]; + if (libcfs_nidstring_idx == ARRAY_SIZE(libcfs_nidstrings)) + libcfs_nidstring_idx = 0; + + spin_unlock_irqrestore(&libcfs_nidstring_lock, flags); + return str; +} + +static int libcfs_lo_str2addr(const char *str, int nob, __u32 *addr) +{ + *addr = 0; + return 1; +} + +static void libcfs_ip_addr2str(__u32 addr, char *str) +{ + snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff); +} + +static int libcfs_ip_str2addr(const char *str, int nob, __u32 *addr) +{ + unsigned int a; + unsigned int b; + unsigned int c; + unsigned int d; + int n = nob; /* XscanfX */ + + /* numeric IP? */ + if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 && + n == nob && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) { + *addr = ((a<<24)|(b<<16)|(c<<8)|d); + return 1; + } + + return 0; +} + +static void libcfs_decnum_addr2str(__u32 addr, char *str) +{ + snprintf(str, LNET_NIDSTR_SIZE, "%u", addr); +} + +static void libcfs_hexnum_addr2str(__u32 addr, char *str) +{ + snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr); +} + +static int libcfs_num_str2addr(const char *str, int nob, __u32 *addr) +{ + int n; + + n = nob; + if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob) + return 1; + + return 0; +} + +/** + * Nf_parse_addrlist method for networks using numeric addresses. + * + * Examples of such networks are gm and elan. + * + * \retval 0 if \a str parsed to numeric address + * \retval errno otherwise + */ +static int +libcfs_num_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + int rc; + + rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el); + if (rc == 0) + list_add_tail(&el->el_link, list); + + return rc; +} + +/* + * Nf_match_addr method for networks using numeric addresses + * + * \retval 1 on match + * \retval 0 otherwise + */ +static int +libcfs_num_match(__u32 addr, struct list_head *numaddr) +{ + struct cfs_expr_list *el; + + LASSERT(!list_empty(numaddr)); + el = list_entry(numaddr->next, struct cfs_expr_list, el_link); + + return cfs_expr_list_match(addr, el); +} + +struct netstrfns { + int nf_type; + char *nf_name; + char *nf_modname; + void (*nf_addr2str)(__u32 addr, char *str); + int (*nf_str2addr)(const char *str, int nob, __u32 *addr); + int (*nf_parse_addrlist)(char *str, int len, + struct list_head *list); + int (*nf_match_addr)(__u32 addr, struct list_head *list); +}; + +static struct netstrfns libcfs_netstrfns[] = { + {/* .nf_type */ LOLND, + /* .nf_name */ "lo", + /* .nf_modname */ "klolnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_lo_str2addr, + /* .nf_parse_addr*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + {/* .nf_type */ SOCKLND, + /* .nf_name */ "tcp", + /* .nf_modname */ "ksocklnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ O2IBLND, + /* .nf_name */ "o2ib", + /* .nf_modname */ "ko2iblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ CIBLND, + /* .nf_name */ "cib", + /* .nf_modname */ "kciblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ OPENIBLND, + /* .nf_name */ "openib", + /* .nf_modname */ "kopeniblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ IIBLND, + /* .nf_name */ "iib", + /* .nf_modname */ "kiiblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ VIBLND, + /* .nf_name */ "vib", + /* .nf_modname */ "kviblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ RALND, + /* .nf_name */ "ra", + /* .nf_modname */ "kralnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ QSWLND, + /* .nf_name */ "elan", + /* .nf_modname */ "kqswlnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr, + /* .nf_parse_addrlist*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + {/* .nf_type */ GMLND, + /* .nf_name */ "gm", + /* .nf_modname */ "kgmlnd", + /* .nf_addr2str */ libcfs_hexnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr, + /* .nf_parse_addrlist*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + {/* .nf_type */ MXLND, + /* .nf_name */ "mx", + /* .nf_modname */ "kmxlnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ PTLLND, + /* .nf_name */ "ptl", + /* .nf_modname */ "kptllnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr, + /* .nf_parse_addrlist*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + {/* .nf_type */ GNILND, + /* .nf_name */ "gni", + /* .nf_modname */ "kgnilnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr, + /* .nf_parse_addrlist*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + /* placeholder for net0 alias. It MUST BE THE LAST ENTRY */ + {/* .nf_type */ -1}, +}; + +static const int libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns); + +/* CAVEAT EMPTOR XscanfX + * I use "%n" at the end of a sscanf format to detect trailing junk. However + * sscanf may return immediately if it sees the terminating '0' in a string, so + * I initialise the %n variable to the expected length. If sscanf sets it; + * fine, if it doesn't, then the scan ended at the end of the string, which is + * fine too :) */ + +static struct netstrfns * +libcfs_lnd2netstrfns(int lnd) +{ + int i; + + if (lnd >= 0) + for (i = 0; i < libcfs_nnetstrfns; i++) + if (lnd == libcfs_netstrfns[i].nf_type) + return &libcfs_netstrfns[i]; + + return NULL; +} + +static struct netstrfns * +libcfs_namenum2netstrfns(const char *name) +{ + struct netstrfns *nf; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (nf->nf_type >= 0 && + !strncmp(name, nf->nf_name, strlen(nf->nf_name))) + return nf; + } + return NULL; +} + +static struct netstrfns * +libcfs_name2netstrfns(const char *name) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (libcfs_netstrfns[i].nf_type >= 0 && + !strcmp(libcfs_netstrfns[i].nf_name, name)) + return &libcfs_netstrfns[i]; + + return NULL; +} + +int +libcfs_isknown_lnd(int type) +{ + return libcfs_lnd2netstrfns(type) != NULL; +} +EXPORT_SYMBOL(libcfs_isknown_lnd); + +char * +libcfs_lnd2modname(int lnd) +{ + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + return (nf == NULL) ? NULL : nf->nf_modname; +} +EXPORT_SYMBOL(libcfs_lnd2modname); + +char * +libcfs_lnd2str(int lnd) +{ + char *str; + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + if (nf != NULL) + return nf->nf_name; + + str = libcfs_next_nidstring(); + snprintf(str, LNET_NIDSTR_SIZE, "?%d?", lnd); + return str; +} +EXPORT_SYMBOL(libcfs_lnd2str); + +int +libcfs_str2lnd(const char *str) +{ + struct netstrfns *nf = libcfs_name2netstrfns(str); + + if (nf != NULL) + return nf->nf_type; + + return -1; +} +EXPORT_SYMBOL(libcfs_str2lnd); + +char * +libcfs_net2str(__u32 net) +{ + int lnd = LNET_NETTYP(net); + int num = LNET_NETNUM(net); + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + char *str = libcfs_next_nidstring(); + + if (nf == NULL) + snprintf(str, LNET_NIDSTR_SIZE, "<%d:%d>", lnd, num); + else if (num == 0) + snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name); + else + snprintf(str, LNET_NIDSTR_SIZE, "%s%d", nf->nf_name, num); + + return str; +} +EXPORT_SYMBOL(libcfs_net2str); + +char * +libcfs_nid2str(lnet_nid_t nid) +{ + __u32 addr = LNET_NIDADDR(nid); + __u32 net = LNET_NIDNET(nid); + int lnd = LNET_NETTYP(net); + int nnum = LNET_NETNUM(net); + struct netstrfns *nf; + char *str; + int nob; + + if (nid == LNET_NID_ANY) + return ""; + + nf = libcfs_lnd2netstrfns(lnd); + str = libcfs_next_nidstring(); + + if (nf == NULL) + snprintf(str, LNET_NIDSTR_SIZE, "%x@<%d:%d>", addr, lnd, nnum); + else { + nf->nf_addr2str(addr, str); + nob = strlen(str); + if (nnum == 0) + snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s", + nf->nf_name); + else + snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%d", + nf->nf_name, nnum); + } + + return str; +} +EXPORT_SYMBOL(libcfs_nid2str); + +static struct netstrfns * +libcfs_str2net_internal(const char *str, __u32 *net) +{ + struct netstrfns *uninitialized_var(nf); + int nob; + unsigned int netnum; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (nf->nf_type >= 0 && + !strncmp(str, nf->nf_name, strlen(nf->nf_name))) + break; + } + + if (i == libcfs_nnetstrfns) + return NULL; + + nob = strlen(nf->nf_name); + + if (strlen(str) == (unsigned int)nob) { + netnum = 0; + } else { + if (nf->nf_type == LOLND) /* net number not allowed */ + return NULL; + + str += nob; + i = strlen(str); + if (sscanf(str, "%u%n", &netnum, &i) < 1 || + i != (int)strlen(str)) + return NULL; + } + + *net = LNET_MKNET(nf->nf_type, netnum); + return nf; +} + +__u32 +libcfs_str2net(const char *str) +{ + __u32 net; + + if (libcfs_str2net_internal(str, &net) != NULL) + return net; + + return LNET_NIDNET(LNET_NID_ANY); +} +EXPORT_SYMBOL(libcfs_str2net); + +lnet_nid_t +libcfs_str2nid(const char *str) +{ + const char *sep = strchr(str, '@'); + struct netstrfns *nf; + __u32 net; + __u32 addr; + + if (sep != NULL) { + nf = libcfs_str2net_internal(sep + 1, &net); + if (nf == NULL) + return LNET_NID_ANY; + } else { + sep = str + strlen(str); + net = LNET_MKNET(SOCKLND, 0); + nf = libcfs_lnd2netstrfns(SOCKLND); + LASSERT(nf != NULL); + } + + if (!nf->nf_str2addr(str, (int)(sep - str), &addr)) + return LNET_NID_ANY; + + return LNET_MKNID(net, addr); +} +EXPORT_SYMBOL(libcfs_str2nid); + +char * +libcfs_id2str(lnet_process_id_t id) +{ + char *str = libcfs_next_nidstring(); + + if (id.pid == LNET_PID_ANY) { + snprintf(str, LNET_NIDSTR_SIZE, + "LNET_PID_ANY-%s", libcfs_nid2str(id.nid)); + return str; + } + + snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s", + ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "", + (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid)); + return str; +} +EXPORT_SYMBOL(libcfs_id2str); + +int +libcfs_str2anynid(lnet_nid_t *nidp, const char *str) +{ + if (!strcmp(str, "*")) { + *nidp = LNET_NID_ANY; + return 1; + } + + *nidp = libcfs_str2nid(str); + return *nidp != LNET_NID_ANY; +} +EXPORT_SYMBOL(libcfs_str2anynid); + +/** + * Nid range list syntax. + * \verbatim + * + * :== [ ' ' ] + * :== '@' + * :== '*' | + * | + * + * :== ... + * + * :== | + * + * :== '[' [ ',' ] ']' + * :== | + * '-' | + * '-' '/' + * :== | + * :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" | + * "vib" | "ra" | "elan" | "mx" | "ptl" + * \endverbatim + */ + +/** + * Structure to represent \ token of the syntax. + * + * One of this is created for each \ parsed. + */ +struct nidrange { + /** + * Link to list of this structures which is built on nid range + * list parsing. + */ + struct list_head nr_link; + /** + * List head for addrrange::ar_link. + */ + struct list_head nr_addrranges; + /** + * Flag indicating that *@ is found. + */ + int nr_all; + /** + * Pointer to corresponding element of libcfs_netstrfns. + */ + struct netstrfns *nr_netstrfns; + /** + * Number of network. E.g. 5 if \ is "elan5". + */ + int nr_netnum; +}; + +/** + * Structure to represent \ token of the syntax. + */ +struct addrrange { + /** + * Link to nidrange::nr_addrranges. + */ + struct list_head ar_link; + /** + * List head for cfs_expr_list::el_list. + */ + struct list_head ar_numaddr_ranges; +}; + +/** + * Parses \ token on the syntax. + * + * Allocates struct addrrange and links to \a nidrange via + * (nidrange::nr_addrranges) + * + * \retval 1 if \a src parses to '*' | \ | \ + * \retval 0 otherwise + */ +static int +parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange) +{ + struct addrrange *addrrange; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + nidrange->nr_all = 1; + return 1; + } + + LIBCFS_ALLOC(addrrange, sizeof(struct addrrange)); + if (addrrange == NULL) + return 0; + list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges); + INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges); + + return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str, + src->ls_len, + &addrrange->ar_numaddr_ranges); +} + +/** + * Finds or creates struct nidrange. + * + * Checks if \a src is a valid network name, looks for corresponding + * nidrange on the ist of nidranges (\a nidlist), creates new struct + * nidrange if it is not found. + * + * \retval pointer to struct nidrange matching network specified via \a src + * \retval NULL if \a src does not match any network + */ +static struct nidrange * +add_nidrange(const struct cfs_lstr *src, + struct list_head *nidlist) +{ + struct netstrfns *nf; + struct nidrange *nr; + int endlen; + unsigned netnum; + + if (src->ls_len >= LNET_NIDSTR_SIZE) + return NULL; + + nf = libcfs_namenum2netstrfns(src->ls_str); + if (nf == NULL) + return NULL; + endlen = src->ls_len - strlen(nf->nf_name); + if (endlen == 0) + /* network name only, e.g. "elan" or "tcp" */ + netnum = 0; + else { + /* e.g. "elan25" or "tcp23", refuse to parse if + * network name is not appended with decimal or + * hexadecimal number */ + if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name), + endlen, &netnum, 0, MAX_NUMERIC_VALUE)) + return NULL; + } + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns != nf) + continue; + if (nr->nr_netnum != netnum) + continue; + return nr; + } + + LIBCFS_ALLOC(nr, sizeof(struct nidrange)); + if (nr == NULL) + return NULL; + list_add_tail(&nr->nr_link, nidlist); + INIT_LIST_HEAD(&nr->nr_addrranges); + nr->nr_netstrfns = nf; + nr->nr_all = 0; + nr->nr_netnum = netnum; + + return nr; +} + +/** + * Parses \ token of the syntax. + * + * \retval 1 if \a src parses to \ '@' \ + * \retval 0 otherwise + */ +static int +parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist) +{ + struct cfs_lstr addrrange; + struct cfs_lstr net; + struct cfs_lstr tmp; + struct nidrange *nr; + + tmp = *src; + if (cfs_gettok(src, '@', &addrrange) == 0) + goto failed; + + if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL) + goto failed; + + nr = add_nidrange(&net, nidlist); + if (nr == NULL) + goto failed; + + if (parse_addrange(&addrrange, nr) != 0) + goto failed; + + return 1; + failed: + CWARN("can't parse nidrange: \"%.*s\"\n", tmp.ls_len, tmp.ls_str); + return 0; +} + +/** + * Frees addrrange structures of \a list. + * + * For each struct addrrange structure found on \a list it frees + * cfs_expr_list list attached to it and frees the addrrange itself. + * + * \retval none + */ +static void +free_addrranges(struct list_head *list) +{ + while (!list_empty(list)) { + struct addrrange *ar; + + ar = list_entry(list->next, struct addrrange, ar_link); + + cfs_expr_list_free_list(&ar->ar_numaddr_ranges); + list_del(&ar->ar_link); + LIBCFS_FREE(ar, sizeof(struct addrrange)); + } +} + +/** + * Frees nidrange strutures of \a list. + * + * For each struct nidrange structure found on \a list it frees + * addrrange list attached to it and frees the nidrange itself. + * + * \retval none + */ +void +cfs_free_nidlist(struct list_head *list) +{ + struct list_head *pos, *next; + struct nidrange *nr; + + list_for_each_safe(pos, next, list) { + nr = list_entry(pos, struct nidrange, nr_link); + free_addrranges(&nr->nr_addrranges); + list_del(pos); + LIBCFS_FREE(nr, sizeof(struct nidrange)); + } +} +EXPORT_SYMBOL(cfs_free_nidlist); + +/** + * Parses nid range list. + * + * Parses with rigorous syntax and overflow checking \a str into + * \ [ ' ' \ ], compiles \a str into set of + * structures and links that structure to \a nidlist. The resulting + * list can be used to match a NID againts set of NIDS defined by \a + * str. + * \see cfs_match_nid + * + * \retval 1 on success + * \retval 0 otherwise + */ +int +cfs_parse_nidlist(char *str, int len, struct list_head *nidlist) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(nidlist); + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + rc = parse_nidrange(&res, nidlist); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + } + return 1; +} +EXPORT_SYMBOL(cfs_parse_nidlist); + +/** + * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist). + * + * \see cfs_parse_nidlist() + * + * \retval 1 on match + * \retval 0 otherwises + */ +int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist) +{ + struct nidrange *nr; + struct addrrange *ar; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid))) + continue; + if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid))) + continue; + if (nr->nr_all) + return 1; + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) + if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid), + &ar->ar_numaddr_ranges)) + return 1; + } + return 0; +} +EXPORT_SYMBOL(cfs_match_nid); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/prng.c b/kernel/drivers/staging/lustre/lustre/libcfs/prng.c new file mode 100644 index 000000000..4147664ff --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/prng.c @@ -0,0 +1,139 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/prng.c + * + * concatenation of following two 16-bit multiply with carry generators + * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16, + * number and carry packed within the same 32 bit integer. + * algorithm recommended by Marsaglia +*/ + +#include "../../include/linux/libcfs/libcfs.h" + +/* +From: George Marsaglia +Newsgroups: sci.math +Subject: Re: A RANDOM NUMBER GENERATOR FOR C +Date: Tue, 30 Sep 1997 05:29:35 -0700 + + * You may replace the two constants 36969 and 18000 by any + * pair of distinct constants from this list: + * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584 + * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243 + * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974 + * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114 + * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088 + * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834 + * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013 + * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083 + * (or any other 16-bit constants k for which both k*2^16-1 + * and k*2^15-1 are prime) */ + +#define RANDOM_CONST_A 18030 +#define RANDOM_CONST_B 29013 + +static unsigned int seed_x = 521288629; +static unsigned int seed_y = 362436069; + +/** + * cfs_rand - creates new seeds + * + * First it creates new seeds from the previous seeds. Then it generates a + * new pseudo random number for use. + * + * Returns a pseudo-random 32-bit integer + */ +unsigned int cfs_rand(void) +{ + seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16); + seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16); + + return ((seed_x << 16) + (seed_y & 65535)); +} +EXPORT_SYMBOL(cfs_rand); + +/** + * cfs_srand - sets the initial seed + * @seed1 : (seed_x) should have the most entropy in the low bits of the word + * @seed2 : (seed_y) should have the most entropy in the high bits of the word + * + * Replaces the original seeds with new values. Used to generate a new pseudo + * random numbers. + */ +void cfs_srand(unsigned int seed1, unsigned int seed2) +{ + if (seed1) + seed_x = seed1; /* use default seeds if parameter is 0 */ + if (seed2) + seed_y = seed2; +} +EXPORT_SYMBOL(cfs_srand); + +/** + * cfs_get_random_bytes - generate a bunch of random numbers + * @buf : buffer to fill with random numbers + * @size: size of passed in buffer + * + * Fills a buffer with random bytes + */ +void cfs_get_random_bytes(void *buf, int size) +{ + int *p = buf; + int rem, tmp; + + LASSERT(size >= 0); + + rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size); + if (rem) { + get_random_bytes(&tmp, sizeof(tmp)); + tmp ^= cfs_rand(); + memcpy(buf, &tmp, rem); + p = buf + rem; + size -= rem; + } + + while (size >= sizeof(int)) { + get_random_bytes(&tmp, sizeof(tmp)); + *p = cfs_rand() ^ tmp; + size -= sizeof(int); + p++; + } + buf = p; + if (size) { + get_random_bytes(&tmp, sizeof(tmp)); + tmp ^= cfs_rand(); + memcpy(buf, &tmp, size); + } +} +EXPORT_SYMBOL(cfs_get_random_bytes); diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c new file mode 100644 index 000000000..c86394f7f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c @@ -0,0 +1,1196 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/tracefile.c + * + * Author: Zach Brown + * Author: Phil Schwan + */ + + +#define DEBUG_SUBSYSTEM S_LNET +#define LUSTRE_TRACEFILE_PRIVATE +#include "tracefile.h" + +#include "../../include/linux/libcfs/libcfs.h" + +/* XXX move things up to the top, comment */ +union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned; + +char cfs_tracefile[TRACEFILE_NAME_SIZE]; +long long cfs_tracefile_size = CFS_TRACEFILE_SIZE; +static struct tracefiled_ctl trace_tctl; +struct mutex cfs_trace_thread_mutex; +static int thread_running; + +static atomic_t cfs_tage_allocated = ATOMIC_INIT(0); + +static void put_pages_on_tcd_daemon_list(struct page_collection *pc, + struct cfs_trace_cpu_data *tcd); + +static inline struct cfs_trace_page * +cfs_tage_from_list(struct list_head *list) +{ + return list_entry(list, struct cfs_trace_page, linkage); +} + +static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp) +{ + struct page *page; + struct cfs_trace_page *tage; + + /* My caller is trying to free memory */ + if (!in_interrupt() && memory_pressure_get()) + return NULL; + + /* + * Don't spam console with allocation failures: they will be reported + * by upper layer anyway. + */ + gfp |= __GFP_NOWARN; + page = alloc_page(gfp); + if (page == NULL) + return NULL; + + tage = kmalloc(sizeof(*tage), gfp); + if (tage == NULL) { + __free_page(page); + return NULL; + } + + tage->page = page; + atomic_inc(&cfs_tage_allocated); + return tage; +} + +static void cfs_tage_free(struct cfs_trace_page *tage) +{ + __LASSERT(tage != NULL); + __LASSERT(tage->page != NULL); + + __free_page(tage->page); + kfree(tage); + atomic_dec(&cfs_tage_allocated); +} + +static void cfs_tage_to_tail(struct cfs_trace_page *tage, + struct list_head *queue) +{ + __LASSERT(tage != NULL); + __LASSERT(queue != NULL); + + list_move_tail(&tage->linkage, queue); +} + +int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp, + struct list_head *stock) +{ + int i; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) { + struct cfs_trace_page *tage; + + tage = cfs_tage_alloc(gfp); + if (tage == NULL) + break; + list_add_tail(&tage->linkage, stock); + } + return i; +} + +/* return a page that has 'len' bytes left at the end */ +static struct cfs_trace_page * +cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len) +{ + struct cfs_trace_page *tage; + + if (tcd->tcd_cur_pages > 0) { + __LASSERT(!list_empty(&tcd->tcd_pages)); + tage = cfs_tage_from_list(tcd->tcd_pages.prev); + if (tage->used + len <= PAGE_CACHE_SIZE) + return tage; + } + + if (tcd->tcd_cur_pages < tcd->tcd_max_pages) { + if (tcd->tcd_cur_stock_pages > 0) { + tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev); + --tcd->tcd_cur_stock_pages; + list_del_init(&tage->linkage); + } else { + tage = cfs_tage_alloc(GFP_ATOMIC); + if (unlikely(tage == NULL)) { + if ((!memory_pressure_get() || + in_interrupt()) && printk_ratelimit()) + printk(KERN_WARNING + "cannot allocate a tage (%ld)\n", + tcd->tcd_cur_pages); + return NULL; + } + } + + tage->used = 0; + tage->cpu = smp_processor_id(); + tage->type = tcd->tcd_type; + list_add_tail(&tage->linkage, &tcd->tcd_pages); + tcd->tcd_cur_pages++; + + if (tcd->tcd_cur_pages > 8 && thread_running) { + struct tracefiled_ctl *tctl = &trace_tctl; + /* + * wake up tracefiled to process some pages. + */ + wake_up(&tctl->tctl_waitq); + } + return tage; + } + return NULL; +} + +static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd) +{ + int pgcount = tcd->tcd_cur_pages / 10; + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + if (printk_ratelimit()) + printk(KERN_WARNING "debug daemon buffer overflowed; discarding 10%% of pages (%d of %ld)\n", + pgcount + 1, tcd->tcd_cur_pages); + + INIT_LIST_HEAD(&pc.pc_pages); + spin_lock_init(&pc.pc_lock); + + list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) { + if (pgcount-- == 0) + break; + + list_move_tail(&tage->linkage, &pc.pc_pages); + tcd->tcd_cur_pages--; + } + put_pages_on_tcd_daemon_list(&pc, tcd); +} + +/* return a page that has 'len' bytes left at the end */ +static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd, + unsigned long len) +{ + struct cfs_trace_page *tage; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + if (len > PAGE_CACHE_SIZE) { + pr_err("cowardly refusing to write %lu bytes in a page\n", len); + return NULL; + } + + tage = cfs_trace_get_tage_try(tcd, len); + if (tage != NULL) + return tage; + if (thread_running) + cfs_tcd_shrink(tcd); + if (tcd->tcd_cur_pages > 0) { + tage = cfs_tage_from_list(tcd->tcd_pages.next); + tage->used = 0; + cfs_tage_to_tail(tage, &tcd->tcd_pages); + } + return tage; +} + +int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, + const char *format, ...) +{ + va_list args; + int rc; + + va_start(args, format); + rc = libcfs_debug_vmsg2(msgdata, format, args, NULL); + va_end(args); + + return rc; +} +EXPORT_SYMBOL(libcfs_debug_msg); + +int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata, + const char *format1, va_list args, + const char *format2, ...) +{ + struct cfs_trace_cpu_data *tcd = NULL; + struct ptldebug_header header = {0}; + struct cfs_trace_page *tage; + /* string_buf is used only if tcd != NULL, and is always set then */ + char *string_buf = NULL; + char *debug_buf; + int known_size; + int needed = 85; /* average message length */ + int max_nob; + va_list ap; + int depth; + int i; + int remain; + int mask = msgdata->msg_mask; + const char *file = kbasename(msgdata->msg_file); + struct cfs_debug_limit_state *cdls = msgdata->msg_cdls; + + tcd = cfs_trace_get_tcd(); + + /* cfs_trace_get_tcd() grabs a lock, which disables preemption and + * pins us to a particular CPU. This avoids an smp_processor_id() + * warning on Linux when debugging is enabled. */ + cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK()); + + if (tcd == NULL) /* arch may not log in IRQ context */ + goto console; + + if (tcd->tcd_cur_pages == 0) + header.ph_flags |= PH_FLAG_FIRST_RECORD; + + if (tcd->tcd_shutting_down) { + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + depth = __current_nesting_level(); + known_size = strlen(file) + 1 + depth; + if (msgdata->msg_fn) + known_size += strlen(msgdata->msg_fn) + 1; + + if (libcfs_debug_binary) + known_size += sizeof(header); + + /*/ + * '2' used because vsnprintf return real size required for output + * _without_ terminating NULL. + * if needed is to small for this format. + */ + for (i = 0; i < 2; i++) { + tage = cfs_trace_get_tage(tcd, needed + known_size + 1); + if (tage == NULL) { + if (needed + known_size > PAGE_CACHE_SIZE) + mask |= D_ERROR; + + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + string_buf = (char *)page_address(tage->page) + + tage->used + known_size; + + max_nob = PAGE_CACHE_SIZE - tage->used - known_size; + if (max_nob <= 0) { + printk(KERN_EMERG "negative max_nob: %d\n", + max_nob); + mask |= D_ERROR; + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + needed = 0; + if (format1) { + va_copy(ap, args); + needed = vsnprintf(string_buf, max_nob, format1, ap); + va_end(ap); + } + + if (format2) { + remain = max_nob - needed; + if (remain < 0) + remain = 0; + + va_start(ap, format2); + needed += vsnprintf(string_buf + needed, remain, + format2, ap); + va_end(ap); + } + + if (needed < max_nob) /* well. printing ok.. */ + break; + } + + if (*(string_buf+needed-1) != '\n') + printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n", + file, msgdata->msg_line, msgdata->msg_fn); + + header.ph_len = known_size + needed; + debug_buf = (char *)page_address(tage->page) + tage->used; + + if (libcfs_debug_binary) { + memcpy(debug_buf, &header, sizeof(header)); + tage->used += sizeof(header); + debug_buf += sizeof(header); + } + + /* indent message according to the nesting level */ + while (depth-- > 0) { + *(debug_buf++) = '.'; + ++ tage->used; + } + + strcpy(debug_buf, file); + tage->used += strlen(file) + 1; + debug_buf += strlen(file) + 1; + + if (msgdata->msg_fn) { + strcpy(debug_buf, msgdata->msg_fn); + tage->used += strlen(msgdata->msg_fn) + 1; + debug_buf += strlen(msgdata->msg_fn) + 1; + } + + __LASSERT(debug_buf == string_buf); + + tage->used += needed; + __LASSERT (tage->used <= PAGE_CACHE_SIZE); + +console: + if ((mask & libcfs_printk) == 0) { + /* no console output requested */ + if (tcd != NULL) + cfs_trace_put_tcd(tcd); + return 1; + } + + if (cdls != NULL) { + if (libcfs_console_ratelimit && + cdls->cdls_next != 0 && /* not first time ever */ + !cfs_time_after(cfs_time_current(), cdls->cdls_next)) { + /* skipping a console message */ + cdls->cdls_count++; + if (tcd != NULL) + cfs_trace_put_tcd(tcd); + return 1; + } + + if (cfs_time_after(cfs_time_current(), cdls->cdls_next + + libcfs_console_max_delay + + cfs_time_seconds(10))) { + /* last timeout was a long time ago */ + cdls->cdls_delay /= libcfs_console_backoff * 4; + } else { + cdls->cdls_delay *= libcfs_console_backoff; + } + + if (cdls->cdls_delay < libcfs_console_min_delay) + cdls->cdls_delay = libcfs_console_min_delay; + else if (cdls->cdls_delay > libcfs_console_max_delay) + cdls->cdls_delay = libcfs_console_max_delay; + + /* ensure cdls_next is never zero after it's been seen */ + cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1; + } + + if (tcd != NULL) { + cfs_print_to_console(&header, mask, string_buf, needed, file, + msgdata->msg_fn); + cfs_trace_put_tcd(tcd); + } else { + string_buf = cfs_trace_get_console_buffer(); + + needed = 0; + if (format1 != NULL) { + va_copy(ap, args); + needed = vsnprintf(string_buf, + CFS_TRACE_CONSOLE_BUFFER_SIZE, + format1, ap); + va_end(ap); + } + if (format2 != NULL) { + remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed; + if (remain > 0) { + va_start(ap, format2); + needed += vsnprintf(string_buf+needed, remain, + format2, ap); + va_end(ap); + } + } + cfs_print_to_console(&header, mask, + string_buf, needed, file, msgdata->msg_fn); + + cfs_trace_put_console_buffer(string_buf); + } + + if (cdls != NULL && cdls->cdls_count != 0) { + string_buf = cfs_trace_get_console_buffer(); + + needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE, + "Skipped %d previous similar message%s\n", + cdls->cdls_count, + (cdls->cdls_count > 1) ? "s" : ""); + + cfs_print_to_console(&header, mask, + string_buf, needed, file, msgdata->msg_fn); + + cfs_trace_put_console_buffer(string_buf); + cdls->cdls_count = 0; + } + + return 0; +} +EXPORT_SYMBOL(libcfs_debug_vmsg2); + +void +cfs_trace_assertion_failed(const char *str, + struct libcfs_debug_msg_data *msgdata) +{ + struct ptldebug_header hdr; + + libcfs_panic_in_progress = 1; + libcfs_catastrophe = 1; + mb(); + + cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK()); + + cfs_print_to_console(&hdr, D_EMERG, str, strlen(str), + msgdata->msg_file, msgdata->msg_fn); + + panic("Lustre debug assertion failure\n"); + + /* not reached */ +} + +static void +panic_collect_pages(struct page_collection *pc) +{ + /* Do the collect_pages job on a single CPU: assumes that all other + * CPUs have been stopped during a panic. If this isn't true for some + * arch, this will have to be implemented separately in each arch. */ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + + INIT_LIST_HEAD(&pc->pc_pages); + + cfs_tcd_for_each(tcd, i, j) { + list_splice_init(&tcd->tcd_pages, &pc->pc_pages); + tcd->tcd_cur_pages = 0; + + if (pc->pc_want_daemon_pages) { + list_splice_init(&tcd->tcd_daemon_pages, + &pc->pc_pages); + tcd->tcd_cur_daemon_pages = 0; + } + } +} + +static void collect_pages_on_all_cpus(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + int i, cpu; + + spin_lock(&pc->pc_lock); + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + list_splice_init(&tcd->tcd_pages, &pc->pc_pages); + tcd->tcd_cur_pages = 0; + if (pc->pc_want_daemon_pages) { + list_splice_init(&tcd->tcd_daemon_pages, + &pc->pc_pages); + tcd->tcd_cur_daemon_pages = 0; + } + } + } + spin_unlock(&pc->pc_lock); +} + +static void collect_pages(struct page_collection *pc) +{ + INIT_LIST_HEAD(&pc->pc_pages); + + if (libcfs_panic_in_progress) + panic_collect_pages(pc); + else + collect_pages_on_all_cpus(pc); +} + +static void put_pages_back_on_all_cpus(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + struct list_head *cur_head; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + int i, cpu; + + spin_lock(&pc->pc_lock); + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + cur_head = tcd->tcd_pages.next; + + list_for_each_entry_safe(tage, tmp, &pc->pc_pages, + linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + if (tage->cpu != cpu || tage->type != i) + continue; + + cfs_tage_to_tail(tage, cur_head); + tcd->tcd_cur_pages++; + } + } + } + spin_unlock(&pc->pc_lock); +} + +static void put_pages_back(struct page_collection *pc) +{ + if (!libcfs_panic_in_progress) + put_pages_back_on_all_cpus(pc); +} + +/* Add pages to a per-cpu debug daemon ringbuffer. This buffer makes sure that + * we have a good amount of data at all times for dumping during an LBUG, even + * if we have been steadily writing (and otherwise discarding) pages via the + * debug daemon. */ +static void put_pages_on_tcd_daemon_list(struct page_collection *pc, + struct cfs_trace_cpu_data *tcd) +{ + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + spin_lock(&pc->pc_lock); + list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type) + continue; + + cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages); + tcd->tcd_cur_daemon_pages++; + + if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) { + struct cfs_trace_page *victim; + + __LASSERT(!list_empty(&tcd->tcd_daemon_pages)); + victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next); + + __LASSERT_TAGE_INVARIANT(victim); + + list_del(&victim->linkage); + cfs_tage_free(victim); + tcd->tcd_cur_daemon_pages--; + } + } + spin_unlock(&pc->pc_lock); +} + +static void put_pages_on_daemon_list(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + int i, cpu; + + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) + put_pages_on_tcd_daemon_list(pc, tcd); + } +} + +void cfs_trace_debug_print(void) +{ + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + spin_lock_init(&pc.pc_lock); + + pc.pc_want_daemon_pages = 1; + collect_pages(&pc); + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + char *p, *file, *fn; + struct page *page; + + __LASSERT_TAGE_INVARIANT(tage); + + page = tage->page; + p = page_address(page); + while (p < ((char *)page_address(page) + tage->used)) { + struct ptldebug_header *hdr; + int len; + hdr = (void *)p; + p += sizeof(*hdr); + file = p; + p += strlen(file) + 1; + fn = p; + p += strlen(fn) + 1; + len = hdr->ph_len - (int)(p - (char *)hdr); + + cfs_print_to_console(hdr, D_EMERG, p, len, file, fn); + + p += len; + } + + list_del(&tage->linkage); + cfs_tage_free(tage); + } +} + +int cfs_tracefile_dump_all_pages(char *filename) +{ + struct page_collection pc; + struct file *filp; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + char *buf; + int rc; + + DECL_MMSPACE; + + cfs_tracefile_write_lock(); + + filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + filp = NULL; + pr_err("LustreError: can't open %s for dump: rc %d\n", + filename, rc); + goto out; + } + + spin_lock_init(&pc.pc_lock); + pc.pc_want_daemon_pages = 1; + collect_pages(&pc); + if (list_empty(&pc.pc_pages)) { + rc = 0; + goto close; + } + + /* ok, for now, just write the pages. in the future we'll be building + * iobufs with the pages and calling generic_direct_IO */ + MMSPACE_OPEN; + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + buf = kmap(tage->page); + rc = vfs_write(filp, (__force const char __user *)buf, + tage->used, &filp->f_pos); + kunmap(tage->page); + + if (rc != (int)tage->used) { + printk(KERN_WARNING "wanted to write %u but wrote %d\n", + tage->used, rc); + put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + break; + } + list_del(&tage->linkage); + cfs_tage_free(tage); + } + MMSPACE_CLOSE; + rc = vfs_fsync(filp, 1); + if (rc) + pr_err("sync returns %d\n", rc); +close: + filp_close(filp, NULL); +out: + cfs_tracefile_write_unlock(); + return rc; +} + +void cfs_trace_flush_pages(void) +{ + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + spin_lock_init(&pc.pc_lock); + + pc.pc_want_daemon_pages = 1; + collect_pages(&pc); + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + list_del(&tage->linkage); + cfs_tage_free(tage); + } +} + +int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, + const char __user *usr_buffer, int usr_buffer_nob) +{ + int nob; + + if (usr_buffer_nob > knl_buffer_nob) + return -EOVERFLOW; + + if (copy_from_user((void *)knl_buffer, + usr_buffer, usr_buffer_nob)) + return -EFAULT; + + nob = strnlen(knl_buffer, usr_buffer_nob); + while (nob-- >= 0) /* strip trailing whitespace */ + if (!isspace(knl_buffer[nob])) + break; + + if (nob < 0) /* empty string */ + return -EINVAL; + + if (nob == knl_buffer_nob) /* no space to terminate */ + return -EOVERFLOW; + + knl_buffer[nob + 1] = 0; /* terminate */ + return 0; +} +EXPORT_SYMBOL(cfs_trace_copyin_string); + +int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, + const char *knl_buffer, char *append) +{ + /* NB if 'append' != NULL, it's a single character to append to the + * copied out string - usually "\n", for /proc entries and "" (i.e. a + * terminating zero byte) for sysctl entries */ + int nob = strlen(knl_buffer); + + if (nob > usr_buffer_nob) + nob = usr_buffer_nob; + + if (copy_to_user(usr_buffer, knl_buffer, nob)) + return -EFAULT; + + if (append != NULL && nob < usr_buffer_nob) { + if (copy_to_user(usr_buffer + nob, append, 1)) + return -EFAULT; + + nob++; + } + + return nob; +} +EXPORT_SYMBOL(cfs_trace_copyout_string); + +int cfs_trace_allocate_string_buffer(char **str, int nob) +{ + if (nob > 2 * PAGE_CACHE_SIZE) /* string must be "sensible" */ + return -EINVAL; + + *str = kmalloc(nob, GFP_IOFS | __GFP_ZERO); + if (*str == NULL) + return -ENOMEM; + + return 0; +} + +void cfs_trace_free_string_buffer(char *str, int nob) +{ + kfree(str); +} + +int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob) +{ + char *str; + int rc; + + rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1); + if (rc != 0) + return rc; + + rc = cfs_trace_copyin_string(str, usr_str_nob + 1, + usr_str, usr_str_nob); + if (rc != 0) + goto out; + + if (str[0] != '/') { + rc = -EINVAL; + goto out; + } + rc = cfs_tracefile_dump_all_pages(str); +out: + cfs_trace_free_string_buffer(str, usr_str_nob + 1); + return rc; +} + +int cfs_trace_daemon_command(char *str) +{ + int rc = 0; + + cfs_tracefile_write_lock(); + + if (strcmp(str, "stop") == 0) { + cfs_tracefile_write_unlock(); + cfs_trace_stop_thread(); + cfs_tracefile_write_lock(); + memset(cfs_tracefile, 0, sizeof(cfs_tracefile)); + + } else if (strncmp(str, "size=", 5) == 0) { + cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0); + if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480) + cfs_tracefile_size = CFS_TRACEFILE_SIZE; + else + cfs_tracefile_size <<= 20; + + } else if (strlen(str) >= sizeof(cfs_tracefile)) { + rc = -ENAMETOOLONG; + } else if (str[0] != '/') { + rc = -EINVAL; + } else { + strcpy(cfs_tracefile, str); + + printk(KERN_INFO + "Lustre: debug daemon will attempt to start writing to %s (%lukB max)\n", + cfs_tracefile, + (long)(cfs_tracefile_size >> 10)); + + cfs_trace_start_thread(); + } + + cfs_tracefile_write_unlock(); + return rc; +} + +int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob) +{ + char *str; + int rc; + + rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1); + if (rc != 0) + return rc; + + rc = cfs_trace_copyin_string(str, usr_str_nob + 1, + usr_str, usr_str_nob); + if (rc == 0) + rc = cfs_trace_daemon_command(str); + + cfs_trace_free_string_buffer(str, usr_str_nob + 1); + return rc; +} + +int cfs_trace_set_debug_mb(int mb) +{ + int i; + int j; + int pages; + int limit = cfs_trace_max_debug_mb(); + struct cfs_trace_cpu_data *tcd; + + if (mb < num_possible_cpus()) { + printk(KERN_WARNING + "Lustre: %d MB is too small for debug buffer size, setting it to %d MB.\n", + mb, num_possible_cpus()); + mb = num_possible_cpus(); + } + + if (mb > limit) { + printk(KERN_WARNING + "Lustre: %d MB is too large for debug buffer size, setting it to %d MB.\n", + mb, limit); + mb = limit; + } + + mb /= num_possible_cpus(); + pages = mb << (20 - PAGE_CACHE_SHIFT); + + cfs_tracefile_write_lock(); + + cfs_tcd_for_each(tcd, i, j) + tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100; + + cfs_tracefile_write_unlock(); + + return 0; +} + +int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob) +{ + char str[32]; + int rc; + + rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob); + if (rc < 0) + return rc; + + return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0)); +} + +int cfs_trace_get_debug_mb(void) +{ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + int total_pages = 0; + + cfs_tracefile_read_lock(); + + cfs_tcd_for_each(tcd, i, j) + total_pages += tcd->tcd_max_pages; + + cfs_tracefile_read_unlock(); + + return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1; +} + +static int tracefiled(void *arg) +{ + struct page_collection pc; + struct tracefiled_ctl *tctl = arg; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + struct file *filp; + char *buf; + int last_loop = 0; + int rc; + + DECL_MMSPACE; + + /* we're started late enough that we pick up init's fs context */ + /* this is so broken in uml? what on earth is going on? */ + + spin_lock_init(&pc.pc_lock); + complete(&tctl->tctl_start); + + while (1) { + wait_queue_t __wait; + + pc.pc_want_daemon_pages = 0; + collect_pages(&pc); + if (list_empty(&pc.pc_pages)) + goto end_loop; + + filp = NULL; + cfs_tracefile_read_lock(); + if (cfs_tracefile[0] != 0) { + filp = filp_open(cfs_tracefile, + O_CREAT | O_RDWR | O_LARGEFILE, + 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + filp = NULL; + printk(KERN_WARNING "couldn't open %s: %d\n", + cfs_tracefile, rc); + } + } + cfs_tracefile_read_unlock(); + if (filp == NULL) { + put_pages_on_daemon_list(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + goto end_loop; + } + + MMSPACE_OPEN; + + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, + linkage) { + static loff_t f_pos; + + __LASSERT_TAGE_INVARIANT(tage); + + if (f_pos >= (off_t)cfs_tracefile_size) + f_pos = 0; + else if (f_pos > i_size_read(file_inode(filp))) + f_pos = i_size_read(file_inode(filp)); + + buf = kmap(tage->page); + rc = vfs_write(filp, (__force const char __user *)buf, + tage->used, &f_pos); + kunmap(tage->page); + + if (rc != (int)tage->used) { + printk(KERN_WARNING "wanted to write %u but wrote %d\n", + tage->used, rc); + put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + break; + } + } + MMSPACE_CLOSE; + + filp_close(filp, NULL); + put_pages_on_daemon_list(&pc); + if (!list_empty(&pc.pc_pages)) { + int i; + + printk(KERN_ALERT "Lustre: trace pages aren't empty\n"); + pr_err("total cpus(%d): ", + num_possible_cpus()); + for (i = 0; i < num_possible_cpus(); i++) + if (cpu_online(i)) + pr_cont("%d(on) ", i); + else + pr_cont("%d(off) ", i); + pr_cont("\n"); + + i = 0; + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, + linkage) + pr_err("page %d belongs to cpu %d\n", + ++i, tage->cpu); + pr_err("There are %d pages unwritten\n", i); + } + __LASSERT(list_empty(&pc.pc_pages)); +end_loop: + if (atomic_read(&tctl->tctl_shutdown)) { + if (last_loop == 0) { + last_loop = 1; + continue; + } else { + break; + } + } + init_waitqueue_entry(&__wait, current); + add_wait_queue(&tctl->tctl_waitq, &__wait); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + remove_wait_queue(&tctl->tctl_waitq, &__wait); + } + complete(&tctl->tctl_stop); + return 0; +} + +int cfs_trace_start_thread(void) +{ + struct tracefiled_ctl *tctl = &trace_tctl; + int rc = 0; + + mutex_lock(&cfs_trace_thread_mutex); + if (thread_running) + goto out; + + init_completion(&tctl->tctl_start); + init_completion(&tctl->tctl_stop); + init_waitqueue_head(&tctl->tctl_waitq); + atomic_set(&tctl->tctl_shutdown, 0); + + if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) { + rc = -ECHILD; + goto out; + } + + wait_for_completion(&tctl->tctl_start); + thread_running = 1; +out: + mutex_unlock(&cfs_trace_thread_mutex); + return rc; +} + +void cfs_trace_stop_thread(void) +{ + struct tracefiled_ctl *tctl = &trace_tctl; + + mutex_lock(&cfs_trace_thread_mutex); + if (thread_running) { + printk(KERN_INFO + "Lustre: shutting down debug daemon thread...\n"); + atomic_set(&tctl->tctl_shutdown, 1); + wait_for_completion(&tctl->tctl_stop); + thread_running = 0; + } + mutex_unlock(&cfs_trace_thread_mutex); +} + +int cfs_tracefile_init(int max_pages) +{ + struct cfs_trace_cpu_data *tcd; + int i; + int j; + int rc; + int factor; + + rc = cfs_tracefile_init_arch(); + if (rc != 0) + return rc; + + cfs_tcd_for_each(tcd, i, j) { + /* tcd_pages_factor is initialized int tracefile_init_arch. */ + factor = tcd->tcd_pages_factor; + INIT_LIST_HEAD(&tcd->tcd_pages); + INIT_LIST_HEAD(&tcd->tcd_stock_pages); + INIT_LIST_HEAD(&tcd->tcd_daemon_pages); + tcd->tcd_cur_pages = 0; + tcd->tcd_cur_stock_pages = 0; + tcd->tcd_cur_daemon_pages = 0; + tcd->tcd_max_pages = (max_pages * factor) / 100; + LASSERT(tcd->tcd_max_pages > 0); + tcd->tcd_shutting_down = 0; + } + + return 0; +} + +static void trace_cleanup_on_all_cpus(void) +{ + struct cfs_trace_cpu_data *tcd; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + int i, cpu; + + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + tcd->tcd_shutting_down = 1; + + list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, + linkage) { + __LASSERT_TAGE_INVARIANT(tage); + + list_del(&tage->linkage); + cfs_tage_free(tage); + } + + tcd->tcd_cur_pages = 0; + } + } +} + +static void cfs_trace_cleanup(void) +{ + struct page_collection pc; + + INIT_LIST_HEAD(&pc.pc_pages); + spin_lock_init(&pc.pc_lock); + + trace_cleanup_on_all_cpus(); + + cfs_tracefile_fini_arch(); +} + +void cfs_tracefile_exit(void) +{ + cfs_trace_stop_thread(); + cfs_trace_cleanup(); +} diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h new file mode 100644 index 000000000..0601476e1 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h @@ -0,0 +1,340 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_TRACEFILE_H__ +#define __LIBCFS_TRACEFILE_H__ + +#include "../../include/linux/libcfs/libcfs.h" + +#include "linux/linux-tracefile.h" + +/* trace file lock routines */ + +#define TRACEFILE_NAME_SIZE 1024 +extern char cfs_tracefile[TRACEFILE_NAME_SIZE]; +extern long long cfs_tracefile_size; + +extern void libcfs_run_debug_log_upcall(char *file); + +int cfs_tracefile_init_arch(void); +void cfs_tracefile_fini_arch(void); + +void cfs_tracefile_read_lock(void); +void cfs_tracefile_read_unlock(void); +void cfs_tracefile_write_lock(void); +void cfs_tracefile_write_unlock(void); + +int cfs_tracefile_dump_all_pages(char *filename); +void cfs_trace_debug_print(void); +void cfs_trace_flush_pages(void); +int cfs_trace_start_thread(void); +void cfs_trace_stop_thread(void); +int cfs_tracefile_init(int max_pages); +void cfs_tracefile_exit(void); + + + +int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, + const char __user *usr_buffer, int usr_buffer_nob); +int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, + const char *knl_str, char *append); +int cfs_trace_allocate_string_buffer(char **str, int nob); +void cfs_trace_free_string_buffer(char *str, int nob); +int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob); +int cfs_trace_daemon_command(char *str); +int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob); +int cfs_trace_set_debug_mb(int mb); +int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob); +int cfs_trace_get_debug_mb(void); + +extern void libcfs_debug_dumplog_internal(void *arg); +extern void libcfs_register_panic_notifier(void); +extern void libcfs_unregister_panic_notifier(void); +extern int libcfs_panic_in_progress; +extern int cfs_trace_max_debug_mb(void); + +#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT)) +#define TCD_STOCK_PAGES (TCD_MAX_PAGES) +#define CFS_TRACEFILE_SIZE (500 << 20) + +#ifdef LUSTRE_TRACEFILE_PRIVATE + +/* + * Private declare for tracefile + */ +#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT)) +#define TCD_STOCK_PAGES (TCD_MAX_PAGES) + +#define CFS_TRACEFILE_SIZE (500 << 20) + +/* Size of a buffer for sprinting console messages if we can't get a page + * from system */ +#define CFS_TRACE_CONSOLE_BUFFER_SIZE 1024 + +union cfs_trace_data_union { + struct cfs_trace_cpu_data { + /* + * Even though this structure is meant to be per-CPU, locking + * is needed because in some places the data may be accessed + * from other CPUs. This lock is directly used in trace_get_tcd + * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and + * tcd_for_each_type_lock + */ + spinlock_t tcd_lock; + unsigned long tcd_lock_flags; + + /* + * pages with trace records not yet processed by tracefiled. + */ + struct list_head tcd_pages; + /* number of pages on ->tcd_pages */ + unsigned long tcd_cur_pages; + + /* + * pages with trace records already processed by + * tracefiled. These pages are kept in memory, so that some + * portion of log can be written in the event of LBUG. This + * list is maintained in LRU order. + * + * Pages are moved to ->tcd_daemon_pages by tracefiled() + * (put_pages_on_daemon_list()). LRU pages from this list are + * discarded when list grows too large. + */ + struct list_head tcd_daemon_pages; + /* number of pages on ->tcd_daemon_pages */ + unsigned long tcd_cur_daemon_pages; + + /* + * Maximal number of pages allowed on ->tcd_pages and + * ->tcd_daemon_pages each. + * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current + * implementation. + */ + unsigned long tcd_max_pages; + + /* + * preallocated pages to write trace records into. Pages from + * ->tcd_stock_pages are moved to ->tcd_pages by + * portals_debug_msg(). + * + * This list is necessary, because on some platforms it's + * impossible to perform efficient atomic page allocation in a + * non-blockable context. + * + * Such platforms fill ->tcd_stock_pages "on occasion", when + * tracing code is entered in blockable context. + * + * trace_get_tage_try() tries to get a page from + * ->tcd_stock_pages first and resorts to atomic page + * allocation only if this queue is empty. ->tcd_stock_pages + * is replenished when tracing code is entered in blocking + * context (darwin-tracefile.c:trace_get_tcd()). We try to + * maintain TCD_STOCK_PAGES (40 by default) pages in this + * queue. Atomic allocation is only required if more than + * TCD_STOCK_PAGES pagesful are consumed by trace records all + * emitted in non-blocking contexts. Which is quite unlikely. + */ + struct list_head tcd_stock_pages; + /* number of pages on ->tcd_stock_pages */ + unsigned long tcd_cur_stock_pages; + + unsigned short tcd_shutting_down; + unsigned short tcd_cpu; + unsigned short tcd_type; + /* The factors to share debug memory. */ + unsigned short tcd_pages_factor; + } tcd; + char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))]; +}; + +#define TCD_MAX_TYPES 8 +extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS]; + +#define cfs_tcd_for_each(tcd, i, j) \ + for (i = 0; cfs_trace_data[i] != NULL; i++) \ + for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd); \ + j < num_possible_cpus(); \ + j++, (tcd) = &(*cfs_trace_data[i])[j].tcd) + +#define cfs_tcd_for_each_type_lock(tcd, i, cpu) \ + for (i = 0; cfs_trace_data[i] && \ + (tcd = &(*cfs_trace_data[i])[cpu].tcd) && \ + cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++) + +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct page_collection { + struct list_head pc_pages; + /* + * spin-lock protecting ->pc_pages. It is taken by smp_call_function() + * call-back functions. XXX nikita: Which is horrible: all processors + * receive NMI at the same time only to be serialized by this + * lock. Probably ->pc_pages should be replaced with an array of + * NR_CPUS elements accessed locklessly. + */ + spinlock_t pc_lock; + /* + * if this flag is set, collect_pages() will spill both + * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise, + * only ->tcd_pages are spilled. + */ + int pc_want_daemon_pages; +}; + +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct tracefiled_ctl { + struct completion tctl_start; + struct completion tctl_stop; + wait_queue_head_t tctl_waitq; + pid_t tctl_pid; + atomic_t tctl_shutdown; +}; + +/* + * small data-structure for each page owned by tracefiled. + */ +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct cfs_trace_page { + /* + * page itself + */ + struct page *page; + /* + * linkage into one of the lists in trace_data_union or + * page_collection + */ + struct list_head linkage; + /* + * number of bytes used within this page + */ + unsigned int used; + /* + * cpu that owns this page + */ + unsigned short cpu; + /* + * type(context) of this page + */ + unsigned short type; +}; + +extern void cfs_set_ptldebug_header(struct ptldebug_header *header, + struct libcfs_debug_msg_data *m, + unsigned long stack); +extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask, + const char *buf, int len, const char *file, + const char *fn); + +extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking); +extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking); + +/** + * trace_buf_type_t, trace_buf_idx_get() and trace_console_buffers[][] + * are not public libcfs API; they should be defined in + * platform-specific tracefile include files + * (see, for example, linux-tracefile.h). + */ + +extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX]; +extern cfs_trace_buf_type_t cfs_trace_buf_idx_get(void); + +static inline char * +cfs_trace_get_console_buffer(void) +{ + unsigned int i = get_cpu(); + unsigned int j = cfs_trace_buf_idx_get(); + + return cfs_trace_console_buffers[i][j]; +} + +static inline void +cfs_trace_put_console_buffer(char *buffer) +{ + put_cpu(); +} + +static inline struct cfs_trace_cpu_data * +cfs_trace_get_tcd(void) +{ + struct cfs_trace_cpu_data *tcd = + &(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd; + + cfs_trace_lock_tcd(tcd, 0); + + return tcd; +} + +static inline void +cfs_trace_put_tcd (struct cfs_trace_cpu_data *tcd) +{ + cfs_trace_unlock_tcd(tcd, 0); + + put_cpu(); +} + +int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp, + struct list_head *stock); + + +int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd, + struct cfs_trace_page *tage); + +extern void cfs_trace_assertion_failed(const char *str, + struct libcfs_debug_msg_data *m); + +/* ASSERTION that is safe to use within the debug system */ +#define __LASSERT(cond) \ +do { \ + if (unlikely(!(cond))) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ + cfs_trace_assertion_failed("ASSERTION("#cond") failed", \ + &msgdata); \ + } \ +} while (0) + +#define __LASSERT_TAGE_INVARIANT(tage) \ +do { \ + __LASSERT(tage != NULL); \ + __LASSERT(tage->page != NULL); \ + __LASSERT(tage->used <= PAGE_CACHE_SIZE); \ + __LASSERT(page_count(tage->page) > 0); \ +} while (0) + +#endif /* LUSTRE_TRACEFILE_PRIVATE */ + +#endif /* __LIBCFS_TRACEFILE_H__ */ diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/workitem.c b/kernel/drivers/staging/lustre/lustre/libcfs/workitem.c new file mode 100644 index 000000000..48009b775 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/libcfs/workitem.c @@ -0,0 +1,479 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/workitem.c + * + * Author: Isaac Huang + * Liang Zhen + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "../../include/linux/libcfs/libcfs.h" + +#define CFS_WS_NAME_LEN 16 + +typedef struct cfs_wi_sched { + struct list_head ws_list; /* chain on global list */ + /** serialised workitems */ + spinlock_t ws_lock; + /** where schedulers sleep */ + wait_queue_head_t ws_waitq; + /** concurrent workitems */ + struct list_head ws_runq; + /** rescheduled running-workitems, a workitem can be rescheduled + * while running in wi_action(), but we don't to execute it again + * unless it returns from wi_action(), so we put it on ws_rerunq + * while rescheduling, and move it to runq after it returns + * from wi_action() */ + struct list_head ws_rerunq; + /** CPT-table for this scheduler */ + struct cfs_cpt_table *ws_cptab; + /** CPT id for affinity */ + int ws_cpt; + /** number of scheduled workitems */ + int ws_nscheduled; + /** started scheduler thread, protected by cfs_wi_data::wi_glock */ + unsigned int ws_nthreads:30; + /** shutting down, protected by cfs_wi_data::wi_glock */ + unsigned int ws_stopping:1; + /** serialize starting thread, protected by cfs_wi_data::wi_glock */ + unsigned int ws_starting:1; + /** scheduler name */ + char ws_name[CFS_WS_NAME_LEN]; +} cfs_wi_sched_t; + +static struct cfs_workitem_data { + /** serialize */ + spinlock_t wi_glock; + /** list of all schedulers */ + struct list_head wi_scheds; + /** WI module is initialized */ + int wi_init; + /** shutting down the whole WI module */ + int wi_stopping; +} cfs_wi_data; + +static inline void +cfs_wi_sched_lock(cfs_wi_sched_t *sched) +{ + spin_lock(&sched->ws_lock); +} + +static inline void +cfs_wi_sched_unlock(cfs_wi_sched_t *sched) +{ + spin_unlock(&sched->ws_lock); +} + +static inline int +cfs_wi_sched_cansleep(cfs_wi_sched_t *sched) +{ + cfs_wi_sched_lock(sched); + if (sched->ws_stopping) { + cfs_wi_sched_unlock(sched); + return 0; + } + + if (!list_empty(&sched->ws_runq)) { + cfs_wi_sched_unlock(sched); + return 0; + } + cfs_wi_sched_unlock(sched); + return 1; +} + + +/* XXX: + * 0. it only works when called from wi->wi_action. + * 1. when it returns no one shall try to schedule the workitem. + */ +void +cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi) +{ + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + cfs_wi_sched_lock(sched); + + LASSERT(wi->wi_running); + if (wi->wi_scheduled) { /* cancel pending schedules */ + LASSERT(!list_empty(&wi->wi_list)); + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + } + + LASSERT(list_empty(&wi->wi_list)); + + wi->wi_scheduled = 1; /* LBUG future schedule attempts */ + cfs_wi_sched_unlock(sched); + + return; +} +EXPORT_SYMBOL(cfs_wi_exit); + +/** + * cancel schedule request of workitem \a wi + */ +int +cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi) +{ + int rc; + + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + /* + * return 0 if it's running already, otherwise return 1, which + * means the workitem will not be scheduled and will not have + * any race with wi_action. + */ + cfs_wi_sched_lock(sched); + + rc = !(wi->wi_running); + + if (wi->wi_scheduled) { /* cancel pending schedules */ + LASSERT(!list_empty(&wi->wi_list)); + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + + wi->wi_scheduled = 0; + } + + LASSERT (list_empty(&wi->wi_list)); + + cfs_wi_sched_unlock(sched); + return rc; +} +EXPORT_SYMBOL(cfs_wi_deschedule); + +/* + * Workitem scheduled with (serial == 1) is strictly serialised not only with + * itself, but also with others scheduled this way. + * + * Now there's only one static serialised queue, but in the future more might + * be added, and even dynamic creation of serialised queues might be supported. + */ +void +cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi) +{ + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + cfs_wi_sched_lock(sched); + + if (!wi->wi_scheduled) { + LASSERT (list_empty(&wi->wi_list)); + + wi->wi_scheduled = 1; + sched->ws_nscheduled++; + if (!wi->wi_running) { + list_add_tail(&wi->wi_list, &sched->ws_runq); + wake_up(&sched->ws_waitq); + } else { + list_add(&wi->wi_list, &sched->ws_rerunq); + } + } + + LASSERT (!list_empty(&wi->wi_list)); + cfs_wi_sched_unlock(sched); + return; +} +EXPORT_SYMBOL(cfs_wi_schedule); + + +static int +cfs_wi_scheduler (void *arg) +{ + struct cfs_wi_sched *sched = (cfs_wi_sched_t *)arg; + + cfs_block_allsigs(); + + /* CPT affinity scheduler? */ + if (sched->ws_cptab != NULL) + cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt); + + spin_lock(&cfs_wi_data.wi_glock); + + LASSERT(sched->ws_starting == 1); + sched->ws_starting--; + sched->ws_nthreads++; + + spin_unlock(&cfs_wi_data.wi_glock); + + cfs_wi_sched_lock(sched); + + while (!sched->ws_stopping) { + int nloops = 0; + int rc; + cfs_workitem_t *wi; + + while (!list_empty(&sched->ws_runq) && + nloops < CFS_WI_RESCHED) { + wi = list_entry(sched->ws_runq.next, + cfs_workitem_t, wi_list); + LASSERT(wi->wi_scheduled && !wi->wi_running); + + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + + wi->wi_running = 1; + wi->wi_scheduled = 0; + + + cfs_wi_sched_unlock(sched); + nloops++; + + rc = (*wi->wi_action) (wi); + + cfs_wi_sched_lock(sched); + if (rc != 0) /* WI should be dead, even be freed! */ + continue; + + wi->wi_running = 0; + if (list_empty(&wi->wi_list)) + continue; + + LASSERT(wi->wi_scheduled); + /* wi is rescheduled, should be on rerunq now, we + * move it to runq so it can run action now */ + list_move_tail(&wi->wi_list, &sched->ws_runq); + } + + if (!list_empty(&sched->ws_runq)) { + cfs_wi_sched_unlock(sched); + /* don't sleep because some workitems still + * expect me to come back soon */ + cond_resched(); + cfs_wi_sched_lock(sched); + continue; + } + + cfs_wi_sched_unlock(sched); + rc = wait_event_interruptible_exclusive(sched->ws_waitq, + !cfs_wi_sched_cansleep(sched)); + cfs_wi_sched_lock(sched); + } + + cfs_wi_sched_unlock(sched); + + spin_lock(&cfs_wi_data.wi_glock); + sched->ws_nthreads--; + spin_unlock(&cfs_wi_data.wi_glock); + + return 0; +} + + +void +cfs_wi_sched_destroy(struct cfs_wi_sched *sched) +{ + int i; + + LASSERT(cfs_wi_data.wi_init); + LASSERT(!cfs_wi_data.wi_stopping); + + spin_lock(&cfs_wi_data.wi_glock); + if (sched->ws_stopping) { + CDEBUG(D_INFO, "%s is in progress of stopping\n", + sched->ws_name); + spin_unlock(&cfs_wi_data.wi_glock); + return; + } + + LASSERT(!list_empty(&sched->ws_list)); + sched->ws_stopping = 1; + + spin_unlock(&cfs_wi_data.wi_glock); + + i = 2; + wake_up_all(&sched->ws_waitq); + + spin_lock(&cfs_wi_data.wi_glock); + while (sched->ws_nthreads > 0) { + CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET, + "waiting for %d threads of WI sched[%s] to terminate\n", + sched->ws_nthreads, sched->ws_name); + + spin_unlock(&cfs_wi_data.wi_glock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 20); + spin_lock(&cfs_wi_data.wi_glock); + } + + list_del(&sched->ws_list); + + spin_unlock(&cfs_wi_data.wi_glock); + LASSERT(sched->ws_nscheduled == 0); + + LIBCFS_FREE(sched, sizeof(*sched)); +} +EXPORT_SYMBOL(cfs_wi_sched_destroy); + +int +cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, + int cpt, int nthrs, struct cfs_wi_sched **sched_pp) +{ + struct cfs_wi_sched *sched; + int rc; + + LASSERT(cfs_wi_data.wi_init); + LASSERT(!cfs_wi_data.wi_stopping); + LASSERT(cptab == NULL || cpt == CFS_CPT_ANY || + (cpt >= 0 && cpt < cfs_cpt_number(cptab))); + + LIBCFS_ALLOC(sched, sizeof(*sched)); + if (sched == NULL) + return -ENOMEM; + + strncpy(sched->ws_name, name, CFS_WS_NAME_LEN); + sched->ws_name[CFS_WS_NAME_LEN - 1] = '\0'; + sched->ws_cptab = cptab; + sched->ws_cpt = cpt; + + spin_lock_init(&sched->ws_lock); + init_waitqueue_head(&sched->ws_waitq); + INIT_LIST_HEAD(&sched->ws_runq); + INIT_LIST_HEAD(&sched->ws_rerunq); + INIT_LIST_HEAD(&sched->ws_list); + + rc = 0; + while (nthrs > 0) { + char name[16]; + struct task_struct *task; + + spin_lock(&cfs_wi_data.wi_glock); + while (sched->ws_starting > 0) { + spin_unlock(&cfs_wi_data.wi_glock); + schedule(); + spin_lock(&cfs_wi_data.wi_glock); + } + + sched->ws_starting++; + spin_unlock(&cfs_wi_data.wi_glock); + + if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) { + snprintf(name, sizeof(name), "%s_%02d_%02u", + sched->ws_name, sched->ws_cpt, + sched->ws_nthreads); + } else { + snprintf(name, sizeof(name), "%s_%02u", + sched->ws_name, sched->ws_nthreads); + } + + task = kthread_run(cfs_wi_scheduler, sched, "%s", name); + if (!IS_ERR(task)) { + nthrs--; + continue; + } + rc = PTR_ERR(task); + + CERROR("Failed to create thread for WI scheduler %s: %d\n", + name, rc); + + spin_lock(&cfs_wi_data.wi_glock); + + /* make up for cfs_wi_sched_destroy */ + list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); + sched->ws_starting--; + + spin_unlock(&cfs_wi_data.wi_glock); + + cfs_wi_sched_destroy(sched); + return rc; + } + spin_lock(&cfs_wi_data.wi_glock); + list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); + spin_unlock(&cfs_wi_data.wi_glock); + + *sched_pp = sched; + return 0; +} +EXPORT_SYMBOL(cfs_wi_sched_create); + +int +cfs_wi_startup(void) +{ + memset(&cfs_wi_data, 0, sizeof(cfs_wi_data)); + + spin_lock_init(&cfs_wi_data.wi_glock); + INIT_LIST_HEAD(&cfs_wi_data.wi_scheds); + cfs_wi_data.wi_init = 1; + + return 0; +} + +void +cfs_wi_shutdown(void) +{ + struct cfs_wi_sched *sched; + + spin_lock(&cfs_wi_data.wi_glock); + cfs_wi_data.wi_stopping = 1; + spin_unlock(&cfs_wi_data.wi_glock); + + /* nobody should contend on this list */ + list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { + sched->ws_stopping = 1; + wake_up_all(&sched->ws_waitq); + } + + list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { + spin_lock(&cfs_wi_data.wi_glock); + + while (sched->ws_nthreads != 0) { + spin_unlock(&cfs_wi_data.wi_glock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 20); + spin_lock(&cfs_wi_data.wi_glock); + } + spin_unlock(&cfs_wi_data.wi_glock); + } + while (!list_empty(&cfs_wi_data.wi_scheds)) { + sched = list_entry(cfs_wi_data.wi_scheds.next, + struct cfs_wi_sched, ws_list); + list_del(&sched->ws_list); + LIBCFS_FREE(sched, sizeof(*sched)); + } + + cfs_wi_data.wi_stopping = 0; + cfs_wi_data.wi_init = 0; +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/Makefile b/kernel/drivers/staging/lustre/lustre/llite/Makefile new file mode 100644 index 000000000..7d70115d5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_LUSTRE_FS) += lustre.o +obj-$(CONFIG_LUSTRE_LLITE_LLOOP) += llite_lloop.o +lustre-y := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o \ + rw.o namei.o symlink.o llite_mmap.o \ + xattr.o xattr_cache.o remote_perm.o llite_rmtacl.o llite_capa.o \ + rw26.o super25.o statahead.o \ + ../lclient/glimpse.o ../lclient/lcommon_cl.o ../lclient/lcommon_misc.o \ + vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o + +lustre-$(CONFIG_PROC_FS) += lproc_llite.o +llite_lloop-y := lloop.o diff --git a/kernel/drivers/staging/lustre/lustre/llite/dcache.c b/kernel/drivers/staging/lustre/lustre/llite/dcache.c new file mode 100644 index 000000000..5af013513 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/dcache.c @@ -0,0 +1,363 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/obd_support.h" +#include "../include/lustre_lite.h" +#include "../include/lustre/lustre_idl.h" +#include "../include/lustre_dlm.h" + +#include "llite_internal.h" + +static void free_dentry_data(struct rcu_head *head) +{ + struct ll_dentry_data *lld; + + lld = container_of(head, struct ll_dentry_data, lld_rcu_head); + OBD_FREE_PTR(lld); +} + +/* should NOT be called with the dcache lock, see fs/dcache.c */ +static void ll_release(struct dentry *de) +{ + struct ll_dentry_data *lld; + + LASSERT(de != NULL); + lld = ll_d2d(de); + if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */ + return; + + if (lld->lld_it) { + ll_intent_release(lld->lld_it); + OBD_FREE(lld->lld_it, sizeof(*lld->lld_it)); + } + + de->d_fsdata = NULL; + call_rcu(&lld->lld_rcu_head, free_dentry_data); +} + +/* Compare if two dentries are the same. Don't match if the existing dentry + * is marked invalid. Returns 1 if different, 0 if the same. + * + * This avoids a race where ll_lookup_it() instantiates a dentry, but we get + * an AST before calling d_revalidate_it(). The dentry still exists (marked + * INVALID) so d_lookup() matches it, but we have no lock on it (so + * lock_match() fails) and we spin around real_lookup(). */ +static int ll_dcompare(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, + const struct qstr *name) +{ + if (len != name->len) + return 1; + + if (memcmp(str, name->name, len)) + return 1; + + CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n", + name->len, name->name, dentry, dentry->d_flags, + d_count(dentry)); + + /* mountpoint is always valid */ + if (d_mountpoint((struct dentry *)dentry)) + return 0; + + if (d_lustre_invalid(dentry)) + return 1; + + return 0; +} + +static inline int return_if_equal(struct ldlm_lock *lock, void *data) +{ + if ((lock->l_flags & + (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) == + (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) + return LDLM_ITER_CONTINUE; + return LDLM_ITER_STOP; +} + +/* find any ldlm lock of the inode in mdc and lov + * return 0 not find + * 1 find one + * < 0 error */ +static int find_cbdata(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct lov_stripe_md *lsm; + int rc = 0; + + LASSERT(inode); + rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode), + return_if_equal, NULL); + if (rc != 0) + return rc; + + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) + return rc; + + rc = obd_find_cbdata(sbi->ll_dt_exp, lsm, return_if_equal, NULL); + ccc_inode_lsm_put(inode, lsm); + + return rc; +} + +/** + * Called when last reference to a dentry is dropped and dcache wants to know + * whether or not it should cache it: + * - return 1 to delete the dentry immediately + * - return 0 to cache the dentry + * Should NOT be called with the dcache lock, see fs/dcache.c + */ +static int ll_ddelete(const struct dentry *de) +{ + LASSERT(de); + + CDEBUG(D_DENTRY, "%s dentry %pd (%p, parent %p, inode %p) %s%s\n", + d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping", + de, de, de->d_parent, d_inode(de), + d_unhashed(de) ? "" : "hashed,", + list_empty(&de->d_subdirs) ? "" : "subdirs"); + + /* kernel >= 2.6.38 last refcount is decreased after this function. */ + LASSERT(d_count(de) == 1); + + /* Disable this piece of code temporarily because this is called + * inside dcache_lock so it's not appropriate to do lots of work + * here. ATTENTION: Before this piece of code enabling, LU-2487 must be + * resolved. */ +#if 0 + /* if not ldlm lock for this inode, set i_nlink to 0 so that + * this inode can be recycled later b=20433 */ + if (d_really_is_positive(de) && !find_cbdata(d_inode(de))) + clear_nlink(d_inode(de)); +#endif + + if (d_lustre_invalid((struct dentry *)de)) + return 1; + return 0; +} + +int ll_d_init(struct dentry *de) +{ + LASSERT(de != NULL); + + CDEBUG(D_DENTRY, "ldd on dentry %pd (%p) parent %p inode %p refc %d\n", + de, de, de->d_parent, d_inode(de), + d_count(de)); + + if (de->d_fsdata == NULL) { + struct ll_dentry_data *lld; + + lld = kzalloc(sizeof(*lld), GFP_NOFS); + if (likely(lld)) { + spin_lock(&de->d_lock); + if (likely(de->d_fsdata == NULL)) { + de->d_fsdata = lld; + __d_lustre_invalidate(de); + } else { + OBD_FREE_PTR(lld); + } + spin_unlock(&de->d_lock); + } else { + return -ENOMEM; + } + } + LASSERT(de->d_op == &ll_d_ops); + + return 0; +} + +void ll_intent_drop_lock(struct lookup_intent *it) +{ + if (it->it_op && it->d.lustre.it_lock_mode) { + struct lustre_handle handle; + + handle.cookie = it->d.lustre.it_lock_handle; + + CDEBUG(D_DLMTRACE, "releasing lock with cookie %#llx from it %p\n", + handle.cookie, it); + ldlm_lock_decref(&handle, it->d.lustre.it_lock_mode); + + /* bug 494: intent_release may be called multiple times, from + * this thread and we don't want to double-decref this lock */ + it->d.lustre.it_lock_mode = 0; + if (it->d.lustre.it_remote_lock_mode != 0) { + handle.cookie = it->d.lustre.it_remote_lock_handle; + + CDEBUG(D_DLMTRACE, "releasing remote lock with cookie%#llx from it %p\n", + handle.cookie, it); + ldlm_lock_decref(&handle, + it->d.lustre.it_remote_lock_mode); + it->d.lustre.it_remote_lock_mode = 0; + } + } +} + +void ll_intent_release(struct lookup_intent *it) +{ + CDEBUG(D_INFO, "intent %p released\n", it); + ll_intent_drop_lock(it); + /* We are still holding extra reference on a request, need to free it */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) + ptlrpc_req_finished(it->d.lustre.it_data); /* ll_file_open */ + + if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */ + ptlrpc_req_finished(it->d.lustre.it_data); + + it->d.lustre.it_disposition = 0; + it->d.lustre.it_data = NULL; +} + +void ll_invalidate_aliases(struct inode *inode) +{ + struct dentry *dentry; + struct ll_d_hlist_node *p; + + LASSERT(inode != NULL); + + CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n", + inode->i_ino, inode->i_generation, inode); + + ll_lock_dcache(inode); + ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry, d_u.d_alias) { + CDEBUG(D_DENTRY, "dentry in drop %pd (%p) parent %p inode %p flags %d\n", + dentry, dentry, dentry->d_parent, + d_inode(dentry), dentry->d_flags); + + d_lustre_invalidate(dentry, 0); + } + ll_unlock_dcache(inode); +} + +int ll_revalidate_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, + struct inode *inode) +{ + int rc = 0; + + if (!request) + return 0; + + if (it_disposition(it, DISP_LOOKUP_NEG)) + return -ENOENT; + + rc = ll_prep_inode(&inode, request, NULL, it); + + return rc; +} + +void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode) +{ + LASSERT(it != NULL); + + if (it->d.lustre.it_lock_mode && inode != NULL) { + struct ll_sb_info *sbi = ll_i2sbi(inode); + + CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n", + inode, inode->i_ino, inode->i_generation); + ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); + } + + /* drop lookup or getattr locks immediately */ + if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) { + /* on 2.6 there are situation when several lookups and + * revalidations may be requested during single operation. + * therefore, we don't release intent here -bzzz */ + ll_intent_drop_lock(it); + } +} + +static int ll_revalidate_dentry(struct dentry *dentry, + unsigned int lookup_flags) +{ + struct inode *dir = d_inode(dentry->d_parent); + + /* + * if open&create is set, talk to MDS to make sure file is created if + * necessary, because we can't do this in ->open() later since that's + * called on an inode. return 0 here to let lookup to handle this. + */ + if ((lookup_flags & (LOOKUP_OPEN | LOOKUP_CREATE)) == + (LOOKUP_OPEN | LOOKUP_CREATE)) + return 0; + + if (lookup_flags & (LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE)) + return 1; + + if (d_need_statahead(dir, dentry) <= 0) + return 1; + + if (lookup_flags & LOOKUP_RCU) + return -ECHILD; + + do_statahead_enter(dir, &dentry, d_inode(dentry) == NULL); + ll_statahead_mark(dir, dentry); + return 1; +} + +/* + * Always trust cached dentries. Update statahead window if necessary. + */ +static int ll_revalidate_nd(struct dentry *dentry, unsigned int flags) +{ + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, flags=%u\n", + dentry, flags); + + return ll_revalidate_dentry(dentry, flags); +} + + +static void ll_d_iput(struct dentry *de, struct inode *inode) +{ + LASSERT(inode); + if (!find_cbdata(inode)) + clear_nlink(inode); + iput(inode); +} + +const struct dentry_operations ll_d_ops = { + .d_revalidate = ll_revalidate_nd, + .d_release = ll_release, + .d_delete = ll_ddelete, + .d_iput = ll_d_iput, + .d_compare = ll_dcompare, +}; diff --git a/kernel/drivers/staging/lustre/lustre/llite/dir.c b/kernel/drivers/staging/lustre/lustre/llite/dir.c new file mode 100644 index 000000000..a5bc694dc --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/dir.c @@ -0,0 +1,1971 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/dir.c + * + * Directory code for lustre client. + */ + +#include +#include +#include +#include +#include /* for wait_on_buffer */ +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_lib.h" +#include "../include/lustre/lustre_idl.h" +#include "../include/lustre_lite.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_fid.h" +#include "llite_internal.h" + +/* + * (new) readdir implementation overview. + * + * Original lustre readdir implementation cached exact copy of raw directory + * pages on the client. These pages were indexed in client page cache by + * logical offset in the directory file. This design, while very simple and + * intuitive had some inherent problems: + * + * . it implies that byte offset to the directory entry serves as a + * telldir(3)/seekdir(3) cookie, but that offset is not stable: in + * ext3/htree directory entries may move due to splits, and more + * importantly, + * + * . it is incompatible with the design of split directories for cmd3, + * that assumes that names are distributed across nodes based on their + * hash, and so readdir should be done in hash order. + * + * New readdir implementation does readdir in hash order, and uses hash of a + * file name as a telldir/seekdir cookie. This led to number of complications: + * + * . hash is not unique, so it cannot be used to index cached directory + * pages on the client (note, that it requires a whole pageful of hash + * collided entries to cause two pages to have identical hashes); + * + * . hash is not unique, so it cannot, strictly speaking, be used as an + * entry cookie. ext3/htree has the same problem and lustre implementation + * mimics their solution: seekdir(hash) positions directory at the first + * entry with the given hash. + * + * Client side. + * + * 0. caching + * + * Client caches directory pages using hash of the first entry as an index. As + * noted above hash is not unique, so this solution doesn't work as is: + * special processing is needed for "page hash chains" (i.e., sequences of + * pages filled with entries all having the same hash value). + * + * First, such chains have to be detected. To this end, server returns to the + * client the hash of the first entry on the page next to one returned. When + * client detects that this hash is the same as hash of the first entry on the + * returned page, page hash collision has to be handled. Pages in the + * hash chain, except first one, are termed "overflow pages". + * + * Solution to index uniqueness problem is to not cache overflow + * pages. Instead, when page hash collision is detected, all overflow pages + * from emerging chain are immediately requested from the server and placed in + * a special data structure (struct ll_dir_chain). This data structure is used + * by ll_readdir() to process entries from overflow pages. When readdir + * invocation finishes, overflow pages are discarded. If page hash collision + * chain weren't completely processed, next call to readdir will again detect + * page hash collision, again read overflow pages in, process next portion of + * entries and again discard the pages. This is not as wasteful as it looks, + * because, given reasonable hash, page hash collisions are extremely rare. + * + * 1. directory positioning + * + * When seekdir(hash) is called, original + * + * + * + * + * + * + * + * + * Server. + * + * identification of and access to overflow pages + * + * page format + * + * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains + * a header lu_dirpage which describes the start/end hash, and whether this + * page is empty (contains no dir entry) or hash collide with next page. + * After client receives reply, several pages will be integrated into dir page + * in PAGE_CACHE_SIZE (if PAGE_CACHE_SIZE greater than LU_PAGE_SIZE), and the + * lu_dirpage for this integrated page will be adjusted. See + * lmv_adjust_dirpages(). + * + */ + +/* returns the page unlocked, but with a reference */ +static int ll_dir_filler(void *_hash, struct page *page0) +{ + struct inode *inode = page0->mapping->host; + int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH; + struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp; + struct ptlrpc_request *request; + struct mdt_body *body; + struct md_op_data *op_data; + __u64 hash = *((__u64 *)_hash); + struct page **page_pool; + struct page *page; + struct lu_dirpage *dp; + int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT; + int nrdpgs = 0; /* number of pages read actually */ + int npages; + int i; + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash %llu\n", + inode->i_ino, inode->i_generation, inode, hash); + + LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES); + + page_pool = kcalloc(max_pages, sizeof(page), GFP_NOFS); + if (page_pool) { + page_pool[0] = page0; + } else { + page_pool = &page0; + max_pages = 1; + } + for (npages = 1; npages < max_pages; npages++) { + page = page_cache_alloc_cold(inode->i_mapping); + if (!page) + break; + page_pool[npages] = page; + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + op_data->op_npages = npages; + op_data->op_offset = hash; + rc = md_readpage(exp, op_data, page_pool, &request); + ll_finish_md_op_data(op_data); + if (rc < 0) { + /* page0 is special, which was added into page cache early */ + delete_from_page_cache(page0); + } else if (rc == 0) { + body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + /* Checked by mdc_readpage() */ + LASSERT(body != NULL); + + if (body->valid & OBD_MD_FLSIZE) + cl_isize_write(inode, body->size); + + nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1) + >> PAGE_CACHE_SHIFT; + SetPageUptodate(page0); + } + unlock_page(page0); + ptlrpc_req_finished(request); + + CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages); + + ll_pagevec_init(&lru_pvec, 0); + for (i = 1; i < npages; i++) { + unsigned long offset; + int ret; + + page = page_pool[i]; + + if (rc < 0 || i >= nrdpgs) { + page_cache_release(page); + continue; + } + + SetPageUptodate(page); + + dp = kmap(page); + hash = le64_to_cpu(dp->ldp_hash_start); + kunmap(page); + + offset = hash_x_index(hash, hash64); + + prefetchw(&page->flags); + ret = add_to_page_cache_lru(page, inode->i_mapping, offset, + GFP_KERNEL); + if (ret == 0) { + unlock_page(page); + if (ll_pagevec_add(&lru_pvec, page) == 0) + ll_pagevec_lru_add_file(&lru_pvec); + } else { + CDEBUG(D_VFSTRACE, "page %lu add to page cache failed: %d\n", + offset, ret); + } + page_cache_release(page); + } + ll_pagevec_lru_add_file(&lru_pvec); + + if (page_pool != &page0) + OBD_FREE(page_pool, sizeof(struct page *) * max_pages); + return rc; +} + +static void ll_check_page(struct inode *dir, struct page *page) +{ + /* XXX: check page format later */ + SetPageChecked(page); +} + +void ll_release_page(struct page *page, int remove) +{ + kunmap(page); + if (remove) { + lock_page(page); + if (likely(page->mapping != NULL)) + truncate_complete_page(page->mapping, page); + unlock_page(page); + } + page_cache_release(page); +} + +/* + * Find, kmap and return page that contains given hash. + */ +static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash, + __u64 *start, __u64 *end) +{ + int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH; + struct address_space *mapping = dir->i_mapping; + /* + * Complement of hash is used as an index so that + * radix_tree_gang_lookup() can be used to find a page with starting + * hash _smaller_ than one we are looking for. + */ + unsigned long offset = hash_x_index(*hash, hash64); + struct page *page; + int found; + + spin_lock_irq(&mapping->tree_lock); + found = radix_tree_gang_lookup(&mapping->page_tree, + (void **)&page, offset, 1); + if (found > 0 && !radix_tree_exceptional_entry(page)) { + struct lu_dirpage *dp; + + page_cache_get(page); + spin_unlock_irq(&mapping->tree_lock); + /* + * In contrast to find_lock_page() we are sure that directory + * page cannot be truncated (while DLM lock is held) and, + * hence, can avoid restart. + * + * In fact, page cannot be locked here at all, because + * ll_dir_filler() does synchronous io. + */ + wait_on_page_locked(page); + if (PageUptodate(page)) { + dp = kmap(page); + if (BITS_PER_LONG == 32 && hash64) { + *start = le64_to_cpu(dp->ldp_hash_start) >> 32; + *end = le64_to_cpu(dp->ldp_hash_end) >> 32; + *hash = *hash >> 32; + } else { + *start = le64_to_cpu(dp->ldp_hash_start); + *end = le64_to_cpu(dp->ldp_hash_end); + } + LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n", + *start, *end, *hash); + CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash %llu\n", + offset, *start, *end, *hash); + if (*hash > *end) { + ll_release_page(page, 0); + page = NULL; + } else if (*end != *start && *hash == *end) { + /* + * upon hash collision, remove this page, + * otherwise put page reference, and + * ll_get_dir_page() will issue RPC to fetch + * the page we want. + */ + ll_release_page(page, + le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + page = NULL; + } + } else { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + + } else { + spin_unlock_irq(&mapping->tree_lock); + page = NULL; + } + return page; +} + +struct page *ll_get_dir_page(struct inode *dir, __u64 hash, + struct ll_dir_chain *chain) +{ + ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} }; + struct address_space *mapping = dir->i_mapping; + struct lustre_handle lockh; + struct lu_dirpage *dp; + struct page *page; + ldlm_mode_t mode; + int rc; + __u64 start = 0; + __u64 end = 0; + __u64 lhash = hash; + struct ll_inode_info *lli = ll_i2info(dir); + int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH; + + mode = LCK_PR; + rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED, + ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh); + if (!rc) { + struct ldlm_enqueue_info einfo = { + .ei_type = LDLM_IBITS, + .ei_mode = mode, + .ei_cb_bl = ll_md_blocking_ast, + .ei_cb_cp = ldlm_completion_ast, + }; + struct lookup_intent it = { .it_op = IT_READDIR }; + struct ptlrpc_request *request; + struct md_op_data *op_data; + + op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return (void *)op_data; + + rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it, + op_data, &lockh, NULL, 0, NULL, 0); + + ll_finish_md_op_data(op_data); + + request = (struct ptlrpc_request *)it.d.lustre.it_data; + if (request) + ptlrpc_req_finished(request); + if (rc < 0) { + CERROR("lock enqueue: "DFID" at %llu: rc %d\n", + PFID(ll_inode2fid(dir)), hash, rc); + return ERR_PTR(rc); + } + + CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n", + dir, dir->i_ino, dir->i_generation); + md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, + &it.d.lustre.it_lock_handle, dir, NULL); + } else { + /* for cross-ref object, l_ast_data of the lock may not be set, + * we reset it here */ + md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie, + dir, NULL); + } + ldlm_lock_dump_handle(D_OTHER, &lockh); + + mutex_lock(&lli->lli_readdir_mutex); + page = ll_dir_page_locate(dir, &lhash, &start, &end); + if (IS_ERR(page)) { + CERROR("dir page locate: "DFID" at %llu: rc %ld\n", + PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page)); + goto out_unlock; + } else if (page != NULL) { + /* + * XXX nikita: not entirely correct handling of a corner case: + * suppose hash chain of entries with hash value HASH crosses + * border between pages P0 and P1. First both P0 and P1 are + * cached, seekdir() is called for some entry from the P0 part + * of the chain. Later P0 goes out of cache. telldir(HASH) + * happens and finds P1, as it starts with matching hash + * value. Remaining entries from P0 part of the chain are + * skipped. (Is that really a bug?) + * + * Possible solutions: 0. don't cache P1 is such case, handle + * it as an "overflow" page. 1. invalidate all pages at + * once. 2. use HASH|1 as an index for P1. + */ + goto hash_collision; + } + + page = read_cache_page(mapping, hash_x_index(hash, hash64), + ll_dir_filler, &lhash); + if (IS_ERR(page)) { + CERROR("read cache page: "DFID" at %llu: rc %ld\n", + PFID(ll_inode2fid(dir)), hash, PTR_ERR(page)); + goto out_unlock; + } + + wait_on_page_locked(page); + (void)kmap(page); + if (!PageUptodate(page)) { + CERROR("page not updated: "DFID" at %llu: rc %d\n", + PFID(ll_inode2fid(dir)), hash, -5); + goto fail; + } + if (!PageChecked(page)) + ll_check_page(dir, page); + if (PageError(page)) { + CERROR("page error: "DFID" at %llu: rc %d\n", + PFID(ll_inode2fid(dir)), hash, -5); + goto fail; + } +hash_collision: + dp = page_address(page); + if (BITS_PER_LONG == 32 && hash64) { + start = le64_to_cpu(dp->ldp_hash_start) >> 32; + end = le64_to_cpu(dp->ldp_hash_end) >> 32; + lhash = hash >> 32; + } else { + start = le64_to_cpu(dp->ldp_hash_start); + end = le64_to_cpu(dp->ldp_hash_end); + lhash = hash; + } + if (end == start) { + LASSERT(start == lhash); + CWARN("Page-wide hash collision: %llu\n", end); + if (BITS_PER_LONG == 32 && hash64) + CWARN("Real page-wide hash collision at [%llu %llu] with hash %llu\n", + le64_to_cpu(dp->ldp_hash_start), + le64_to_cpu(dp->ldp_hash_end), hash); + /* + * Fetch whole overflow chain... + * + * XXX not yet. + */ + goto fail; + } +out_unlock: + mutex_unlock(&lli->lli_readdir_mutex); + ldlm_lock_decref(&lockh, mode); + return page; + +fail: + ll_release_page(page, 1); + page = ERR_PTR(-EIO); + goto out_unlock; +} + +int ll_dir_read(struct inode *inode, struct dir_context *ctx) +{ + struct ll_inode_info *info = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + __u64 pos = ctx->pos; + int api32 = ll_need_32bit_api(sbi); + int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; + struct page *page; + struct ll_dir_chain chain; + int done = 0; + int rc = 0; + + ll_dir_chain_init(&chain); + + page = ll_get_dir_page(inode, pos, &chain); + + while (rc == 0 && !done) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + if (!IS_ERR(page)) { + /* + * If page is empty (end of directory is reached), + * use this value. + */ + __u64 hash = MDS_DIR_END_OFF; + __u64 next; + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL && !done; + ent = lu_dirent_next(ent)) { + __u16 type; + int namelen; + struct lu_fid fid; + __u64 lhash; + __u64 ino; + + /* + * XXX: implement correct swabbing here. + */ + + hash = le64_to_cpu(ent->lde_hash); + if (hash < pos) + /* + * Skip until we find target hash + * value. + */ + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (namelen == 0) + /* + * Skip dummy record. + */ + continue; + + if (api32 && hash64) + lhash = hash >> 32; + else + lhash = hash; + fid_le_to_cpu(&fid, &ent->lde_fid); + ino = cl_fid_build_ino(&fid, api32); + type = ll_dirent_type_get(ent); + ctx->pos = lhash; + /* For 'll_nfs_get_name_filldir()', it will try + * to access the 'ent' through its 'lde_name', + * so the parameter 'name' for 'ctx->actor()' + * must be part of the 'ent'. + */ + done = !dir_emit(ctx, ent->lde_name, + namelen, ino, type); + } + next = le64_to_cpu(dp->ldp_hash_end); + if (!done) { + pos = next; + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + done = 1; + ll_release_page(page, 0); + } else if (1 /* chain is exhausted*/) { + /* + * Normal case: continue to the next + * page. + */ + ll_release_page(page, + le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + next = pos; + page = ll_get_dir_page(inode, pos, + &chain); + } else { + /* + * go into overflow page. + */ + LASSERT(le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + ll_release_page(page, 1); + } + } else { + pos = hash; + ll_release_page(page, 0); + } + } else { + rc = PTR_ERR(page); + CERROR("error reading dir "DFID" at %lu: rc %d\n", + PFID(&info->lli_fid), (unsigned long)pos, rc); + } + } + + ctx->pos = pos; + ll_dir_chain_fini(&chain); + return rc; +} + +static int ll_readdir(struct file *filp, struct dir_context *ctx) +{ + struct inode *inode = file_inode(filp); + struct ll_file_data *lfd = LUSTRE_FPRIVATE(filp); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; + int api32 = ll_need_32bit_api(sbi); + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu 32bit_api %d\n", + inode->i_ino, inode->i_generation, + inode, (unsigned long)lfd->lfd_pos, i_size_read(inode), api32); + + if (lfd->lfd_pos == MDS_DIR_END_OFF) { + /* + * end-of-file. + */ + rc = 0; + goto out; + } + + ctx->pos = lfd->lfd_pos; + rc = ll_dir_read(inode, ctx); + lfd->lfd_pos = ctx->pos; + if (ctx->pos == MDS_DIR_END_OFF) { + if (api32) + ctx->pos = LL_DIR_END_OFF_32BIT; + else + ctx->pos = LL_DIR_END_OFF; + } else { + if (api32 && hash64) + ctx->pos >>= 32; + } + filp->f_version = inode->i_version; + +out: + if (!rc) + ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1); + + return rc; +} + +static int ll_send_mgc_param(struct obd_export *mgc, char *string) +{ + struct mgs_send_param *msp; + int rc = 0; + + msp = kzalloc(sizeof(*msp), GFP_NOFS); + if (!msp) + return -ENOMEM; + + strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN); + rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO, + sizeof(struct mgs_send_param), msp, NULL); + if (rc) + CERROR("Failed to set parameter: %d\n", rc); + OBD_FREE_PTR(msp); + + return rc; +} + +static int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump, + char *filename) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int mode; + int err; + + mode = (0755 & ~current_umask()) | S_IFDIR; + op_data = ll_prep_md_op_data(NULL, dir, NULL, filename, + strlen(filename), mode, LUSTRE_OPC_MKDIR, + lump); + if (IS_ERR(op_data)) { + err = PTR_ERR(op_data); + goto err_exit; + } + + op_data->op_cli_flags |= CLI_SET_MEA; + err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode, + from_kuid(&init_user_ns, current_fsuid()), + from_kgid(&init_user_ns, current_fsgid()), + cfs_curproc_cap_pack(), 0, &request); + ll_finish_md_op_data(op_data); + if (err) + goto err_exit; +err_exit: + ptlrpc_req_finished(request); + return err; +} + +int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, + int set_default) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + int rc = 0; + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + struct obd_device *mgc = lsi->lsi_mgc; + int lum_size; + + if (lump != NULL) { + /* + * This is coming from userspace, so should be in + * local endian. But the MDS would like it in little + * endian, so we swab it before we send it. + */ + switch (lump->lmm_magic) { + case LOV_USER_MAGIC_V1: { + if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) + lustre_swab_lov_user_md_v1(lump); + lum_size = sizeof(struct lov_user_md_v1); + break; + } + case LOV_USER_MAGIC_V3: { + if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3)) + lustre_swab_lov_user_md_v3( + (struct lov_user_md_v3 *)lump); + lum_size = sizeof(struct lov_user_md_v3); + break; + } + default: { + CDEBUG(D_IOCTL, "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n", + lump->lmm_magic, LOV_USER_MAGIC_V1, + LOV_USER_MAGIC_V3); + return -EINVAL; + } + } + } else { + lum_size = sizeof(struct lov_user_md_v1); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + if (lump != NULL && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC)) + op_data->op_cli_flags |= CLI_SET_MEA; + + /* swabbing is done in lov_setstripe() on server side */ + rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, + NULL, 0, &req, NULL); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc) { + if (rc != -EPERM && rc != -EACCES) + CERROR("mdc_setattr fails: rc = %d\n", rc); + } + + /* In the following we use the fact that LOV_USER_MAGIC_V1 and + LOV_USER_MAGIC_V3 have the same initial fields so we do not + need to make the distinction between the 2 versions */ + if (set_default && mgc->u.cli.cl_mgc_mgsexp) { + char *param = NULL; + char *buf; + + param = kzalloc(MGS_PARAM_MAXLEN, GFP_NOFS); + if (!param) { + rc = -ENOMEM; + goto end; + } + + buf = param; + /* Get fsname and assume devname to be -MDT0000. */ + ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN); + strcat(buf, "-MDT0000.lov"); + buf += strlen(buf); + + /* Set root stripesize */ + sprintf(buf, ".stripesize=%u", + lump ? le32_to_cpu(lump->lmm_stripe_size) : 0); + rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); + if (rc) + goto end; + + /* Set root stripecount */ + sprintf(buf, ".stripecount=%hd", + lump ? le16_to_cpu(lump->lmm_stripe_count) : 0); + rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); + if (rc) + goto end; + + /* Set root stripeoffset */ + sprintf(buf, ".stripeoffset=%hd", + lump ? le16_to_cpu(lump->lmm_stripe_offset) : + (typeof(lump->lmm_stripe_offset))(-1)); + rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); + +end: + if (param != NULL) + OBD_FREE(param, MGS_PARAM_MAXLEN); + } + return rc; +} + +int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, + int *lmm_size, struct ptlrpc_request **request) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct mdt_body *body; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *req = NULL; + int rc, lmmsize; + struct md_op_data *op_data; + + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc) + return rc; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, + 0, lmmsize, LUSTRE_OPC_ANY, + NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr failed on inode %lu/%u: rc %d\n", + inode->i_ino, + inode->i_generation, rc); + goto out; + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + lmmsize = body->eadatasize; + + if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + lmmsize == 0) { + rc = -ENODATA; + goto out; + } + + lmm = req_capsule_server_sized_get(&req->rq_pill, + &RMF_MDT_MD, lmmsize); + LASSERT(lmm != NULL); + + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + /* We don't swab objects for directories */ + switch (le32_to_cpu(lmm->lmm_magic)) { + case LOV_MAGIC_V1: + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); + break; + case LOV_MAGIC_V3: + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); + break; + default: + CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic); + rc = -EPROTO; + } +out: + *lmmp = lmm; + *lmm_size = lmmsize; + *request = req; + return rc; +} + +/* + * Get MDT index for the inode. + */ +int ll_get_mdt_idx(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + int rc, mdtidx; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + op_data->op_flags |= MF_GET_MDT_IDX; + rc = md_getattr(sbi->ll_md_exp, op_data, NULL); + mdtidx = op_data->op_mds; + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr_name: %d\n", rc); + return rc; + } + return mdtidx; +} + +/** + * Generic handler to do any pre-copy work. + * + * It send a first hsm_progress (with extent length == 0) to coordinator as a + * first information for it that real work has started. + * + * Moreover, for a ARCHIVE request, it will sample the file data version and + * store it in \a copy. + * + * \return 0 on success. + */ +static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct hsm_progress_kernel hpk; + int rc; + + /* Forge a hsm_progress based on data from copy. */ + hpk.hpk_fid = copy->hc_hai.hai_fid; + hpk.hpk_cookie = copy->hc_hai.hai_cookie; + hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset; + hpk.hpk_extent.length = 0; + hpk.hpk_flags = 0; + hpk.hpk_errval = 0; + hpk.hpk_data_version = 0; + + + /* For archive request, we need to read the current file version. */ + if (copy->hc_hai.hai_action == HSMA_ARCHIVE) { + struct inode *inode; + __u64 data_version = 0; + + /* Get inode for this fid */ + inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); + if (IS_ERR(inode)) { + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval is >= 0 */ + hpk.hpk_errval = -PTR_ERR(inode); + rc = PTR_ERR(inode); + goto progress; + } + + /* Read current file data version */ + rc = ll_data_version(inode, &data_version, 1); + iput(inode); + if (rc != 0) { + CDEBUG(D_HSM, "Could not read file data version of " + DFID" (rc = %d). Archive request (%#llx) could not be done.\n", + PFID(©->hc_hai.hai_fid), rc, + copy->hc_hai.hai_cookie); + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -rc; + goto progress; + } + + /* Store it the hsm_copy for later copytool use. + * Always modified even if no lsm. */ + copy->hc_data_version = data_version; + } + +progress: + rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), + &hpk, NULL); + + return rc; +} + +/** + * Generic handler to do any post-copy work. + * + * It will send the last hsm_progress update to coordinator to inform it + * that copy is finished and whether it was successful or not. + * + * Moreover, + * - for ARCHIVE request, it will sample the file data version and compare it + * with the version saved in ll_ioc_copy_start(). If they do not match, copy + * will be considered as failed. + * - for RESTORE request, it will sample the file data version and send it to + * coordinator which is useful if the file was imported as 'released'. + * + * \return 0 on success. + */ +static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct hsm_progress_kernel hpk; + int rc; + + /* If you modify the logic here, also check llapi_hsm_copy_end(). */ + /* Take care: copy->hc_hai.hai_action, len, gid and data are not + * initialized if copy_end was called with copy == NULL. + */ + + /* Forge a hsm_progress based on data from copy. */ + hpk.hpk_fid = copy->hc_hai.hai_fid; + hpk.hpk_cookie = copy->hc_hai.hai_cookie; + hpk.hpk_extent = copy->hc_hai.hai_extent; + hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED; + hpk.hpk_errval = copy->hc_errval; + hpk.hpk_data_version = 0; + + /* For archive request, we need to check the file data was not changed. + * + * For restore request, we need to send the file data version, this is + * useful when the file was created using hsm_import. + */ + if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) || + (copy->hc_hai.hai_action == HSMA_RESTORE)) && + (copy->hc_errval == 0)) { + struct inode *inode; + __u64 data_version = 0; + + /* Get lsm for this fid */ + inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); + if (IS_ERR(inode)) { + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -PTR_ERR(inode); + rc = PTR_ERR(inode); + goto progress; + } + + rc = ll_data_version(inode, &data_version, + copy->hc_hai.hai_action == HSMA_ARCHIVE); + iput(inode); + if (rc) { + CDEBUG(D_HSM, "Could not read file data version. Request could not be confirmed.\n"); + if (hpk.hpk_errval == 0) + hpk.hpk_errval = -rc; + goto progress; + } + + /* Store it the hsm_copy for later copytool use. + * Always modified even if no lsm. */ + hpk.hpk_data_version = data_version; + + /* File could have been stripped during archiving, so we need + * to check anyway. */ + if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) && + (copy->hc_data_version != data_version)) { + CDEBUG(D_HSM, "File data version mismatched. File content was changed during archiving. " + DFID", start:%#llx current:%#llx\n", + PFID(©->hc_hai.hai_fid), + copy->hc_data_version, data_version); + /* File was changed, send error to cdt. Do not ask for + * retry because if a file is modified frequently, + * the cdt will loop on retried archive requests. + * The policy engine will ask for a new archive later + * when the file will not be modified for some tunable + * time */ + /* we do not notify caller */ + hpk.hpk_flags &= ~HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = EBUSY; + } + + } + +progress: + rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), + &hpk, NULL); + + return rc; +} + + +static int copy_and_ioctl(int cmd, struct obd_export *exp, + const void __user *data, size_t size) +{ + void *copy; + int rc; + + copy = kzalloc(size, GFP_NOFS); + if (!copy) + return -ENOMEM; + + if (copy_from_user(copy, data, size)) { + rc = -EFAULT; + goto out; + } + + rc = obd_iocontrol(cmd, exp, size, copy, NULL); +out: + OBD_FREE(copy, size); + + return rc; +} + +static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl) +{ + int cmd = qctl->qc_cmd; + int type = qctl->qc_type; + int id = qctl->qc_id; + int valid = qctl->qc_valid; + int rc = 0; + + switch (cmd) { + case LUSTRE_Q_INVALIDATE: + case LUSTRE_Q_FINVALIDATE: + case Q_QUOTAON: + case Q_QUOTAOFF: + case Q_SETQUOTA: + case Q_SETINFO: + if (!capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT) + return -EPERM; + break; + case Q_GETQUOTA: + if (((type == USRQUOTA && + !uid_eq(current_euid(), make_kuid(&init_user_ns, id))) || + (type == GRPQUOTA && + !in_egroup_p(make_kgid(&init_user_ns, id)))) && + (!capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT)) + return -EPERM; + break; + case Q_GETINFO: + break; + default: + CERROR("unsupported quotactl op: %#x\n", cmd); + return -ENOTTY; + } + + if (valid != QC_GENERAL) { + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + return -EOPNOTSUPP; + + if (cmd == Q_GETINFO) + qctl->qc_cmd = Q_GETOINFO; + else if (cmd == Q_GETQUOTA) + qctl->qc_cmd = Q_GETOQUOTA; + else + return -EINVAL; + + switch (valid) { + case QC_MDTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_OSTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_UUID: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + if (rc == -EAGAIN) + rc = obd_iocontrol(OBD_IOC_QUOTACTL, + sbi->ll_dt_exp, + sizeof(*qctl), qctl, NULL); + break; + default: + rc = -EINVAL; + break; + } + + if (rc) + return rc; + + qctl->qc_cmd = cmd; + } else { + struct obd_quotactl *oqctl; + + oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS); + if (!oqctl) + return -ENOMEM; + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(sbi->ll_md_exp, oqctl); + if (rc) { + if (rc != -EALREADY && cmd == Q_QUOTAON) { + oqctl->qc_cmd = Q_QUOTAOFF; + obd_quotactl(sbi->ll_md_exp, oqctl); + } + OBD_FREE_PTR(oqctl); + return rc; + } + /* If QIF_SPACE is not set, client should collect the + * space usage from OSSs by itself */ + if (cmd == Q_GETQUOTA && + !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) && + !oqctl->qc_dqblk.dqb_curspace) { + struct obd_quotactl *oqctl_tmp; + + oqctl_tmp = kzalloc(sizeof(*oqctl_tmp), GFP_NOFS); + if (!oqctl_tmp) { + rc = -ENOMEM; + goto out; + } + + oqctl_tmp->qc_cmd = Q_GETOQUOTA; + oqctl_tmp->qc_id = oqctl->qc_id; + oqctl_tmp->qc_type = oqctl->qc_type; + + /* collect space usage from OSTs */ + oqctl_tmp->qc_dqblk.dqb_curspace = 0; + rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp); + if (!rc || rc == -EREMOTEIO) { + oqctl->qc_dqblk.dqb_curspace = + oqctl_tmp->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_valid |= QIF_SPACE; + } + + /* collect space & inode usage from MDTs */ + oqctl_tmp->qc_dqblk.dqb_curspace = 0; + oqctl_tmp->qc_dqblk.dqb_curinodes = 0; + rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp); + if (!rc || rc == -EREMOTEIO) { + oqctl->qc_dqblk.dqb_curspace += + oqctl_tmp->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_curinodes = + oqctl_tmp->qc_dqblk.dqb_curinodes; + oqctl->qc_dqblk.dqb_valid |= QIF_INODES; + } else { + oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE; + } + + OBD_FREE_PTR(oqctl_tmp); + } +out: + QCTL_COPY(qctl, oqctl); + OBD_FREE_PTR(oqctl); + } + + return rc; +} + +static char * +ll_getname(const char __user *filename) +{ + int ret = 0, len; + char *tmp = __getname(); + + if (!tmp) + return ERR_PTR(-ENOMEM); + + len = strncpy_from_user(tmp, filename, PATH_MAX); + if (len == 0) + ret = -ENOENT; + else if (len > PATH_MAX) + ret = -ENAMETOOLONG; + + if (ret) { + __putname(tmp); + tmp = ERR_PTR(ret); + } + return tmp; +} + +#define ll_putname(filename) __putname(filename) + +static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_ioctl_data *data; + int rc = 0; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n", + inode->i_ino, inode->i_generation, inode, cmd); + + /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ + if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ + return -ENOTTY; + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); + switch (cmd) { + case FSFILT_IOC_GETFLAGS: + case FSFILT_IOC_SETFLAGS: + return ll_iocontrol(inode, file, cmd, arg); + case FSFILT_IOC_GETVERSION_OLD: + case FSFILT_IOC_GETVERSION: + return put_user(inode->i_generation, (int *)arg); + /* We need to special case any other ioctls we want to handle, + * to send them to the MDS/OST as appropriate and to properly + * network encode the arg field. + case FSFILT_IOC_SETVERSION_OLD: + case FSFILT_IOC_SETVERSION: + */ + case LL_IOC_GET_MDTIDX: { + int mdtidx; + + mdtidx = ll_get_mdt_idx(inode); + if (mdtidx < 0) + return mdtidx; + + if (put_user((int)mdtidx, (int *)arg)) + return -EFAULT; + + return 0; + } + case IOC_MDC_LOOKUP: { + struct ptlrpc_request *request = NULL; + int namelen, len = 0; + char *buf = NULL; + char *filename; + struct md_op_data *op_data; + + rc = obd_ioctl_getdata(&buf, &len, (void *)arg); + if (rc) + return rc; + data = (void *)buf; + + filename = data->ioc_inlbuf1; + namelen = strlen(filename); + + if (namelen < 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + rc = -EINVAL; + goto out_free; + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + rc = PTR_ERR(op_data); + goto out_free; + } + + op_data->op_valid = OBD_MD_FLID; + rc = md_getattr_name(sbi->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr_name: %d\n", rc); + goto out_free; + } + ptlrpc_req_finished(request); +out_free: + obd_ioctl_freedata(buf, len); + return rc; + } + case LL_IOC_LMV_SETSTRIPE: { + struct lmv_user_md *lum; + char *buf = NULL; + char *filename; + int namelen = 0; + int lumlen = 0; + int len; + int rc; + + rc = obd_ioctl_getdata(&buf, &len, (void *)arg); + if (rc) + return rc; + + data = (void *)buf; + if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL || + data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) { + rc = -EINVAL; + goto lmv_out_free; + } + + filename = data->ioc_inlbuf1; + namelen = data->ioc_inllen1; + + if (namelen < 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + rc = -EINVAL; + goto lmv_out_free; + } + lum = (struct lmv_user_md *)data->ioc_inlbuf2; + lumlen = data->ioc_inllen2; + + if (lum->lum_magic != LMV_USER_MAGIC || + lumlen != sizeof(*lum)) { + CERROR("%s: wrong lum magic %x or size %d: rc = %d\n", + filename, lum->lum_magic, lumlen, -EFAULT); + rc = -EINVAL; + goto lmv_out_free; + } + + /** + * ll_dir_setdirstripe will be used to set dir stripe + * mdc_create--->mdt_reint_create (with dirstripe) + */ + rc = ll_dir_setdirstripe(inode, lum, filename); +lmv_out_free: + obd_ioctl_freedata(buf, len); + return rc; + + } + case LL_IOC_LOV_SETSTRIPE: { + struct lov_user_md_v3 lumv3; + struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; + struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg; + struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg; + + int set_default = 0; + + LASSERT(sizeof(lumv3) == sizeof(*lumv3p)); + LASSERT(sizeof(lumv3.lmm_objects[0]) == + sizeof(lumv3p->lmm_objects[0])); + /* first try with v1 which is smaller than v3 */ + if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1))) + return -EFAULT; + + if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) { + if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3))) + return -EFAULT; + } + + if (is_root_inode(inode)) + set_default = 1; + + /* in v1 and v3 cases lumv1 points to data */ + rc = ll_dir_setstripe(inode, lumv1, set_default); + + return rc; + } + case LL_IOC_LMV_GETSTRIPE: { + struct lmv_user_md *lump = (struct lmv_user_md *)arg; + struct lmv_user_md lum; + struct lmv_user_md *tmp; + int lum_size; + int rc = 0; + int mdtindex; + + if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md))) + return -EFAULT; + + if (lum.lum_magic != LMV_MAGIC_V1) + return -EINVAL; + + lum_size = lmv_user_md_size(1, LMV_MAGIC_V1); + tmp = kzalloc(lum_size, GFP_NOFS); + if (!tmp) { + rc = -ENOMEM; + goto free_lmv; + } + + *tmp = lum; + tmp->lum_type = LMV_STRIPE_TYPE; + tmp->lum_stripe_count = 1; + mdtindex = ll_get_mdt_idx(inode); + if (mdtindex < 0) { + rc = -ENOMEM; + goto free_lmv; + } + + tmp->lum_stripe_offset = mdtindex; + tmp->lum_objects[0].lum_mds = mdtindex; + memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode), + sizeof(struct lu_fid)); + if (copy_to_user((void *)arg, tmp, lum_size)) { + rc = -EFAULT; + goto free_lmv; + } +free_lmv: + if (tmp) + OBD_FREE(tmp, lum_size); + return rc; + } + case LL_IOC_REMOVE_ENTRY: { + char *filename = NULL; + int namelen = 0; + int rc; + + /* Here is a little hack to avoid sending REINT_RMENTRY to + * unsupported server, which might crash the server(LU-2730), + * Because both LVB_TYPE and REINT_RMENTRY will be supported + * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the + * server will support REINT_RMENTRY XXX*/ + if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE)) + return -ENOTSUPP; + + filename = ll_getname((const char *)arg); + if (IS_ERR(filename)) + return PTR_ERR(filename); + + namelen = strlen(filename); + if (namelen < 1) { + rc = -EINVAL; + goto out_rmdir; + } + + rc = ll_rmdir_entry(inode, filename, namelen); +out_rmdir: + if (filename) + ll_putname(filename); + return rc; + } + case LL_IOC_LOV_SWAP_LAYOUTS: + return -EPERM; + case LL_IOC_OBD_STATFS: + return ll_obd_statfs(inode, (void *)arg); + case LL_IOC_LOV_GETSTRIPE: + case LL_IOC_MDC_GETINFO: + case IOC_MDC_GETFILEINFO: + case IOC_MDC_GETFILESTRIPE: { + struct ptlrpc_request *request = NULL; + struct lov_user_md *lump; + struct lov_mds_md *lmm = NULL; + struct mdt_body *body; + char *filename = NULL; + int lmmsize; + + if (cmd == IOC_MDC_GETFILEINFO || + cmd == IOC_MDC_GETFILESTRIPE) { + filename = ll_getname((const char *)arg); + if (IS_ERR(filename)) + return PTR_ERR(filename); + + rc = ll_lov_getstripe_ea_info(inode, filename, &lmm, + &lmmsize, &request); + } else { + rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request); + } + + if (request) { + body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + LASSERT(body != NULL); + } else { + goto out_req; + } + + if (rc < 0) { + if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO || + cmd == LL_IOC_MDC_GETINFO)) { + rc = 0; + goto skip_lmm; + } else + goto out_req; + } + + if (cmd == IOC_MDC_GETFILESTRIPE || + cmd == LL_IOC_LOV_GETSTRIPE) { + lump = (struct lov_user_md *)arg; + } else { + struct lov_user_mds_data *lmdp; + + lmdp = (struct lov_user_mds_data *)arg; + lump = &lmdp->lmd_lmm; + } + if (copy_to_user(lump, lmm, lmmsize)) { + if (copy_to_user(lump, lmm, sizeof(*lump))) { + rc = -EFAULT; + goto out_req; + } + rc = -EOVERFLOW; + } +skip_lmm: + if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) { + struct lov_user_mds_data *lmdp; + lstat_t st = { 0 }; + + st.st_dev = inode->i_sb->s_dev; + st.st_mode = body->mode; + st.st_nlink = body->nlink; + st.st_uid = body->uid; + st.st_gid = body->gid; + st.st_rdev = body->rdev; + st.st_size = body->size; + st.st_blksize = PAGE_CACHE_SIZE; + st.st_blocks = body->blocks; + st.st_atime = body->atime; + st.st_mtime = body->mtime; + st.st_ctime = body->ctime; + st.st_ino = inode->i_ino; + + lmdp = (struct lov_user_mds_data *)arg; + if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st))) { + rc = -EFAULT; + goto out_req; + } + } + +out_req: + ptlrpc_req_finished(request); + if (filename) + ll_putname(filename); + return rc; + } + case IOC_LOV_GETINFO: { + struct lov_user_mds_data *lumd; + struct lov_stripe_md *lsm; + struct lov_user_md *lum; + struct lov_mds_md *lmm; + int lmmsize; + lstat_t st; + + lumd = (struct lov_user_mds_data *)arg; + lum = &lumd->lmd_lmm; + + rc = ll_get_max_mdsize(sbi, &lmmsize); + if (rc) + return rc; + + OBD_ALLOC_LARGE(lmm, lmmsize); + if (lmm == NULL) + return -ENOMEM; + if (copy_from_user(lmm, lum, lmmsize)) { + rc = -EFAULT; + goto free_lmm; + } + + switch (lmm->lmm_magic) { + case LOV_USER_MAGIC_V1: + if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1)) + break; + /* swab objects first so that stripes num will be sane */ + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v1 *)lmm)->lmm_objects, + ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count); + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); + break; + case LOV_USER_MAGIC_V3: + if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3)) + break; + /* swab objects first so that stripes num will be sane */ + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v3 *)lmm)->lmm_objects, + ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count); + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); + break; + default: + rc = -EINVAL; + goto free_lmm; + } + + rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize); + if (rc < 0) { + rc = -ENOMEM; + goto free_lmm; + } + + /* Perform glimpse_size operation. */ + memset(&st, 0, sizeof(st)); + + rc = ll_glimpse_ioctl(sbi, lsm, &st); + if (rc) + goto free_lsm; + + if (copy_to_user(&lumd->lmd_st, &st, sizeof(st))) { + rc = -EFAULT; + goto free_lsm; + } + +free_lsm: + obd_free_memmd(sbi->ll_dt_exp, &lsm); +free_lmm: + OBD_FREE_LARGE(lmm, lmmsize); + return rc; + } + case OBD_IOC_LLOG_CATINFO: { + return -EOPNOTSUPP; + } + case OBD_IOC_QUOTACHECK: { + struct obd_quotactl *oqctl; + int error = 0; + + if (!capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT) + return -EPERM; + + oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS); + if (!oqctl) + return -ENOMEM; + oqctl->qc_type = arg; + rc = obd_quotacheck(sbi->ll_md_exp, oqctl); + if (rc < 0) { + CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc); + error = rc; + } + + rc = obd_quotacheck(sbi->ll_dt_exp, oqctl); + if (rc < 0) + CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc); + + OBD_FREE_PTR(oqctl); + return error ?: rc; + } + case OBD_IOC_POLL_QUOTACHECK: { + struct if_quotacheck *check; + + if (!capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT) + return -EPERM; + + check = kzalloc(sizeof(*check), GFP_NOFS); + if (!check) + return -ENOMEM; + + rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check, + NULL); + if (rc) { + CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc); + if (copy_to_user((void *)arg, check, + sizeof(*check))) + CDEBUG(D_QUOTA, "copy_to_user failed\n"); + goto out_poll; + } + + rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check, + NULL); + if (rc) { + CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc); + if (copy_to_user((void *)arg, check, + sizeof(*check))) + CDEBUG(D_QUOTA, "copy_to_user failed\n"); + goto out_poll; + } +out_poll: + OBD_FREE_PTR(check); + return rc; + } + case LL_IOC_QUOTACTL: { + struct if_quotactl *qctl; + + qctl = kzalloc(sizeof(*qctl), GFP_NOFS); + if (!qctl) + return -ENOMEM; + + if (copy_from_user(qctl, (void *)arg, sizeof(*qctl))) { + rc = -EFAULT; + goto out_quotactl; + } + + rc = quotactl_ioctl(sbi, qctl); + + if (rc == 0 && copy_to_user((void *)arg, qctl, sizeof(*qctl))) + rc = -EFAULT; + +out_quotactl: + OBD_FREE_PTR(qctl); + return rc; + } + case OBD_IOC_GETDTNAME: + case OBD_IOC_GETMDNAME: + return ll_get_obd_name(inode, cmd, arg); + case LL_IOC_FLUSHCTX: + return ll_flush_ctx(inode); +#ifdef CONFIG_FS_POSIX_ACL + case LL_IOC_RMTACL: { + if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) { + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + LASSERT(fd != NULL); + rc = rct_add(&sbi->ll_rct, current_pid(), arg); + if (!rc) + fd->fd_flags |= LL_FILE_RMTACL; + return rc; + } else + return 0; + } +#endif + case LL_IOC_GETOBDCOUNT: { + int count, vallen; + struct obd_export *exp; + + if (copy_from_user(&count, (int *)arg, sizeof(int))) + return -EFAULT; + + /* get ost count when count is zero, get mdt count otherwise */ + exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp; + vallen = sizeof(count); + rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT), + KEY_TGT_COUNT, &vallen, &count, NULL); + if (rc) { + CERROR("get target count failed: %d\n", rc); + return rc; + } + + if (copy_to_user((int *)arg, &count, sizeof(int))) + return -EFAULT; + + return 0; + } + case LL_IOC_PATH2FID: + if (copy_to_user((void *)arg, ll_inode2fid(inode), + sizeof(struct lu_fid))) + return -EFAULT; + return 0; + case LL_IOC_GET_CONNECT_FLAGS: { + return obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, (void *)arg); + } + case OBD_IOC_CHANGELOG_SEND: + case OBD_IOC_CHANGELOG_CLEAR: + rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg, + sizeof(struct ioc_changelog)); + return rc; + case OBD_IOC_FID2PATH: + return ll_fid2path(inode, (void *)arg); + case LL_IOC_HSM_REQUEST: { + struct hsm_user_request *hur; + ssize_t totalsize; + + hur = kzalloc(sizeof(*hur), GFP_NOFS); + if (!hur) + return -ENOMEM; + + /* We don't know the true size yet; copy the fixed-size part */ + if (copy_from_user(hur, (void *)arg, sizeof(*hur))) { + OBD_FREE_PTR(hur); + return -EFAULT; + } + + /* Compute the whole struct size */ + totalsize = hur_len(hur); + OBD_FREE_PTR(hur); + if (totalsize < 0) + return -E2BIG; + + /* Final size will be more than double totalsize */ + if (totalsize >= MDS_MAXREQSIZE / 3) + return -E2BIG; + + OBD_ALLOC_LARGE(hur, totalsize); + if (hur == NULL) + return -ENOMEM; + + /* Copy the whole struct */ + if (copy_from_user(hur, (void *)arg, totalsize)) { + OBD_FREE_LARGE(hur, totalsize); + return -EFAULT; + } + + if (hur->hur_request.hr_action == HUA_RELEASE) { + const struct lu_fid *fid; + struct inode *f; + int i; + + for (i = 0; i < hur->hur_request.hr_itemcount; i++) { + fid = &hur->hur_user_item[i].hui_fid; + f = search_inode_for_lustre(inode->i_sb, fid); + if (IS_ERR(f)) { + rc = PTR_ERR(f); + break; + } + + rc = ll_hsm_release(f); + iput(f); + if (rc != 0) + break; + } + } else { + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize, + hur, NULL); + } + + OBD_FREE_LARGE(hur, totalsize); + + return rc; + } + case LL_IOC_HSM_PROGRESS: { + struct hsm_progress_kernel hpk; + struct hsm_progress hp; + + if (copy_from_user(&hp, (void *)arg, sizeof(hp))) + return -EFAULT; + + hpk.hpk_fid = hp.hp_fid; + hpk.hpk_cookie = hp.hp_cookie; + hpk.hpk_extent = hp.hp_extent; + hpk.hpk_flags = hp.hp_flags; + hpk.hpk_errval = hp.hp_errval; + hpk.hpk_data_version = 0; + + /* File may not exist in Lustre; all progress + * reported to Lustre root */ + rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk, + NULL); + return rc; + } + case LL_IOC_HSM_CT_START: + rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg, + sizeof(struct lustre_kernelcomm)); + return rc; + + case LL_IOC_HSM_COPY_START: { + struct hsm_copy *copy; + int rc; + + copy = kzalloc(sizeof(*copy), GFP_NOFS); + if (!copy) + return -ENOMEM; + if (copy_from_user(copy, (char *)arg, sizeof(*copy))) { + OBD_FREE_PTR(copy); + return -EFAULT; + } + + rc = ll_ioc_copy_start(inode->i_sb, copy); + if (copy_to_user((char *)arg, copy, sizeof(*copy))) + rc = -EFAULT; + + OBD_FREE_PTR(copy); + return rc; + } + case LL_IOC_HSM_COPY_END: { + struct hsm_copy *copy; + int rc; + + copy = kzalloc(sizeof(*copy), GFP_NOFS); + if (!copy) + return -ENOMEM; + if (copy_from_user(copy, (char *)arg, sizeof(*copy))) { + OBD_FREE_PTR(copy); + return -EFAULT; + } + + rc = ll_ioc_copy_end(inode->i_sb, copy); + if (copy_to_user((char *)arg, copy, sizeof(*copy))) + rc = -EFAULT; + + OBD_FREE_PTR(copy); + return rc; + } + default: + return obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, (void *)arg); + } +} + +static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int api32 = ll_need_32bit_api(sbi); + loff_t ret = -EINVAL; + + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_SET: + break; + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_END: + if (offset > 0) + goto out; + if (api32) + offset += LL_DIR_END_OFF_32BIT; + else + offset += LL_DIR_END_OFF; + break; + default: + goto out; + } + + if (offset >= 0 && + ((api32 && offset <= LL_DIR_END_OFF_32BIT) || + (!api32 && offset <= LL_DIR_END_OFF))) { + if (offset != file->f_pos) { + if ((api32 && offset == LL_DIR_END_OFF_32BIT) || + (!api32 && offset == LL_DIR_END_OFF)) + fd->lfd_pos = MDS_DIR_END_OFF; + else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH) + fd->lfd_pos = offset << 32; + else + fd->lfd_pos = offset; + file->f_pos = offset; + file->f_version = 0; + } + ret = offset; + } + goto out; + +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +static int ll_dir_open(struct inode *inode, struct file *file) +{ + return ll_file_open(inode, file); +} + +static int ll_dir_release(struct inode *inode, struct file *file) +{ + return ll_file_release(inode, file); +} + +const struct file_operations ll_dir_operations = { + .llseek = ll_dir_seek, + .open = ll_dir_open, + .release = ll_dir_release, + .read = generic_read_dir, + .iterate = ll_readdir, + .unlocked_ioctl = ll_dir_ioctl, + .fsync = ll_fsync, +}; diff --git a/kernel/drivers/staging/lustre/lustre/llite/file.c b/kernel/drivers/staging/lustre/lustre/llite/file.c new file mode 100644 index 000000000..4b44c634f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/file.c @@ -0,0 +1,3624 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/file.c + * + * Author: Peter Braam + * Author: Phil Schwan + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LLITE +#include "../include/lustre_dlm.h" +#include "../include/lustre_lite.h" +#include +#include +#include "llite_internal.h" +#include "../include/lustre/ll_fiemap.h" + +#include "../include/cl_object.h" + +static int +ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg); + +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, + bool *lease_broken); + +static enum llioc_iter +ll_iocontrol_call(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg, int *rcp); + +static struct ll_file_data *ll_file_data_get(void) +{ + struct ll_file_data *fd; + + OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS); + if (fd == NULL) + return NULL; + fd->fd_write_failed = false; + return fd; +} + +static void ll_file_data_put(struct ll_file_data *fd) +{ + if (fd != NULL) + OBD_SLAB_FREE_PTR(fd, ll_file_data_slab); +} + +void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data, + struct lustre_handle *fh) +{ + op_data->op_fid1 = ll_i2info(inode)->lli_fid; + op_data->op_attr.ia_mode = inode->i_mode; + op_data->op_attr.ia_atime = inode->i_atime; + op_data->op_attr.ia_mtime = inode->i_mtime; + op_data->op_attr.ia_ctime = inode->i_ctime; + op_data->op_attr.ia_size = i_size_read(inode); + op_data->op_attr_blocks = inode->i_blocks; + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = + ll_inode_to_ext_flags(inode->i_flags); + op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch; + if (fh) + op_data->op_handle = *fh; + op_data->op_capa1 = ll_mdscapa_get(inode); + + if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags) + op_data->op_bias |= MDS_DATA_MODIFIED; +} + +/** + * Closes the IO epoch and packs all the attributes into @op_data for + * the CLOSE rpc. + */ +static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data, + struct obd_client_handle *och) +{ + op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | + ATTR_CTIME | ATTR_CTIME_SET; + + if (!(och->och_flags & FMODE_WRITE)) + goto out; + + if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode)) + op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS; + else + ll_ioepoch_close(inode, op_data, &och, 0); + +out: + ll_pack_inode2opdata(inode, op_data, &och->och_fh); + ll_prep_md_op_data(op_data, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); +} + +static int ll_close_inode_openhandle(struct obd_export *md_exp, + struct inode *inode, + struct obd_client_handle *och, + const __u64 *data_version) +{ + struct obd_export *exp = ll_i2mdexp(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + struct obd_device *obd = class_exp2obd(exp); + int epoch_close = 1; + int rc; + + if (obd == NULL) { + /* + * XXX: in case of LMV, is this correct to access + * ->exp_handle? + */ + CERROR("Invalid MDC connection handle %#llx\n", + ll_i2mdexp(inode)->exp_handle.h_cookie); + rc = 0; + goto out; + } + + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + if (!op_data) { + /* XXX We leak openhandle and request here. */ + rc = -ENOMEM; + goto out; + } + + ll_prepare_close(inode, op_data, och); + if (data_version != NULL) { + /* Pass in data_version implies release. */ + op_data->op_bias |= MDS_HSM_RELEASE; + op_data->op_data_version = *data_version; + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS; + } + epoch_close = op_data->op_flags & MF_EPOCH_CLOSE; + rc = md_close(md_exp, op_data, och->och_mod, &req); + if (rc == -EAGAIN) { + /* This close must have the epoch closed. */ + LASSERT(epoch_close); + /* MDS has instructed us to obtain Size-on-MDS attribute from + * OSTs and send setattr to back to MDS. */ + rc = ll_som_update(inode, op_data); + if (rc) { + CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n", + inode->i_ino, rc); + rc = 0; + } + } else if (rc) { + CERROR("inode %lu mdc close failed: rc = %d\n", + inode->i_ino, rc); + } + + /* DATA_MODIFIED flag was successfully sent on close, cancel data + * modification flag. */ + if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) { + struct ll_inode_info *lli = ll_i2info(inode); + + spin_lock(&lli->lli_lock); + lli->lli_flags &= ~LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + + if (rc == 0) { + rc = ll_objects_destroy(req, inode); + if (rc) + CERROR("inode %lu ll_objects destroy: rc = %d\n", + inode->i_ino, rc); + } + if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) { + struct mdt_body *body; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (!(body->valid & OBD_MD_FLRELEASED)) + rc = -EBUSY; + } + + ll_finish_md_op_data(op_data); + +out: + if (exp_connect_som(exp) && !epoch_close && + S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) { + ll_queue_done_writing(inode, LLIF_DONE_WRITING); + } else { + md_clear_open_replay_data(md_exp, och); + /* Free @och if it is not waiting for DONE_WRITING. */ + och->och_fh.cookie = DEAD_HANDLE_MAGIC; + OBD_FREE_PTR(och); + } + if (req) /* This is close request */ + ptlrpc_req_finished(req); + return rc; +} + +int ll_md_real_close(struct inode *inode, fmode_t fmode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_client_handle **och_p; + struct obd_client_handle *och; + __u64 *och_usecount; + int rc = 0; + + if (fmode & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else if (fmode & FMODE_EXEC) { + och_p = &lli->lli_mds_exec_och; + och_usecount = &lli->lli_open_fd_exec_count; + } else { + LASSERT(fmode & FMODE_READ); + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + mutex_lock(&lli->lli_och_mutex); + if (*och_usecount > 0) { + /* There are still users of this handle, so skip + * freeing it. */ + mutex_unlock(&lli->lli_och_mutex); + return 0; + } + + och = *och_p; + *och_p = NULL; + mutex_unlock(&lli->lli_och_mutex); + + if (och != NULL) { + /* There might be a race and this handle may already + be closed. */ + rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, + inode, och, NULL); + } + + return rc; +} + +static int ll_md_close(struct obd_export *md_exp, struct inode *inode, + struct file *file) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_inode_info *lli = ll_i2info(inode); + int lockmode; + __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK; + struct lustre_handle lockh; + ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN}}; + int rc = 0; + + /* clear group lock, if present */ + if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) + ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid); + + if (fd->fd_lease_och != NULL) { + bool lease_broken; + + /* Usually the lease is not released when the + * application crashed, we need to release here. */ + rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken); + CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n", + PFID(&lli->lli_fid), rc, lease_broken); + + fd->fd_lease_och = NULL; + } + + if (fd->fd_och != NULL) { + rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL); + fd->fd_och = NULL; + goto out; + } + + /* Let's see if we have good enough OPEN lock on the file and if + we can skip talking to MDS */ + + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_omode & FMODE_WRITE) { + lockmode = LCK_CW; + LASSERT(lli->lli_open_fd_write_count); + lli->lli_open_fd_write_count--; + } else if (fd->fd_omode & FMODE_EXEC) { + lockmode = LCK_PR; + LASSERT(lli->lli_open_fd_exec_count); + lli->lli_open_fd_exec_count--; + } else { + lockmode = LCK_CR; + LASSERT(lli->lli_open_fd_read_count); + lli->lli_open_fd_read_count--; + } + mutex_unlock(&lli->lli_och_mutex); + + if (!md_lock_match(md_exp, flags, ll_inode2fid(inode), + LDLM_IBITS, &policy, lockmode, &lockh)) + rc = ll_md_real_close(inode, fd->fd_omode); + +out: + LUSTRE_FPRIVATE(file) = NULL; + ll_file_data_put(fd); + ll_capa_close(inode); + + return rc; +} + +/* While this returns an error code, fput() the caller does not, so we need + * to make every effort to clean up all of our state here. Also, applications + * rarely check close errors and even if an error is returned they will not + * re-try the close call. + */ +int ll_file_release(struct inode *inode, struct file *file) +{ + struct ll_file_data *fd; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) { + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + LASSERT(fd != NULL); + if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) { + fd->fd_flags &= ~LL_FILE_RMTACL; + rct_del(&sbi->ll_rct, current_pid()); + et_search_free(&sbi->ll_et, current_pid()); + } + } +#endif + + if (!is_root_inode(inode)) + ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1); + fd = LUSTRE_FPRIVATE(file); + LASSERT(fd != NULL); + + /* The last ref on @file, maybe not the owner pid of statahead. + * Different processes can open the same dir, "ll_opendir_key" means: + * it is me that should stop the statahead thread. */ + if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd && + lli->lli_opendir_pid != 0) + ll_stop_statahead(inode, lli->lli_opendir_key); + + if (is_root_inode(inode)) { + LUSTRE_FPRIVATE(file) = NULL; + ll_file_data_put(fd); + return 0; + } + + if (!S_ISDIR(inode->i_mode)) { + lov_read_and_clear_async_rc(lli->lli_clob); + lli->lli_async_rc = 0; + } + + rc = ll_md_close(sbi->ll_md_exp, inode, file); + + if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val)) + libcfs_debug_dumplog(); + + return rc; +} + +static int ll_intent_file_open(struct dentry *dentry, void *lmm, + int lmmsize, struct lookup_intent *itp) +{ + struct inode *inode = d_inode(dentry); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct dentry *parent = dentry->d_parent; + const char *name = dentry->d_name.name; + const int len = dentry->d_name.len; + struct md_op_data *op_data; + struct ptlrpc_request *req; + __u32 opc = LUSTRE_OPC_ANY; + int rc; + + /* Usually we come here only for NFSD, and we want open lock. + But we can also get here with pre 2.6.15 patchless kernels, and in + that case that lock is also ok */ + /* We can also get here if there was cached open handle in revalidate_it + * but it disappeared while we were getting from there to ll_file_open. + * But this means this file was closed and immediately opened which + * makes a good candidate for using OPEN lock */ + /* If lmmsize & lmm are not 0, we are just setting stripe info + * parameters. No need for the open lock */ + if (lmm == NULL && lmmsize == 0) { + itp->it_flags |= MDS_OPEN_LOCK; + if (itp->it_flags & FMODE_WRITE) + opc = LUSTRE_OPC_CREATE; + } + + op_data = ll_prep_md_op_data(NULL, d_inode(parent), + inode, name, len, + O_RDWR, opc, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + itp->it_flags |= MDS_OPEN_BY_FID; + rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp, + 0 /*unused */, &req, ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + if (rc == -ESTALE) { + /* reason for keep own exit path - don`t flood log + * with messages with -ESTALE errors. + */ + if (!it_disposition(itp, DISP_OPEN_OPEN) || + it_open_error(DISP_OPEN_OPEN, itp)) + goto out; + ll_release_openhandle(inode, itp); + goto out; + } + + if (it_disposition(itp, DISP_LOOKUP_NEG)) { + rc = -ENOENT; + goto out; + } + + if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) { + rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp); + CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc); + goto out; + } + + rc = ll_prep_inode(&inode, req, NULL, itp); + if (!rc && itp->d.lustre.it_lock_mode) + ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL); + +out: + ptlrpc_req_finished(req); + ll_intent_drop_lock(itp); + + return rc; +} + +/** + * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does + * not believe attributes if a few ioepoch holders exist. Attributes for + * previous ioepoch if new one is opened are also skipped by MDS. + */ +void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch) +{ + if (ioepoch && lli->lli_ioepoch != ioepoch) { + lli->lli_ioepoch = ioepoch; + CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n", + ioepoch, PFID(&lli->lli_fid)); + } +} + +static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it, + struct obd_client_handle *och) +{ + struct ptlrpc_request *req = it->d.lustre.it_data; + struct mdt_body *body; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + och->och_fh = body->handle; + och->och_fid = body->fid1; + och->och_lease_handle.cookie = it->d.lustre.it_lock_handle; + och->och_magic = OBD_CLIENT_HANDLE_MAGIC; + och->och_flags = it->it_flags; + + return md_set_open_replay_data(md_exp, och, it); +} + +static int ll_local_open(struct file *file, struct lookup_intent *it, + struct ll_file_data *fd, struct obd_client_handle *och) +{ + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + + LASSERT(!LUSTRE_FPRIVATE(file)); + + LASSERT(fd != NULL); + + if (och) { + struct ptlrpc_request *req = it->d.lustre.it_data; + struct mdt_body *body; + int rc; + + rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); + if (rc != 0) + return rc; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + ll_ioepoch_open(lli, body->ioepoch); + } + + LUSTRE_FPRIVATE(file) = fd; + ll_readahead_init(inode, &fd->fd_ras); + fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + return 0; +} + +/* Open a file, and (for the very first open) create objects on the OSTs at + * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object + * creation or open until ll_lov_setstripe() ioctl is called. + * + * If we already have the stripe MD locally then we don't request it in + * md_open(), by passing a lmm_size = 0. + * + * It is up to the application to ensure no other processes open this file + * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be + * used. We might be able to avoid races of that sort by getting lli_open_sem + * before returning in the O_LOV_DELAY_CREATE case and dropping it here + * or in ll_file_release(), but I'm not sure that is desirable/necessary. + */ +int ll_file_open(struct inode *inode, struct file *file) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lookup_intent *it, oit = { .it_op = IT_OPEN, + .it_flags = file->f_flags }; + struct obd_client_handle **och_p = NULL; + __u64 *och_usecount = NULL; + struct ll_file_data *fd; + int rc = 0, opendir_set = 0; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino, + inode->i_generation, inode, file->f_flags); + + it = file->private_data; /* XXX: compat macro */ + file->private_data = NULL; /* prevent ll_local_open assertion */ + + fd = ll_file_data_get(); + if (fd == NULL) { + rc = -ENOMEM; + goto out_openerr; + } + + fd->fd_file = file; + if (S_ISDIR(inode->i_mode)) { + spin_lock(&lli->lli_sa_lock); + if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL && + lli->lli_opendir_pid == 0) { + lli->lli_opendir_key = fd; + lli->lli_opendir_pid = current_pid(); + opendir_set = 1; + } + spin_unlock(&lli->lli_sa_lock); + } + + if (is_root_inode(inode)) { + LUSTRE_FPRIVATE(file) = fd; + return 0; + } + + if (!it || !it->d.lustre.it_disposition) { + /* Convert f_flags into access mode. We cannot use file->f_mode, + * because everything but O_ACCMODE mask was stripped from + * there */ + if ((oit.it_flags + 1) & O_ACCMODE) + oit.it_flags++; + if (file->f_flags & O_TRUNC) + oit.it_flags |= FMODE_WRITE; + + /* kernel only call f_op->open in dentry_open. filp_open calls + * dentry_open after call to open_namei that checks permissions. + * Only nfsd_open call dentry_open directly without checking + * permissions and because of that this code below is safe. */ + if (oit.it_flags & (FMODE_WRITE | FMODE_READ)) + oit.it_flags |= MDS_OPEN_OWNEROVERRIDE; + + /* We do not want O_EXCL here, presumably we opened the file + * already? XXX - NFS implications? */ + oit.it_flags &= ~O_EXCL; + + /* bug20584, if "it_flags" contains O_CREAT, the file will be + * created if necessary, then "IT_CREAT" should be set to keep + * consistent with it */ + if (oit.it_flags & O_CREAT) + oit.it_op |= IT_CREAT; + + it = &oit; + } + +restart: + /* Let's see if we have file open on MDS already. */ + if (it->it_flags & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else if (it->it_flags & FMODE_EXEC) { + och_p = &lli->lli_mds_exec_och; + och_usecount = &lli->lli_open_fd_exec_count; + } else { + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + mutex_lock(&lli->lli_och_mutex); + if (*och_p) { /* Open handle is present */ + if (it_disposition(it, DISP_OPEN_OPEN)) { + /* Well, there's extra open request that we do not need, + let's close it somehow. This will decref request. */ + rc = it_open_error(DISP_OPEN_OPEN, it); + if (rc) { + mutex_unlock(&lli->lli_och_mutex); + goto out_openerr; + } + + ll_release_openhandle(inode, it); + } + (*och_usecount)++; + + rc = ll_local_open(file, it, fd, NULL); + if (rc) { + (*och_usecount)--; + mutex_unlock(&lli->lli_och_mutex); + goto out_openerr; + } + } else { + LASSERT(*och_usecount == 0); + if (!it->d.lustre.it_disposition) { + /* We cannot just request lock handle now, new ELC code + means that one of other OPEN locks for this file + could be cancelled, and since blocking ast handler + would attempt to grab och_mutex as well, that would + result in a deadlock */ + mutex_unlock(&lli->lli_och_mutex); + it->it_create_mode |= M_CHECK_STALE; + rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it); + it->it_create_mode &= ~M_CHECK_STALE; + if (rc) + goto out_openerr; + + goto restart; + } + *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS); + if (!*och_p) { + rc = -ENOMEM; + goto out_och_free; + } + + (*och_usecount)++; + + /* md_intent_lock() didn't get a request ref if there was an + * open error, so don't do cleanup on the request here + * (bug 3430) */ + /* XXX (green): Should not we bail out on any error here, not + * just open error? */ + rc = it_open_error(DISP_OPEN_OPEN, it); + if (rc) + goto out_och_free; + + LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF)); + + rc = ll_local_open(file, it, fd, *och_p); + if (rc) + goto out_och_free; + } + mutex_unlock(&lli->lli_och_mutex); + fd = NULL; + + /* Must do this outside lli_och_mutex lock to prevent deadlock where + different kind of OPEN lock for this same inode gets cancelled + by ldlm_cancel_lru */ + if (!S_ISREG(inode->i_mode)) + goto out_och_free; + + ll_capa_open(inode); + + if (!lli->lli_has_smd && + (cl_is_lov_delay_create(file->f_flags) || + (file->f_mode & FMODE_WRITE) == 0)) { + CDEBUG(D_INODE, "object creation was delayed\n"); + goto out_och_free; + } + cl_lov_delay_create_clear(&file->f_flags); + goto out_och_free; + +out_och_free: + if (rc) { + if (och_p && *och_p) { + OBD_FREE(*och_p, sizeof(struct obd_client_handle)); + *och_p = NULL; /* OBD_FREE writes some magic there */ + (*och_usecount)--; + } + mutex_unlock(&lli->lli_och_mutex); + +out_openerr: + if (opendir_set != 0) + ll_stop_statahead(inode, lli->lli_opendir_key); + if (fd != NULL) + ll_file_data_put(fd); + } else { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1); + } + + if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) { + ptlrpc_req_finished(it->d.lustre.it_data); + it_clear_disposition(it, DISP_ENQ_OPEN_REF); + } + + return rc; +} + +static int ll_md_blocking_lease_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, void *data, int flag) +{ + int rc; + struct lustre_handle lockh; + + switch (flag) { + case LDLM_CB_BLOCKING: + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc < 0) { + CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc); + return rc; + } + break; + case LDLM_CB_CANCELING: + /* do nothing */ + break; + } + return 0; +} + +/** + * Acquire a lease and open the file. + */ +static struct obd_client_handle * +ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode, + __u64 open_flags) +{ + struct lookup_intent it = { .it_op = IT_OPEN }; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req; + struct lustre_handle old_handle = { 0 }; + struct obd_client_handle *och = NULL; + int rc; + int rc2; + + if (fmode != FMODE_WRITE && fmode != FMODE_READ) + return ERR_PTR(-EINVAL); + + if (file != NULL) { + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct obd_client_handle **och_p; + __u64 *och_usecount; + + if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC)) + return ERR_PTR(-EPERM); + + /* Get the openhandle of the file */ + rc = -EBUSY; + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + mutex_unlock(&lli->lli_och_mutex); + return ERR_PTR(rc); + } + + if (fd->fd_och == NULL) { + if (file->f_mode & FMODE_WRITE) { + LASSERT(lli->lli_mds_write_och != NULL); + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else { + LASSERT(lli->lli_mds_read_och != NULL); + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + if (*och_usecount == 1) { + fd->fd_och = *och_p; + *och_p = NULL; + *och_usecount = 0; + rc = 0; + } + } + mutex_unlock(&lli->lli_och_mutex); + if (rc < 0) /* more than 1 opener */ + return ERR_PTR(rc); + + LASSERT(fd->fd_och != NULL); + old_handle = fd->fd_och->och_fh; + } + + och = kzalloc(sizeof(*och), GFP_NOFS); + if (!och) + return ERR_PTR(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + rc = PTR_ERR(op_data); + goto out; + } + + /* To tell the MDT this openhandle is from the same owner */ + op_data->op_handle = old_handle; + + it.it_flags = fmode | open_flags; + it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE; + rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req, + ll_md_blocking_lease_ast, + /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise + * it can be cancelled which may mislead applications that the lease is + * broken; + * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal + * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast + * doesn't deal with openhandle, so normal openhandle will be leaked. */ + LDLM_FL_NO_LRU | LDLM_FL_EXCL); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc < 0) + goto out_release_it; + + if (it_disposition(&it, DISP_LOOKUP_NEG)) { + rc = -ENOENT; + goto out_release_it; + } + + rc = it_open_error(DISP_OPEN_OPEN, &it); + if (rc) + goto out_release_it; + + LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF)); + ll_och_fill(sbi->ll_md_exp, &it, och); + + if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ { + rc = -EOPNOTSUPP; + goto out_close; + } + + /* already get lease, handle lease lock */ + ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); + if (it.d.lustre.it_lock_mode == 0 || + it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) { + /* open lock must return for lease */ + CERROR(DFID "lease granted but no open lock, %d/%llu.\n", + PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode, + it.d.lustre.it_lock_bits); + rc = -EPROTO; + goto out_close; + } + + ll_intent_release(&it); + return och; + +out_close: + rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL); + if (rc2) + CERROR("Close openhandle returned %d\n", rc2); + + /* cancel open lock */ + if (it.d.lustre.it_lock_mode != 0) { + ldlm_lock_decref_and_cancel(&och->och_lease_handle, + it.d.lustre.it_lock_mode); + it.d.lustre.it_lock_mode = 0; + } +out_release_it: + ll_intent_release(&it); +out: + OBD_FREE_PTR(och); + return ERR_PTR(rc); +} + +/** + * Release lease and close the file. + * It will check if the lease has ever broken. + */ +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, + bool *lease_broken) +{ + struct ldlm_lock *lock; + bool cancelled = true; + int rc; + + lock = ldlm_handle2lock(&och->och_lease_handle); + if (lock != NULL) { + lock_res_and_lock(lock); + cancelled = ldlm_is_cancel(lock); + unlock_res_and_lock(lock); + ldlm_lock_put(lock); + } + + CDEBUG(D_INODE, "lease for "DFID" broken? %d\n", + PFID(&ll_i2info(inode)->lli_fid), cancelled); + + if (!cancelled) + ldlm_cli_cancel(&och->och_lease_handle, 0); + if (lease_broken != NULL) + *lease_broken = cancelled; + + rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och, + NULL); + return rc; +} + +/* Fills the obdo with the attributes for the lsm */ +static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp, + struct obd_capa *capa, struct obdo *obdo, + __u64 ioepoch, int sync) +{ + struct ptlrpc_request_set *set; + struct obd_info oinfo = { { { 0 } } }; + int rc; + + LASSERT(lsm != NULL); + + oinfo.oi_md = lsm; + oinfo.oi_oa = obdo; + oinfo.oi_oa->o_oi = lsm->lsm_oi; + oinfo.oi_oa->o_mode = S_IFREG; + oinfo.oi_oa->o_ioepoch = ioepoch; + oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | + OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLBLKSZ | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME | + OBD_MD_FLGROUP | OBD_MD_FLEPOCH | + OBD_MD_FLDATAVERSION; + oinfo.oi_capa = capa; + if (sync) { + oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS; + oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK; + } + + set = ptlrpc_prep_set(); + if (set == NULL) { + CERROR("can't allocate ptlrpc set\n"); + rc = -ENOMEM; + } else { + rc = obd_getattr_async(exp, &oinfo, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + } + if (rc == 0) + oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | + OBD_MD_FLATIME | OBD_MD_FLMTIME | + OBD_MD_FLCTIME | OBD_MD_FLSIZE | + OBD_MD_FLDATAVERSION); + return rc; +} + +/** + * Performs the getattr on the inode and updates its fields. + * If @sync != 0, perform the getattr under the server-side lock. + */ +int ll_inode_getattr(struct inode *inode, struct obdo *obdo, + __u64 ioepoch, int sync) +{ + struct obd_capa *capa = ll_mdscapa_get(inode); + struct lov_stripe_md *lsm; + int rc; + + lsm = ccc_inode_lsm_get(inode); + rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode), + capa, obdo, ioepoch, sync); + capa_put(capa); + if (rc == 0) { + struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi; + + obdo_refresh_inode(inode, obdo, obdo->o_valid); + CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n", + POSTID(oi), i_size_read(inode), + (unsigned long long)inode->i_blocks, + 1UL << inode->i_blkbits); + } + ccc_inode_lsm_put(inode, lsm); + return rc; +} + +int ll_merge_lvb(const struct lu_env *env, struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct cl_attr *attr = ccc_env_thread_attr(env); + struct ost_lvb lvb; + int rc = 0; + + ll_inode_size_lock(inode); + /* merge timestamps the most recently obtained from mds with + timestamps obtained from osts */ + LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime; + LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime; + LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime; + + lvb.lvb_size = i_size_read(inode); + lvb.lvb_blocks = inode->i_blocks; + lvb.lvb_mtime = LTIME_S(inode->i_mtime); + lvb.lvb_atime = LTIME_S(inode->i_atime); + lvb.lvb_ctime = LTIME_S(inode->i_ctime); + + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + + if (rc == 0) { + if (lvb.lvb_atime < attr->cat_atime) + lvb.lvb_atime = attr->cat_atime; + if (lvb.lvb_ctime < attr->cat_ctime) + lvb.lvb_ctime = attr->cat_ctime; + if (lvb.lvb_mtime < attr->cat_mtime) + lvb.lvb_mtime = attr->cat_mtime; + + CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n", + PFID(&lli->lli_fid), attr->cat_size); + cl_isize_write_nolock(inode, attr->cat_size); + + inode->i_blocks = attr->cat_blocks; + + LTIME_S(inode->i_mtime) = lvb.lvb_mtime; + LTIME_S(inode->i_atime) = lvb.lvb_atime; + LTIME_S(inode->i_ctime) = lvb.lvb_ctime; + } + ll_inode_size_unlock(inode); + + return rc; +} + +int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm, + lstat_t *st) +{ + struct obdo obdo = { 0 }; + int rc; + + rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0); + if (rc == 0) { + st->st_size = obdo.o_size; + st->st_blocks = obdo.o_blocks; + st->st_mtime = obdo.o_mtime; + st->st_atime = obdo.o_atime; + st->st_ctime = obdo.o_ctime; + } + return rc; +} + +static bool file_is_noatime(const struct file *file) +{ + const struct vfsmount *mnt = file->f_path.mnt; + const struct inode *inode = file_inode(file); + + /* Adapted from file_accessed() and touch_atime().*/ + if (file->f_flags & O_NOATIME) + return true; + + if (inode->i_flags & S_NOATIME) + return true; + + if (IS_NOATIME(inode)) + return true; + + if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY)) + return true; + + if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) + return true; + + if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) + return true; + + return false; +} + +void ll_io_init(struct cl_io *io, const struct file *file, int write) +{ + struct inode *inode = file_inode(file); + + io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK; + if (write) { + io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND); + io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || + file->f_flags & O_DIRECT || + IS_SYNC(inode); + } + io->ci_obj = ll_i2info(inode)->lli_clob; + io->ci_lockreq = CILR_MAYBE; + if (ll_file_nolock(file)) { + io->ci_lockreq = CILR_NEVER; + io->ci_no_srvlock = 1; + } else if (file->f_flags & O_APPEND) { + io->ci_lockreq = CILR_MANDATORY; + } + + io->ci_noatime = file_is_noatime(file); +} + +static ssize_t +ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, + struct file *file, enum cl_io_type iot, + loff_t *ppos, size_t count) +{ + struct ll_inode_info *lli = ll_i2info(file_inode(file)); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct cl_io *io; + ssize_t result; + +restart: + io = ccc_env_thread_io(env); + ll_io_init(io, file, iot == CIT_WRITE); + + if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) { + struct vvp_io *vio = vvp_env_io(env); + struct ccc_io *cio = ccc_env_io(env); + int write_mutex_locked = 0; + + cio->cui_fd = LUSTRE_FPRIVATE(file); + vio->cui_io_subtype = args->via_io_subtype; + + switch (vio->cui_io_subtype) { + case IO_NORMAL: + cio->cui_iter = args->u.normal.via_iter; + cio->cui_iocb = args->u.normal.via_iocb; + if ((iot == CIT_WRITE) && + !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + if (mutex_lock_interruptible(&lli-> + lli_write_mutex)) { + result = -ERESTARTSYS; + goto out; + } + write_mutex_locked = 1; + } else if (iot == CIT_READ) { + down_read(&lli->lli_trunc_sem); + } + break; + case IO_SPLICE: + vio->u.splice.cui_pipe = args->u.splice.via_pipe; + vio->u.splice.cui_flags = args->u.splice.via_flags; + break; + default: + CERROR("Unknown IO type - %u\n", vio->cui_io_subtype); + LBUG(); + } + result = cl_io_loop(env, io); + if (write_mutex_locked) + mutex_unlock(&lli->lli_write_mutex); + else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ) + up_read(&lli->lli_trunc_sem); + } else { + /* cl_io_rw_init() handled IO */ + result = io->ci_result; + } + + if (io->ci_nob > 0) { + result = io->ci_nob; + *ppos = io->u.ci_wr.wr.crw_pos; + } + goto out; +out: + cl_io_fini(env, io); + /* If any bit been read/written (result != 0), we just return + * short read/write instead of restart io. */ + if ((result == 0 || result == -ENODATA) && io->ci_need_restart) { + CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n", + iot == CIT_READ ? "read" : "write", + file, *ppos, count); + LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob); + goto restart; + } + + if (iot == CIT_READ) { + if (result >= 0) + ll_stats_ops_tally(ll_i2sbi(file_inode(file)), + LPROC_LL_READ_BYTES, result); + } else if (iot == CIT_WRITE) { + if (result >= 0) { + ll_stats_ops_tally(ll_i2sbi(file_inode(file)), + LPROC_LL_WRITE_BYTES, result); + fd->fd_write_failed = false; + } else if (result != -ERESTARTSYS) { + fd->fd_write_failed = true; + } + } + + return result; +} + +static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct lu_env *env; + struct vvp_io_args *args; + ssize_t result; + int refcheck; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + args = vvp_env_args(env, IO_NORMAL); + args->u.normal.via_iter = to; + args->u.normal.via_iocb = iocb; + + result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ, + &iocb->ki_pos, iov_iter_count(to)); + cl_env_put(env, &refcheck); + return result; +} + +/* + * Write to a file (through the page cache). + */ +static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct lu_env *env; + struct vvp_io_args *args; + ssize_t result; + int refcheck; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + args = vvp_env_args(env, IO_NORMAL); + args->u.normal.via_iter = from; + args->u.normal.via_iocb = iocb; + + result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE, + &iocb->ki_pos, iov_iter_count(from)); + cl_env_put(env, &refcheck); + return result; +} + +/* + * Send file content (through pagecache) somewhere with helper + */ +static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) +{ + struct lu_env *env; + struct vvp_io_args *args; + ssize_t result; + int refcheck; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + args = vvp_env_args(env, IO_SPLICE); + args->u.splice.via_pipe = pipe; + args->u.splice.via_flags = flags; + + result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count); + cl_env_put(env, &refcheck); + return result; +} + +static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx) +{ + struct obd_export *exp = ll_i2dtexp(inode); + struct obd_trans_info oti = { 0 }; + struct obdo *oa = NULL; + int lsm_size; + int rc = 0; + struct lov_stripe_md *lsm = NULL, *lsm2; + + OBDO_ALLOC(oa); + if (oa == NULL) + return -ENOMEM; + + lsm = ccc_inode_lsm_get(inode); + if (!lsm_has_objects(lsm)) { + rc = -ENOENT; + goto out; + } + + lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) * + (lsm->lsm_stripe_count)); + + OBD_ALLOC_LARGE(lsm2, lsm_size); + if (lsm2 == NULL) { + rc = -ENOMEM; + goto out; + } + + oa->o_oi = *oi; + oa->o_nlink = ost_idx; + oa->o_flags |= OBD_FL_RECREATE_OBJS; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP; + obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); + obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid); + memcpy(lsm2, lsm, lsm_size); + ll_inode_size_lock(inode); + rc = obd_create(NULL, exp, oa, &lsm2, &oti); + ll_inode_size_unlock(inode); + + OBD_FREE_LARGE(lsm2, lsm_size); + goto out; +out: + ccc_inode_lsm_put(inode, lsm); + OBDO_FREE(oa); + return rc; +} + +static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg) +{ + struct ll_recreate_obj ucreat; + struct ost_id oi; + + if (!capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg, + sizeof(ucreat))) + return -EFAULT; + + ostid_set_seq_mdt0(&oi); + ostid_set_id(&oi, ucreat.lrc_id); + return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx); +} + +static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg) +{ + struct lu_fid fid; + struct ost_id oi; + u32 ost_idx; + + if (!capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid))) + return -EFAULT; + + fid_to_ostid(&fid, &oi); + ost_idx = (fid_seq(&fid) >> 16) & 0xffff; + return ll_lov_recreate(inode, &oi, ost_idx); +} + +int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, + int flags, struct lov_user_md *lum, int lum_size) +{ + struct lov_stripe_md *lsm = NULL; + struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags}; + int rc = 0; + + lsm = ccc_inode_lsm_get(inode); + if (lsm != NULL) { + ccc_inode_lsm_put(inode, lsm); + CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n", + inode->i_ino); + rc = -EEXIST; + goto out; + } + + ll_inode_size_lock(inode); + rc = ll_intent_file_open(dentry, lum, lum_size, &oit); + if (rc) + goto out_unlock; + rc = oit.d.lustre.it_status; + if (rc < 0) + goto out_req_free; + + ll_release_openhandle(inode, &oit); + +out_unlock: + ll_inode_size_unlock(inode); + ll_intent_release(&oit); + ccc_inode_lsm_put(inode, lsm); +out: + return rc; +out_req_free: + ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data); + goto out; +} + +int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, + struct lov_mds_md **lmmp, int *lmm_size, + struct ptlrpc_request **request) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct mdt_body *body; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *req = NULL; + struct md_op_data *op_data; + int rc, lmmsize; + + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc) + return rc; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, + strlen(filename), lmmsize, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; + rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n", + filename, rc); + goto out; + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); /* checked by mdc_getattr_name */ + + lmmsize = body->eadatasize; + + if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + lmmsize == 0) { + rc = -ENODATA; + goto out; + } + + lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize); + LASSERT(lmm != NULL); + + if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) && + (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) { + rc = -EPROTO; + goto out; + } + + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) { + int stripe_count; + + stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED) + stripe_count = 0; + + /* if function called for directory - we should + * avoid swab not existent lsm objects */ + if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) { + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); + if (S_ISREG(body->mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v1 *)lmm)->lmm_objects, + stripe_count); + } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); + if (S_ISREG(body->mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v3 *)lmm)->lmm_objects, + stripe_count); + } + } + +out: + *lmmp = lmm; + *lmm_size = lmmsize; + *request = req; + return rc; +} + +static int ll_lov_setea(struct inode *inode, struct file *file, + unsigned long arg) +{ + int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE; + struct lov_user_md *lump; + int lum_size = sizeof(struct lov_user_md) + + sizeof(struct lov_user_ost_data); + int rc; + + if (!capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + + OBD_ALLOC_LARGE(lump, lum_size); + if (lump == NULL) + return -ENOMEM; + + if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) { + OBD_FREE_LARGE(lump, lum_size); + return -EFAULT; + } + + rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump, + lum_size); + cl_lov_delay_create_clear(&file->f_flags); + + OBD_FREE_LARGE(lump, lum_size); + return rc; +} + +static int ll_lov_setstripe(struct inode *inode, struct file *file, + unsigned long arg) +{ + struct lov_user_md_v3 lumv3; + struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; + struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg; + struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg; + int lum_size, rc; + int flags = FMODE_WRITE; + + /* first try with v1 which is smaller than v3 */ + lum_size = sizeof(struct lov_user_md_v1); + if (copy_from_user(lumv1, lumv1p, lum_size)) + return -EFAULT; + + if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) { + lum_size = sizeof(struct lov_user_md_v3); + if (copy_from_user(&lumv3, lumv3p, lum_size)) + return -EFAULT; + } + + rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1, + lum_size); + cl_lov_delay_create_clear(&file->f_flags); + if (rc == 0) { + struct lov_stripe_md *lsm; + __u32 gen; + + put_user(0, &lumv1p->lmm_stripe_count); + + ll_layout_refresh(inode, &gen); + lsm = ccc_inode_lsm_get(inode); + rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), + 0, lsm, (void *)arg); + ccc_inode_lsm_put(inode, lsm); + } + return rc; +} + +static int ll_lov_getstripe(struct inode *inode, unsigned long arg) +{ + struct lov_stripe_md *lsm; + int rc = -ENODATA; + + lsm = ccc_inode_lsm_get(inode); + if (lsm != NULL) + rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, + lsm, (void *)arg); + ccc_inode_lsm_put(inode, lsm); + return rc; +} + +static int +ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ccc_grouplock grouplock; + int rc; + + if (arg == 0) { + CWARN("group id for group lock must not be 0\n"); + return -EINVAL; + } + + if (ll_file_nolock(file)) + return -EOPNOTSUPP; + + spin_lock(&lli->lli_lock); + if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { + CWARN("group lock already existed with gid %lu\n", + fd->fd_grouplock.cg_gid); + spin_unlock(&lli->lli_lock); + return -EINVAL; + } + LASSERT(fd->fd_grouplock.cg_lock == NULL); + spin_unlock(&lli->lli_lock); + + rc = cl_get_grouplock(cl_i2info(inode)->lli_clob, + arg, (file->f_flags & O_NONBLOCK), &grouplock); + if (rc) + return rc; + + spin_lock(&lli->lli_lock); + if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { + spin_unlock(&lli->lli_lock); + CERROR("another thread just won the race\n"); + cl_put_grouplock(&grouplock); + return -EINVAL; + } + + fd->fd_flags |= LL_FILE_GROUP_LOCKED; + fd->fd_grouplock = grouplock; + spin_unlock(&lli->lli_lock); + + CDEBUG(D_INFO, "group lock %lu obtained\n", arg); + return 0; +} + +static int ll_put_grouplock(struct inode *inode, struct file *file, + unsigned long arg) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ccc_grouplock grouplock; + + spin_lock(&lli->lli_lock); + if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + spin_unlock(&lli->lli_lock); + CWARN("no group lock held\n"); + return -EINVAL; + } + LASSERT(fd->fd_grouplock.cg_lock != NULL); + + if (fd->fd_grouplock.cg_gid != arg) { + CWARN("group lock %lu doesn't match current id %lu\n", + arg, fd->fd_grouplock.cg_gid); + spin_unlock(&lli->lli_lock); + return -EINVAL; + } + + grouplock = fd->fd_grouplock; + memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock)); + fd->fd_flags &= ~LL_FILE_GROUP_LOCKED; + spin_unlock(&lli->lli_lock); + + cl_put_grouplock(&grouplock); + CDEBUG(D_INFO, "group lock %lu released\n", arg); + return 0; +} + +/** + * Close inode open handle + * + * \param inode [in] inode in question + * \param it [in,out] intent which contains open info and result + * + * \retval 0 success + * \retval <0 failure + */ +int ll_release_openhandle(struct inode *inode, struct lookup_intent *it) +{ + struct obd_client_handle *och; + int rc; + + LASSERT(inode); + + /* Root ? Do nothing. */ + if (is_root_inode(inode)) + return 0; + + /* No open handle to close? Move away */ + if (!it_disposition(it, DISP_OPEN_OPEN)) + return 0; + + LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0); + + och = kzalloc(sizeof(*och), GFP_NOFS); + if (!och) { + rc = -ENOMEM; + goto out; + } + + ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); + + rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, + inode, och, NULL); +out: + /* this one is in place of ll_file_open */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) { + ptlrpc_req_finished(it->d.lustre.it_data); + it_clear_disposition(it, DISP_ENQ_OPEN_REF); + } + return rc; +} + +/** + * Get size for inode for which FIEMAP mapping is requested. + * Make the FIEMAP get_info call and returns the result. + */ +static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap, + size_t num_bytes) +{ + struct obd_export *exp = ll_i2dtexp(inode); + struct lov_stripe_md *lsm = NULL; + struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, }; + __u32 vallen = num_bytes; + int rc; + + /* Checks for fiemap flags */ + if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) { + fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT; + return -EBADR; + } + + /* Check for FIEMAP_FLAG_SYNC */ + if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) { + rc = filemap_fdatawrite(inode->i_mapping); + if (rc) + return rc; + } + + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) + return -ENOENT; + + /* If the stripe_count > 1 and the application does not understand + * DEVICE_ORDER flag, then it cannot interpret the extents correctly. + */ + if (lsm->lsm_stripe_count > 1 && + !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) { + rc = -EOPNOTSUPP; + goto out; + } + + fm_key.oa.o_oi = lsm->lsm_oi; + fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + if (i_size_read(inode) == 0) { + rc = ll_glimpse_size(inode); + if (rc) + goto out; + } + + obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE); + obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid); + /* If filesize is 0, then there would be no objects for mapping */ + if (fm_key.oa.o_size == 0) { + fiemap->fm_mapped_extents = 0; + rc = 0; + goto out; + } + + memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap)); + + rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen, + fiemap, lsm); + if (rc) + CERROR("obd_get_info failed: rc = %d\n", rc); + +out: + ccc_inode_lsm_put(inode, lsm); + return rc; +} + +int ll_fid2path(struct inode *inode, void __user *arg) +{ + struct obd_export *exp = ll_i2mdexp(inode); + const struct getinfo_fid2path __user *gfin = arg; + struct getinfo_fid2path *gfout; + u32 pathlen; + size_t outsize; + int rc; + + if (!capable(CFS_CAP_DAC_READ_SEARCH) && + !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH)) + return -EPERM; + + /* Only need to get the buflen */ + if (get_user(pathlen, &gfin->gf_pathlen)) + return -EFAULT; + + if (pathlen > PATH_MAX) + return -EINVAL; + + outsize = sizeof(*gfout) + pathlen; + + gfout = kzalloc(outsize, GFP_NOFS); + if (!gfout) + return -ENOMEM; + + if (copy_from_user(gfout, arg, sizeof(*gfout))) { + rc = -EFAULT; + goto gf_free; + } + + /* Call mdc_iocontrol */ + rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL); + if (rc != 0) + goto gf_free; + + if (copy_to_user(arg, gfout, outsize)) + rc = -EFAULT; + +gf_free: + OBD_FREE(gfout, outsize); + return rc; +} + +static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg) +{ + struct ll_user_fiemap *fiemap_s; + size_t num_bytes, ret_bytes; + unsigned int extent_count; + int rc = 0; + + /* Get the extent count so we can calculate the size of + * required fiemap buffer */ + if (get_user(extent_count, + &((struct ll_user_fiemap __user *)arg)->fm_extent_count)) + return -EFAULT; + + if (extent_count >= + (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent)) + return -EINVAL; + num_bytes = sizeof(*fiemap_s) + (extent_count * + sizeof(struct ll_fiemap_extent)); + + OBD_ALLOC_LARGE(fiemap_s, num_bytes); + if (fiemap_s == NULL) + return -ENOMEM; + + /* get the fiemap value */ + if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg, + sizeof(*fiemap_s))) { + rc = -EFAULT; + goto error; + } + + /* If fm_extent_count is non-zero, read the first extent since + * it is used to calculate end_offset and device from previous + * fiemap call. */ + if (extent_count) { + if (copy_from_user(&fiemap_s->fm_extents[0], + (char __user *)arg + sizeof(*fiemap_s), + sizeof(struct ll_fiemap_extent))) { + rc = -EFAULT; + goto error; + } + } + + rc = ll_do_fiemap(inode, fiemap_s, num_bytes); + if (rc) + goto error; + + ret_bytes = sizeof(struct ll_user_fiemap); + + if (extent_count != 0) + ret_bytes += (fiemap_s->fm_mapped_extents * + sizeof(struct ll_fiemap_extent)); + + if (copy_to_user((void *)arg, fiemap_s, ret_bytes)) + rc = -EFAULT; + +error: + OBD_FREE_LARGE(fiemap_s, num_bytes); + return rc; +} + +/* + * Read the data_version for inode. + * + * This value is computed using stripe object version on OST. + * Version is computed using server side locking. + * + * @param extent_lock Take extent lock. Not needed if a process is already + * holding the OST object group locks. + */ +int ll_data_version(struct inode *inode, __u64 *data_version, + int extent_lock) +{ + struct lov_stripe_md *lsm = NULL; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obdo *obdo = NULL; + int rc; + + /* If no stripe, we consider version is 0. */ + lsm = ccc_inode_lsm_get(inode); + if (!lsm_has_objects(lsm)) { + *data_version = 0; + CDEBUG(D_INODE, "No object for inode\n"); + rc = 0; + goto out; + } + + obdo = kzalloc(sizeof(*obdo), GFP_NOFS); + if (!obdo) { + rc = -ENOMEM; + goto out; + } + + rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock); + if (rc == 0) { + if (!(obdo->o_valid & OBD_MD_FLDATAVERSION)) + rc = -EOPNOTSUPP; + else + *data_version = obdo->o_data_version; + } + + OBD_FREE_PTR(obdo); +out: + ccc_inode_lsm_put(inode, lsm); + return rc; +} + +/* + * Trigger a HSM release request for the provided inode. + */ +int ll_hsm_release(struct inode *inode) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct obd_client_handle *och = NULL; + __u64 data_version = 0; + int rc; + + + CDEBUG(D_INODE, "%s: Releasing file "DFID".\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&ll_i2info(inode)->lli_fid)); + + och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE); + if (IS_ERR(och)) { + rc = PTR_ERR(och); + goto out; + } + + /* Grab latest data_version and [am]time values */ + rc = ll_data_version(inode, &data_version, 1); + if (rc != 0) + goto out; + + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) { + rc = PTR_ERR(env); + goto out; + } + + ll_merge_lvb(env, inode); + cl_env_nested_put(&nest, env); + + /* Release the file. + * NB: lease lock handle is released in mdc_hsm_release_pack() because + * we still need it to pack l_remote_handle to MDT. */ + rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och, + &data_version); + och = NULL; + + +out: + if (och != NULL && !IS_ERR(och)) /* close the file */ + ll_lease_close(och, inode, NULL); + + return rc; +} + +struct ll_swap_stack { + struct iattr ia1, ia2; + __u64 dv1, dv2; + struct inode *inode1, *inode2; + bool check_dv1, check_dv2; +}; + +static int ll_swap_layouts(struct file *file1, struct file *file2, + struct lustre_swap_layouts *lsl) +{ + struct mdc_swap_layouts msl; + struct md_op_data *op_data; + __u32 gid; + __u64 dv; + struct ll_swap_stack *llss = NULL; + int rc; + + llss = kzalloc(sizeof(*llss), GFP_NOFS); + if (!llss) + return -ENOMEM; + + llss->inode1 = file_inode(file1); + llss->inode2 = file_inode(file2); + + if (!S_ISREG(llss->inode2->i_mode)) { + rc = -EINVAL; + goto free; + } + + if (inode_permission(llss->inode1, MAY_WRITE) || + inode_permission(llss->inode2, MAY_WRITE)) { + rc = -EPERM; + goto free; + } + + if (llss->inode2->i_sb != llss->inode1->i_sb) { + rc = -EXDEV; + goto free; + } + + /* we use 2 bool because it is easier to swap than 2 bits */ + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1) + llss->check_dv1 = true; + + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2) + llss->check_dv2 = true; + + /* we cannot use lsl->sl_dvX directly because we may swap them */ + llss->dv1 = lsl->sl_dv1; + llss->dv2 = lsl->sl_dv2; + + rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2)); + if (rc == 0) /* same file, done! */ { + rc = 0; + goto free; + } + + if (rc < 0) { /* sequentialize it */ + swap(llss->inode1, llss->inode2); + swap(file1, file2); + swap(llss->dv1, llss->dv2); + swap(llss->check_dv1, llss->check_dv2); + } + + gid = lsl->sl_gid; + if (gid != 0) { /* application asks to flush dirty cache */ + rc = ll_get_grouplock(llss->inode1, file1, gid); + if (rc < 0) + goto free; + + rc = ll_get_grouplock(llss->inode2, file2, gid); + if (rc < 0) { + ll_put_grouplock(llss->inode1, file1, gid); + goto free; + } + } + + /* to be able to restore mtime and atime after swap + * we need to first save them */ + if (lsl->sl_flags & + (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) { + llss->ia1.ia_mtime = llss->inode1->i_mtime; + llss->ia1.ia_atime = llss->inode1->i_atime; + llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME; + llss->ia2.ia_mtime = llss->inode2->i_mtime; + llss->ia2.ia_atime = llss->inode2->i_atime; + llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME; + } + + /* ultimate check, before swapping the layouts we check if + * dataversion has changed (if requested) */ + if (llss->check_dv1) { + rc = ll_data_version(llss->inode1, &dv, 0); + if (rc) + goto putgl; + if (dv != llss->dv1) { + rc = -EAGAIN; + goto putgl; + } + } + + if (llss->check_dv2) { + rc = ll_data_version(llss->inode2, &dv, 0); + if (rc) + goto putgl; + if (dv != llss->dv2) { + rc = -EAGAIN; + goto putgl; + } + } + + /* struct md_op_data is used to send the swap args to the mdt + * only flags is missing, so we use struct mdc_swap_layouts + * through the md_op_data->op_data */ + /* flags from user space have to be converted before they are send to + * server, no flag is sent today, they are only used on the client */ + msl.msl_flags = 0; + rc = -ENOMEM; + op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0, + 0, LUSTRE_OPC_ANY, &msl); + if (IS_ERR(op_data)) { + rc = PTR_ERR(op_data); + goto free; + } + + rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1), + sizeof(*op_data), op_data, NULL); + ll_finish_md_op_data(op_data); + +putgl: + if (gid != 0) { + ll_put_grouplock(llss->inode2, file2, gid); + ll_put_grouplock(llss->inode1, file1, gid); + } + + /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */ + if (rc != 0) + goto free; + + /* clear useless flags */ + if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) { + llss->ia1.ia_valid &= ~ATTR_MTIME; + llss->ia2.ia_valid &= ~ATTR_MTIME; + } + + if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) { + llss->ia1.ia_valid &= ~ATTR_ATIME; + llss->ia2.ia_valid &= ~ATTR_ATIME; + } + + /* update time if requested */ + rc = 0; + if (llss->ia2.ia_valid != 0) { + mutex_lock(&llss->inode1->i_mutex); + rc = ll_setattr(file1->f_path.dentry, &llss->ia2); + mutex_unlock(&llss->inode1->i_mutex); + } + + if (llss->ia1.ia_valid != 0) { + int rc1; + + mutex_lock(&llss->inode2->i_mutex); + rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1); + mutex_unlock(&llss->inode2->i_mutex); + if (rc == 0) + rc = rc1; + } + +free: + if (llss != NULL) + OBD_FREE_PTR(llss); + + return rc; +} + +static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss) +{ + struct md_op_data *op_data; + int rc; + + /* Non-root users are forbidden to set or clear flags which are + * NOT defined in HSM_USER_MASK. */ + if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) && + !capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hss); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode), + sizeof(*op_data), op_data, NULL); + + ll_finish_md_op_data(op_data); + + return rc; +} + +static int ll_hsm_import(struct inode *inode, struct file *file, + struct hsm_user_import *hui) +{ + struct hsm_state_set *hss = NULL; + struct iattr *attr = NULL; + int rc; + + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + /* set HSM flags */ + hss = kzalloc(sizeof(*hss), GFP_NOFS); + if (!hss) { + rc = -ENOMEM; + goto out; + } + + hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID; + hss->hss_archive_id = hui->hui_archive_id; + hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED; + rc = ll_hsm_state_set(inode, hss); + if (rc != 0) + goto out; + + attr = kzalloc(sizeof(*attr), GFP_NOFS); + if (!attr) { + rc = -ENOMEM; + goto out; + } + + attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO); + attr->ia_mode |= S_IFREG; + attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid); + attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid); + attr->ia_size = hui->hui_size; + attr->ia_mtime.tv_sec = hui->hui_mtime; + attr->ia_mtime.tv_nsec = hui->hui_mtime_ns; + attr->ia_atime.tv_sec = hui->hui_atime; + attr->ia_atime.tv_nsec = hui->hui_atime_ns; + + attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE | + ATTR_UID | ATTR_GID | + ATTR_MTIME | ATTR_MTIME_SET | + ATTR_ATIME | ATTR_ATIME_SET; + + mutex_lock(&inode->i_mutex); + + rc = ll_setattr_raw(file->f_path.dentry, attr, true); + if (rc == -ENODATA) + rc = 0; + + mutex_unlock(&inode->i_mutex); + +out: + if (hss != NULL) + OBD_FREE_PTR(hss); + + if (attr != NULL) + OBD_FREE_PTR(attr); + + return rc; +} + +static long +ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + int flags, rc; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino, + inode->i_generation, inode, cmd); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); + + /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ + if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ + return -ENOTTY; + + switch (cmd) { + case LL_IOC_GETFLAGS: + /* Get the current value of the file flags */ + return put_user(fd->fd_flags, (int *)arg); + case LL_IOC_SETFLAGS: + case LL_IOC_CLRFLAGS: + /* Set or clear specific file flags */ + /* XXX This probably needs checks to ensure the flags are + * not abused, and to handle any flag side effects. + */ + if (get_user(flags, (int *) arg)) + return -EFAULT; + + if (cmd == LL_IOC_SETFLAGS) { + if ((flags & LL_FILE_IGNORE_LOCK) && + !(file->f_flags & O_DIRECT)) { + CERROR("%s: unable to disable locking on non-O_DIRECT file\n", + current->comm); + return -EINVAL; + } + + fd->fd_flags |= flags; + } else { + fd->fd_flags &= ~flags; + } + return 0; + case LL_IOC_LOV_SETSTRIPE: + return ll_lov_setstripe(inode, file, arg); + case LL_IOC_LOV_SETEA: + return ll_lov_setea(inode, file, arg); + case LL_IOC_LOV_SWAP_LAYOUTS: { + struct file *file2; + struct lustre_swap_layouts lsl; + + if (copy_from_user(&lsl, (char *)arg, + sizeof(struct lustre_swap_layouts))) + return -EFAULT; + + if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */ + return -EPERM; + + file2 = fget(lsl.sl_fd); + if (file2 == NULL) + return -EBADF; + + rc = -EPERM; + if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */ + rc = ll_swap_layouts(file, file2, &lsl); + fput(file2); + return rc; + } + case LL_IOC_LOV_GETSTRIPE: + return ll_lov_getstripe(inode, arg); + case LL_IOC_RECREATE_OBJ: + return ll_lov_recreate_obj(inode, arg); + case LL_IOC_RECREATE_FID: + return ll_lov_recreate_fid(inode, arg); + case FSFILT_IOC_FIEMAP: + return ll_ioctl_fiemap(inode, arg); + case FSFILT_IOC_GETFLAGS: + case FSFILT_IOC_SETFLAGS: + return ll_iocontrol(inode, file, cmd, arg); + case FSFILT_IOC_GETVERSION_OLD: + case FSFILT_IOC_GETVERSION: + return put_user(inode->i_generation, (int *)arg); + case LL_IOC_GROUP_LOCK: + return ll_get_grouplock(inode, file, arg); + case LL_IOC_GROUP_UNLOCK: + return ll_put_grouplock(inode, file, arg); + case IOC_OBD_STATFS: + return ll_obd_statfs(inode, (void *)arg); + + /* We need to special case any other ioctls we want to handle, + * to send them to the MDS/OST as appropriate and to properly + * network encode the arg field. + case FSFILT_IOC_SETVERSION_OLD: + case FSFILT_IOC_SETVERSION: + */ + case LL_IOC_FLUSHCTX: + return ll_flush_ctx(inode); + case LL_IOC_PATH2FID: { + if (copy_to_user((void *)arg, ll_inode2fid(inode), + sizeof(struct lu_fid))) + return -EFAULT; + + return 0; + } + case OBD_IOC_FID2PATH: + return ll_fid2path(inode, (void *)arg); + case LL_IOC_DATA_VERSION: { + struct ioc_data_version idv; + int rc; + + if (copy_from_user(&idv, (char *)arg, sizeof(idv))) + return -EFAULT; + + rc = ll_data_version(inode, &idv.idv_version, + !(idv.idv_flags & LL_DV_NOFLUSH)); + + if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv))) + return -EFAULT; + + return rc; + } + + case LL_IOC_GET_MDTIDX: { + int mdtidx; + + mdtidx = ll_get_mdt_idx(inode); + if (mdtidx < 0) + return mdtidx; + + if (put_user((int)mdtidx, (int *)arg)) + return -EFAULT; + + return 0; + } + case OBD_IOC_GETDTNAME: + case OBD_IOC_GETMDNAME: + return ll_get_obd_name(inode, cmd, arg); + case LL_IOC_HSM_STATE_GET: { + struct md_op_data *op_data; + struct hsm_user_state *hus; + int rc; + + hus = kzalloc(sizeof(*hus), GFP_NOFS); + if (!hus) + return -ENOMEM; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hus); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hus); + return PTR_ERR(op_data); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((void *)arg, hus, sizeof(*hus))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hus); + return rc; + } + case LL_IOC_HSM_STATE_SET: { + struct hsm_state_set *hss; + int rc; + + hss = kzalloc(sizeof(*hss), GFP_NOFS); + if (!hss) + return -ENOMEM; + + if (copy_from_user(hss, (char *)arg, sizeof(*hss))) { + OBD_FREE_PTR(hss); + return -EFAULT; + } + + rc = ll_hsm_state_set(inode, hss); + + OBD_FREE_PTR(hss); + return rc; + } + case LL_IOC_HSM_ACTION: { + struct md_op_data *op_data; + struct hsm_current_action *hca; + int rc; + + hca = kzalloc(sizeof(*hca), GFP_NOFS); + if (!hca) + return -ENOMEM; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hca); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hca); + return PTR_ERR(op_data); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((char *)arg, hca, sizeof(*hca))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hca); + return rc; + } + case LL_IOC_SET_LEASE: { + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_client_handle *och = NULL; + bool lease_broken; + fmode_t mode = 0; + + switch (arg) { + case F_WRLCK: + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + mode = FMODE_WRITE; + break; + case F_RDLCK: + if (!(file->f_mode & FMODE_READ)) + return -EPERM; + mode = FMODE_READ; + break; + case F_UNLCK: + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + och = fd->fd_lease_och; + fd->fd_lease_och = NULL; + } + mutex_unlock(&lli->lli_och_mutex); + + if (och != NULL) { + mode = och->och_flags & + (FMODE_READ|FMODE_WRITE); + rc = ll_lease_close(och, inode, &lease_broken); + if (rc == 0 && lease_broken) + mode = 0; + } else { + rc = -ENOLCK; + } + + /* return the type of lease or error */ + return rc < 0 ? rc : (int)mode; + default: + return -EINVAL; + } + + CDEBUG(D_INODE, "Set lease with mode %d\n", mode); + + /* apply for lease */ + och = ll_lease_open(inode, file, mode, 0); + if (IS_ERR(och)) + return PTR_ERR(och); + + rc = 0; + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och == NULL) { + fd->fd_lease_och = och; + och = NULL; + } + mutex_unlock(&lli->lli_och_mutex); + if (och != NULL) { + /* impossible now that only excl is supported for now */ + ll_lease_close(och, inode, &lease_broken); + rc = -EBUSY; + } + return rc; + } + case LL_IOC_GET_LEASE: { + struct ll_inode_info *lli = ll_i2info(inode); + struct ldlm_lock *lock = NULL; + + rc = 0; + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + struct obd_client_handle *och = fd->fd_lease_och; + + lock = ldlm_handle2lock(&och->och_lease_handle); + if (lock != NULL) { + lock_res_and_lock(lock); + if (!ldlm_is_cancel(lock)) + rc = och->och_flags & + (FMODE_READ | FMODE_WRITE); + unlock_res_and_lock(lock); + ldlm_lock_put(lock); + } + } + mutex_unlock(&lli->lli_och_mutex); + return rc; + } + case LL_IOC_HSM_IMPORT: { + struct hsm_user_import *hui; + + hui = kzalloc(sizeof(*hui), GFP_NOFS); + if (!hui) + return -ENOMEM; + + if (copy_from_user(hui, (void *)arg, sizeof(*hui))) { + OBD_FREE_PTR(hui); + return -EFAULT; + } + + rc = ll_hsm_import(inode, file, hui); + + OBD_FREE_PTR(hui); + return rc; + } + default: { + int err; + + if (LLIOC_STOP == + ll_iocontrol_call(inode, file, cmd, arg, &err)) + return err; + + return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL, + (void *)arg); + } + } +} + + +static loff_t ll_file_seek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file_inode(file); + loff_t retval, eof = 0; + + retval = offset + ((origin == SEEK_END) ? i_size_read(inode) : + (origin == SEEK_CUR) ? file->f_pos : 0); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n", + inode->i_ino, inode->i_generation, inode, retval, retval, + origin); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1); + + if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) { + retval = ll_glimpse_size(inode); + if (retval != 0) + return retval; + eof = i_size_read(inode); + } + + retval = generic_file_llseek_size(file, offset, origin, + ll_file_maxbytes(inode), eof); + return retval; +} + +static int ll_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + int rc, err; + + LASSERT(!S_ISDIR(inode->i_mode)); + + /* catch async errors that were recorded back when async writeback + * failed for pages in this mapping. */ + rc = lli->lli_async_rc; + lli->lli_async_rc = 0; + err = lov_read_and_clear_async_rc(lli->lli_clob); + if (rc == 0) + rc = err; + + /* The application has been told write failure already. + * Do not report failure again. */ + if (fd->fd_write_failed) + return 0; + return rc ? -EIO : 0; +} + +/** + * Called to make sure a portion of file has been written out. + * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST. + * + * Return how many pages have been written. + */ +int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, + enum cl_fsync_mode mode, int ignore_layout) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct cl_io *io; + struct obd_capa *capa = NULL; + struct cl_fsync_io *fio; + int result; + + if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL && + mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL) + return -EINVAL; + + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + return PTR_ERR(env); + + capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE); + + io = ccc_env_thread_io(env); + io->ci_obj = cl_i2info(inode)->lli_clob; + io->ci_ignore_layout = ignore_layout; + + /* initialize parameters for sync */ + fio = &io->u.ci_fsync; + fio->fi_capa = capa; + fio->fi_start = start; + fio->fi_end = end; + fio->fi_fid = ll_inode2fid(inode); + fio->fi_mode = mode; + fio->fi_nr_written = 0; + + if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0) + result = cl_io_loop(env, io); + else + result = io->ci_result; + if (result == 0) + result = fio->fi_nr_written; + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + + capa_put(capa); + + return result; +} + +int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct ptlrpc_request *req; + struct obd_capa *oc; + int rc, err; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1); + + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + mutex_lock(&inode->i_mutex); + + /* catch async errors that were recorded back when async writeback + * failed for pages in this mapping. */ + if (!S_ISDIR(inode->i_mode)) { + err = lli->lli_async_rc; + lli->lli_async_rc = 0; + if (rc == 0) + rc = err; + err = lov_read_and_clear_async_rc(lli->lli_clob); + if (rc == 0) + rc = err; + } + + oc = ll_mdscapa_get(inode); + err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc, + &req); + capa_put(oc); + if (!rc) + rc = err; + if (!err) + ptlrpc_req_finished(req); + + if (S_ISREG(inode->i_mode)) { + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0); + if (rc == 0 && err < 0) + rc = err; + if (rc < 0) + fd->fd_write_failed = true; + else + fd->fd_write_failed = false; + } + + mutex_unlock(&inode->i_mutex); + return rc; +} + +static int +ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) +{ + struct inode *inode = file_inode(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ldlm_enqueue_info einfo = { + .ei_type = LDLM_FLOCK, + .ei_cb_cp = ldlm_flock_completion_ast, + .ei_cbdata = file_lock, + }; + struct md_op_data *op_data; + struct lustre_handle lockh = {0}; + ldlm_policy_data_t flock = {{0}}; + __u64 flags = 0; + int rc; + int rc2 = 0; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n", + inode->i_ino, file_lock); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1); + + if (file_lock->fl_flags & FL_FLOCK) + LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK)); + else if (!(file_lock->fl_flags & FL_POSIX)) + return -EINVAL; + + flock.l_flock.owner = (unsigned long)file_lock->fl_owner; + flock.l_flock.pid = file_lock->fl_pid; + flock.l_flock.start = file_lock->fl_start; + flock.l_flock.end = file_lock->fl_end; + + /* Somewhat ugly workaround for svc lockd. + * lockd installs custom fl_lmops->lm_compare_owner that checks + * for the fl_owner to be the same (which it always is on local node + * I guess between lockd processes) and then compares pid. + * As such we assign pid to the owner field to make it all work, + * conflict with normal locks is unlikely since pid space and + * pointer space for current->files are not intersecting */ + if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner) + flock.l_flock.owner = (unsigned long)file_lock->fl_pid; + + switch (file_lock->fl_type) { + case F_RDLCK: + einfo.ei_mode = LCK_PR; + break; + case F_UNLCK: + /* An unlock request may or may not have any relation to + * existing locks so we may not be able to pass a lock handle + * via a normal ldlm_lock_cancel() request. The request may even + * unlock a byte range in the middle of an existing lock. In + * order to process an unlock request we need all of the same + * information that is given with a normal read or write record + * lock request. To avoid creating another ldlm unlock (cancel) + * message we'll treat a LCK_NL flock request as an unlock. */ + einfo.ei_mode = LCK_NL; + break; + case F_WRLCK: + einfo.ei_mode = LCK_PW; + break; + default: + CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", + file_lock->fl_type); + return -ENOTSUPP; + } + + switch (cmd) { + case F_SETLKW: +#ifdef F_SETLKW64 + case F_SETLKW64: +#endif + flags = 0; + break; + case F_SETLK: +#ifdef F_SETLK64 + case F_SETLK64: +#endif + flags = LDLM_FL_BLOCK_NOWAIT; + break; + case F_GETLK: +#ifdef F_GETLK64 + case F_GETLK64: +#endif + flags = LDLM_FL_TEST_LOCK; + /* Save the old mode so that if the mode in the lock changes we + * can decrement the appropriate reader or writer refcount. */ + file_lock->fl_type = einfo.ei_mode; + break; + default: + CERROR("unknown fcntl lock command: %d\n", cmd); + return -EINVAL; + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n", + inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode, + flock.l_flock.start, flock.l_flock.end); + + rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, + op_data, &lockh, &flock, 0, NULL /* req */, flags); + + if ((file_lock->fl_flags & FL_FLOCK) && + (rc == 0 || file_lock->fl_type == F_UNLCK)) + rc2 = flock_lock_file_wait(file, file_lock); + if ((file_lock->fl_flags & FL_POSIX) && + (rc == 0 || file_lock->fl_type == F_UNLCK) && + !(flags & LDLM_FL_TEST_LOCK)) + rc2 = posix_lock_file_wait(file, file_lock); + + if (rc2 && file_lock->fl_type != F_UNLCK) { + einfo.ei_mode = LCK_NL; + md_enqueue(sbi->ll_md_exp, &einfo, NULL, + op_data, &lockh, &flock, 0, NULL /* req */, flags); + rc = rc2; + } + + ll_finish_md_op_data(op_data); + + return rc; +} + +static int +ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock) +{ + return -ENOSYS; +} + +/** + * test if some locks matching bits and l_req_mode are acquired + * - bits can be in different locks + * - if found clear the common lock bits in *bits + * - the bits not found, are kept in *bits + * \param inode [IN] + * \param bits [IN] searched lock bits [IN] + * \param l_req_mode [IN] searched lock mode + * \retval boolean, true iff all bits are found + */ +int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode) +{ + struct lustre_handle lockh; + ldlm_policy_data_t policy; + ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ? + (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode; + struct lu_fid *fid; + __u64 flags; + int i; + + if (!inode) + return 0; + + fid = &ll_i2info(inode)->lli_fid; + CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid), + ldlm_lockname[mode]); + + flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK; + for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) { + policy.l_inodebits.bits = *bits & (1 << i); + if (policy.l_inodebits.bits == 0) + continue; + + if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, + &policy, mode, &lockh)) { + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(&lockh); + if (lock) { + *bits &= + ~(lock->l_policy_data.l_inodebits.bits); + LDLM_LOCK_PUT(lock); + } else { + *bits &= ~policy.l_inodebits.bits; + } + } + } + return *bits == 0; +} + +ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits, + struct lustre_handle *lockh, __u64 flags, + ldlm_mode_t mode) +{ + ldlm_policy_data_t policy = { .l_inodebits = {bits} }; + struct lu_fid *fid; + ldlm_mode_t rc; + + fid = &ll_i2info(inode)->lli_fid; + CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid)); + + rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags, + fid, LDLM_IBITS, &policy, mode, lockh); + + return rc; +} + +static int ll_inode_revalidate_fini(struct inode *inode, int rc) +{ + /* Already unlinked. Just update nlink and return success */ + if (rc == -ENOENT) { + clear_nlink(inode); + /* This path cannot be hit for regular files unless in + * case of obscure races, so no need to validate size. + */ + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return 0; + } else if (rc != 0) { + CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR, + "%s: revalidate FID "DFID" error: rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(ll_inode2fid(inode)), rc); + } + + return rc; +} + +static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits) +{ + struct inode *inode = d_inode(dentry); + struct ptlrpc_request *req = NULL; + struct obd_export *exp; + int rc = 0; + + LASSERT(inode != NULL); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n", + inode->i_ino, inode->i_generation, inode, dentry); + + exp = ll_i2mdexp(inode); + + /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC. + * But under CMD case, it caused some lock issues, should be fixed + * with new CMD ibits lock. See bug 12718 */ + if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) { + struct lookup_intent oit = { .it_op = IT_GETATTR }; + struct md_op_data *op_data; + + if (ibits == MDS_INODELOCK_LOOKUP) + oit.it_op = IT_LOOKUP; + + /* Call getattr by fid, so do not provide name at all. */ + op_data = ll_prep_md_op_data(NULL, inode, + inode, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + oit.it_create_mode |= M_CHECK_STALE; + rc = md_intent_lock(exp, op_data, NULL, 0, + /* we are not interested in name + based lookup */ + &oit, 0, &req, + ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + oit.it_create_mode &= ~M_CHECK_STALE; + if (rc < 0) { + rc = ll_inode_revalidate_fini(inode, rc); + goto out; + } + + rc = ll_revalidate_it_finish(req, &oit, inode); + if (rc != 0) { + ll_intent_release(&oit); + goto out; + } + + /* Unlinked? Unhash dentry, so it is not picked up later by + do_lookup() -> ll_revalidate_it(). We cannot use d_drop + here to preserve get_cwd functionality on 2.6. + Bug 10503 */ + if (!d_inode(dentry)->i_nlink) + d_lustre_invalidate(dentry, 0); + + ll_lookup_finish_locks(&oit, inode); + } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) { + struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry)); + u64 valid = OBD_MD_FLGETATTR; + struct md_op_data *op_data; + int ealen = 0; + + if (S_ISREG(inode->i_mode)) { + rc = ll_get_default_mdsize(sbi, &ealen); + if (rc) + return rc; + valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE; + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, + 0, ealen, LUSTRE_OPC_ANY, + NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + op_data->op_valid = valid; + /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one + * capa for this inode. Because we only keep capas of dirs + * fresh. */ + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc) { + rc = ll_inode_revalidate_fini(inode, rc); + return rc; + } + + rc = ll_prep_inode(&inode, req, NULL, NULL); + } +out: + ptlrpc_req_finished(req); + return rc; +} + +static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits) +{ + struct inode *inode = d_inode(dentry); + int rc; + + rc = __ll_inode_revalidate(dentry, ibits); + if (rc != 0) + return rc; + + /* if object isn't regular file, don't validate size */ + if (!S_ISREG(inode->i_mode)) { + LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime; + LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime; + LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime; + } else { + /* In case of restore, the MDT has the right size and has + * already send it back without granting the layout lock, + * inode is up-to-date so glimpse is useless. + * Also to glimpse we need the layout, in case of a running + * restore the MDT holds the layout lock so the glimpse will + * block up to the end of restore (getattr will block) + */ + if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING)) + rc = ll_glimpse_size(inode); + } + return rc; +} + +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) +{ + struct inode *inode = d_inode(de); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + int res = 0; + + res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LOOKUP); + ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1); + + if (res) + return res; + + stat->dev = inode->i_sb->s_dev; + if (ll_need_32bit_api(sbi)) + stat->ino = cl_fid_build_ino(&lli->lli_fid, 1); + else + stat->ino = inode->i_ino; + stat->mode = inode->i_mode; + stat->nlink = inode->i_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blksize = 1 << inode->i_blkbits; + + stat->size = i_size_read(inode); + stat->blocks = inode->i_blocks; + + return 0; +} + +static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + int rc; + size_t num_bytes; + struct ll_user_fiemap *fiemap; + unsigned int extent_count = fieinfo->fi_extents_max; + + num_bytes = sizeof(*fiemap) + (extent_count * + sizeof(struct ll_fiemap_extent)); + OBD_ALLOC_LARGE(fiemap, num_bytes); + + if (fiemap == NULL) + return -ENOMEM; + + fiemap->fm_flags = fieinfo->fi_flags; + fiemap->fm_extent_count = fieinfo->fi_extents_max; + fiemap->fm_start = start; + fiemap->fm_length = len; + if (extent_count > 0) + memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start, + sizeof(struct ll_fiemap_extent)); + + rc = ll_do_fiemap(inode, fiemap, num_bytes); + + fieinfo->fi_flags = fiemap->fm_flags; + fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents; + if (extent_count > 0) + memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0], + fiemap->fm_mapped_extents * + sizeof(struct ll_fiemap_extent)); + + OBD_FREE_LARGE(fiemap, num_bytes); + return rc; +} + +struct posix_acl *ll_get_acl(struct inode *inode, int type) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct posix_acl *acl = NULL; + + spin_lock(&lli->lli_lock); + /* VFS' acl_permission_check->check_acl will release the refcount */ + acl = posix_acl_dup(lli->lli_posix_acl); + spin_unlock(&lli->lli_lock); + + return acl; +} + + +int ll_inode_permission(struct inode *inode, int mask) +{ + int rc = 0; + +#ifdef MAY_NOT_BLOCK + if (mask & MAY_NOT_BLOCK) + return -ECHILD; +#endif + + /* as root inode are NOT getting validated in lookup operation, + * need to do it before permission check. */ + + if (is_root_inode(inode)) { + rc = __ll_inode_revalidate(inode->i_sb->s_root, + MDS_INODELOCK_LOOKUP); + if (rc) + return rc; + } + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n", + inode->i_ino, inode->i_generation, inode, inode->i_mode, mask); + + if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT) + return lustre_check_remote_perm(inode, mask); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1); + rc = generic_permission(inode, mask); + + return rc; +} + +/* -o localflock - only provides locally consistent flock locks */ +struct file_operations ll_file_operations = { + .read_iter = ll_file_read_iter, + .write_iter = ll_file_write_iter, + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, + .splice_read = ll_file_splice_read, + .fsync = ll_fsync, + .flush = ll_flush +}; + +struct file_operations ll_file_operations_flock = { + .read_iter = ll_file_read_iter, + .write_iter = ll_file_write_iter, + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, + .splice_read = ll_file_splice_read, + .fsync = ll_fsync, + .flush = ll_flush, + .flock = ll_file_flock, + .lock = ll_file_flock +}; + +/* These are for -o noflock - to return ENOSYS on flock calls */ +struct file_operations ll_file_operations_noflock = { + .read_iter = ll_file_read_iter, + .write_iter = ll_file_write_iter, + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, + .splice_read = ll_file_splice_read, + .fsync = ll_fsync, + .flush = ll_flush, + .flock = ll_file_noflock, + .lock = ll_file_noflock +}; + +struct inode_operations ll_file_inode_operations = { + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, + .fiemap = ll_fiemap, + .get_acl = ll_get_acl, +}; + +/* dynamic ioctl number support routines */ +static struct llioc_ctl_data { + struct rw_semaphore ioc_sem; + struct list_head ioc_head; +} llioc = { + __RWSEM_INITIALIZER(llioc.ioc_sem), + LIST_HEAD_INIT(llioc.ioc_head) +}; + + +struct llioc_data { + struct list_head iocd_list; + unsigned int iocd_size; + llioc_callback_t iocd_cb; + unsigned int iocd_count; + unsigned int iocd_cmd[0]; +}; + +void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd) +{ + unsigned int size; + struct llioc_data *in_data = NULL; + + if (cb == NULL || cmd == NULL || + count > LLIOC_MAX_CMD || count < 0) + return NULL; + + size = sizeof(*in_data) + count * sizeof(unsigned int); + in_data = kzalloc(size, GFP_NOFS); + if (!in_data) + return NULL; + + memset(in_data, 0, sizeof(*in_data)); + in_data->iocd_size = size; + in_data->iocd_cb = cb; + in_data->iocd_count = count; + memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count); + + down_write(&llioc.ioc_sem); + list_add_tail(&in_data->iocd_list, &llioc.ioc_head); + up_write(&llioc.ioc_sem); + + return in_data; +} +EXPORT_SYMBOL(ll_iocontrol_register); + +void ll_iocontrol_unregister(void *magic) +{ + struct llioc_data *tmp; + + if (magic == NULL) + return; + + down_write(&llioc.ioc_sem); + list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) { + if (tmp == magic) { + unsigned int size = tmp->iocd_size; + + list_del(&tmp->iocd_list); + up_write(&llioc.ioc_sem); + + OBD_FREE(tmp, size); + return; + } + } + up_write(&llioc.ioc_sem); + + CWARN("didn't find iocontrol register block with magic: %p\n", magic); +} +EXPORT_SYMBOL(ll_iocontrol_unregister); + +static enum llioc_iter +ll_iocontrol_call(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg, int *rcp) +{ + enum llioc_iter ret = LLIOC_CONT; + struct llioc_data *data; + int rc = -EINVAL, i; + + down_read(&llioc.ioc_sem); + list_for_each_entry(data, &llioc.ioc_head, iocd_list) { + for (i = 0; i < data->iocd_count; i++) { + if (cmd != data->iocd_cmd[i]) + continue; + + ret = data->iocd_cb(inode, file, cmd, arg, data, &rc); + break; + } + + if (ret == LLIOC_STOP) + break; + } + up_read(&llioc.ioc_sem); + + if (rcp) + *rcp = rc; + return ret; +} + +int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_env_nest nest; + struct lu_env *env; + int result; + + if (lli->lli_clob == NULL) + return 0; + + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + return PTR_ERR(env); + + result = cl_conf_set(env, lli->lli_clob, conf); + cl_env_nested_put(&nest, env); + + if (conf->coc_opc == OBJECT_CONF_SET) { + struct ldlm_lock *lock = conf->coc_lock; + + LASSERT(lock != NULL); + LASSERT(ldlm_has_layout(lock)); + if (result == 0) { + /* it can only be allowed to match after layout is + * applied to inode otherwise false layout would be + * seen. Applying layout should happen before dropping + * the intent lock. */ + ldlm_lock_allow_match(lock); + } + } + return result; +} + +/* Fetch layout from MDT with getxattr request, if it's not ready yet */ +static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock) + +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_capa *oc; + struct ptlrpc_request *req; + struct mdt_body *body; + void *lvbdata; + void *lmm; + int lmmsize; + int rc; + + CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n", + PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY), + lock->l_lvb_data, lock->l_lvb_len); + + if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY)) + return 0; + + /* if layout lock was granted right away, the layout is returned + * within DLM_LVB of dlm reply; otherwise if the lock was ever + * blocked and then granted via completion ast, we have to fetch + * layout here. Please note that we can't use the LVB buffer in + * completion AST because it doesn't have a large enough buffer */ + oc = ll_mdscapa_get(inode); + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc == 0) + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, + OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0, + lmmsize, 0, &req); + capa_put(oc); + if (rc < 0) + return rc; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) { + rc = -EPROTO; + goto out; + } + + lmmsize = body->eadatasize; + if (lmmsize == 0) /* empty layout */ { + rc = 0; + goto out; + } + + lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize); + if (lmm == NULL) { + rc = -EFAULT; + goto out; + } + + OBD_ALLOC_LARGE(lvbdata, lmmsize); + if (lvbdata == NULL) { + rc = -ENOMEM; + goto out; + } + + memcpy(lvbdata, lmm, lmmsize); + lock_res_and_lock(lock); + if (lock->l_lvb_data != NULL) + OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len); + + lock->l_lvb_data = lvbdata; + lock->l_lvb_len = lmmsize; + unlock_res_and_lock(lock); + +out: + ptlrpc_req_finished(req); + return rc; +} + +/** + * Apply the layout to the inode. Layout lock is held and will be released + * in this function. + */ +static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode, + struct inode *inode, __u32 *gen, bool reconf) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ldlm_lock *lock; + struct lustre_md md = { NULL }; + struct cl_object_conf conf; + int rc = 0; + bool lvb_ready; + bool wait_layout = false; + + LASSERT(lustre_handle_is_used(lockh)); + + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + LASSERT(ldlm_has_layout(lock)); + + LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n", + inode, PFID(&lli->lli_fid), reconf); + + /* in case this is a caching lock and reinstate with new inode */ + md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL); + + lock_res_and_lock(lock); + lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY); + unlock_res_and_lock(lock); + /* checking lvb_ready is racy but this is okay. The worst case is + * that multi processes may configure the file on the same time. */ + if (lvb_ready || !reconf) { + rc = -ENODATA; + if (lvb_ready) { + /* layout_gen must be valid if layout lock is not + * cancelled and stripe has already set */ + *gen = ll_layout_version_get(lli); + rc = 0; + } + goto out; + } + + rc = ll_layout_fetch(inode, lock); + if (rc < 0) + goto out; + + /* for layout lock, lmm is returned in lock's lvb. + * lvb_data is immutable if the lock is held so it's safe to access it + * without res lock. See the description in ldlm_lock_decref_internal() + * for the condition to free lvb_data of layout lock */ + if (lock->l_lvb_data != NULL) { + rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm, + lock->l_lvb_data, lock->l_lvb_len); + if (rc >= 0) { + *gen = LL_LAYOUT_GEN_EMPTY; + if (md.lsm != NULL) + *gen = md.lsm->lsm_layout_gen; + rc = 0; + } else { + CERROR("%s: file "DFID" unpackmd error: %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&lli->lli_fid), rc); + } + } + if (rc < 0) + goto out; + + /* set layout to file. Unlikely this will fail as old layout was + * surely eliminated */ + memset(&conf, 0, sizeof(conf)); + conf.coc_opc = OBJECT_CONF_SET; + conf.coc_inode = inode; + conf.coc_lock = lock; + conf.u.coc_md = &md; + rc = ll_layout_conf(inode, &conf); + + if (md.lsm != NULL) + obd_free_memmd(sbi->ll_dt_exp, &md.lsm); + + /* refresh layout failed, need to wait */ + wait_layout = rc == -EBUSY; + +out: + LDLM_LOCK_PUT(lock); + ldlm_lock_decref(lockh, mode); + + /* wait for IO to complete if it's still being used. */ + if (wait_layout) { + CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n", + ll_get_fsname(inode->i_sb, NULL, 0), + inode, PFID(&lli->lli_fid)); + + memset(&conf, 0, sizeof(conf)); + conf.coc_opc = OBJECT_CONF_WAIT; + conf.coc_inode = inode; + rc = ll_layout_conf(inode, &conf); + if (rc == 0) + rc = -EAGAIN; + + CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n", + PFID(&lli->lli_fid), rc); + } + return rc; +} + +/** + * This function checks if there exists a LAYOUT lock on the client side, + * or enqueues it if it doesn't have one in cache. + * + * This function will not hold layout lock so it may be revoked any time after + * this function returns. Any operations depend on layout should be redone + * in that case. + * + * This function should be called before lov_io_init() to get an uptodate + * layout version, the caller should save the version number and after IO + * is finished, this function should be called again to verify that layout + * is not changed during IO time. + */ +int ll_layout_refresh(struct inode *inode, __u32 *gen) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct lookup_intent it; + struct lustre_handle lockh; + ldlm_mode_t mode; + struct ldlm_enqueue_info einfo = { + .ei_type = LDLM_IBITS, + .ei_mode = LCK_CR, + .ei_cb_bl = ll_md_blocking_ast, + .ei_cb_cp = ldlm_completion_ast, + }; + int rc; + + *gen = ll_layout_version_get(lli); + if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE) + return 0; + + /* sanity checks */ + LASSERT(fid_is_sane(ll_inode2fid(inode))); + LASSERT(S_ISREG(inode->i_mode)); + + /* take layout lock mutex to enqueue layout lock exclusively. */ + mutex_lock(&lli->lli_layout_mutex); + +again: + /* mostly layout lock is caching on the local side, so try to match + * it before grabbing layout lock mutex. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0, + LCK_CR | LCK_CW | LCK_PR | LCK_PW); + if (mode != 0) { /* hit cached lock */ + rc = ll_layout_lock_set(&lockh, mode, inode, gen, true); + if (rc == -EAGAIN) + goto again; + + mutex_unlock(&lli->lli_layout_mutex); + return rc; + } + + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + mutex_unlock(&lli->lli_layout_mutex); + return PTR_ERR(op_data); + } + + /* have to enqueue one */ + memset(&it, 0, sizeof(it)); + it.it_op = IT_LAYOUT; + lockh.cookie = 0ULL; + + LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n", + ll_get_fsname(inode->i_sb, NULL, 0), inode, + PFID(&lli->lli_fid)); + + rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh, + NULL, 0, NULL, 0); + if (it.d.lustre.it_data != NULL) + ptlrpc_req_finished(it.d.lustre.it_data); + it.d.lustre.it_data = NULL; + + ll_finish_md_op_data(op_data); + + mode = it.d.lustre.it_lock_mode; + it.d.lustre.it_lock_mode = 0; + ll_intent_drop_lock(&it); + + if (rc == 0) { + /* set lock data in case this is a new lock */ + ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); + rc = ll_layout_lock_set(&lockh, mode, inode, gen, true); + if (rc == -EAGAIN) + goto again; + } + mutex_unlock(&lli->lli_layout_mutex); + + return rc; +} + +/** + * This function send a restore request to the MDT + */ +int ll_layout_restore(struct inode *inode) +{ + struct hsm_user_request *hur; + int len, rc; + + len = sizeof(struct hsm_user_request) + + sizeof(struct hsm_user_item); + hur = kzalloc(len, GFP_NOFS); + if (!hur) + return -ENOMEM; + + hur->hur_request.hr_action = HUA_RESTORE; + hur->hur_request.hr_archive_id = 0; + hur->hur_request.hr_flags = 0; + memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid, + sizeof(hur->hur_user_item[0].hui_fid)); + hur->hur_user_item[0].hui_extent.length = -1; + hur->hur_request.hr_itemcount = 1; + rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp, + len, hur, NULL); + OBD_FREE(hur, len); + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_capa.c b/kernel/drivers/staging/lustre/lustre/llite/llite_capa.c new file mode 100644 index 000000000..aec9a4412 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/llite_capa.c @@ -0,0 +1,654 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/llite_capa.c + * + * Author: Lai Siyao + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include + +#include "../include/lustre_lite.h" +#include "llite_internal.h" + +/* for obd_capa.c_list, client capa might stay in three places: + * 1. ll_capa_list. + * 2. ll_idle_capas. + * 3. stand alone: just allocated. + */ + +/* capas for oss writeback and those failed to renew */ +static LIST_HEAD(ll_idle_capas); +static struct ptlrpc_thread ll_capa_thread; +static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT]; + +/* llite capa renewal timer */ +struct timer_list ll_capa_timer; +/* for debug: indicate whether capa on llite is enabled or not */ +static atomic_t ll_capa_debug = ATOMIC_INIT(0); +static unsigned long long ll_capa_renewed; +static unsigned long long ll_capa_renewal_noent; +static unsigned long long ll_capa_renewal_failed; +static unsigned long long ll_capa_renewal_retries; + +static int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa); + +static inline void update_capa_timer(struct obd_capa *ocapa, unsigned long expiry) +{ + if (time_before(expiry, ll_capa_timer.expires) || + !timer_pending(&ll_capa_timer)) { + mod_timer(&ll_capa_timer, expiry); + DEBUG_CAPA(D_SEC, &ocapa->c_capa, + "ll_capa_timer update: %lu/%lu by", expiry, jiffies); + } +} + +static inline unsigned long capa_renewal_time(struct obd_capa *ocapa) +{ + return cfs_time_sub(ocapa->c_expiry, + cfs_time_seconds(ocapa->c_capa.lc_timeout) / 2); +} + +static inline int capa_is_to_expire(struct obd_capa *ocapa) +{ + return time_before_eq(capa_renewal_time(ocapa), cfs_time_current()); +} + +static inline int have_expired_capa(void) +{ + struct obd_capa *ocapa = NULL; + int expired = 0; + + /* if ll_capa_list has client capa to expire or ll_idle_capas has + * expired capa, return 1. + */ + spin_lock(&capa_lock); + if (!list_empty(ll_capa_list)) { + ocapa = list_entry(ll_capa_list->next, struct obd_capa, + c_list); + expired = capa_is_to_expire(ocapa); + if (!expired) + update_capa_timer(ocapa, capa_renewal_time(ocapa)); + } else if (!list_empty(&ll_idle_capas)) { + ocapa = list_entry(ll_idle_capas.next, struct obd_capa, + c_list); + expired = capa_is_expired(ocapa); + if (!expired) + update_capa_timer(ocapa, ocapa->c_expiry); + } + spin_unlock(&capa_lock); + + if (expired) + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired"); + return expired; +} + +static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head) +{ + struct obd_capa *tmp; + struct list_head *before = NULL; + + /* TODO: client capa is sorted by expiry, this could be optimized */ + list_for_each_entry_reverse(tmp, head, c_list) { + if (cfs_time_aftereq(ocapa->c_expiry, tmp->c_expiry)) { + before = &tmp->c_list; + break; + } + } + + LASSERT(&ocapa->c_list != before); + list_add(&ocapa->c_list, before ?: head); +} + +static inline int obd_capa_open_count(struct obd_capa *oc) +{ + struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode); + return atomic_read(&lli->lli_open_count); +} + +static void ll_delete_capa(struct obd_capa *ocapa) +{ + struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode); + + if (capa_for_mds(&ocapa->c_capa)) { + LASSERT(lli->lli_mds_capa == ocapa); + lli->lli_mds_capa = NULL; + } else if (capa_for_oss(&ocapa->c_capa)) { + list_del_init(&ocapa->u.cli.lli_list); + } + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client"); + list_del_init(&ocapa->c_list); + capa_count[CAPA_SITE_CLIENT]--; + /* release the ref when alloc */ + capa_put(ocapa); +} + +/* three places where client capa is deleted: + * 1. capa_thread_main(), main place to delete expired capa. + * 2. ll_clear_inode_capas() in ll_clear_inode(). + * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_setattr_ost(). + */ +static int capa_thread_main(void *unused) +{ + struct obd_capa *ocapa, *tmp, *next; + struct inode *inode = NULL; + struct l_wait_info lwi = { 0 }; + int rc; + + thread_set_flags(&ll_capa_thread, SVC_RUNNING); + wake_up(&ll_capa_thread.t_ctl_waitq); + + while (1) { + l_wait_event(ll_capa_thread.t_ctl_waitq, + !thread_is_running(&ll_capa_thread) || + have_expired_capa(), + &lwi); + + if (!thread_is_running(&ll_capa_thread)) + break; + + next = NULL; + + spin_lock(&capa_lock); + list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) { + __u64 ibits; + + LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC); + + if (!capa_is_to_expire(ocapa)) { + next = ocapa; + break; + } + + list_del_init(&ocapa->c_list); + + /* for MDS capability, only renew those which belong to + * dir, or its inode is opened, or client holds LOOKUP + * lock. + */ + /* ibits may be changed by ll_have_md_lock() so we have + * to set it each time */ + ibits = MDS_INODELOCK_LOOKUP; + if (capa_for_mds(&ocapa->c_capa) && + !S_ISDIR(ocapa->u.cli.inode->i_mode) && + obd_capa_open_count(ocapa) == 0 && + !ll_have_md_lock(ocapa->u.cli.inode, + &ibits, LCK_MINMODE)) { + DEBUG_CAPA(D_SEC, &ocapa->c_capa, + "skip renewal for"); + sort_add_capa(ocapa, &ll_idle_capas); + continue; + } + + /* for OSS capability, only renew those whose inode is + * opened. + */ + if (capa_for_oss(&ocapa->c_capa) && + obd_capa_open_count(ocapa) == 0) { + /* oss capa with open count == 0 won't renew, + * move to idle list */ + sort_add_capa(ocapa, &ll_idle_capas); + continue; + } + + /* NB iput() is in ll_update_capa() */ + inode = igrab(ocapa->u.cli.inode); + if (inode == NULL) { + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, + "igrab failed for"); + continue; + } + + capa_get(ocapa); + ll_capa_renewed++; + spin_unlock(&capa_lock); + rc = md_renew_capa(ll_i2mdexp(inode), ocapa, + ll_update_capa); + spin_lock(&capa_lock); + if (rc) { + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, + "renew failed: %d", rc); + ll_capa_renewal_failed++; + } + } + + if (next) + update_capa_timer(next, capa_renewal_time(next)); + + list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas, + c_list) { + if (!capa_is_expired(ocapa)) { + if (!next) + update_capa_timer(ocapa, + ocapa->c_expiry); + break; + } + + if (atomic_read(&ocapa->c_refc) > 1) { + DEBUG_CAPA(D_SEC, &ocapa->c_capa, + "expired(c_refc %d), don't release", + atomic_read(&ocapa->c_refc)); + /* don't try to renew any more */ + list_del_init(&ocapa->c_list); + continue; + } + + /* expired capa is released. */ + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired"); + ll_delete_capa(ocapa); + } + + spin_unlock(&capa_lock); + } + + thread_set_flags(&ll_capa_thread, SVC_STOPPED); + wake_up(&ll_capa_thread.t_ctl_waitq); + return 0; +} + +void ll_capa_timer_callback(unsigned long unused) +{ + wake_up(&ll_capa_thread.t_ctl_waitq); +} + +int ll_capa_thread_start(void) +{ + struct task_struct *task; + + init_waitqueue_head(&ll_capa_thread.t_ctl_waitq); + + task = kthread_run(capa_thread_main, NULL, "ll_capa"); + if (IS_ERR(task)) { + CERROR("cannot start expired capa thread: rc %ld\n", + PTR_ERR(task)); + return PTR_ERR(task); + } + wait_event(ll_capa_thread.t_ctl_waitq, + thread_is_running(&ll_capa_thread)); + + return 0; +} + +void ll_capa_thread_stop(void) +{ + thread_set_flags(&ll_capa_thread, SVC_STOPPING); + wake_up(&ll_capa_thread.t_ctl_waitq); + wait_event(ll_capa_thread.t_ctl_waitq, + thread_is_stopped(&ll_capa_thread)); +} + +struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *ocapa; + int found = 0; + + if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0) + return NULL; + + LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW || + opc == CAPA_OPC_OSS_TRUNC); + + spin_lock(&capa_lock); + list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) { + if (capa_is_expired(ocapa)) + continue; + if ((opc & CAPA_OPC_OSS_WRITE) && + capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) { + found = 1; + break; + } else if ((opc & CAPA_OPC_OSS_READ) && + capa_opc_supported(&ocapa->c_capa, + CAPA_OPC_OSS_READ)) { + found = 1; + break; + } else if ((opc & CAPA_OPC_OSS_TRUNC) && + capa_opc_supported(&ocapa->c_capa, opc)) { + found = 1; + break; + } + } + + if (found) { + LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa), + ll_inode2fid(inode))); + LASSERT(ocapa->c_site == CAPA_SITE_CLIENT); + + capa_get(ocapa); + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client"); + } else { + ocapa = NULL; + + if (atomic_read(&ll_capa_debug)) { + CERROR("no capability for "DFID" opc %#llx\n", + PFID(&lli->lli_fid), opc); + atomic_set(&ll_capa_debug, 0); + } + } + spin_unlock(&capa_lock); + + return ocapa; +} +EXPORT_SYMBOL(ll_osscapa_get); + +struct obd_capa *ll_mdscapa_get(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *ocapa; + + LASSERT(inode != NULL); + + if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0) + return NULL; + + spin_lock(&capa_lock); + ocapa = capa_get(lli->lli_mds_capa); + spin_unlock(&capa_lock); + if (!ocapa && atomic_read(&ll_capa_debug)) { + CERROR("no mds capability for "DFID"\n", PFID(&lli->lli_fid)); + atomic_set(&ll_capa_debug, 0); + } + + return ocapa; +} + +static struct obd_capa *do_add_mds_capa(struct inode *inode, + struct obd_capa *ocapa) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *old = lli->lli_mds_capa; + struct lustre_capa *capa = &ocapa->c_capa; + + if (!old) { + ocapa->u.cli.inode = inode; + lli->lli_mds_capa = ocapa; + capa_count[CAPA_SITE_CLIENT]++; + + DEBUG_CAPA(D_SEC, capa, "add MDS"); + } else { + spin_lock(&old->c_lock); + old->c_capa = *capa; + spin_unlock(&old->c_lock); + + DEBUG_CAPA(D_SEC, capa, "update MDS"); + + capa_put(ocapa); + ocapa = old; + } + return ocapa; +} + +static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *ocapa; + + /* inside capa_lock */ + list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) { + if ((capa_opc(&ocapa->c_capa) & opc) != opc) + continue; + + LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa), + ll_inode2fid(inode))); + LASSERT(ocapa->c_site == CAPA_SITE_CLIENT); + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client"); + return ocapa; + } + + return NULL; +} + +static inline void inode_add_oss_capa(struct inode *inode, + struct obd_capa *ocapa) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *tmp; + struct list_head *next = NULL; + + /* capa is sorted in lli_oss_capas so lookup can always find the + * latest one */ + list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) { + if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) { + next = &tmp->u.cli.lli_list; + break; + } + } + LASSERT(&ocapa->u.cli.lli_list != next); + list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas); +} + +static struct obd_capa *do_add_oss_capa(struct inode *inode, + struct obd_capa *ocapa) +{ + struct obd_capa *old; + struct lustre_capa *capa = &ocapa->c_capa; + + LASSERTF(S_ISREG(inode->i_mode), + "inode has oss capa, but not regular file, mode: %d\n", + inode->i_mode); + + /* FIXME: can't replace it so easily with fine-grained opc */ + old = do_lookup_oss_capa(inode, capa_opc(capa) & CAPA_OPC_OSS_ONLY); + if (!old) { + ocapa->u.cli.inode = inode; + INIT_LIST_HEAD(&ocapa->u.cli.lli_list); + capa_count[CAPA_SITE_CLIENT]++; + + DEBUG_CAPA(D_SEC, capa, "add OSS"); + } else { + spin_lock(&old->c_lock); + old->c_capa = *capa; + spin_unlock(&old->c_lock); + + DEBUG_CAPA(D_SEC, capa, "update OSS"); + + capa_put(ocapa); + ocapa = old; + } + + inode_add_oss_capa(inode, ocapa); + return ocapa; +} + +struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa) +{ + spin_lock(&capa_lock); + ocapa = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, ocapa) : + do_add_oss_capa(inode, ocapa); + + /* truncate capa won't renew */ + if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) { + set_capa_expiry(ocapa); + list_del_init(&ocapa->c_list); + sort_add_capa(ocapa, ll_capa_list); + + update_capa_timer(ocapa, capa_renewal_time(ocapa)); + } + + spin_unlock(&capa_lock); + + atomic_set(&ll_capa_debug, 1); + return ocapa; +} + +static inline void delay_capa_renew(struct obd_capa *oc, unsigned long delay) +{ + /* NB: set a fake expiry for this capa to prevent it renew too soon */ + oc->c_expiry = cfs_time_add(oc->c_expiry, cfs_time_seconds(delay)); +} + +static int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa) +{ + struct inode *inode = ocapa->u.cli.inode; + int rc = 0; + + LASSERT(ocapa); + + if (IS_ERR(capa)) { + /* set error code */ + rc = PTR_ERR(capa); + spin_lock(&capa_lock); + if (rc == -ENOENT) { + DEBUG_CAPA(D_SEC, &ocapa->c_capa, + "renewal canceled because object removed"); + ll_capa_renewal_noent++; + } else { + ll_capa_renewal_failed++; + + /* failed capa won't be renewed any longer, but if -EIO, + * client might be doing recovery, retry in 2 min. */ + if (rc == -EIO && !capa_is_expired(ocapa)) { + delay_capa_renew(ocapa, 120); + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, + "renewal failed: -EIO, retry in 2 mins"); + ll_capa_renewal_retries++; + goto retry; + } else { + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, + "renewal failed(rc: %d) for", rc); + } + } + + list_del_init(&ocapa->c_list); + sort_add_capa(ocapa, &ll_idle_capas); + spin_unlock(&capa_lock); + + capa_put(ocapa); + iput(inode); + return rc; + } + + spin_lock(&ocapa->c_lock); + LASSERT(!memcmp(&ocapa->c_capa, capa, + offsetof(struct lustre_capa, lc_opc))); + ocapa->c_capa = *capa; + set_capa_expiry(ocapa); + spin_unlock(&ocapa->c_lock); + + spin_lock(&capa_lock); + if (capa_for_oss(capa)) + inode_add_oss_capa(inode, ocapa); + DEBUG_CAPA(D_SEC, capa, "renew"); +retry: + list_del_init(&ocapa->c_list); + sort_add_capa(ocapa, ll_capa_list); + update_capa_timer(ocapa, capa_renewal_time(ocapa)); + spin_unlock(&capa_lock); + + capa_put(ocapa); + iput(inode); + return rc; +} + +void ll_capa_open(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA)) + == 0) + return; + + if (!S_ISREG(inode->i_mode)) + return; + + atomic_inc(&lli->lli_open_count); +} + +void ll_capa_close(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA)) + == 0) + return; + + if (!S_ISREG(inode->i_mode)) + return; + + atomic_dec(&lli->lli_open_count); +} + +/* delete CAPA_OPC_OSS_TRUNC only */ +void ll_truncate_free_capa(struct obd_capa *ocapa) +{ + if (!ocapa) + return; + + LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC); + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate"); + + /* release ref when find */ + capa_put(ocapa); + if (likely(ocapa->c_capa.lc_opc == CAPA_OPC_OSS_TRUNC)) { + spin_lock(&capa_lock); + ll_delete_capa(ocapa); + spin_unlock(&capa_lock); + } +} + +void ll_clear_inode_capas(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *ocapa, *tmp; + + spin_lock(&capa_lock); + ocapa = lli->lli_mds_capa; + if (ocapa) + ll_delete_capa(ocapa); + + list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas, + u.cli.lli_list) + ll_delete_capa(ocapa); + spin_unlock(&capa_lock); +} + +void ll_print_capa_stat(struct ll_sb_info *sbi) +{ + if (sbi->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA)) + LCONSOLE_INFO("Fid capabilities renewed: %llu\n" + "Fid capabilities renewal ENOENT: %llu\n" + "Fid capabilities failed to renew: %llu\n" + "Fid capabilities renewal retries: %llu\n", + ll_capa_renewed, ll_capa_renewal_noent, + ll_capa_renewal_failed, ll_capa_renewal_retries); +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_close.c b/kernel/drivers/staging/lustre/lustre/llite/llite_close.c new file mode 100644 index 000000000..a94ba02cc --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/llite_close.c @@ -0,0 +1,393 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/llite_close.c + * + * Lustre Lite routines to issue a secondary close after writeback + */ + +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/lustre_lite.h" +#include "llite_internal.h" + +/** records that a write is in flight */ +void vvp_write_pending(struct ccc_object *club, struct ccc_page *page) +{ + struct ll_inode_info *lli = ll_i2info(club->cob_inode); + + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_SOM_DIRTY; + if (page != NULL && list_empty(&page->cpg_pending_linkage)) + list_add(&page->cpg_pending_linkage, + &club->cob_pending_list); + spin_unlock(&lli->lli_lock); +} + +/** records that a write has completed */ +void vvp_write_complete(struct ccc_object *club, struct ccc_page *page) +{ + struct ll_inode_info *lli = ll_i2info(club->cob_inode); + int rc = 0; + + spin_lock(&lli->lli_lock); + if (page != NULL && !list_empty(&page->cpg_pending_linkage)) { + list_del_init(&page->cpg_pending_linkage); + rc = 1; + } + spin_unlock(&lli->lli_lock); + if (rc) + ll_queue_done_writing(club->cob_inode, 0); +} + +/** Queues DONE_WRITING if + * - done writing is allowed; + * - inode has no no dirty pages; */ +void ll_queue_done_writing(struct inode *inode, unsigned long flags) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob); + + spin_lock(&lli->lli_lock); + lli->lli_flags |= flags; + + if ((lli->lli_flags & LLIF_DONE_WRITING) && + list_empty(&club->cob_pending_list)) { + struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq; + + if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) + CWARN("ino %lu/%u(flags %u) som valid it just after recovery\n", + inode->i_ino, inode->i_generation, + lli->lli_flags); + /* DONE_WRITING is allowed and inode has no dirty page. */ + spin_lock(&lcq->lcq_lock); + + LASSERT(list_empty(&lli->lli_close_list)); + CDEBUG(D_INODE, "adding inode %lu/%u to close list\n", + inode->i_ino, inode->i_generation); + list_add_tail(&lli->lli_close_list, &lcq->lcq_head); + + /* Avoid a concurrent insertion into the close thread queue: + * an inode is already in the close thread, open(), write(), + * close() happen, epoch is closed as the inode is marked as + * LLIF_EPOCH_PENDING. When pages are written inode should not + * be inserted into the queue again, clear this flag to avoid + * it. */ + lli->lli_flags &= ~LLIF_DONE_WRITING; + + wake_up(&lcq->lcq_waitq); + spin_unlock(&lcq->lcq_lock); + } + spin_unlock(&lli->lli_lock); +} + +/** Pack SOM attributes info @opdata for CLOSE, DONE_WRITING rpc. */ +void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + op_data->op_flags |= MF_SOM_CHANGE; + /* Check if Size-on-MDS attributes are valid. */ + if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) + CERROR("ino %lu/%u(flags %u) som valid it just after recovery\n", + inode->i_ino, inode->i_generation, + lli->lli_flags); + + if (!cl_local_size(inode)) { + /* Send Size-on-MDS Attributes if valid. */ + op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET | + ATTR_ATIME_SET | ATTR_SIZE | ATTR_BLOCKS; + } +} + +/** Closes ioepoch and packs Size-on-MDS attribute if needed into @op_data. */ +void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data, + struct obd_client_handle **och, unsigned long flags) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob); + + spin_lock(&lli->lli_lock); + if (!(list_empty(&club->cob_pending_list))) { + if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) { + LASSERT(*och != NULL); + LASSERT(lli->lli_pending_och == NULL); + /* Inode is dirty and there is no pending write done + * request yet, DONE_WRITE is to be sent later. */ + lli->lli_flags |= LLIF_EPOCH_PENDING; + lli->lli_pending_och = *och; + spin_unlock(&lli->lli_lock); + + inode = igrab(inode); + LASSERT(inode); + goto out; + } + if (flags & LLIF_DONE_WRITING) { + /* Some pages are still dirty, it is early to send + * DONE_WRITE. Wait until all pages will be flushed + * and try DONE_WRITE again later. */ + LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING)); + lli->lli_flags |= LLIF_DONE_WRITING; + spin_unlock(&lli->lli_lock); + + inode = igrab(inode); + LASSERT(inode); + goto out; + } + } + CDEBUG(D_INODE, "Epoch %llu closed on "DFID"\n", + ll_i2info(inode)->lli_ioepoch, PFID(&lli->lli_fid)); + op_data->op_flags |= MF_EPOCH_CLOSE; + + if (flags & LLIF_DONE_WRITING) { + LASSERT(lli->lli_flags & LLIF_SOM_DIRTY); + LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING)); + *och = lli->lli_pending_och; + lli->lli_pending_och = NULL; + lli->lli_flags &= ~LLIF_EPOCH_PENDING; + } else { + /* Pack Size-on-MDS inode attributes only if they has changed */ + if (!(lli->lli_flags & LLIF_SOM_DIRTY)) { + spin_unlock(&lli->lli_lock); + goto out; + } + + /* There is a pending DONE_WRITE -- close epoch with no + * attribute change. */ + if (lli->lli_flags & LLIF_EPOCH_PENDING) { + spin_unlock(&lli->lli_lock); + goto out; + } + } + + LASSERT(list_empty(&club->cob_pending_list)); + lli->lli_flags &= ~LLIF_SOM_DIRTY; + spin_unlock(&lli->lli_lock); + ll_done_writing_attr(inode, op_data); + +out: + return; +} + +/** + * Cliens updates SOM attributes on MDS (including llog cookies): + * obd_getattr with no lock and md_setattr. + */ +int ll_som_update(struct inode *inode, struct md_op_data *op_data) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ptlrpc_request *request = NULL; + __u32 old_flags; + struct obdo *oa; + int rc; + + LASSERT(op_data != NULL); + if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) + CERROR("ino %lu/%u(flags %u) som valid it just after recovery\n", + inode->i_ino, inode->i_generation, + lli->lli_flags); + + OBDO_ALLOC(oa); + if (!oa) { + CERROR("can't allocate memory for Size-on-MDS update.\n"); + return -ENOMEM; + } + + old_flags = op_data->op_flags; + op_data->op_flags = MF_SOM_CHANGE; + + /* If inode is already in another epoch, skip getattr from OSTs. */ + if (lli->lli_ioepoch == op_data->op_ioepoch) { + rc = ll_inode_getattr(inode, oa, op_data->op_ioepoch, + old_flags & MF_GETATTR_LOCK); + if (rc) { + oa->o_valid = 0; + if (rc != -ENOENT) + CERROR("inode_getattr failed (%d): unable to send a Size-on-MDS attribute update for inode %lu/%u\n", + rc, inode->i_ino, + inode->i_generation); + } else { + CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n", + PFID(&lli->lli_fid)); + } + /* Install attributes into op_data. */ + md_from_obdo(op_data, oa, oa->o_valid); + } + + rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, + NULL, 0, NULL, 0, &request, NULL); + ptlrpc_req_finished(request); + + OBDO_FREE(oa); + return rc; +} + +/** + * Closes the ioepoch and packs all the attributes into @op_data for + * DONE_WRITING rpc. + */ +static void ll_prepare_done_writing(struct inode *inode, + struct md_op_data *op_data, + struct obd_client_handle **och) +{ + ll_ioepoch_close(inode, op_data, och, LLIF_DONE_WRITING); + /* If there is no @och, we do not do D_W yet. */ + if (*och == NULL) + return; + + ll_pack_inode2opdata(inode, op_data, &(*och)->och_fh); + ll_prep_md_op_data(op_data, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); +} + +/** Send a DONE_WRITING rpc. */ +static void ll_done_writing(struct inode *inode) +{ + struct obd_client_handle *och = NULL; + struct md_op_data *op_data; + int rc; + + LASSERT(exp_connect_som(ll_i2mdexp(inode))); + + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + if (!op_data) + return; + + ll_prepare_done_writing(inode, op_data, &och); + /* If there is no @och, we do not do D_W yet. */ + if (och == NULL) + goto out; + + rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL); + if (rc == -EAGAIN) { + /* MDS has instructed us to obtain Size-on-MDS attribute from + * OSTs and send setattr to back to MDS. */ + rc = ll_som_update(inode, op_data); + } else if (rc) { + CERROR("inode %lu mdc done_writing failed: rc = %d\n", + inode->i_ino, rc); + } +out: + ll_finish_md_op_data(op_data); + if (och) { + md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och); + OBD_FREE_PTR(och); + } +} + +static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq) +{ + struct ll_inode_info *lli = NULL; + + spin_lock(&lcq->lcq_lock); + + if (!list_empty(&lcq->lcq_head)) { + lli = list_entry(lcq->lcq_head.next, struct ll_inode_info, + lli_close_list); + list_del_init(&lli->lli_close_list); + } else if (atomic_read(&lcq->lcq_stop)) + lli = ERR_PTR(-EALREADY); + + spin_unlock(&lcq->lcq_lock); + return lli; +} + +static int ll_close_thread(void *arg) +{ + struct ll_close_queue *lcq = arg; + + complete(&lcq->lcq_comp); + + while (1) { + struct l_wait_info lwi = { 0 }; + struct ll_inode_info *lli; + struct inode *inode; + + l_wait_event_exclusive(lcq->lcq_waitq, + (lli = ll_close_next_lli(lcq)) != NULL, + &lwi); + if (IS_ERR(lli)) + break; + + inode = ll_info2i(lli); + CDEBUG(D_INFO, "done_writing for inode %lu/%u\n", + inode->i_ino, inode->i_generation); + ll_done_writing(inode); + iput(inode); + } + + CDEBUG(D_INFO, "ll_close exiting\n"); + complete(&lcq->lcq_comp); + return 0; +} + +int ll_close_thread_start(struct ll_close_queue **lcq_ret) +{ + struct ll_close_queue *lcq; + struct task_struct *task; + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CLOSE_THREAD)) + return -EINTR; + + lcq = kzalloc(sizeof(*lcq), GFP_NOFS); + if (!lcq) + return -ENOMEM; + + spin_lock_init(&lcq->lcq_lock); + INIT_LIST_HEAD(&lcq->lcq_head); + init_waitqueue_head(&lcq->lcq_waitq); + init_completion(&lcq->lcq_comp); + + task = kthread_run(ll_close_thread, lcq, "ll_close"); + if (IS_ERR(task)) { + OBD_FREE(lcq, sizeof(*lcq)); + return PTR_ERR(task); + } + + wait_for_completion(&lcq->lcq_comp); + *lcq_ret = lcq; + return 0; +} + +void ll_close_thread_shutdown(struct ll_close_queue *lcq) +{ + init_completion(&lcq->lcq_comp); + atomic_inc(&lcq->lcq_stop); + wake_up(&lcq->lcq_waitq); + wait_for_completion(&lcq->lcq_comp); + OBD_FREE(lcq, sizeof(*lcq)); +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_internal.h b/kernel/drivers/staging/lustre/lustre/llite/llite_internal.h new file mode 100644 index 000000000..5f918e3c4 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/llite_internal.h @@ -0,0 +1,1521 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LLITE_INTERNAL_H +#define LLITE_INTERNAL_H +#include "../include/lustre_debug.h" +#include "../include/lustre_ver.h" +#include "../include/lustre_disk.h" /* for s2sbi */ +#include "../include/lustre_eacl.h" + +/* for struct cl_lock_descr and struct cl_io */ +#include "../include/cl_object.h" +#include "../include/lclient.h" +#include "../include/lustre_mdc.h" +#include "../include/lustre_intent.h" +#include +#include + +#ifndef FMODE_EXEC +#define FMODE_EXEC 0 +#endif + +#ifndef VM_FAULT_RETRY +#define VM_FAULT_RETRY 0 +#endif + +/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it. + * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */ +#ifndef LOOKUP_CONTINUE +#define LOOKUP_CONTINUE LOOKUP_PARENT +#endif + +/** Only used on client-side for indicating the tail of dir hash/offset. */ +#define LL_DIR_END_OFF 0x7fffffffffffffffULL +#define LL_DIR_END_OFF_32BIT 0x7fffffffUL + +#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0") +#define LUSTRE_FPRIVATE(file) ((file)->private_data) + +struct ll_dentry_data { + struct lookup_intent *lld_it; + unsigned int lld_sa_generation; + unsigned int lld_invalid:1; + struct rcu_head lld_rcu_head; +}; + +#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata)) + +#define LLI_INODE_MAGIC 0x111d0de5 +#define LLI_INODE_DEAD 0xdeadd00d + +/* remote client permission cache */ +#define REMOTE_PERM_HASHSIZE 16 + +struct ll_getname_data { + struct dir_context ctx; + char *lgd_name; /* points to a buffer with NAME_MAX+1 size */ + struct lu_fid lgd_fid; /* target fid we are looking for */ + int lgd_found; /* inode matched? */ +}; + +/* llite setxid/access permission for user on remote client */ +struct ll_remote_perm { + struct hlist_node lrp_list; + uid_t lrp_uid; + gid_t lrp_gid; + uid_t lrp_fsuid; + gid_t lrp_fsgid; + int lrp_access_perm; /* MAY_READ/WRITE/EXEC, this + is access permission with + lrp_fsuid/lrp_fsgid. */ +}; + +enum lli_flags { + /* MDS has an authority for the Size-on-MDS attributes. */ + LLIF_MDS_SIZE_LOCK = (1 << 0), + /* Epoch close is postponed. */ + LLIF_EPOCH_PENDING = (1 << 1), + /* DONE WRITING is allowed. */ + LLIF_DONE_WRITING = (1 << 2), + /* Sizeon-on-MDS attributes are changed. An attribute update needs to + * be sent to MDS. */ + LLIF_SOM_DIRTY = (1 << 3), + /* File data is modified. */ + LLIF_DATA_MODIFIED = (1 << 4), + /* File is being restored */ + LLIF_FILE_RESTORING = (1 << 5), + /* Xattr cache is attached to the file */ + LLIF_XATTR_CACHE = (1 << 6), +}; + +struct ll_inode_info { + __u32 lli_inode_magic; + __u32 lli_flags; + __u64 lli_ioepoch; + + spinlock_t lli_lock; + struct posix_acl *lli_posix_acl; + + struct hlist_head *lli_remote_perms; + struct mutex lli_rmtperm_mutex; + + /* identifying fields for both metadata and data stacks. */ + struct lu_fid lli_fid; + /* Parent fid for accessing default stripe data on parent directory + * for allocating OST objects after a mknod() and later open-by-FID. */ + struct lu_fid lli_pfid; + + struct list_head lli_close_list; + struct list_head lli_oss_capas; + /* open count currently used by capability only, indicate whether + * capability needs renewal */ + atomic_t lli_open_count; + struct obd_capa *lli_mds_capa; + unsigned long lli_rmtperm_time; + + /* handle is to be sent to MDS later on done_writing and setattr. + * Open handle data are needed for the recovery to reconstruct + * the inode state on the MDS. XXX: recovery is not ready yet. */ + struct obd_client_handle *lli_pending_och; + + /* We need all three because every inode may be opened in different + * modes */ + struct obd_client_handle *lli_mds_read_och; + struct obd_client_handle *lli_mds_write_och; + struct obd_client_handle *lli_mds_exec_och; + __u64 lli_open_fd_read_count; + __u64 lli_open_fd_write_count; + __u64 lli_open_fd_exec_count; + /* Protects access to och pointers and their usage counters */ + struct mutex lli_och_mutex; + + struct inode lli_vfs_inode; + + /* the most recent timestamps obtained from mds */ + struct ost_lvb lli_lvb; + spinlock_t lli_agl_lock; + + /* Try to make the d::member and f::member are aligned. Before using + * these members, make clear whether it is directory or not. */ + union { + /* for directory */ + struct { + /* serialize normal readdir and statahead-readdir. */ + struct mutex d_readdir_mutex; + + /* metadata statahead */ + /* since parent-child threads can share the same @file + * struct, "opendir_key" is the token when dir close for + * case of parent exit before child -- it is me should + * cleanup the dir readahead. */ + void *d_opendir_key; + struct ll_statahead_info *d_sai; + /* protect statahead stuff. */ + spinlock_t d_sa_lock; + /* "opendir_pid" is the token when lookup/revalid + * -- I am the owner of dir statahead. */ + pid_t d_opendir_pid; + } d; + +#define lli_readdir_mutex u.d.d_readdir_mutex +#define lli_opendir_key u.d.d_opendir_key +#define lli_sai u.d.d_sai +#define lli_sa_lock u.d.d_sa_lock +#define lli_opendir_pid u.d.d_opendir_pid + + /* for non-directory */ + struct { + struct mutex f_size_mutex; + char *f_symlink_name; + __u64 f_maxbytes; + /* + * struct rw_semaphore { + * signed long count; // align d.d_def_acl + * spinlock_t wait_lock; // align d.d_sa_lock + * struct list_head wait_list; + * } + */ + struct rw_semaphore f_trunc_sem; + struct mutex f_write_mutex; + + struct rw_semaphore f_glimpse_sem; + unsigned long f_glimpse_time; + struct list_head f_agl_list; + __u64 f_agl_index; + + /* for writepage() only to communicate to fsync */ + int f_async_rc; + + /* + * whenever a process try to read/write the file, the + * jobid of the process will be saved here, and it'll + * be packed into the write PRC when flush later. + * + * so the read/write statistics for jobid will not be + * accurate if the file is shared by different jobs. + */ + char f_jobid[JOBSTATS_JOBID_SIZE]; + } f; + +#define lli_size_mutex u.f.f_size_mutex +#define lli_symlink_name u.f.f_symlink_name +#define lli_maxbytes u.f.f_maxbytes +#define lli_trunc_sem u.f.f_trunc_sem +#define lli_write_mutex u.f.f_write_mutex +#define lli_glimpse_sem u.f.f_glimpse_sem +#define lli_glimpse_time u.f.f_glimpse_time +#define lli_agl_list u.f.f_agl_list +#define lli_agl_index u.f.f_agl_index +#define lli_async_rc u.f.f_async_rc +#define lli_jobid u.f.f_jobid + + } u; + + /* XXX: For following frequent used members, although they maybe special + * used for non-directory object, it is some time-wasting to check + * whether the object is directory or not before using them. On the + * other hand, currently, sizeof(f) > sizeof(d), it cannot reduce + * the "ll_inode_info" size even if moving those members into u.f. + * So keep them out side. + * + * In the future, if more members are added only for directory, + * some of the following members can be moved into u.f. + */ + bool lli_has_smd; + struct cl_object *lli_clob; + + /* mutex to request for layout lock exclusively. */ + struct mutex lli_layout_mutex; + /* Layout version, protected by lli_layout_lock */ + __u32 lli_layout_gen; + spinlock_t lli_layout_lock; + + struct rw_semaphore lli_xattrs_list_rwsem; + struct mutex lli_xattrs_enq_lock; + struct list_head lli_xattrs;/* ll_xattr_entry->xe_list */ +}; + +static inline __u32 ll_layout_version_get(struct ll_inode_info *lli) +{ + __u32 gen; + + spin_lock(&lli->lli_layout_lock); + gen = lli->lli_layout_gen; + spin_unlock(&lli->lli_layout_lock); + + return gen; +} + +static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen) +{ + spin_lock(&lli->lli_layout_lock); + lli->lli_layout_gen = gen; + spin_unlock(&lli->lli_layout_lock); +} + +int ll_xattr_cache_destroy(struct inode *inode); + +int ll_xattr_cache_get(struct inode *inode, + const char *name, + char *buffer, + size_t size, + __u64 valid); + +/* + * Locking to guarantee consistency of non-atomic updates to long long i_size, + * consistency between file size and KMS. + * + * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order. + */ + +void ll_inode_size_lock(struct inode *inode); +void ll_inode_size_unlock(struct inode *inode); + +/* FIXME: replace the name of this with LL_I to conform to kernel stuff */ +/* static inline struct ll_inode_info *LL_I(struct inode *inode) */ +static inline struct ll_inode_info *ll_i2info(struct inode *inode) +{ + return container_of(inode, struct ll_inode_info, lli_vfs_inode); +} + +/* default to about 40meg of readahead on a given system. That much tied + * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */ +#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT)) + +/* default to read-ahead full files smaller than 2MB on the second read */ +#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT)) + +enum ra_stat { + RA_STAT_HIT = 0, + RA_STAT_MISS, + RA_STAT_DISTANT_READPAGE, + RA_STAT_MISS_IN_WINDOW, + RA_STAT_FAILED_GRAB_PAGE, + RA_STAT_FAILED_MATCH, + RA_STAT_DISCARDED, + RA_STAT_ZERO_LEN, + RA_STAT_ZERO_WINDOW, + RA_STAT_EOF, + RA_STAT_MAX_IN_FLIGHT, + RA_STAT_WRONG_GRAB_PAGE, + _NR_RA_STAT, +}; + +struct ll_ra_info { + atomic_t ra_cur_pages; + unsigned long ra_max_pages; + unsigned long ra_max_pages_per_file; + unsigned long ra_max_read_ahead_whole_pages; +}; + +/* ra_io_arg will be filled in the beginning of ll_readahead with + * ras_lock, then the following ll_read_ahead_pages will read RA + * pages according to this arg, all the items in this structure are + * counted by page index. + */ +struct ra_io_arg { + unsigned long ria_start; /* start offset of read-ahead*/ + unsigned long ria_end; /* end offset of read-ahead*/ + /* If stride read pattern is detected, ria_stoff means where + * stride read is started. Note: for normal read-ahead, the + * value here is meaningless, and also it will not be accessed*/ + pgoff_t ria_stoff; + /* ria_length and ria_pages are the length and pages length in the + * stride I/O mode. And they will also be used to check whether + * it is stride I/O read-ahead in the read-ahead pages*/ + unsigned long ria_length; + unsigned long ria_pages; +}; + +/* LL_HIST_MAX=32 causes an overflow */ +#define LL_HIST_MAX 28 +#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */ +#define LL_PROCESS_HIST_MAX 10 +struct per_process_info { + pid_t pid; + struct obd_histogram pp_r_hist; + struct obd_histogram pp_w_hist; +}; + +/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */ +struct ll_rw_extents_info { + struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1]; +}; + +#define LL_OFFSET_HIST_MAX 100 +struct ll_rw_process_info { + pid_t rw_pid; + int rw_op; + loff_t rw_range_start; + loff_t rw_range_end; + loff_t rw_last_file_pos; + loff_t rw_offset; + size_t rw_smallest_extent; + size_t rw_largest_extent; + struct ll_file_data *rw_last_file; +}; + +enum stats_track_type { + STATS_TRACK_ALL = 0, /* track all processes */ + STATS_TRACK_PID, /* track process with this pid */ + STATS_TRACK_PPID, /* track processes with this ppid */ + STATS_TRACK_GID, /* track processes with this gid */ + STATS_TRACK_LAST, +}; + +/* flags for sbi->ll_flags */ +#define LL_SBI_NOLCK 0x01 /* DLM locking disabled (directio-only) */ +#define LL_SBI_CHECKSUM 0x02 /* checksum each page as it's written */ +#define LL_SBI_FLOCK 0x04 +#define LL_SBI_USER_XATTR 0x08 /* support user xattr */ +#define LL_SBI_ACL 0x10 /* support ACL */ +#define LL_SBI_RMT_CLIENT 0x40 /* remote client */ +#define LL_SBI_MDS_CAPA 0x80 /* support mds capa */ +#define LL_SBI_OSS_CAPA 0x100 /* support oss capa */ +#define LL_SBI_LOCALFLOCK 0x200 /* Local flocks support by kernel */ +#define LL_SBI_LRU_RESIZE 0x400 /* lru resize support */ +#define LL_SBI_LAZYSTATFS 0x800 /* lazystatfs mount option */ +#define LL_SBI_SOM_PREVIEW 0x1000 /* SOM preview mount option */ +#define LL_SBI_32BIT_API 0x2000 /* generate 32 bit inodes. */ +#define LL_SBI_64BIT_HASH 0x4000 /* support 64-bits dir hash/offset */ +#define LL_SBI_AGL_ENABLED 0x8000 /* enable agl */ +#define LL_SBI_VERBOSE 0x10000 /* verbose mount/umount */ +#define LL_SBI_LAYOUT_LOCK 0x20000 /* layout lock support */ +#define LL_SBI_USER_FID2PATH 0x40000 /* allow fid2path by unprivileged users */ +#define LL_SBI_XATTR_CACHE 0x80000 /* support for xattr cache */ + +#define LL_SBI_FLAGS { \ + "nolck", \ + "checksum", \ + "flock", \ + "xattr", \ + "acl", \ + "???", \ + "rmt_client", \ + "mds_capa", \ + "oss_capa", \ + "flock", \ + "lru_resize", \ + "lazy_statfs", \ + "som", \ + "32bit_api", \ + "64bit_hash", \ + "agl", \ + "verbose", \ + "layout", \ + "user_fid2path",\ + "xattr", \ +} + +#define RCE_HASHES 32 + +struct rmtacl_ctl_entry { + struct list_head rce_list; + pid_t rce_key; /* hash key */ + int rce_ops; /* acl operation type */ +}; + +struct rmtacl_ctl_table { + spinlock_t rct_lock; + struct list_head rct_entries[RCE_HASHES]; +}; + +#define EE_HASHES 32 + +struct eacl_table { + spinlock_t et_lock; + struct list_head et_entries[EE_HASHES]; +}; + +struct ll_sb_info { + struct list_head ll_list; + /* this protects pglist and ra_info. It isn't safe to + * grab from interrupt contexts */ + spinlock_t ll_lock; + spinlock_t ll_pp_extent_lock; /* pp_extent entry*/ + spinlock_t ll_process_lock; /* ll_rw_process_info */ + struct obd_uuid ll_sb_uuid; + struct obd_export *ll_md_exp; + struct obd_export *ll_dt_exp; + struct proc_dir_entry* ll_proc_root; + struct lu_fid ll_root_fid; /* root object fid */ + + int ll_flags; + unsigned int ll_umounting:1, + ll_xattr_cache_enabled:1; + struct list_head ll_conn_chain; /* per-conn chain of SBs */ + struct lustre_client_ocd ll_lco; + + struct list_head ll_orphan_dentry_list; /*please don't ask -p*/ + struct ll_close_queue *ll_lcq; + + struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ + + struct cl_client_cache ll_cache; + + struct lprocfs_stats *ll_ra_stats; + + struct ll_ra_info ll_ra_info; + unsigned int ll_namelen; + struct file_operations *ll_fop; + + /* =0 - hold lock over whole read/write + * >0 - max. chunk to be read/written w/o lock re-acquiring */ + unsigned long ll_max_rw_chunk; + unsigned int ll_md_brw_size; /* used by readdir */ + + struct lu_site *ll_site; + struct cl_device *ll_cl; + /* Statistics */ + struct ll_rw_extents_info ll_rw_extents_info; + int ll_extent_process_count; + struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX]; + unsigned int ll_offset_process_count; + struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX]; + unsigned int ll_rw_offset_entry_count; + int ll_stats_track_id; + enum stats_track_type ll_stats_track_type; + int ll_rw_stats_on; + + /* metadata stat-ahead */ + unsigned int ll_sa_max; /* max statahead RPCs */ + atomic_t ll_sa_total; /* statahead thread started + * count */ + atomic_t ll_sa_wrong; /* statahead thread stopped for + * low hit ratio */ + atomic_t ll_agl_total; /* AGL thread started count */ + + dev_t ll_sdev_orig; /* save s_dev before assign for + * clustered nfs */ + struct rmtacl_ctl_table ll_rct; + struct eacl_table ll_et; + __kernel_fsid_t ll_fsid; +}; + +#define LL_DEFAULT_MAX_RW_CHUNK (32 * 1024 * 1024) + +struct ll_ra_read { + pgoff_t lrr_start; + pgoff_t lrr_count; + struct task_struct *lrr_reader; + struct list_head lrr_linkage; +}; + +/* + * per file-descriptor read-ahead data. + */ +struct ll_readahead_state { + spinlock_t ras_lock; + /* + * index of the last page that read(2) needed and that wasn't in the + * cache. Used by ras_update() to detect seeks. + * + * XXX nikita: if access seeks into cached region, Lustre doesn't see + * this. + */ + unsigned long ras_last_readpage; + /* + * number of pages read after last read-ahead window reset. As window + * is reset on each seek, this is effectively a number of consecutive + * accesses. Maybe ->ras_accessed_in_window is better name. + * + * XXX nikita: window is also reset (by ras_update()) when Lustre + * believes that memory pressure evicts read-ahead pages. In that + * case, it probably doesn't make sense to expand window to + * PTLRPC_MAX_BRW_PAGES on the third access. + */ + unsigned long ras_consecutive_pages; + /* + * number of read requests after the last read-ahead window reset + * As window is reset on each seek, this is effectively the number + * on consecutive read request and is used to trigger read-ahead. + */ + unsigned long ras_consecutive_requests; + /* + * Parameters of current read-ahead window. Handled by + * ras_update(). On the initial access to the file or after a seek, + * window is reset to 0. After 3 consecutive accesses, window is + * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by + * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages. + */ + unsigned long ras_window_start, ras_window_len; + /* + * Where next read-ahead should start at. This lies within read-ahead + * window. Read-ahead window is read in pieces rather than at once + * because: 1. lustre limits total number of pages under read-ahead by + * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages + * not covered by DLM lock. + */ + unsigned long ras_next_readahead; + /* + * Total number of ll_file_read requests issued, reads originating + * due to mmap are not counted in this total. This value is used to + * trigger full file read-ahead after multiple reads to a small file. + */ + unsigned long ras_requests; + /* + * Page index with respect to the current request, these value + * will not be accurate when dealing with reads issued via mmap. + */ + unsigned long ras_request_index; + /* + * list of struct ll_ra_read's one per read(2) call current in + * progress against this file descriptor. Used by read-ahead code, + * protected by ->ras_lock. + */ + struct list_head ras_read_beads; + /* + * The following 3 items are used for detecting the stride I/O + * mode. + * In stride I/O mode, + * ...............|-----data-----|****gap*****|--------|******|.... + * offset |-stride_pages-|-stride_gap-| + * ras_stride_offset = offset; + * ras_stride_length = stride_pages + stride_gap; + * ras_stride_pages = stride_pages; + * Note: all these three items are counted by pages. + */ + unsigned long ras_stride_length; + unsigned long ras_stride_pages; + pgoff_t ras_stride_offset; + /* + * number of consecutive stride request count, and it is similar as + * ras_consecutive_requests, but used for stride I/O mode. + * Note: only more than 2 consecutive stride request are detected, + * stride read-ahead will be enable + */ + unsigned long ras_consecutive_stride_requests; +}; + +extern struct kmem_cache *ll_file_data_slab; +struct lustre_handle; +struct ll_file_data { + struct ll_readahead_state fd_ras; + struct ccc_grouplock fd_grouplock; + __u64 lfd_pos; + __u32 fd_flags; + fmode_t fd_omode; + /* openhandle if lease exists for this file. + * Borrow lli->lli_och_mutex to protect assignment */ + struct obd_client_handle *fd_lease_och; + struct obd_client_handle *fd_och; + struct file *fd_file; + /* Indicate whether need to report failure when close. + * true: failure is known, not report again. + * false: unknown failure, should report. */ + bool fd_write_failed; +}; + +struct lov_stripe_md; + +extern spinlock_t inode_lock; + +extern struct proc_dir_entry *proc_lustre_fs_root; + +static inline struct inode *ll_info2i(struct ll_inode_info *lli) +{ + return &lli->lli_vfs_inode; +} + +__u32 ll_i2suppgid(struct inode *i); +void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2); + +static inline int ll_need_32bit_api(struct ll_sb_info *sbi) +{ +#if BITS_PER_LONG == 32 + return 1; +#elif defined(CONFIG_COMPAT) + return unlikely(is_compat_task() || (sbi->ll_flags & LL_SBI_32BIT_API)); +#else + return unlikely(sbi->ll_flags & LL_SBI_32BIT_API); +#endif +} + +void ll_ra_read_in(struct file *f, struct ll_ra_read *rar); +void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar); +struct ll_ra_read *ll_ra_read_get(struct file *f); + +/* llite/lproc_llite.c */ +#if defined (CONFIG_PROC_FS) +int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, char *osc, char *mdc); +void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi); +void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count); +void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars); +void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw); +#else +static inline int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, char *osc, char *mdc){return 0;} +static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {} +static inline +void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {} +static inline void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +static inline void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw) {} +#endif + + +/* llite/dir.c */ +void ll_release_page(struct page *page, int remove); +extern const struct file_operations ll_dir_operations; +extern const struct inode_operations ll_dir_inode_operations; +struct page *ll_get_dir_page(struct inode *dir, __u64 hash, + struct ll_dir_chain *chain); +int ll_dir_read(struct inode *inode, struct dir_context *ctx); + +int ll_get_mdt_idx(struct inode *inode); +/* llite/namei.c */ +extern const struct inode_operations ll_special_inode_operations; + +int ll_objects_destroy(struct ptlrpc_request *request, + struct inode *dir); +struct inode *ll_iget(struct super_block *sb, ino_t hash, + struct lustre_md *lic); +int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *data, int flag); +struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de); +int ll_rmdir_entry(struct inode *dir, char *name, int namelen); + +/* llite/rw.c */ +int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to); +int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to); +int ll_writepage(struct page *page, struct writeback_control *wbc); +int ll_writepages(struct address_space *, struct writeback_control *wbc); +int ll_readpage(struct file *file, struct page *page); +void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); +int ll_readahead(const struct lu_env *env, struct cl_io *io, + struct ll_readahead_state *ras, struct address_space *mapping, + struct cl_page_list *queue, int flags); + +#ifndef MS_HAS_NEW_AOPS +extern const struct address_space_operations ll_aops; +#else +extern const struct address_space_operations_ext ll_aops; +#endif + +/* llite/file.c */ +extern struct file_operations ll_file_operations; +extern struct file_operations ll_file_operations_flock; +extern struct file_operations ll_file_operations_noflock; +extern struct inode_operations ll_file_inode_operations; +extern int ll_have_md_lock(struct inode *inode, __u64 *bits, + ldlm_mode_t l_req_mode); +extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits, + struct lustre_handle *lockh, __u64 flags, + ldlm_mode_t mode); +int ll_file_open(struct inode *inode, struct file *file); +int ll_file_release(struct inode *inode, struct file *file); +int ll_glimpse_ioctl(struct ll_sb_info *sbi, + struct lov_stripe_md *lsm, lstat_t *st); +void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch); +int ll_release_openhandle(struct inode *, struct lookup_intent *); +int ll_md_real_close(struct inode *inode, fmode_t fmode); +void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data, + struct obd_client_handle **och, unsigned long flags); +void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data); +int ll_som_update(struct inode *inode, struct md_op_data *op_data); +int ll_inode_getattr(struct inode *inode, struct obdo *obdo, + __u64 ioepoch, int sync); +void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data, + struct lustre_handle *fh); +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat); +struct posix_acl *ll_get_acl(struct inode *inode, int type); + +int ll_inode_permission(struct inode *inode, int mask); + +int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, + int flags, struct lov_user_md *lum, + int lum_size); +int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, + struct lov_mds_md **lmm, int *lmm_size, + struct ptlrpc_request **request); +int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, + int set_default); +int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, + int *lmm_size, struct ptlrpc_request **request); +int ll_fsync(struct file *file, loff_t start, loff_t end, int data); +int ll_merge_lvb(const struct lu_env *env, struct inode *inode); +int ll_fid2path(struct inode *inode, void __user *arg); +int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock); +int ll_hsm_release(struct inode *inode); + +/* llite/dcache.c */ + +int ll_d_init(struct dentry *de); +extern const struct dentry_operations ll_d_ops; +void ll_intent_drop_lock(struct lookup_intent *); +void ll_intent_release(struct lookup_intent *); +void ll_invalidate_aliases(struct inode *); +void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode); +int ll_revalidate_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, struct inode *inode); + +/* llite/llite_lib.c */ +extern struct super_operations lustre_super_operations; + +void ll_lli_init(struct ll_inode_info *lli); +int ll_fill_super(struct super_block *sb, struct vfsmount *mnt); +void ll_put_super(struct super_block *sb); +void ll_kill_super(struct super_block *sb); +struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock); +void ll_clear_inode(struct inode *inode); +int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import); +int ll_setattr(struct dentry *de, struct iattr *attr); +int ll_statfs(struct dentry *de, struct kstatfs *sfs); +int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, + __u64 max_age, __u32 flags); +void ll_update_inode(struct inode *inode, struct lustre_md *md); +void ll_read_inode2(struct inode *inode, void *opaque); +void ll_delete_inode(struct inode *inode); +int ll_iocontrol(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +int ll_flush_ctx(struct inode *inode); +void ll_umount_begin(struct super_block *sb); +int ll_remount_fs(struct super_block *sb, int *flags, char *data); +int ll_show_options(struct seq_file *seq, struct dentry *dentry); +void ll_dirty_page_discard_warn(struct page *page, int ioret); +int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, + struct super_block *, struct lookup_intent *); +int ll_obd_statfs(struct inode *inode, void *arg); +int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); +int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize); +int ll_get_max_cookiesize(struct ll_sb_info *sbi, int *max_cookiesize); +int ll_get_default_cookiesize(struct ll_sb_info *sbi, int *default_cookiesize); +int ll_process_config(struct lustre_cfg *lcfg); +struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, + struct inode *i1, struct inode *i2, + const char *name, int namelen, + int mode, __u32 opc, void *data); +void ll_finish_md_op_data(struct md_op_data *op_data); +int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg); +char *ll_get_fsname(struct super_block *sb, char *buf, int buflen); + +/* llite/llite_nfs.c */ +extern struct export_operations lustre_export_operations; +__u32 get_uuid2int(const char *name, int len); +void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid); +struct inode *search_inode_for_lustre(struct super_block *sb, + const struct lu_fid *fid); + +/* llite/symlink.c */ +extern struct inode_operations ll_fast_symlink_inode_operations; + +/* llite/llite_close.c */ +struct ll_close_queue { + spinlock_t lcq_lock; + struct list_head lcq_head; + wait_queue_head_t lcq_waitq; + struct completion lcq_comp; + atomic_t lcq_stop; +}; + +struct ccc_object *cl_inode2ccc(struct inode *inode); + + +void vvp_write_pending (struct ccc_object *club, struct ccc_page *page); +void vvp_write_complete(struct ccc_object *club, struct ccc_page *page); + +/* specific architecture can implement only part of this list */ +enum vvp_io_subtype { + /** normal IO */ + IO_NORMAL, + /** io started from splice_{read|write} */ + IO_SPLICE +}; + +/* IO subtypes */ +struct vvp_io { + /** io subtype */ + enum vvp_io_subtype cui_io_subtype; + + union { + struct { + struct pipe_inode_info *cui_pipe; + unsigned int cui_flags; + } splice; + struct vvp_fault_io { + /** + * Inode modification time that is checked across DLM + * lock request. + */ + time_t ft_mtime; + struct vm_area_struct *ft_vma; + /** + * locked page returned from vvp_io + */ + struct page *ft_vmpage; + struct vm_fault_api { + /** + * kernel fault info + */ + struct vm_fault *ft_vmf; + /** + * fault API used bitflags for return code. + */ + unsigned int ft_flags; + /** + * check that flags are from filemap_fault + */ + bool ft_flags_valid; + } fault; + } fault; + } u; + /** + * Read-ahead state used by read and page-fault IO contexts. + */ + struct ll_ra_read cui_bead; + /** + * Set when cui_bead has been initialized. + */ + int cui_ra_window_set; +}; + +/** + * IO arguments for various VFS I/O interfaces. + */ +struct vvp_io_args { + /** normal/splice */ + enum vvp_io_subtype via_io_subtype; + + union { + struct { + struct kiocb *via_iocb; + struct iov_iter *via_iter; + } normal; + struct { + struct pipe_inode_info *via_pipe; + unsigned int via_flags; + } splice; + } u; +}; + +struct ll_cl_context { + void *lcc_cookie; + struct cl_io *lcc_io; + struct cl_page *lcc_page; + struct lu_env *lcc_env; + int lcc_refcheck; +}; + +struct vvp_thread_info { + struct vvp_io_args vti_args; + struct ra_io_arg vti_ria; + struct ll_cl_context vti_io_ctx; +}; + +static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env) +{ + extern struct lu_context_key vvp_key; + struct vvp_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &vvp_key); + LASSERT(info != NULL); + return info; +} + +static inline struct vvp_io_args *vvp_env_args(const struct lu_env *env, + enum vvp_io_subtype type) +{ + struct vvp_io_args *ret = &vvp_env_info(env)->vti_args; + + ret->via_io_subtype = type; + + return ret; +} + +struct vvp_session { + struct vvp_io vs_ios; +}; + +static inline struct vvp_session *vvp_env_session(const struct lu_env *env) +{ + extern struct lu_context_key vvp_session_key; + struct vvp_session *ses; + + ses = lu_context_key_get(env->le_ses, &vvp_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct vvp_io *vvp_env_io(const struct lu_env *env) +{ + return &vvp_env_session(env)->vs_ios; +} + +int vvp_global_init(void); +void vvp_global_fini(void); + +void ll_queue_done_writing(struct inode *inode, unsigned long flags); +void ll_close_thread_shutdown(struct ll_close_queue *lcq); +int ll_close_thread_start(struct ll_close_queue **lcq_ret); + +/* llite/llite_mmap.c */ + +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last); +int ll_file_mmap(struct file *file, struct vm_area_struct *vma); +void policy_from_vma(ldlm_policy_data_t *policy, + struct vm_area_struct *vma, unsigned long addr, size_t count); +struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, + size_t count); + +static inline void ll_invalidate_page(struct page *vmpage) +{ + struct address_space *mapping = vmpage->mapping; + loff_t offset = vmpage->index << PAGE_CACHE_SHIFT; + + LASSERT(PageLocked(vmpage)); + if (mapping == NULL) + return; + + ll_teardown_mmaps(mapping, offset, offset + PAGE_CACHE_SIZE); + truncate_complete_page(mapping, vmpage); +} + +#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi) + +/* don't need an addref as the sb_info should be holding one */ +static inline struct obd_export *ll_s2dtexp(struct super_block *sb) +{ + return ll_s2sbi(sb)->ll_dt_exp; +} + +/* don't need an addref as the sb_info should be holding one */ +static inline struct obd_export *ll_s2mdexp(struct super_block *sb) +{ + return ll_s2sbi(sb)->ll_md_exp; +} + +static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi) +{ + struct obd_device *obd = sbi->ll_md_exp->exp_obd; + if (obd == NULL) + LBUG(); + return &obd->u.cli; +} + +/* FIXME: replace the name of this with LL_SB to conform to kernel stuff */ +static inline struct ll_sb_info *ll_i2sbi(struct inode *inode) +{ + return ll_s2sbi(inode->i_sb); +} + +static inline struct obd_export *ll_i2dtexp(struct inode *inode) +{ + return ll_s2dtexp(inode->i_sb); +} + +static inline struct obd_export *ll_i2mdexp(struct inode *inode) +{ + return ll_s2mdexp(inode->i_sb); +} + +static inline struct lu_fid *ll_inode2fid(struct inode *inode) +{ + struct lu_fid *fid; + + LASSERT(inode != NULL); + fid = &ll_i2info(inode)->lli_fid; + + return fid; +} + +static inline __u64 ll_file_maxbytes(struct inode *inode) +{ + return ll_i2info(inode)->lli_maxbytes; +} + +/* llite/xattr.c */ +int ll_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ll_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size); +int ll_removexattr(struct dentry *dentry, const char *name); + +/* llite/remote_perm.c */ +extern struct kmem_cache *ll_remote_perm_cachep; +extern struct kmem_cache *ll_rmtperm_hash_cachep; + +void free_rmtperm_hash(struct hlist_head *hash); +int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm); +int lustre_check_remote_perm(struct inode *inode, int mask); + +/* llite/llite_capa.c */ +extern struct timer_list ll_capa_timer; + +int ll_capa_thread_start(void); +void ll_capa_thread_stop(void); +void ll_capa_timer_callback(unsigned long unused); + +struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa); + +void ll_capa_open(struct inode *inode); +void ll_capa_close(struct inode *inode); + +struct obd_capa *ll_mdscapa_get(struct inode *inode); +struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc); + +void ll_truncate_free_capa(struct obd_capa *ocapa); +void ll_clear_inode_capas(struct inode *inode); +void ll_print_capa_stat(struct ll_sb_info *sbi); + +/* llite/llite_cl.c */ +extern struct lu_device_type vvp_device_type; + +/** + * Common IO arguments for various VFS I/O interfaces. + */ +int cl_sb_init(struct super_block *sb); +int cl_sb_fini(struct super_block *sb); +void ll_io_init(struct cl_io *io, const struct file *file, int write); + +void ras_update(struct ll_sb_info *sbi, struct inode *inode, + struct ll_readahead_state *ras, unsigned long index, + unsigned hit); +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len); +void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which); + +/* llite/llite_rmtacl.c */ +#ifdef CONFIG_FS_POSIX_ACL +struct eacl_entry { + struct list_head ee_list; + pid_t ee_key; /* hash key */ + struct lu_fid ee_fid; + int ee_type; /* ACL type for ACCESS or DEFAULT */ + ext_acl_xattr_header *ee_acl; +}; + +u64 rce_ops2valid(int ops); +struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key); +int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops); +int rct_del(struct rmtacl_ctl_table *rct, pid_t key); +void rct_init(struct rmtacl_ctl_table *rct); +void rct_fini(struct rmtacl_ctl_table *rct); + +void ee_free(struct eacl_entry *ee); +int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type, + ext_acl_xattr_header *header); +struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key, + struct lu_fid *fid, int type); +void et_search_free(struct eacl_table *et, pid_t key); +void et_init(struct eacl_table *et); +void et_fini(struct eacl_table *et); +#else +static inline u64 rce_ops2valid(int ops) +{ + return 0; +} +#endif + +/* statahead.c */ + +#define LL_SA_RPC_MIN 2 +#define LL_SA_RPC_DEF 32 +#define LL_SA_RPC_MAX 8192 + +#define LL_SA_CACHE_BIT 5 +#define LL_SA_CACHE_SIZE (1 << LL_SA_CACHE_BIT) +#define LL_SA_CACHE_MASK (LL_SA_CACHE_SIZE - 1) + +/* per inode struct, for dir only */ +struct ll_statahead_info { + struct inode *sai_inode; + atomic_t sai_refcount; /* when access this struct, hold + * refcount */ + unsigned int sai_generation; /* generation for statahead */ + unsigned int sai_max; /* max ahead of lookup */ + __u64 sai_sent; /* stat requests sent count */ + __u64 sai_replied; /* stat requests which received + * reply */ + __u64 sai_index; /* index of statahead entry */ + __u64 sai_index_wait; /* index of entry which is the + * caller is waiting for */ + __u64 sai_hit; /* hit count */ + __u64 sai_miss; /* miss count: + * for "ls -al" case, it includes + * hidden dentry miss; + * for "ls -l" case, it does not + * include hidden dentry miss. + * "sai_miss_hidden" is used for + * the later case. + */ + unsigned int sai_consecutive_miss; /* consecutive miss */ + unsigned int sai_miss_hidden;/* "ls -al", but first dentry + * is not a hidden one */ + unsigned int sai_skip_hidden;/* skipped hidden dentry count */ + unsigned int sai_ls_all:1, /* "ls -al", do stat-ahead for + * hidden entries */ + sai_agl_valid:1;/* AGL is valid for the dir */ + wait_queue_head_t sai_waitq; /* stat-ahead wait queue */ + struct ptlrpc_thread sai_thread; /* stat-ahead thread */ + struct ptlrpc_thread sai_agl_thread; /* AGL thread */ + struct list_head sai_entries; /* entry list */ + struct list_head sai_entries_received; /* entries returned */ + struct list_head sai_entries_stated; /* entries stated */ + struct list_head sai_entries_agl; /* AGL entries to be sent */ + struct list_head sai_cache[LL_SA_CACHE_SIZE]; + spinlock_t sai_cache_lock[LL_SA_CACHE_SIZE]; + atomic_t sai_cache_count; /* entry count in cache */ +}; + +int do_statahead_enter(struct inode *dir, struct dentry **dentry, + int only_unplug); +void ll_stop_statahead(struct inode *dir, void *key); + +static inline int ll_glimpse_size(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + down_read(&lli->lli_glimpse_sem); + rc = cl_glimpse_size(inode); + lli->lli_glimpse_time = cfs_time_current(); + up_read(&lli->lli_glimpse_sem); + return rc; +} + +static inline void +ll_statahead_mark(struct inode *dir, struct dentry *dentry) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct ll_dentry_data *ldd = ll_d2d(dentry); + + /* not the same process, don't mark */ + if (lli->lli_opendir_pid != current_pid()) + return; + + LASSERT(ldd != NULL); + if (sai != NULL) + ldd->lld_sa_generation = sai->sai_generation; +} + +static inline int +d_need_statahead(struct inode *dir, struct dentry *dentryp) +{ + struct ll_inode_info *lli; + struct ll_dentry_data *ldd; + + if (ll_i2sbi(dir)->ll_sa_max == 0) + return -EAGAIN; + + lli = ll_i2info(dir); + /* not the same process, don't statahead */ + if (lli->lli_opendir_pid != current_pid()) + return -EAGAIN; + + /* statahead has been stopped */ + if (lli->lli_opendir_key == NULL) + return -EAGAIN; + + ldd = ll_d2d(dentryp); + /* + * When stats a dentry, the system trigger more than once "revalidate" + * or "lookup", for "getattr", for "getxattr", and maybe for others. + * Under patchless client mode, the operation intent is not accurate, + * which maybe misguide the statahead thread. For example: + * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe + * have the same operation intent -- "IT_GETATTR". + * In fact, one dentry should has only one chance to interact with the + * statahead thread, otherwise the statahead windows will be confused. + * The solution is as following: + * Assign "lld_sa_generation" with "sai_generation" when a dentry + * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR" + * will bypass interacting with statahead thread for checking: + * "lld_sa_generation == lli_sai->sai_generation" + */ + if (ldd && lli->lli_sai && + ldd->lld_sa_generation == lli->lli_sai->sai_generation) + return -EAGAIN; + + return 1; +} + +static inline int +ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug) +{ + int ret; + + ret = d_need_statahead(dir, *dentryp); + if (ret <= 0) + return ret; + + return do_statahead_enter(dir, dentryp, only_unplug); +} + +/* llite ioctl register support routine */ +enum llioc_iter { + LLIOC_CONT = 0, + LLIOC_STOP +}; + +#define LLIOC_MAX_CMD 256 + +/* + * Rules to write a callback function: + * + * Parameters: + * @magic: Dynamic ioctl call routine will feed this value with the pointer + * returned to ll_iocontrol_register. Callback functions should use this + * data to check the potential collasion of ioctl cmd. If collasion is + * found, callback function should return LLIOC_CONT. + * @rcp: The result of ioctl command. + * + * Return values: + * If @magic matches the pointer returned by ll_iocontrol_data, the + * callback should return LLIOC_STOP; return LLIOC_STOP otherwise. + */ +typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg, + void *magic, int *rcp); + +/* export functions */ +/* Register ioctl block dynamatically for a regular file. + * + * @cmd: the array of ioctl command set + * @count: number of commands in the @cmd + * @cb: callback function, it will be called if an ioctl command is found to + * belong to the command list @cmd. + * + * Return value: + * A magic pointer will be returned if success; + * otherwise, NULL will be returned. + * */ +void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd); +void ll_iocontrol_unregister(void *magic); + + +/* lclient compat stuff */ +#define cl_inode_info ll_inode_info +#define cl_i2info(info) ll_i2info(info) +#define cl_inode_mode(inode) ((inode)->i_mode) +#define cl_i2sbi ll_i2sbi + +static inline struct ll_file_data *cl_iattr2fd(struct inode *inode, + const struct iattr *attr) +{ + LASSERT(attr->ia_valid & ATTR_FILE); + return LUSTRE_FPRIVATE(attr->ia_file); +} + +static inline void cl_isize_lock(struct inode *inode) +{ + ll_inode_size_lock(inode); +} + +static inline void cl_isize_unlock(struct inode *inode) +{ + ll_inode_size_unlock(inode); +} + +static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms) +{ + LASSERT(mutex_is_locked(&ll_i2info(inode)->lli_size_mutex)); + i_size_write(inode, kms); +} + +static inline void cl_isize_write(struct inode *inode, loff_t kms) +{ + ll_inode_size_lock(inode); + i_size_write(inode, kms); + ll_inode_size_unlock(inode); +} + +#define cl_isize_read(inode) i_size_read(inode) + +static inline int cl_merge_lvb(const struct lu_env *env, struct inode *inode) +{ + return ll_merge_lvb(env, inode); +} + +#define cl_inode_atime(inode) LTIME_S((inode)->i_atime) +#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime) +#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime) + +struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt); + +int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, + enum cl_fsync_mode mode, int ignore_layout); + +/** direct write pages */ +struct ll_dio_pages { + /** page array to be written. we don't support + * partial pages except the last one. */ + struct page **ldp_pages; + /* offset of each page */ + loff_t *ldp_offsets; + /** if ldp_offsets is NULL, it means a sequential + * pages to be written, then this is the file offset + * of the * first page. */ + loff_t ldp_start_offset; + /** how many bytes are to be written. */ + size_t ldp_size; + /** # of pages in the array. */ + int ldp_nr; +}; + +static inline void cl_stats_tally(struct cl_device *dev, enum cl_req_type crt, + int rc) +{ + int opc = (crt == CRT_READ) ? LPROC_LL_OSC_READ : + LPROC_LL_OSC_WRITE; + + ll_stats_ops_tally(ll_s2sbi(cl2ccc_dev(dev)->cdv_sb), opc, rc); +} + +extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, + struct ll_dio_pages *pv); + +static inline int ll_file_nolock(const struct file *file) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct inode *inode = file_inode(file); + + LASSERT(fd != NULL); + return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) || + (ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK)); +} + +static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode, + struct lookup_intent *it, __u64 *bits) +{ + if (!it->d.lustre.it_lock_set) { + struct lustre_handle handle; + + /* If this inode is a remote object, it will get two + * separate locks in different namespaces, Master MDT, + * where the name entry is, will grant LOOKUP lock, + * remote MDT, where the object is, will grant + * UPDATE|PERM lock. The inode will be attached to both + * LOOKUP and PERM locks, so revoking either locks will + * case the dcache being cleared */ + if (it->d.lustre.it_remote_lock_mode) { + handle.cookie = it->d.lustre.it_remote_lock_handle; + CDEBUG(D_DLMTRACE, "setting l_data to inode %p(%lu/%u) for remote lock %#llx\n", + inode, + inode->i_ino, inode->i_generation, + handle.cookie); + md_set_lock_data(exp, &handle.cookie, inode, NULL); + } + + handle.cookie = it->d.lustre.it_lock_handle; + + CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u) for lock %#llx\n", + inode, inode->i_ino, + inode->i_generation, handle.cookie); + + md_set_lock_data(exp, &handle.cookie, inode, + &it->d.lustre.it_lock_bits); + it->d.lustre.it_lock_set = 1; + } + + if (bits != NULL) + *bits = it->d.lustre.it_lock_bits; +} + +static inline void ll_lock_dcache(struct inode *inode) +{ + spin_lock(&inode->i_lock); +} + +static inline void ll_unlock_dcache(struct inode *inode) +{ + spin_unlock(&inode->i_lock); +} + +static inline int d_lustre_invalid(const struct dentry *dentry) +{ + struct ll_dentry_data *lld = ll_d2d(dentry); + + return (lld == NULL) || lld->lld_invalid; +} + +static inline void __d_lustre_invalidate(struct dentry *dentry) +{ + struct ll_dentry_data *lld = ll_d2d(dentry); + + if (lld != NULL) + lld->lld_invalid = 1; +} + +/* + * Mark dentry INVALID, if dentry refcount is zero (this is normally case for + * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later; + * else dput() of the last refcount will unhash this dentry and kill it. + */ +static inline void d_lustre_invalidate(struct dentry *dentry, int nested) +{ + CDEBUG(D_DENTRY, "invalidate dentry %pd (%p) parent %p inode %p refc %d\n", + dentry, dentry, + dentry->d_parent, d_inode(dentry), d_count(dentry)); + + spin_lock_nested(&dentry->d_lock, + nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL); + __d_lustre_invalidate(dentry); + if (d_count(dentry) == 0) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void d_lustre_revalidate(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + LASSERT(ll_d2d(dentry) != NULL); + ll_d2d(dentry)->lld_invalid = 0; + spin_unlock(&dentry->d_lock); +} + +enum { + LL_LAYOUT_GEN_NONE = ((__u32)-2), /* layout lock was cancelled */ + LL_LAYOUT_GEN_EMPTY = ((__u32)-1) /* for empty layout */ +}; + +int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf); +int ll_layout_refresh(struct inode *inode, __u32 *gen); +int ll_layout_restore(struct inode *inode); + +int ll_xattr_init(void); +void ll_xattr_fini(void); + +#endif /* LLITE_INTERNAL_H */ diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_lib.c b/kernel/drivers/staging/lustre/lustre/llite/llite_lib.c new file mode 100644 index 000000000..a27af7882 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/llite_lib.c @@ -0,0 +1,2354 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/llite_lib.c + * + * Lustre Light Super operations + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include + +#include "../include/lustre_lite.h" +#include "../include/lustre_ha.h" +#include "../include/lustre_dlm.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre_disk.h" +#include "../include/lustre_param.h" +#include "../include/lustre_log.h" +#include "../include/cl_object.h" +#include "../include/obd_cksum.h" +#include "llite_internal.h" + +struct kmem_cache *ll_file_data_slab; +struct proc_dir_entry *proc_lustre_fs_root; + +static LIST_HEAD(ll_super_blocks); +static DEFINE_SPINLOCK(ll_sb_lock); + +#ifndef log2 +#define log2(n) ffz(~(n)) +#endif + +static struct ll_sb_info *ll_init_sbi(void) +{ + struct ll_sb_info *sbi = NULL; + unsigned long pages; + unsigned long lru_page_max; + struct sysinfo si; + class_uuid_t uuid; + int i; + + sbi = kzalloc(sizeof(*sbi), GFP_NOFS); + if (!sbi) + return NULL; + + spin_lock_init(&sbi->ll_lock); + mutex_init(&sbi->ll_lco.lco_lock); + spin_lock_init(&sbi->ll_pp_extent_lock); + spin_lock_init(&sbi->ll_process_lock); + sbi->ll_rw_stats_on = 0; + + si_meminfo(&si); + pages = si.totalram - si.totalhigh; + if (pages >> (20 - PAGE_CACHE_SHIFT) < 512) + lru_page_max = pages / 2; + else + lru_page_max = (pages / 4) * 3; + + /* initialize lru data */ + atomic_set(&sbi->ll_cache.ccc_users, 0); + sbi->ll_cache.ccc_lru_max = lru_page_max; + atomic_set(&sbi->ll_cache.ccc_lru_left, lru_page_max); + spin_lock_init(&sbi->ll_cache.ccc_lru_lock); + INIT_LIST_HEAD(&sbi->ll_cache.ccc_lru); + + sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32, + SBI_DEFAULT_READAHEAD_MAX); + sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file; + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = + SBI_DEFAULT_READAHEAD_WHOLE_MAX; + INIT_LIST_HEAD(&sbi->ll_conn_chain); + INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list); + + ll_generate_random_uuid(uuid); + class_uuid_unparse(uuid, &sbi->ll_sb_uuid); + CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid); + + spin_lock(&ll_sb_lock); + list_add_tail(&sbi->ll_list, &ll_super_blocks); + spin_unlock(&ll_sb_lock); + + sbi->ll_flags |= LL_SBI_VERBOSE; + sbi->ll_flags |= LL_SBI_CHECKSUM; + + sbi->ll_flags |= LL_SBI_LRU_RESIZE; + + for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { + spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. + pp_r_hist.oh_lock); + spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. + pp_w_hist.oh_lock); + } + + /* metadata statahead is enabled by default */ + sbi->ll_sa_max = LL_SA_RPC_DEF; + atomic_set(&sbi->ll_sa_total, 0); + atomic_set(&sbi->ll_sa_wrong, 0); + atomic_set(&sbi->ll_agl_total, 0); + sbi->ll_flags |= LL_SBI_AGL_ENABLED; + + return sbi; +} + +static void ll_free_sbi(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + + if (sbi != NULL) { + spin_lock(&ll_sb_lock); + list_del(&sbi->ll_list); + spin_unlock(&ll_sb_lock); + OBD_FREE(sbi, sizeof(*sbi)); + } +} + +static int client_common_fill_super(struct super_block *sb, char *md, char *dt, + struct vfsmount *mnt) +{ + struct inode *root = NULL; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + struct obd_capa *oc = NULL; + struct obd_statfs *osfs = NULL; + struct ptlrpc_request *request = NULL; + struct obd_connect_data *data = NULL; + struct obd_uuid *uuid; + struct md_op_data *op_data; + struct lustre_md lmd; + u64 valid; + int size, err, checksum; + + obd = class_name2obd(md); + if (!obd) { + CERROR("MD %s: not setup or attached\n", md); + return -EINVAL; + } + + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) + return -ENOMEM; + + osfs = kzalloc(sizeof(*osfs), GFP_NOFS); + if (!osfs) { + OBD_FREE_PTR(data); + return -ENOMEM; + } + + if (proc_lustre_fs_root) { + err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb, + dt, md); + if (err < 0) + CERROR("could not register mount in /proc/fs/lustre\n"); + } + + /* indicate the features supported by this client */ + data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | + OBD_CONNECT_ATTRFID | + OBD_CONNECT_VERSION | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | + OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | + OBD_CONNECT_AT | OBD_CONNECT_LOV_V3 | + OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_VBR | + OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH| + OBD_CONNECT_EINPROGRESS | + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | + OBD_CONNECT_PINGLESS | + OBD_CONNECT_MAX_EASIZE | + OBD_CONNECT_FLOCK_DEAD | + OBD_CONNECT_DISP_STRIPE; + + if (sbi->ll_flags & LL_SBI_SOM_PREVIEW) + data->ocd_connect_flags |= OBD_CONNECT_SOM; + + if (sbi->ll_flags & LL_SBI_LRU_RESIZE) + data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; +#ifdef CONFIG_FS_POSIX_ACL + data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK; +#endif + + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT)) + /* flag mdc connection as lightweight, only used for test + * purpose, use with care */ + data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT; + + data->ocd_ibits_known = MDS_INODELOCK_FULL; + data->ocd_version = LUSTRE_VERSION_CODE; + + if (sb->s_flags & MS_RDONLY) + data->ocd_connect_flags |= OBD_CONNECT_RDONLY; + if (sbi->ll_flags & LL_SBI_USER_XATTR) + data->ocd_connect_flags |= OBD_CONNECT_XATTR; + +#ifdef HAVE_MS_FLOCK_LOCK + /* force vfs to use lustre handler for flock() calls - bug 10743 */ + sb->s_flags |= MS_FLOCK_LOCK; +#endif +#ifdef MS_HAS_NEW_AOPS + sb->s_flags |= MS_HAS_NEW_AOPS; +#endif + + if (sbi->ll_flags & LL_SBI_FLOCK) + sbi->ll_fop = &ll_file_operations_flock; + else if (sbi->ll_flags & LL_SBI_LOCALFLOCK) + sbi->ll_fop = &ll_file_operations; + else + sbi->ll_fop = &ll_file_operations_noflock; + + /* real client */ + data->ocd_connect_flags |= OBD_CONNECT_REAL; + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE; + + data->ocd_brw_size = MD_MAX_BRW_SIZE; + + err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, + data, NULL); + if (err == -EBUSY) { + LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n", + md); + goto out; + } else if (err) { + CERROR("cannot connect to %s: rc = %d\n", md, err); + goto out; + } + + sbi->ll_md_exp->exp_connect_data = *data; + + err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init metadata layer FID infrastructure, rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + goto out_md; + } + + /* For mount, we only need fs info from MDT0, and also in DNE, it + * can make sure the client can be mounted as long as MDT0 is + * available */ + err = obd_statfs(NULL, sbi->ll_md_exp, osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_FOR_MDT0); + if (err) + goto out_md_fid; + + /* This needs to be after statfs to ensure connect has finished. + * Note that "data" does NOT contain the valid connect reply. + * If connecting to a 1.8 server there will be no LMV device, so + * we can access the MDC export directly and exp_connect_flags will + * be non-zero, but if accessing an upgraded 2.1 server it will + * have the correct flags filled in. + * XXX: fill in the LMV exp_connect_flags from MDC(s). */ + valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD; + if (exp_connect_flags(sbi->ll_md_exp) != 0 && + valid != CLIENT_CONNECT_MDT_REQD) { + char *buf; + + buf = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + obd_connect_flags2str(buf, PAGE_CACHE_SIZE, + valid ^ CLIENT_CONNECT_MDT_REQD, ","); + LCONSOLE_ERROR_MSG(0x170, "Server %s does not support feature(s) needed for correct operation of this client (%s). Please upgrade server or downgrade client.\n", + sbi->ll_md_exp->exp_obd->obd_name, buf); + OBD_FREE(buf, PAGE_CACHE_SIZE); + err = -EPROTO; + goto out_md_fid; + } + + size = sizeof(*data); + err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA), + KEY_CONN_DATA, &size, data, NULL); + if (err) { + CERROR("%s: Get connect data failed: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + goto out_md_fid; + } + + LASSERT(osfs->os_bsize); + sb->s_blocksize = osfs->os_bsize; + sb->s_blocksize_bits = log2(osfs->os_bsize); + sb->s_magic = LL_SUPER_MAGIC; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sbi->ll_namelen = osfs->os_namelen; + sbi->ll_max_rw_chunk = LL_DEFAULT_MAX_RW_CHUNK; + + if ((sbi->ll_flags & LL_SBI_USER_XATTR) && + !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) { + LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n"); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } + + if (data->ocd_connect_flags & OBD_CONNECT_ACL) { +#ifdef MS_POSIXACL + sb->s_flags |= MS_POSIXACL; +#endif + sbi->ll_flags |= LL_SBI_ACL; + } else { + LCONSOLE_INFO("client wants to enable acl, but mdt not!\n"); +#ifdef MS_POSIXACL + sb->s_flags &= ~MS_POSIXACL; +#endif + sbi->ll_flags &= ~LL_SBI_ACL; + } + + if (data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) { + if (!(sbi->ll_flags & LL_SBI_RMT_CLIENT)) { + sbi->ll_flags |= LL_SBI_RMT_CLIENT; + LCONSOLE_INFO("client is set as remote by default.\n"); + } + } else { + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + sbi->ll_flags &= ~LL_SBI_RMT_CLIENT; + LCONSOLE_INFO("client claims to be remote, but server rejected, forced to be local.\n"); + } + } + + if (data->ocd_connect_flags & OBD_CONNECT_MDS_CAPA) { + LCONSOLE_INFO("client enabled MDS capability!\n"); + sbi->ll_flags |= LL_SBI_MDS_CAPA; + } + + if (data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA) { + LCONSOLE_INFO("client enabled OSS capability!\n"); + sbi->ll_flags |= LL_SBI_OSS_CAPA; + } + + if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH) + sbi->ll_flags |= LL_SBI_64BIT_HASH; + + if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) + sbi->ll_md_brw_size = data->ocd_brw_size; + else + sbi->ll_md_brw_size = PAGE_CACHE_SIZE; + + if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) { + LCONSOLE_INFO("Layout lock feature supported.\n"); + sbi->ll_flags |= LL_SBI_LAYOUT_LOCK; + } + + if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) { + if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) { + LCONSOLE_INFO( + "%s: disabling xattr cache due to unknown maximum xattr size.\n", + dt); + } else { + sbi->ll_flags |= LL_SBI_XATTR_CACHE; + sbi->ll_xattr_cache_enabled = 1; + } + } + + obd = class_name2obd(dt); + if (!obd) { + CERROR("DT %s: not setup or attached\n", dt); + err = -ENODEV; + goto out_md_fid; + } + + data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | + OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | + OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK| + OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT | + OBD_CONNECT_OSS_CAPA | OBD_CONNECT_VBR| + OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH | + OBD_CONNECT_MAXBYTES | + OBD_CONNECT_EINPROGRESS | + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS; + + if (sbi->ll_flags & LL_SBI_SOM_PREVIEW) + data->ocd_connect_flags |= OBD_CONNECT_SOM; + + if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) { + /* OBD_CONNECT_CKSUM should always be set, even if checksums are + * disabled by default, because it can still be enabled on the + * fly via /proc. As a consequence, we still need to come to an + * agreement on the supported algorithms at connect time */ + data->ocd_connect_flags |= OBD_CONNECT_CKSUM; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY)) + data->ocd_cksum_types = OBD_CKSUM_ADLER; + else + data->ocd_cksum_types = cksum_types_supported_client(); + } + + data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE; + + CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d\n", + data->ocd_connect_flags, + data->ocd_version, data->ocd_grant); + + obd->obd_upcall.onu_owner = &sbi->ll_lco; + obd->obd_upcall.onu_upcall = cl_ocd_update; + + data->ocd_brw_size = DT_MAX_BRW_SIZE; + + err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data, + NULL); + if (err == -EBUSY) { + LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n", + dt); + goto out_md; + } else if (err) { + CERROR("%s: Cannot connect to %s: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, dt, err); + goto out_md; + } + + sbi->ll_dt_exp->exp_connect_data = *data; + + err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init data layer FID infrastructure, rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, err); + goto out_dt; + } + + mutex_lock(&sbi->ll_lco.lco_lock); + sbi->ll_lco.lco_flags = data->ocd_connect_flags; + sbi->ll_lco.lco_md_exp = sbi->ll_md_exp; + sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp; + mutex_unlock(&sbi->ll_lco.lco_lock); + + fid_zero(&sbi->ll_root_fid); + err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc); + if (err) { + CERROR("cannot mds_connect: rc = %d\n", err); + goto out_lock_cn_cb; + } + if (!fid_is_sane(&sbi->ll_root_fid)) { + CERROR("%s: Invalid root fid "DFID" during mount\n", + sbi->ll_md_exp->exp_obd->obd_name, + PFID(&sbi->ll_root_fid)); + err = -EINVAL; + goto out_lock_cn_cb; + } + CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid)); + + sb->s_op = &lustre_super_operations; +#if THREAD_SIZE >= 8192 /*b=17630*/ + sb->s_export_op = &lustre_export_operations; +#endif + + /* make root inode + * XXX: move this to after cbd setup? */ + valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA; + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + valid |= OBD_MD_FLRMTPERM; + else if (sbi->ll_flags & LL_SBI_ACL) + valid |= OBD_MD_FLACL; + + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + if (!op_data) { + err = -ENOMEM; + goto out_lock_cn_cb; + } + + op_data->op_fid1 = sbi->ll_root_fid; + op_data->op_mode = 0; + op_data->op_capa1 = oc; + op_data->op_valid = valid; + + err = md_getattr(sbi->ll_md_exp, op_data, &request); + if (oc) + capa_put(oc); + OBD_FREE_PTR(op_data); + if (err) { + CERROR("%s: md_getattr failed for root: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + goto out_lock_cn_cb; + } + + err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, + sbi->ll_md_exp, &lmd); + if (err) { + CERROR("failed to understand root inode md: rc = %d\n", err); + ptlrpc_req_finished(request); + goto out_lock_cn_cb; + } + + LASSERT(fid_is_sane(&sbi->ll_root_fid)); + root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid, + sbi->ll_flags & LL_SBI_32BIT_API), + &lmd); + md_free_lustre_md(sbi->ll_md_exp, &lmd); + ptlrpc_req_finished(request); + + if (root == NULL || IS_ERR(root)) { + if (lmd.lsm) + obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm); +#ifdef CONFIG_FS_POSIX_ACL + if (lmd.posix_acl) { + posix_acl_release(lmd.posix_acl); + lmd.posix_acl = NULL; + } +#endif + err = IS_ERR(root) ? PTR_ERR(root) : -EBADF; + root = NULL; + CERROR("lustre_lite: bad iget4 for root\n"); + goto out_root; + } + + err = ll_close_thread_start(&sbi->ll_lcq); + if (err) { + CERROR("cannot start close thread: rc %d\n", err); + goto out_root; + } + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + rct_init(&sbi->ll_rct); + et_init(&sbi->ll_et); + } +#endif + + checksum = sbi->ll_flags & LL_SBI_CHECKSUM; + err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), + KEY_CHECKSUM, sizeof(checksum), &checksum, + NULL); + cl_sb_init(sb); + + err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET), + KEY_CACHE_SET, sizeof(sbi->ll_cache), + &sbi->ll_cache, NULL); + + sb->s_root = d_make_root(root); + if (sb->s_root == NULL) { + CERROR("%s: can't make root dentry\n", + ll_get_fsname(sb, NULL, 0)); + err = -ENOMEM; + goto out_lock_cn_cb; + } + + sbi->ll_sdev_orig = sb->s_dev; + + /* We set sb->s_dev equal on all lustre clients in order to support + * NFS export clustering. NFSD requires that the FSID be the same + * on all clients. */ + /* s_dev is also used in lt_compare() to compare two fs, but that is + * only a node-local comparison. */ + uuid = obd_get_uuid(sbi->ll_md_exp); + if (uuid != NULL) { + sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid)); + get_uuid2fsid(uuid->uuid, strlen(uuid->uuid), &sbi->ll_fsid); + } + + if (data != NULL) + OBD_FREE_PTR(data); + if (osfs != NULL) + OBD_FREE_PTR(osfs); + + return err; +out_root: + iput(root); +out_lock_cn_cb: + obd_fid_fini(sbi->ll_dt_exp->exp_obd); +out_dt: + obd_disconnect(sbi->ll_dt_exp); + sbi->ll_dt_exp = NULL; + /* Make sure all OScs are gone, since cl_cache is accessing sbi. */ + obd_zombie_barrier(); +out_md_fid: + obd_fid_fini(sbi->ll_md_exp->exp_obd); +out_md: + obd_disconnect(sbi->ll_md_exp); + sbi->ll_md_exp = NULL; +out: + if (data != NULL) + OBD_FREE_PTR(data); + if (osfs != NULL) + OBD_FREE_PTR(osfs); + lprocfs_unregister_mountpoint(sbi); + return err; +} + +int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize) +{ + int size, rc; + + *lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL); + size = sizeof(int); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE), + KEY_MAX_EASIZE, &size, lmmsize, NULL); + if (rc) + CERROR("Get max mdsize error rc %d\n", rc); + + return rc; +} + +int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize) +{ + int size, rc; + + size = sizeof(int); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE), + KEY_DEFAULT_EASIZE, &size, lmmsize, NULL); + if (rc) + CERROR("Get default mdsize error rc %d\n", rc); + + return rc; +} + +int ll_get_max_cookiesize(struct ll_sb_info *sbi, int *lmmsize) +{ + int size, rc; + + size = sizeof(int); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_COOKIESIZE), + KEY_MAX_COOKIESIZE, &size, lmmsize, NULL); + if (rc) + CERROR("Get max cookiesize error rc %d\n", rc); + + return rc; +} + +int ll_get_default_cookiesize(struct ll_sb_info *sbi, int *lmmsize) +{ + int size, rc; + + size = sizeof(int); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_COOKIESIZE), + KEY_DEFAULT_COOKIESIZE, &size, lmmsize, NULL); + if (rc) + CERROR("Get default cookiesize error rc %d\n", rc); + + return rc; +} + +static void client_common_put_super(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + et_fini(&sbi->ll_et); + rct_fini(&sbi->ll_rct); + } +#endif + + ll_close_thread_shutdown(sbi->ll_lcq); + + cl_sb_fini(sb); + + list_del(&sbi->ll_conn_chain); + + obd_fid_fini(sbi->ll_dt_exp->exp_obd); + obd_disconnect(sbi->ll_dt_exp); + sbi->ll_dt_exp = NULL; + /* wait till all OSCs are gone, since cl_cache is accessing sbi. + * see LU-2543. */ + obd_zombie_barrier(); + + lprocfs_unregister_mountpoint(sbi); + + obd_fid_fini(sbi->ll_md_exp->exp_obd); + obd_disconnect(sbi->ll_md_exp); + sbi->ll_md_exp = NULL; +} + +void ll_kill_super(struct super_block *sb) +{ + struct ll_sb_info *sbi; + + /* not init sb ?*/ + if (!(sb->s_flags & MS_ACTIVE)) + return; + + sbi = ll_s2sbi(sb); + /* we need to restore s_dev from changed for clustered NFS before + * put_super because new kernels have cached s_dev and change sb->s_dev + * in put_super not affected real removing devices */ + if (sbi) { + sb->s_dev = sbi->ll_sdev_orig; + sbi->ll_umounting = 1; + } +} + +static inline int ll_set_opt(const char *opt, char *data, int fl) +{ + if (strncmp(opt, data, strlen(opt)) != 0) + return 0; + else + return fl; +} + +/* non-client-specific mount options are parsed in lmd_parse */ +static int ll_options(char *options, int *flags) +{ + int tmp; + char *s1 = options, *s2; + + if (!options) + return 0; + + CDEBUG(D_CONFIG, "Parsing opts %s\n", options); + + while (*s1) { + CDEBUG(D_SUPER, "next opt=%s\n", s1); + tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("remote_client", s1, LL_SBI_RMT_CLIENT); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH); + if (tmp) { + *flags &= ~tmp; + goto next; + } + + tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE); + if (tmp) { + *flags &= ~tmp; + goto next; + } + LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n", + s1); + return -EINVAL; + +next: + /* Find next opt */ + s2 = strchr(s1, ','); + if (s2 == NULL) + break; + s1 = s2 + 1; + } + return 0; +} + +void ll_lli_init(struct ll_inode_info *lli) +{ + lli->lli_inode_magic = LLI_INODE_MAGIC; + lli->lli_flags = 0; + lli->lli_ioepoch = 0; + lli->lli_maxbytes = MAX_LFS_FILESIZE; + spin_lock_init(&lli->lli_lock); + lli->lli_posix_acl = NULL; + lli->lli_remote_perms = NULL; + mutex_init(&lli->lli_rmtperm_mutex); + /* Do not set lli_fid, it has been initialized already. */ + fid_zero(&lli->lli_pfid); + INIT_LIST_HEAD(&lli->lli_close_list); + INIT_LIST_HEAD(&lli->lli_oss_capas); + atomic_set(&lli->lli_open_count, 0); + lli->lli_mds_capa = NULL; + lli->lli_rmtperm_time = 0; + lli->lli_pending_och = NULL; + lli->lli_mds_read_och = NULL; + lli->lli_mds_write_och = NULL; + lli->lli_mds_exec_och = NULL; + lli->lli_open_fd_read_count = 0; + lli->lli_open_fd_write_count = 0; + lli->lli_open_fd_exec_count = 0; + mutex_init(&lli->lli_och_mutex); + spin_lock_init(&lli->lli_agl_lock); + lli->lli_has_smd = false; + spin_lock_init(&lli->lli_layout_lock); + ll_layout_version_set(lli, LL_LAYOUT_GEN_NONE); + lli->lli_clob = NULL; + + init_rwsem(&lli->lli_xattrs_list_rwsem); + mutex_init(&lli->lli_xattrs_enq_lock); + + LASSERT(lli->lli_vfs_inode.i_mode != 0); + if (S_ISDIR(lli->lli_vfs_inode.i_mode)) { + mutex_init(&lli->lli_readdir_mutex); + lli->lli_opendir_key = NULL; + lli->lli_sai = NULL; + spin_lock_init(&lli->lli_sa_lock); + lli->lli_opendir_pid = 0; + } else { + mutex_init(&lli->lli_size_mutex); + lli->lli_symlink_name = NULL; + init_rwsem(&lli->lli_trunc_sem); + mutex_init(&lli->lli_write_mutex); + init_rwsem(&lli->lli_glimpse_sem); + lli->lli_glimpse_time = 0; + INIT_LIST_HEAD(&lli->lli_agl_list); + lli->lli_agl_index = 0; + lli->lli_async_rc = 0; + } + mutex_init(&lli->lli_layout_mutex); +} + +static inline int ll_bdi_register(struct backing_dev_info *bdi) +{ + static atomic_t ll_bdi_num = ATOMIC_INIT(0); + + bdi->name = "lustre"; + return bdi_register(bdi, NULL, "lustre-%d", + atomic_inc_return(&ll_bdi_num)); +} + +int ll_fill_super(struct super_block *sb, struct vfsmount *mnt) +{ + struct lustre_profile *lprof = NULL; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi; + char *dt = NULL, *md = NULL; + char *profilenm = get_profile_name(sb); + struct config_llog_instance *cfg; + /* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */ + const int instlen = sizeof(cfg->cfg_instance) * 2 + 2; + int err; + + CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); + + cfg = kzalloc(sizeof(*cfg), GFP_NOFS); + if (!cfg) + return -ENOMEM; + + try_module_get(THIS_MODULE); + + /* client additional sb info */ + lsi->lsi_llsbi = sbi = ll_init_sbi(); + if (!sbi) { + module_put(THIS_MODULE); + OBD_FREE_PTR(cfg); + return -ENOMEM; + } + + err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags); + if (err) + goto out_free; + + err = bdi_init(&lsi->lsi_bdi); + if (err) + goto out_free; + lsi->lsi_flags |= LSI_BDI_INITIALIZED; + lsi->lsi_bdi.capabilities = 0; + err = ll_bdi_register(&lsi->lsi_bdi); + if (err) + goto out_free; + + sb->s_bdi = &lsi->lsi_bdi; + /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */ + sb->s_d_op = &ll_d_ops; + + /* Generate a string unique to this super, in case some joker tries + to mount the same fs at two mount points. + Use the address of the super itself.*/ + cfg->cfg_instance = sb; + cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid; + cfg->cfg_callback = class_config_llog_handler; + /* set up client obds */ + err = lustre_process_log(sb, profilenm, cfg); + if (err < 0) { + CERROR("Unable to process log: %d\n", err); + goto out_free; + } + + /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */ + lprof = class_get_profile(profilenm); + if (lprof == NULL) { + LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be read from the MGS. Does that filesystem exist?\n", + profilenm); + err = -EINVAL; + goto out_free; + } + CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, + lprof->lp_md, lprof->lp_dt); + + dt = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_dt, cfg->cfg_instance); + if (!dt) { + err = -ENOMEM; + goto out_free; + } + + md = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_md, cfg->cfg_instance); + if (!md) { + err = -ENOMEM; + goto out_free; + } + + /* connections, registrations, sb setup */ + err = client_common_fill_super(sb, md, dt, mnt); + +out_free: + if (md) + OBD_FREE(md, strlen(lprof->lp_md) + instlen + 2); + if (dt) + OBD_FREE(dt, strlen(lprof->lp_dt) + instlen + 2); + if (err) + ll_put_super(sb); + else if (sbi->ll_flags & LL_SBI_VERBOSE) + LCONSOLE_WARN("Mounted %s\n", profilenm); + + OBD_FREE_PTR(cfg); + return err; +} /* ll_fill_super */ + +void ll_put_super(struct super_block *sb) +{ + struct config_llog_instance cfg, params_cfg; + struct obd_device *obd; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + char *profilenm = get_profile_name(sb); + int next, force = 1; + + CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm); + + ll_print_capa_stat(sbi); + + cfg.cfg_instance = sb; + lustre_end_log(sb, profilenm, &cfg); + + params_cfg.cfg_instance = sb; + lustre_end_log(sb, PARAMS_FILENAME, ¶ms_cfg); + + if (sbi->ll_md_exp) { + obd = class_exp2obd(sbi->ll_md_exp); + if (obd) + force = obd->obd_force; + } + + /* We need to set force before the lov_disconnect in + lustre_common_put_super, since l_d cleans up osc's as well. */ + if (force) { + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, + &next)) != NULL) { + obd->obd_force = force; + } + } + + if (sbi->ll_lcq) { + /* Only if client_common_fill_super succeeded */ + client_common_put_super(sb); + } + + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next))) + class_manual_cleanup(obd); + + if (sbi->ll_flags & LL_SBI_VERBOSE) + LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : ""); + + if (profilenm) + class_del_profile(profilenm); + + if (lsi->lsi_flags & LSI_BDI_INITIALIZED) { + bdi_destroy(&lsi->lsi_bdi); + lsi->lsi_flags &= ~LSI_BDI_INITIALIZED; + } + + ll_free_sbi(sb); + lsi->lsi_llsbi = NULL; + + lustre_common_put_super(sb); + + module_put(THIS_MODULE); +} /* client_put_super */ + +struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock) +{ + struct inode *inode = NULL; + + /* NOTE: we depend on atomic igrab() -bzzz */ + lock_res_and_lock(lock); + if (lock->l_resource->lr_lvb_inode) { + struct ll_inode_info *lli; + + lli = ll_i2info(lock->l_resource->lr_lvb_inode); + if (lli->lli_inode_magic == LLI_INODE_MAGIC) { + inode = igrab(lock->l_resource->lr_lvb_inode); + } else { + inode = lock->l_resource->lr_lvb_inode; + LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ? D_INFO : + D_WARNING, lock, "lr_lvb_inode %p is bogus: magic %08x", + lock->l_resource->lr_lvb_inode, + lli->lli_inode_magic); + inode = NULL; + } + } + unlock_res_and_lock(lock); + return inode; +} + +void ll_clear_inode(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + + if (S_ISDIR(inode->i_mode)) { + /* these should have been cleared in ll_file_release */ + LASSERT(lli->lli_opendir_key == NULL); + LASSERT(lli->lli_sai == NULL); + LASSERT(lli->lli_opendir_pid == 0); + } + + spin_lock(&lli->lli_lock); + ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK; + spin_unlock(&lli->lli_lock); + md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode)); + + LASSERT(!lli->lli_open_fd_write_count); + LASSERT(!lli->lli_open_fd_read_count); + LASSERT(!lli->lli_open_fd_exec_count); + + if (lli->lli_mds_write_och) + ll_md_real_close(inode, FMODE_WRITE); + if (lli->lli_mds_exec_och) + ll_md_real_close(inode, FMODE_EXEC); + if (lli->lli_mds_read_och) + ll_md_real_close(inode, FMODE_READ); + + if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) { + OBD_FREE(lli->lli_symlink_name, + strlen(lli->lli_symlink_name) + 1); + lli->lli_symlink_name = NULL; + } + + ll_xattr_cache_destroy(inode); + + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + LASSERT(lli->lli_posix_acl == NULL); + if (lli->lli_remote_perms) { + free_rmtperm_hash(lli->lli_remote_perms); + lli->lli_remote_perms = NULL; + } + } +#ifdef CONFIG_FS_POSIX_ACL + else if (lli->lli_posix_acl) { + LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1); + LASSERT(lli->lli_remote_perms == NULL); + posix_acl_release(lli->lli_posix_acl); + lli->lli_posix_acl = NULL; + } +#endif + lli->lli_inode_magic = LLI_INODE_DEAD; + + ll_clear_inode_capas(inode); + if (!S_ISDIR(inode->i_mode)) + LASSERT(list_empty(&lli->lli_agl_list)); + + /* + * XXX This has to be done before lsm is freed below, because + * cl_object still uses inode lsm. + */ + cl_inode_fini(inode); + lli->lli_has_smd = false; +} + +static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data, + struct md_open_data **mod) +{ + struct lustre_md md; + struct inode *inode = d_inode(dentry); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *request = NULL; + int rc, ia_valid; + + op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0, + &request, mod); + if (rc) { + ptlrpc_req_finished(request); + if (rc == -ENOENT) { + clear_nlink(inode); + /* Unlinked special device node? Or just a race? + * Pretend we done everything. */ + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) { + ia_valid = op_data->op_attr.ia_valid; + op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS; + rc = simple_setattr(dentry, &op_data->op_attr); + op_data->op_attr.ia_valid = ia_valid; + } + } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) { + CERROR("md_setattr fails: rc = %d\n", rc); + } + return rc; + } + + rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, + sbi->ll_md_exp, &md); + if (rc) { + ptlrpc_req_finished(request); + return rc; + } + + ia_valid = op_data->op_attr.ia_valid; + /* inode size will be in ll_setattr_ost, can't do it now since dirty + * cache is not cleared yet. */ + op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE); + rc = simple_setattr(dentry, &op_data->op_attr); + op_data->op_attr.ia_valid = ia_valid; + + /* Extract epoch data if obtained. */ + op_data->op_handle = md.body->handle; + op_data->op_ioepoch = md.body->ioepoch; + + ll_update_inode(inode, &md); + ptlrpc_req_finished(request); + + return rc; +} + +/* Close IO epoch and send Size-on-MDS attribute update. */ +static int ll_setattr_done_writing(struct inode *inode, + struct md_op_data *op_data, + struct md_open_data *mod) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc = 0; + + LASSERT(op_data != NULL); + if (!S_ISREG(inode->i_mode)) + return 0; + + CDEBUG(D_INODE, "Epoch %llu closed on "DFID" for truncate\n", + op_data->op_ioepoch, PFID(&lli->lli_fid)); + + op_data->op_flags = MF_EPOCH_CLOSE; + ll_done_writing_attr(inode, op_data); + ll_pack_inode2opdata(inode, op_data, NULL); + + rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod); + if (rc == -EAGAIN) { + /* MDS has instructed us to obtain Size-on-MDS attribute + * from OSTs and send setattr to back to MDS. */ + rc = ll_som_update(inode, op_data); + } else if (rc) { + CERROR("inode %lu mdc truncate failed: rc = %d\n", + inode->i_ino, rc); + } + return rc; +} + +static int ll_setattr_ost(struct inode *inode, struct iattr *attr) +{ + struct obd_capa *capa; + int rc; + + if (attr->ia_valid & ATTR_SIZE) + capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC); + else + capa = ll_mdscapa_get(inode); + + rc = cl_setattr_ost(inode, attr, capa); + + if (attr->ia_valid & ATTR_SIZE) + ll_truncate_free_capa(capa); + else + capa_put(capa); + + return rc; +} + + +/* If this inode has objects allocated to it (lsm != NULL), then the OST + * object(s) determine the file size and mtime. Otherwise, the MDS will + * keep these values until such a time that objects are allocated for it. + * We do the MDS operations first, as it is checking permissions for us. + * We don't to the MDS RPC if there is nothing that we want to store there, + * otherwise there is no harm in updating mtime/atime on the MDS if we are + * going to do an RPC anyways. + * + * If we are doing a truncate, we will send the mtime and ctime updates + * to the OST with the punch RPC, otherwise we do an explicit setattr RPC. + * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE + * at the same time. + * + * In case of HSMimport, we only set attr on MDS. + */ +int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import) +{ + struct inode *inode = d_inode(dentry); + struct ll_inode_info *lli = ll_i2info(inode); + struct md_op_data *op_data = NULL; + struct md_open_data *mod = NULL; + bool file_is_released = false; + int rc = 0, rc1 = 0; + + CDEBUG(D_VFSTRACE, + "%s: setattr inode %p/fid:"DFID + " from %llu to %llu, valid %x, hsm_import %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), inode, + PFID(&lli->lli_fid), i_size_read(inode), attr->ia_size, + attr->ia_valid, hsm_import); + + if (attr->ia_valid & ATTR_SIZE) { + /* Check new size against VFS/VM file size limit and rlimit */ + rc = inode_newsize_ok(inode, attr->ia_size); + if (rc) + return rc; + + /* The maximum Lustre file size is variable, based on the + * OST maximum object size and number of stripes. This + * needs another check in addition to the VFS check above. */ + if (attr->ia_size > ll_file_maxbytes(inode)) { + CDEBUG(D_INODE, "file "DFID" too large %llu > %llu\n", + PFID(&lli->lli_fid), attr->ia_size, + ll_file_maxbytes(inode)); + return -EFBIG; + } + + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + + /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */ + if (attr->ia_valid & TIMES_SET_FLAGS) { + if ((!uid_eq(current_fsuid(), inode->i_uid)) && + !capable(CFS_CAP_FOWNER)) + return -EPERM; + } + + /* We mark all of the fields "set" so MDS/OST does not re-set them */ + if (attr->ia_valid & ATTR_CTIME) { + attr->ia_ctime = CURRENT_TIME; + attr->ia_valid |= ATTR_CTIME_SET; + } + if (!(attr->ia_valid & ATTR_ATIME_SET) && + (attr->ia_valid & ATTR_ATIME)) { + attr->ia_atime = CURRENT_TIME; + attr->ia_valid |= ATTR_ATIME_SET; + } + if (!(attr->ia_valid & ATTR_MTIME_SET) && + (attr->ia_valid & ATTR_MTIME)) { + attr->ia_mtime = CURRENT_TIME; + attr->ia_valid |= ATTR_MTIME_SET; + } + + if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n", + LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime), + get_seconds()); + + /* If we are changing file size, file content is modified, flag it. */ + if (attr->ia_valid & ATTR_SIZE) { + attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + + /* We always do an MDS RPC, even if we're only changing the size; + * only the MDS knows whether truncate() should fail with -ETXTBUSY */ + + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + if (!op_data) + return -ENOMEM; + + if (!S_ISDIR(inode->i_mode)) { + if (attr->ia_valid & ATTR_SIZE) + inode_dio_write_done(inode); + mutex_unlock(&inode->i_mutex); + } + + memcpy(&op_data->op_attr, attr, sizeof(*attr)); + + /* Open epoch for truncate. */ + if (exp_connect_som(ll_i2mdexp(inode)) && + (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET))) + op_data->op_flags = MF_EPOCH_OPEN; + + /* truncate on a released file must failed with -ENODATA, + * so size must not be set on MDS for released file + * but other attributes must be set + */ + if (S_ISREG(inode->i_mode)) { + struct lov_stripe_md *lsm; + __u32 gen; + + ll_layout_refresh(inode, &gen); + lsm = ccc_inode_lsm_get(inode); + if (lsm && lsm->lsm_pattern & LOV_PATTERN_F_RELEASED) + file_is_released = true; + ccc_inode_lsm_put(inode, lsm); + } + + /* if not in HSM import mode, clear size attr for released file + * we clear the attribute send to MDT in op_data, not the original + * received from caller in attr which is used later to + * decide return code */ + if (file_is_released && (attr->ia_valid & ATTR_SIZE) && !hsm_import) + op_data->op_attr.ia_valid &= ~ATTR_SIZE; + + rc = ll_md_setattr(dentry, op_data, &mod); + if (rc) + goto out; + + /* truncate failed (only when non HSM import), others succeed */ + if (file_is_released) { + if ((attr->ia_valid & ATTR_SIZE) && !hsm_import) + rc = -ENODATA; + else + rc = 0; + goto out; + } + + /* RPC to MDT is sent, cancel data modification flag */ + if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) { + spin_lock(&lli->lli_lock); + lli->lli_flags &= ~LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + + ll_ioepoch_open(lli, op_data->op_ioepoch); + if (!S_ISREG(inode->i_mode)) { + rc = 0; + goto out; + } + + if (attr->ia_valid & (ATTR_SIZE | + ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET)) { + /* For truncate and utimes sending attributes to OSTs, setting + * mtime/atime to the past will be performed under PW [0:EOF] + * extent lock (new_size:EOF for truncate). It may seem + * excessive to send mtime/atime updates to OSTs when not + * setting times to past, but it is necessary due to possible + * time de-synchronization between MDT inode and OST objects */ + if (attr->ia_valid & ATTR_SIZE) + down_write(&lli->lli_trunc_sem); + rc = ll_setattr_ost(inode, attr); + if (attr->ia_valid & ATTR_SIZE) + up_write(&lli->lli_trunc_sem); + } +out: + if (op_data) { + if (op_data->op_ioepoch) { + rc1 = ll_setattr_done_writing(inode, op_data, mod); + if (!rc) + rc = rc1; + } + ll_finish_md_op_data(op_data); + } + if (!S_ISDIR(inode->i_mode)) { + mutex_lock(&inode->i_mutex); + if ((attr->ia_valid & ATTR_SIZE) && !hsm_import) + inode_dio_wait(inode); + } + + ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ? + LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1); + + return rc; +} + +int ll_setattr(struct dentry *de, struct iattr *attr) +{ + int mode = d_inode(de)->i_mode; + + if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) == + (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) + attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; + + if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) == + (ATTR_SIZE|ATTR_MODE)) && + (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) || + (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + !(attr->ia_mode & S_ISGID)))) + attr->ia_valid |= ATTR_FORCE; + + if ((attr->ia_valid & ATTR_MODE) && + (mode & S_ISUID) && + !(attr->ia_mode & S_ISUID) && + !(attr->ia_valid & ATTR_KILL_SUID)) + attr->ia_valid |= ATTR_KILL_SUID; + + if ((attr->ia_valid & ATTR_MODE) && + ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + !(attr->ia_mode & S_ISGID) && + !(attr->ia_valid & ATTR_KILL_SGID)) + attr->ia_valid |= ATTR_KILL_SGID; + + return ll_setattr_raw(de, attr, false); +} + +int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, + __u64 max_age, __u32 flags) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_statfs obd_osfs; + int rc; + + rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags); + if (rc) { + CERROR("md_statfs fails: rc = %d\n", rc); + return rc; + } + + osfs->os_type = sb->s_magic; + + CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n", + osfs->os_bavail, osfs->os_blocks, osfs->os_ffree, + osfs->os_files); + + if (sbi->ll_flags & LL_SBI_LAZYSTATFS) + flags |= OBD_STATFS_NODELAY; + + rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags); + if (rc) { + CERROR("obd_statfs fails: rc = %d\n", rc); + return rc; + } + + CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n", + obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree, + obd_osfs.os_files); + + osfs->os_bsize = obd_osfs.os_bsize; + osfs->os_blocks = obd_osfs.os_blocks; + osfs->os_bfree = obd_osfs.os_bfree; + osfs->os_bavail = obd_osfs.os_bavail; + + /* If we don't have as many objects free on the OST as inodes + * on the MDS, we reduce the total number of inodes to + * compensate, so that the "inodes in use" number is correct. + */ + if (obd_osfs.os_ffree < osfs->os_ffree) { + osfs->os_files = (osfs->os_files - osfs->os_ffree) + + obd_osfs.os_ffree; + osfs->os_ffree = obd_osfs.os_ffree; + } + + return rc; +} +int ll_statfs(struct dentry *de, struct kstatfs *sfs) +{ + struct super_block *sb = de->d_sb; + struct obd_statfs osfs; + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64()); + ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1); + + /* Some amount of caching on the client is allowed */ + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + 0); + if (rc) + return rc; + + statfs_unpack(sfs, &osfs); + + /* We need to downshift for all 32-bit kernels, because we can't + * tell if the kernel is being called via sys_statfs64() or not. + * Stop before overflowing f_bsize - in which case it is better + * to just risk EOVERFLOW if caller is using old sys_statfs(). */ + if (sizeof(long) < 8) { + while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) { + sfs->f_bsize <<= 1; + + osfs.os_blocks >>= 1; + osfs.os_bfree >>= 1; + osfs.os_bavail >>= 1; + } + } + + sfs->f_blocks = osfs.os_blocks; + sfs->f_bfree = osfs.os_bfree; + sfs->f_bavail = osfs.os_bavail; + sfs->f_fsid = ll_s2sbi(sb)->ll_fsid; + return 0; +} + +void ll_inode_size_lock(struct inode *inode) +{ + struct ll_inode_info *lli; + + LASSERT(!S_ISDIR(inode->i_mode)); + + lli = ll_i2info(inode); + mutex_lock(&lli->lli_size_mutex); +} + +void ll_inode_size_unlock(struct inode *inode) +{ + struct ll_inode_info *lli; + + lli = ll_i2info(inode); + mutex_unlock(&lli->lli_size_mutex); +} + +void ll_update_inode(struct inode *inode, struct lustre_md *md) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body = md->body; + struct lov_stripe_md *lsm = md->lsm; + struct ll_sb_info *sbi = ll_i2sbi(inode); + + LASSERT((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); + if (lsm != NULL) { + if (!lli->lli_has_smd && + !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK)) + cl_file_inode_init(inode, md); + + lli->lli_maxbytes = lsm->lsm_maxbytes; + if (lli->lli_maxbytes > MAX_LFS_FILESIZE) + lli->lli_maxbytes = MAX_LFS_FILESIZE; + } + + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + if (body->valid & OBD_MD_FLRMTPERM) + ll_update_remote_perm(inode, md->remote_perm); + } +#ifdef CONFIG_FS_POSIX_ACL + else if (body->valid & OBD_MD_FLACL) { + spin_lock(&lli->lli_lock); + if (lli->lli_posix_acl) + posix_acl_release(lli->lli_posix_acl); + lli->lli_posix_acl = md->posix_acl; + spin_unlock(&lli->lli_lock); + } +#endif + inode->i_ino = cl_fid_build_ino(&body->fid1, + sbi->ll_flags & LL_SBI_32BIT_API); + inode->i_generation = cl_fid_build_gen(&body->fid1); + + if (body->valid & OBD_MD_FLATIME) { + if (body->atime > LTIME_S(inode->i_atime)) + LTIME_S(inode->i_atime) = body->atime; + lli->lli_lvb.lvb_atime = body->atime; + } + if (body->valid & OBD_MD_FLMTIME) { + if (body->mtime > LTIME_S(inode->i_mtime)) { + CDEBUG(D_INODE, "setting ino %lu mtime from %lu to %llu\n", + inode->i_ino, LTIME_S(inode->i_mtime), + body->mtime); + LTIME_S(inode->i_mtime) = body->mtime; + } + lli->lli_lvb.lvb_mtime = body->mtime; + } + if (body->valid & OBD_MD_FLCTIME) { + if (body->ctime > LTIME_S(inode->i_ctime)) + LTIME_S(inode->i_ctime) = body->ctime; + lli->lli_lvb.lvb_ctime = body->ctime; + } + if (body->valid & OBD_MD_FLMODE) + inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT); + if (body->valid & OBD_MD_FLTYPE) + inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT); + LASSERT(inode->i_mode != 0); + if (S_ISREG(inode->i_mode)) + inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, + LL_MAX_BLKSIZE_BITS); + else + inode->i_blkbits = inode->i_sb->s_blocksize_bits; + if (body->valid & OBD_MD_FLUID) + inode->i_uid = make_kuid(&init_user_ns, body->uid); + if (body->valid & OBD_MD_FLGID) + inode->i_gid = make_kgid(&init_user_ns, body->gid); + if (body->valid & OBD_MD_FLFLAGS) + inode->i_flags = ll_ext_to_inode_flags(body->flags); + if (body->valid & OBD_MD_FLNLINK) + set_nlink(inode, body->nlink); + if (body->valid & OBD_MD_FLRDEV) + inode->i_rdev = old_decode_dev(body->rdev); + + if (body->valid & OBD_MD_FLID) { + /* FID shouldn't be changed! */ + if (fid_is_sane(&lli->lli_fid)) { + LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1), + "Trying to change FID "DFID + " to the "DFID", inode %lu/%u(%p)\n", + PFID(&lli->lli_fid), PFID(&body->fid1), + inode->i_ino, inode->i_generation, inode); + } else + lli->lli_fid = body->fid1; + } + + LASSERT(fid_seq(&lli->lli_fid) != 0); + + if (body->valid & OBD_MD_FLSIZE) { + if (exp_connect_som(ll_i2mdexp(inode)) && + S_ISREG(inode->i_mode)) { + struct lustre_handle lockh; + ldlm_mode_t mode; + + /* As it is possible a blocking ast has been processed + * by this time, we need to check there is an UPDATE + * lock on the client and set LLIF_MDS_SIZE_LOCK holding + * it. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE, + &lockh, LDLM_FL_CBPENDING, + LCK_CR | LCK_CW | + LCK_PR | LCK_PW); + if (mode) { + if (lli->lli_flags & (LLIF_DONE_WRITING | + LLIF_EPOCH_PENDING | + LLIF_SOM_DIRTY)) { + CERROR("ino %lu flags %u still has size authority! do not trust the size got from MDS\n", + inode->i_ino, lli->lli_flags); + } else { + /* Use old size assignment to avoid + * deadlock bz14138 & bz14326 */ + i_size_write(inode, body->size); + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_MDS_SIZE_LOCK; + spin_unlock(&lli->lli_lock); + } + ldlm_lock_decref(&lockh, mode); + } + } else { + /* Use old size assignment to avoid + * deadlock bz14138 & bz14326 */ + i_size_write(inode, body->size); + + CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n", + inode->i_ino, (unsigned long long)body->size); + } + + if (body->valid & OBD_MD_FLBLOCKS) + inode->i_blocks = body->blocks; + } + + if (body->valid & OBD_MD_FLMDSCAPA) { + LASSERT(md->mds_capa); + ll_add_capa(inode, md->mds_capa); + } + if (body->valid & OBD_MD_FLOSSCAPA) { + LASSERT(md->oss_capa); + ll_add_capa(inode, md->oss_capa); + } + + if (body->valid & OBD_MD_TSTATE) { + if (body->t_state & MS_RESTORE) + lli->lli_flags |= LLIF_FILE_RESTORING; + } +} + +void ll_read_inode2(struct inode *inode, void *opaque) +{ + struct lustre_md *md = opaque; + struct ll_inode_info *lli = ll_i2info(inode); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(&lli->lli_fid), inode); + + LASSERT(!lli->lli_has_smd); + + /* Core attributes from the MDS first. This is a new inode, and + * the VFS doesn't zero times in the core inode so we have to do + * it ourselves. They will be overwritten by either MDS or OST + * attributes - we just need to make sure they aren't newer. */ + LTIME_S(inode->i_mtime) = 0; + LTIME_S(inode->i_atime) = 0; + LTIME_S(inode->i_ctime) = 0; + inode->i_rdev = 0; + ll_update_inode(inode, md); + + /* OIDEBUG(inode); */ + + if (S_ISREG(inode->i_mode)) { + struct ll_sb_info *sbi = ll_i2sbi(inode); + + inode->i_op = &ll_file_inode_operations; + inode->i_fop = sbi->ll_fop; + inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ll_dir_inode_operations; + inode->i_fop = &ll_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &ll_fast_symlink_inode_operations; + } else { + inode->i_op = &ll_special_inode_operations; + + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + } +} + +void ll_delete_inode(struct inode *inode) +{ + struct cl_inode_info *lli = cl_i2info(inode); + + if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) + /* discard all dirty pages before truncating them, required by + * osc_extent implementation at LU-1030. */ + cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, + CL_FSYNC_DISCARD, 1); + + truncate_inode_pages_final(&inode->i_data); + + /* Workaround for LU-118 */ + if (inode->i_data.nrpages) { + spin_lock_irq(&inode->i_data.tree_lock); + spin_unlock_irq(&inode->i_data.tree_lock); + LASSERTF(inode->i_data.nrpages == 0, + "inode=%lu/%u(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n", + inode->i_ino, inode->i_generation, inode, + inode->i_data.nrpages); + } + /* Workaround end */ + + ll_clear_inode(inode); + clear_inode(inode); +} + +int ll_iocontrol(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + int rc, flags = 0; + + switch (cmd) { + case FSFILT_IOC_GETFLAGS: { + struct mdt_body *body; + struct md_op_data *op_data; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, + NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + op_data->op_valid = OBD_MD_FLFLAGS; + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc) { + CERROR("failure %d inode %lu\n", rc, inode->i_ino); + return -abs(rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + + flags = body->flags; + + ptlrpc_req_finished(req); + + return put_user(flags, (int *)arg); + } + case FSFILT_IOC_SETFLAGS: { + struct lov_stripe_md *lsm; + struct obd_info oinfo = { { { 0 } } }; + struct md_op_data *op_data; + + if (get_user(flags, (int *)arg)) + return -EFAULT; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = flags; + op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG; + rc = md_setattr(sbi->ll_md_exp, op_data, + NULL, 0, NULL, 0, &req, NULL); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc) + return rc; + + inode->i_flags = ll_ext_to_inode_flags(flags); + + lsm = ccc_inode_lsm_get(inode); + if (!lsm_has_objects(lsm)) { + ccc_inode_lsm_put(inode, lsm); + return 0; + } + + OBDO_ALLOC(oinfo.oi_oa); + if (!oinfo.oi_oa) { + ccc_inode_lsm_put(inode, lsm); + return -ENOMEM; + } + oinfo.oi_md = lsm; + oinfo.oi_oa->o_oi = lsm->lsm_oi; + oinfo.oi_oa->o_flags = flags; + oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | + OBD_MD_FLGROUP; + oinfo.oi_capa = ll_mdscapa_get(inode); + obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid); + rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL); + capa_put(oinfo.oi_capa); + OBDO_FREE(oinfo.oi_oa); + ccc_inode_lsm_put(inode, lsm); + + if (rc && rc != -EPERM && rc != -EACCES) + CERROR("osc_setattr_async fails: rc = %d\n", rc); + + return rc; + } + default: + return -ENOSYS; + } + + return 0; +} + +int ll_flush_ctx(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + + CDEBUG(D_SEC, "flush context for user %d\n", + from_kuid(&init_user_ns, current_uid())); + + obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, + 0, NULL, NULL); + obd_set_info_async(NULL, sbi->ll_dt_exp, + sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, + 0, NULL, NULL); + return 0; +} + +/* umount -f client means force down, don't save state */ +void ll_umount_begin(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + struct obd_ioctl_data *ioc_data; + + CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb, + sb->s_count, atomic_read(&sb->s_active)); + + obd = class_exp2obd(sbi->ll_md_exp); + if (obd == NULL) { + CERROR("Invalid MDC connection handle %#llx\n", + sbi->ll_md_exp->exp_handle.h_cookie); + return; + } + obd->obd_force = 1; + + obd = class_exp2obd(sbi->ll_dt_exp); + if (obd == NULL) { + CERROR("Invalid LOV connection handle %#llx\n", + sbi->ll_dt_exp->exp_handle.h_cookie); + return; + } + obd->obd_force = 1; + + ioc_data = kzalloc(sizeof(*ioc_data), GFP_NOFS); + if (ioc_data) { + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp, + sizeof(*ioc_data), ioc_data, NULL); + + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp, + sizeof(*ioc_data), ioc_data, NULL); + + OBD_FREE_PTR(ioc_data); + } + + /* Really, we'd like to wait until there are no requests outstanding, + * and then continue. For now, we just invalidate the requests, + * schedule() and sleep one second if needed, and hope. + */ + schedule(); +} + +int ll_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + char *profilenm = get_profile_name(sb); + int err; + __u32 read_only; + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + read_only = *flags & MS_RDONLY; + err = obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_READ_ONLY), + KEY_READ_ONLY, sizeof(read_only), + &read_only, NULL); + if (err) { + LCONSOLE_WARN("Failed to remount %s %s (%d)\n", + profilenm, read_only ? + "read-only" : "read-write", err); + return err; + } + + if (read_only) + sb->s_flags |= MS_RDONLY; + else + sb->s_flags &= ~MS_RDONLY; + + if (sbi->ll_flags & LL_SBI_VERBOSE) + LCONSOLE_WARN("Remounted %s %s\n", profilenm, + read_only ? "read-only" : "read-write"); + } + return 0; +} + +int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, + struct super_block *sb, struct lookup_intent *it) +{ + struct ll_sb_info *sbi = NULL; + struct lustre_md md; + int rc; + + LASSERT(*inode || sb); + sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode); + rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp, + sbi->ll_md_exp, &md); + if (rc) + return rc; + + if (*inode) { + ll_update_inode(*inode, &md); + } else { + LASSERT(sb != NULL); + + /* + * At this point server returns to client's same fid as client + * generated for creating. So using ->fid1 is okay here. + */ + LASSERT(fid_is_sane(&md.body->fid1)); + + *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1, + sbi->ll_flags & LL_SBI_32BIT_API), + &md); + if (*inode == NULL || IS_ERR(*inode)) { +#ifdef CONFIG_FS_POSIX_ACL + if (md.posix_acl) { + posix_acl_release(md.posix_acl); + md.posix_acl = NULL; + } +#endif + rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM; + *inode = NULL; + CERROR("new_inode -fatal: rc %d\n", rc); + goto out; + } + } + + /* Handling piggyback layout lock. + * Layout lock can be piggybacked by getattr and open request. + * The lsm can be applied to inode only if it comes with a layout lock + * otherwise correct layout may be overwritten, for example: + * 1. proc1: mdt returns a lsm but not granting layout + * 2. layout was changed by another client + * 3. proc2: refresh layout and layout lock granted + * 4. proc1: to apply a stale layout */ + if (it != NULL && it->d.lustre.it_lock_mode != 0) { + struct lustre_handle lockh; + struct ldlm_lock *lock; + + lockh.cookie = it->d.lustre.it_lock_handle; + lock = ldlm_handle2lock(&lockh); + LASSERT(lock != NULL); + if (ldlm_has_layout(lock)) { + struct cl_object_conf conf; + + memset(&conf, 0, sizeof(conf)); + conf.coc_opc = OBJECT_CONF_SET; + conf.coc_inode = *inode; + conf.coc_lock = lock; + conf.u.coc_md = &md; + (void)ll_layout_conf(*inode, &conf); + } + LDLM_LOCK_PUT(lock); + } + +out: + if (md.lsm != NULL) + obd_free_memmd(sbi->ll_dt_exp, &md.lsm); + md_free_lustre_md(sbi->ll_md_exp, &md); + return rc; +} + +int ll_obd_statfs(struct inode *inode, void *arg) +{ + struct ll_sb_info *sbi = NULL; + struct obd_export *exp; + char *buf = NULL; + struct obd_ioctl_data *data = NULL; + __u32 type; + __u32 flags; + int len = 0, rc; + + if (!inode) { + rc = -EINVAL; + goto out_statfs; + } + + sbi = ll_i2sbi(inode); + if (!sbi) { + rc = -EINVAL; + goto out_statfs; + } + + rc = obd_ioctl_getdata(&buf, &len, arg); + if (rc) + goto out_statfs; + + data = (void *)buf; + if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || + !data->ioc_pbuf1 || !data->ioc_pbuf2) { + rc = -EINVAL; + goto out_statfs; + } + + if (data->ioc_inllen1 != sizeof(__u32) || + data->ioc_inllen2 != sizeof(__u32) || + data->ioc_plen1 != sizeof(struct obd_statfs) || + data->ioc_plen2 != sizeof(struct obd_uuid)) { + rc = -EINVAL; + goto out_statfs; + } + + memcpy(&type, data->ioc_inlbuf1, sizeof(__u32)); + if (type & LL_STATFS_LMV) + exp = sbi->ll_md_exp; + else if (type & LL_STATFS_LOV) + exp = sbi->ll_dt_exp; + else { + rc = -ENODEV; + goto out_statfs; + } + + flags = (type & LL_STATFS_NODELAY) ? OBD_STATFS_NODELAY : 0; + rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, &flags); + if (rc) + goto out_statfs; +out_statfs: + if (buf) + obd_ioctl_freedata(buf, len); + return rc; +} + +int ll_process_config(struct lustre_cfg *lcfg) +{ + char *ptr; + void *sb; + struct lprocfs_static_vars lvars; + unsigned long x; + int rc = 0; + + lprocfs_llite_init_vars(&lvars); + + /* The instance name contains the sb: lustre-client-aacfe000 */ + ptr = strrchr(lustre_cfg_string(lcfg, 0), '-'); + if (!ptr || !*(++ptr)) + return -EINVAL; + rc = kstrtoul(ptr, 16, &x); + if (rc != 0) + return -EINVAL; + sb = (void *)x; + /* This better be a real Lustre superblock! */ + LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC); + + /* Note we have not called client_common_fill_super yet, so + proc fns must be able to handle that! */ + rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars, + lcfg, sb); + if (rc > 0) + rc = 0; + return rc; +} + +/* this function prepares md_op_data hint for passing ot down to MD stack. */ +struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, + struct inode *i1, struct inode *i2, + const char *name, int namelen, + int mode, __u32 opc, void *data) +{ + LASSERT(i1 != NULL); + + if (namelen > ll_i2sbi(i1)->ll_namelen) + return ERR_PTR(-ENAMETOOLONG); + + if (op_data == NULL) + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + + if (op_data == NULL) + return ERR_PTR(-ENOMEM); + + ll_i2gids(op_data->op_suppgids, i1, i2); + op_data->op_fid1 = *ll_inode2fid(i1); + op_data->op_capa1 = ll_mdscapa_get(i1); + + if (i2) { + op_data->op_fid2 = *ll_inode2fid(i2); + op_data->op_capa2 = ll_mdscapa_get(i2); + } else { + fid_zero(&op_data->op_fid2); + op_data->op_capa2 = NULL; + } + + op_data->op_name = name; + op_data->op_namelen = namelen; + op_data->op_mode = mode; + op_data->op_mod_time = get_seconds(); + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = cfs_curproc_cap_pack(); + op_data->op_bias = 0; + op_data->op_cli_flags = 0; + if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) && + filename_is_volatile(name, namelen, NULL)) + op_data->op_bias |= MDS_CREATE_VOLATILE; + op_data->op_opc = opc; + op_data->op_mds = 0; + op_data->op_data = data; + + /* If the file is being opened after mknod() (normally due to NFS) + * try to use the default stripe data from parent directory for + * allocating OST objects. Try to pass the parent FID to MDS. */ + if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) && + !ll_i2info(i2)->lli_has_smd) { + struct ll_inode_info *lli = ll_i2info(i2); + + spin_lock(&lli->lli_lock); + if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid))) + op_data->op_fid1 = lli->lli_pfid; + spin_unlock(&lli->lli_lock); + /** We ignore parent's capability temporary. */ + } + + /* When called by ll_setattr_raw, file is i1. */ + if (LLIF_DATA_MODIFIED & ll_i2info(i1)->lli_flags) + op_data->op_bias |= MDS_DATA_MODIFIED; + + return op_data; +} + +void ll_finish_md_op_data(struct md_op_data *op_data) +{ + capa_put(op_data->op_capa1); + capa_put(op_data->op_capa2); + OBD_FREE_PTR(op_data); +} + +int ll_show_options(struct seq_file *seq, struct dentry *dentry) +{ + struct ll_sb_info *sbi; + + LASSERT((seq != NULL) && (dentry != NULL)); + sbi = ll_s2sbi(dentry->d_sb); + + if (sbi->ll_flags & LL_SBI_NOLCK) + seq_puts(seq, ",nolock"); + + if (sbi->ll_flags & LL_SBI_FLOCK) + seq_puts(seq, ",flock"); + + if (sbi->ll_flags & LL_SBI_LOCALFLOCK) + seq_puts(seq, ",localflock"); + + if (sbi->ll_flags & LL_SBI_USER_XATTR) + seq_puts(seq, ",user_xattr"); + + if (sbi->ll_flags & LL_SBI_LAZYSTATFS) + seq_puts(seq, ",lazystatfs"); + + if (sbi->ll_flags & LL_SBI_USER_FID2PATH) + seq_puts(seq, ",user_fid2path"); + + return 0; +} + +/** + * Get obd name by cmd, and copy out to user space + */ +int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_device *obd; + + if (cmd == OBD_IOC_GETDTNAME) + obd = class_exp2obd(sbi->ll_dt_exp); + else if (cmd == OBD_IOC_GETMDNAME) + obd = class_exp2obd(sbi->ll_md_exp); + else + return -EINVAL; + + if (!obd) + return -ENOENT; + + if (copy_to_user((void *)arg, obd->obd_name, + strlen(obd->obd_name) + 1)) + return -EFAULT; + + return 0; +} + +/** + * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the + * fsname will be returned in this buffer; otherwise, a static buffer will be + * used to store the fsname and returned to caller. + */ +char *ll_get_fsname(struct super_block *sb, char *buf, int buflen) +{ + static char fsname_static[MTI_NAME_MAXLEN]; + struct lustre_sb_info *lsi = s2lsi(sb); + char *ptr; + int len; + + if (buf == NULL) { + /* this means the caller wants to use static buffer + * and it doesn't care about race. Usually this is + * in error reporting path */ + buf = fsname_static; + buflen = sizeof(fsname_static); + } + + len = strlen(lsi->lsi_lmd->lmd_profile); + ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-'); + if (ptr && (strcmp(ptr, "-client") == 0)) + len -= 7; + + if (unlikely(len >= buflen)) + len = buflen - 1; + strncpy(buf, lsi->lsi_lmd->lmd_profile, len); + buf[len] = '\0'; + + return buf; +} + +void ll_dirty_page_discard_warn(struct page *page, int ioret) +{ + char *buf, *path = NULL; + struct dentry *dentry = NULL; + struct ccc_object *obj = cl_inode2ccc(page->mapping->host); + + /* this can be called inside spin lock so use GFP_ATOMIC. */ + buf = (char *)__get_free_page(GFP_ATOMIC); + if (buf != NULL) { + dentry = d_find_alias(page->mapping->host); + if (dentry != NULL) + path = dentry_path_raw(dentry, buf, PAGE_SIZE); + } + + CDEBUG(D_WARNING, + "%s: dirty page discard: %s/fid: " DFID "/%s may get corrupted (rc %d)\n", + ll_get_fsname(page->mapping->host->i_sb, NULL, 0), + s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev, + PFID(&obj->cob_header.coh_lu.loh_fid), + (path && !IS_ERR(path)) ? path : "", ioret); + + if (dentry != NULL) + dput(dentry); + + if (buf != NULL) + free_page((unsigned long)buf); +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_mmap.c b/kernel/drivers/staging/lustre/lustre/llite/llite_mmap.c new file mode 100644 index 000000000..a90214bb8 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/llite_mmap.c @@ -0,0 +1,492 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/lustre_lite.h" +#include "llite_internal.h" +#include "../include/linux/lustre_compat25.h" + +static const struct vm_operations_struct ll_file_vm_ops; + +void policy_from_vma(ldlm_policy_data_t *policy, + struct vm_area_struct *vma, unsigned long addr, + size_t count) +{ + policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) + + (vma->vm_pgoff << PAGE_CACHE_SHIFT); + policy->l_extent.end = (policy->l_extent.start + count - 1) | + ~CFS_PAGE_MASK; +} + +struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, + size_t count) +{ + struct vm_area_struct *vma, *ret = NULL; + + /* mmap_sem must have been held by caller. */ + LASSERT(!down_write_trylock(&mm->mmap_sem)); + + for (vma = find_vma(mm, addr); + vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) { + if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && + vma->vm_flags & VM_SHARED) { + ret = vma; + break; + } + } + return ret; +} + +/** + * API independent part for page fault initialization. + * \param vma - virtual memory area addressed to page fault + * \param env - corespondent lu_env to processing + * \param nest - nested level + * \param index - page index corespondent to fault. + * \parm ra_flags - vma readahead flags. + * + * \return allocated and initialized env for fault operation. + * \retval EINVAL if env can't allocated + * \return other error codes from cl_io_init. + */ +static struct cl_io * +ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, + struct cl_env_nest *nest, pgoff_t index, + unsigned long *ra_flags) +{ + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct cl_io *io; + struct cl_fault_io *fio; + struct lu_env *env; + int rc; + + *env_ret = NULL; + if (ll_file_nolock(file)) + return ERR_PTR(-EOPNOTSUPP); + + /* + * page fault can be called when lustre IO is + * already active for the current thread, e.g., when doing read/write + * against user level buffer mapped from Lustre buffer. To avoid + * stomping on existing context, optionally force an allocation of a new + * one. + */ + env = cl_env_nested_get(nest); + if (IS_ERR(env)) + return ERR_PTR(-EINVAL); + + *env_ret = env; + + io = ccc_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + LASSERT(io->ci_obj != NULL); + + fio = &io->u.ci_fault; + fio->ft_index = index; + fio->ft_executable = vma->vm_flags&VM_EXEC; + + /* + * disable VM_SEQ_READ and use VM_RAND_READ to make sure that + * the kernel will not read other pages not covered by ldlm in + * filemap_nopage. we do our readahead in ll_readpage. + */ + if (ra_flags != NULL) + *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); + vma->vm_flags &= ~VM_SEQ_READ; + vma->vm_flags |= VM_RAND_READ; + + CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, + fio->ft_index, fio->ft_executable); + + rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj); + if (rc == 0) { + struct ccc_io *cio = ccc_env_io(env); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + LASSERT(cio->cui_cl.cis_io == io); + + /* mmap lock must be MANDATORY it has to cache + * pages. */ + io->ci_lockreq = CILR_MANDATORY; + cio->cui_fd = fd; + } else { + LASSERT(rc < 0); + cl_io_fini(env, io); + cl_env_nested_put(nest, env); + io = ERR_PTR(rc); + } + + return io; +} + +/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ +static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, + bool *retry) +{ + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio; + struct cl_env_nest nest; + int result; + sigset_t set; + struct inode *inode; + struct ll_inode_info *lli; + + LASSERT(vmpage != NULL); + + io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL); + if (IS_ERR(io)) { + result = PTR_ERR(io); + goto out; + } + + result = io->ci_result; + if (result < 0) + goto out_io; + + io->u.ci_fault.ft_mkwrite = 1; + io->u.ci_fault.ft_writable = 1; + + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = vmpage; + + set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); + + /* we grab lli_trunc_sem to exclude truncate case. + * Otherwise, we could add dirty pages into osc cache + * while truncate is on-going. */ + inode = ccc_object_inode(io->ci_obj); + lli = ll_i2info(inode); + down_read(&lli->lli_trunc_sem); + + result = cl_io_loop(env, io); + + up_read(&lli->lli_trunc_sem); + + cfs_restore_sigs(set); + + if (result == 0) { + struct inode *inode = file_inode(vma->vm_file); + struct ll_inode_info *lli = ll_i2info(inode); + + lock_page(vmpage); + if (vmpage->mapping == NULL) { + unlock_page(vmpage); + + /* page was truncated and lock was cancelled, return + * ENODATA so that VM_FAULT_NOPAGE will be returned + * to handle_mm_fault(). */ + if (result == 0) + result = -ENODATA; + } else if (!PageDirty(vmpage)) { + /* race, the page has been cleaned by ptlrpcd after + * it was unlocked, it has to be added into dirty + * cache again otherwise this soon-to-dirty page won't + * consume any grants, even worse if this page is being + * transferred because it will break RPC checksum. + */ + unlock_page(vmpage); + + CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has been written out, retry.\n", + vmpage, vmpage->index); + + *retry = true; + result = -EAGAIN; + } + + if (result == 0) { + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + } + +out_io: + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); +out: + CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); + LASSERT(ergo(result == 0, PageLocked(vmpage))); + + return result; +} + + + +static inline int to_fault_error(int result) +{ + switch (result) { + case 0: + result = VM_FAULT_LOCKED; + break; + case -EFAULT: + result = VM_FAULT_NOPAGE; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + return result; +} + +/** + * Lustre implementation of a vm_operations_struct::fault() method, called by + * VM to server page fault (both in kernel and user space). + * + * \param vma - is virtual area struct related to page fault + * \param vmf - structure which describe type and address where hit fault + * + * \return allocated and filled _locked_ page for address + * \retval VM_FAULT_ERROR on general error + * \retval NOPAGE_OOM not have memory for allocate new page + */ +static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio = NULL; + struct page *vmpage; + unsigned long ra_flags; + struct cl_env_nest nest; + int result; + int fault_ret = 0; + + io = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags); + if (IS_ERR(io)) + return to_fault_error(PTR_ERR(io)); + + result = io->ci_result; + if (result == 0) { + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = NULL; + vio->u.fault.fault.ft_vmf = vmf; + vio->u.fault.fault.ft_flags = 0; + vio->u.fault.fault.ft_flags_valid = false; + + result = cl_io_loop(env, io); + + /* ft_flags are only valid if we reached + * the call to filemap_fault */ + if (vio->u.fault.fault.ft_flags_valid) + fault_ret = vio->u.fault.fault.ft_flags; + + vmpage = vio->u.fault.ft_vmpage; + if (result != 0 && vmpage != NULL) { + page_cache_release(vmpage); + vmf->page = NULL; + } + } + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + + vma->vm_flags |= ra_flags; + if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) + fault_ret |= to_fault_error(result); + + CDEBUG(D_MMAP, "%s fault %d/%d\n", + current->comm, fault_ret, result); + return fault_ret; +} + +static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int count = 0; + bool printed = false; + int result; + sigset_t set; + + /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite + * so that it can be killed by admin but not cause segfault by + * other signals. */ + set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); + +restart: + result = ll_fault0(vma, vmf); + LASSERT(!(result & VM_FAULT_LOCKED)); + if (result == 0) { + struct page *vmpage = vmf->page; + + /* check if this page has been truncated */ + lock_page(vmpage); + if (unlikely(vmpage->mapping == NULL)) { /* unlucky */ + unlock_page(vmpage); + page_cache_release(vmpage); + vmf->page = NULL; + + if (!printed && ++count > 16) { + CWARN("the page is under heavy contention, maybe your app(%s) needs revising :-)\n", + current->comm); + printed = true; + } + + goto restart; + } + + result = VM_FAULT_LOCKED; + } + cfs_restore_sigs(set); + return result; +} + +static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int count = 0; + bool printed = false; + bool retry; + int result; + + do { + retry = false; + result = ll_page_mkwrite0(vma, vmf->page, &retry); + + if (!printed && ++count > 16) { + CWARN("app(%s): the page %lu of file %lu is under heavy contention.\n", + current->comm, vmf->pgoff, + file_inode(vma->vm_file)->i_ino); + printed = true; + } + } while (retry); + + switch (result) { + case 0: + LASSERT(PageLocked(vmf->page)); + result = VM_FAULT_LOCKED; + break; + case -ENODATA: + case -EFAULT: + result = VM_FAULT_NOPAGE; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + case -EAGAIN: + result = VM_FAULT_RETRY; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + + return result; +} + +/** + * To avoid cancel the locks covering mmapped region for lock cache pressure, + * we track the mapped vma count in ccc_object::cob_mmap_cnt. + */ +static void ll_vm_open(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ccc_object *vob = cl_inode2ccc(inode); + + LASSERT(vma->vm_file); + LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); + atomic_inc(&vob->cob_mmap_cnt); +} + +/** + * Dual to ll_vm_open(). + */ +static void ll_vm_close(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ccc_object *vob = cl_inode2ccc(inode); + + LASSERT(vma->vm_file); + atomic_dec(&vob->cob_mmap_cnt); + LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); +} + +/* XXX put nice comment here. talk about __free_pte -> dirty pages and + * nopage's reference passing to the pte */ +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) +{ + int rc = -ENOENT; + + LASSERTF(last > first, "last %llu first %llu\n", last, first); + if (mapping_mapped(mapping)) { + rc = 0; + unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1, + last - first + 1, 0); + } + + return rc; +} + +static const struct vm_operations_struct ll_file_vm_ops = { + .fault = ll_fault, + .page_mkwrite = ll_page_mkwrite, + .open = ll_vm_open, + .close = ll_vm_close, +}; + +int ll_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(file); + int rc; + + if (ll_file_nolock(file)) + return -EOPNOTSUPP; + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1); + rc = generic_file_mmap(file, vma); + if (rc == 0) { + vma->vm_ops = &ll_file_vm_ops; + vma->vm_ops->open(vma); + /* update the inode's size and mtime */ + rc = ll_glimpse_size(inode); + } + + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_nfs.c b/kernel/drivers/staging/lustre/lustre/llite/llite_nfs.c new file mode 100644 index 000000000..db43b8138 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/llite_nfs.c @@ -0,0 +1,335 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/llite/llite_nfs.c + * + * NFS export of Lustre Light File System + * + * Author: Yury Umanets + * Author: Huang Hua + */ + +#define DEBUG_SUBSYSTEM S_LLITE +#include "../include/lustre_lite.h" +#include "llite_internal.h" +#include + +__u32 get_uuid2int(const char *name, int len) +{ + __u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9; + while (len--) { + __u32 key = key1 + (key0 ^ (*name++ * 7152373)); + + if (key & 0x80000000) + key -= 0x7fffffff; + key1 = key0; + key0 = key; + } + return (key0 << 1); +} + +void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid) +{ + __u64 key = 0, key0 = 0x12a3fe2d, key1 = 0x37abe8f9; + + while (len--) { + key = key1 + (key0 ^ (*name++ * 7152373)); + if (key & 0x8000000000000000ULL) + key -= 0x7fffffffffffffffULL; + key1 = key0; + key0 = key; + } + + fsid->val[0] = key; + fsid->val[1] = key >> 32; +} + +static int ll_nfs_test_inode(struct inode *inode, void *opaque) +{ + return lu_fid_eq(&ll_i2info(inode)->lli_fid, + (struct lu_fid *)opaque); +} + +struct inode *search_inode_for_lustre(struct super_block *sb, + const struct lu_fid *fid) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct ptlrpc_request *req = NULL; + struct inode *inode = NULL; + int eadatalen = 0; + unsigned long hash = cl_fid_build_ino(fid, + ll_need_32bit_api(sbi)); + struct md_op_data *op_data; + int rc; + + CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid)); + + inode = ilookup5(sb, hash, ll_nfs_test_inode, (void *)fid); + if (inode) + return inode; + + rc = ll_get_default_mdsize(sbi, &eadatalen); + if (rc) + return ERR_PTR(rc); + + /* Because inode is NULL, ll_prep_md_op_data can not + * be used here. So we allocate op_data ourselves */ + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + if (!op_data) + return ERR_PTR(-ENOMEM); + + op_data->op_fid1 = *fid; + op_data->op_mode = eadatalen; + op_data->op_valid = OBD_MD_FLEASIZE; + + /* mds_fid2dentry ignores f_type */ + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + OBD_FREE_PTR(op_data); + if (rc) { + CERROR("can't get object attrs, fid "DFID", rc %d\n", + PFID(fid), rc); + return ERR_PTR(rc); + } + rc = ll_prep_inode(&inode, req, sb, NULL); + ptlrpc_req_finished(req); + if (rc) + return ERR_PTR(rc); + + return inode; +} + +struct lustre_nfs_fid { + struct lu_fid lnf_child; + struct lu_fid lnf_parent; +}; + +static struct dentry * +ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent) +{ + struct inode *inode; + struct dentry *result; + + CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid)); + if (!fid_is_sane(fid)) + return ERR_PTR(-ESTALE); + + inode = search_inode_for_lustre(sb, fid); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (is_bad_inode(inode)) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + + /** + * It is an anonymous dentry without OST objects created yet. + * We have to find the parent to tell MDS how to init lov objects. + */ + if (S_ISREG(inode->i_mode) && !ll_i2info(inode)->lli_has_smd && + parent != NULL) { + struct ll_inode_info *lli = ll_i2info(inode); + + spin_lock(&lli->lli_lock); + lli->lli_pfid = *parent; + spin_unlock(&lli->lli_lock); + } + + result = d_obtain_alias(inode); + if (IS_ERR(result)) { + iput(inode); + return result; + } + + return result; +} + +#define LUSTRE_NFS_FID 0x97 + +/** + * \a connectable - is nfsd will connect himself or this should be done + * at lustre + * + * The return value is file handle type: + * 1 -- contains child file handle; + * 2 -- contains child file handle and parent file handle; + * 255 -- error. + */ +static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen, + struct inode *parent) +{ + struct lustre_nfs_fid *nfs_fid = (void *)fh; + + CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%d\n", + inode->i_ino, PFID(ll_inode2fid(inode)), *plen, + (int)sizeof(struct lustre_nfs_fid)); + + if (*plen < sizeof(struct lustre_nfs_fid) / 4) + return 255; + + nfs_fid->lnf_child = *ll_inode2fid(inode); + nfs_fid->lnf_parent = *ll_inode2fid(parent); + *plen = sizeof(struct lustre_nfs_fid) / 4; + + return LUSTRE_NFS_FID; +} + +static int ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, + int namelen, loff_t hash, u64 ino, + unsigned type) +{ + /* It is hack to access lde_fid for comparison with lgd_fid. + * So the input 'name' must be part of the 'lu_dirent'. */ + struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name); + struct ll_getname_data *lgd = + container_of(ctx, struct ll_getname_data, ctx); + struct lu_fid fid; + + fid_le_to_cpu(&fid, &lde->lde_fid); + if (lu_fid_eq(&fid, &lgd->lgd_fid)) { + memcpy(lgd->lgd_name, name, namelen); + lgd->lgd_name[namelen] = 0; + lgd->lgd_found = 1; + } + return lgd->lgd_found; +} + +static int ll_get_name(struct dentry *dentry, char *name, + struct dentry *child) +{ + struct inode *dir = d_inode(dentry); + int rc; + struct ll_getname_data lgd = { + .lgd_name = name, + .lgd_fid = ll_i2info(d_inode(child))->lli_fid, + .ctx.actor = ll_nfs_get_name_filldir, + }; + + if (!dir || !S_ISDIR(dir->i_mode)) { + rc = -ENOTDIR; + goto out; + } + + if (!dir->i_fop) { + rc = -EINVAL; + goto out; + } + + mutex_lock(&dir->i_mutex); + rc = ll_dir_read(dir, &lgd.ctx); + mutex_unlock(&dir->i_mutex); + if (!rc && !lgd.lgd_found) + rc = -ENOENT; +out: + return rc; +} + +static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; + + if (fh_type != LUSTRE_NFS_FID) + return ERR_PTR(-EPROTO); + + return ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent); +} + +static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; + + if (fh_type != LUSTRE_NFS_FID) + return ERR_PTR(-EPROTO); + + return ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL); +} + +static struct dentry *ll_get_parent(struct dentry *dchild) +{ + struct ptlrpc_request *req = NULL; + struct inode *dir = d_inode(dchild); + struct ll_sb_info *sbi; + struct dentry *result = NULL; + struct mdt_body *body; + static char dotdot[] = ".."; + struct md_op_data *op_data; + int rc; + int lmmsize; + + LASSERT(dir && S_ISDIR(dir->i_mode)); + + sbi = ll_s2sbi(dir->i_sb); + + CDEBUG(D_INFO, "getting parent for (%lu,"DFID")\n", + dir->i_ino, PFID(ll_inode2fid(dir))); + + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc != 0) + return ERR_PTR(rc); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot, + strlen(dotdot), lmmsize, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return (void *)op_data; + + rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc) { + CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino); + return ERR_PTR(rc); + } + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body->valid & OBD_MD_FLID); + + CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n", + PFID(ll_inode2fid(dir)), PFID(&body->fid1)); + + result = ll_iget_for_nfs(dir->i_sb, &body->fid1, NULL); + + ptlrpc_req_finished(req); + return result; +} + +struct export_operations lustre_export_operations = { + .get_parent = ll_get_parent, + .encode_fh = ll_encode_fh, + .get_name = ll_get_name, + .fh_to_dentry = ll_fh_to_dentry, + .fh_to_parent = ll_fh_to_parent, +}; diff --git a/kernel/drivers/staging/lustre/lustre/llite/llite_rmtacl.c b/kernel/drivers/staging/lustre/lustre/llite/llite_rmtacl.c new file mode 100644 index 000000000..f4da156f3 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/llite_rmtacl.c @@ -0,0 +1,300 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/llite_rmtacl.c + * + * Lustre Remote User Access Control List. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#ifdef CONFIG_FS_POSIX_ACL + +#include "../include/lustre_lite.h" +#include "../include/lustre_eacl.h" +#include "llite_internal.h" + +static inline __u32 rce_hashfunc(uid_t id) +{ + return id & (RCE_HASHES - 1); +} + +static inline __u32 ee_hashfunc(uid_t id) +{ + return id & (EE_HASHES - 1); +} + +u64 rce_ops2valid(int ops) +{ + switch (ops) { + case RMT_LSETFACL: + return OBD_MD_FLRMTLSETFACL; + case RMT_LGETFACL: + return OBD_MD_FLRMTLGETFACL; + case RMT_RSETFACL: + return OBD_MD_FLRMTRSETFACL; + case RMT_RGETFACL: + return OBD_MD_FLRMTRGETFACL; + default: + return 0; + } +} + +static struct rmtacl_ctl_entry *rce_alloc(pid_t key, int ops) +{ + struct rmtacl_ctl_entry *rce; + + rce = kzalloc(sizeof(*rce), GFP_NOFS); + if (!rce) + return NULL; + + INIT_LIST_HEAD(&rce->rce_list); + rce->rce_key = key; + rce->rce_ops = ops; + + return rce; +} + +static void rce_free(struct rmtacl_ctl_entry *rce) +{ + if (!list_empty(&rce->rce_list)) + list_del(&rce->rce_list); + + OBD_FREE_PTR(rce); +} + +static struct rmtacl_ctl_entry *__rct_search(struct rmtacl_ctl_table *rct, + pid_t key) +{ + struct rmtacl_ctl_entry *rce; + struct list_head *head = &rct->rct_entries[rce_hashfunc(key)]; + + list_for_each_entry(rce, head, rce_list) + if (rce->rce_key == key) + return rce; + + return NULL; +} + +struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key) +{ + struct rmtacl_ctl_entry *rce; + + spin_lock(&rct->rct_lock); + rce = __rct_search(rct, key); + spin_unlock(&rct->rct_lock); + return rce; +} + +int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops) +{ + struct rmtacl_ctl_entry *rce, *e; + + rce = rce_alloc(key, ops); + if (rce == NULL) + return -ENOMEM; + + spin_lock(&rct->rct_lock); + e = __rct_search(rct, key); + if (unlikely(e != NULL)) { + CWARN("Unexpected stale rmtacl_entry found: [key: %d] [ops: %d]\n", + (int)key, ops); + rce_free(e); + } + list_add_tail(&rce->rce_list, &rct->rct_entries[rce_hashfunc(key)]); + spin_unlock(&rct->rct_lock); + + return 0; +} + +int rct_del(struct rmtacl_ctl_table *rct, pid_t key) +{ + struct rmtacl_ctl_entry *rce; + + spin_lock(&rct->rct_lock); + rce = __rct_search(rct, key); + if (rce) + rce_free(rce); + spin_unlock(&rct->rct_lock); + + return rce ? 0 : -ENOENT; +} + +void rct_init(struct rmtacl_ctl_table *rct) +{ + int i; + + spin_lock_init(&rct->rct_lock); + for (i = 0; i < RCE_HASHES; i++) + INIT_LIST_HEAD(&rct->rct_entries[i]); +} + +void rct_fini(struct rmtacl_ctl_table *rct) +{ + struct rmtacl_ctl_entry *rce; + int i; + + spin_lock(&rct->rct_lock); + for (i = 0; i < RCE_HASHES; i++) + while (!list_empty(&rct->rct_entries[i])) { + rce = list_entry(rct->rct_entries[i].next, + struct rmtacl_ctl_entry, rce_list); + rce_free(rce); + } + spin_unlock(&rct->rct_lock); +} + + +static struct eacl_entry *ee_alloc(pid_t key, struct lu_fid *fid, int type, + ext_acl_xattr_header *header) +{ + struct eacl_entry *ee; + + ee = kzalloc(sizeof(*ee), GFP_NOFS); + if (!ee) + return NULL; + + INIT_LIST_HEAD(&ee->ee_list); + ee->ee_key = key; + ee->ee_fid = *fid; + ee->ee_type = type; + ee->ee_acl = header; + + return ee; +} + +void ee_free(struct eacl_entry *ee) +{ + if (!list_empty(&ee->ee_list)) + list_del(&ee->ee_list); + + if (ee->ee_acl) + lustre_ext_acl_xattr_free(ee->ee_acl); + + OBD_FREE_PTR(ee); +} + +static struct eacl_entry *__et_search_del(struct eacl_table *et, pid_t key, + struct lu_fid *fid, int type) +{ + struct eacl_entry *ee; + struct list_head *head = &et->et_entries[ee_hashfunc(key)]; + + LASSERT(fid != NULL); + list_for_each_entry(ee, head, ee_list) + if (ee->ee_key == key) { + if (lu_fid_eq(&ee->ee_fid, fid) && + ee->ee_type == type) { + list_del_init(&ee->ee_list); + return ee; + } + } + + return NULL; +} + +struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key, + struct lu_fid *fid, int type) +{ + struct eacl_entry *ee; + + spin_lock(&et->et_lock); + ee = __et_search_del(et, key, fid, type); + spin_unlock(&et->et_lock); + return ee; +} + +void et_search_free(struct eacl_table *et, pid_t key) +{ + struct eacl_entry *ee, *next; + struct list_head *head = &et->et_entries[ee_hashfunc(key)]; + + spin_lock(&et->et_lock); + list_for_each_entry_safe(ee, next, head, ee_list) + if (ee->ee_key == key) + ee_free(ee); + + spin_unlock(&et->et_lock); +} + +int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type, + ext_acl_xattr_header *header) +{ + struct eacl_entry *ee, *e; + + ee = ee_alloc(key, fid, type, header); + if (ee == NULL) + return -ENOMEM; + + spin_lock(&et->et_lock); + e = __et_search_del(et, key, fid, type); + if (unlikely(e != NULL)) { + CWARN("Unexpected stale eacl_entry found: [key: %d] [fid: " DFID "] [type: %d]\n", + (int)key, PFID(fid), type); + ee_free(e); + } + list_add_tail(&ee->ee_list, &et->et_entries[ee_hashfunc(key)]); + spin_unlock(&et->et_lock); + + return 0; +} + +void et_init(struct eacl_table *et) +{ + int i; + + spin_lock_init(&et->et_lock); + for (i = 0; i < EE_HASHES; i++) + INIT_LIST_HEAD(&et->et_entries[i]); +} + +void et_fini(struct eacl_table *et) +{ + struct eacl_entry *ee; + int i; + + spin_lock(&et->et_lock); + for (i = 0; i < EE_HASHES; i++) + while (!list_empty(&et->et_entries[i])) { + ee = list_entry(et->et_entries[i].next, + struct eacl_entry, ee_list); + ee_free(ee); + } + spin_unlock(&et->et_lock); +} + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/llite/lloop.c b/kernel/drivers/staging/lustre/lustre/llite/lloop.c new file mode 100644 index 000000000..413a8408e --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/lloop.c @@ -0,0 +1,877 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/* + * linux/drivers/block/loop.c + * + * Written by Theodore Ts'o, 3/29/93 + * + * Copyright 1993 by Theodore Ts'o. Redistribution of this file is + * permitted under the GNU General Public License. + * + * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 + * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 + * + * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 + * + * Added devfs support - Richard Gooch 16-Jan-1998 + * + * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 + * + * Loadable modules and other fixes by AK, 1998 + * + * Maximum number of loop devices now dynamic via max_loop module parameter. + * Russell Kroll 19990701 + * + * Maximum number of loop devices when compiled-in now selectable by passing + * max_loop=<1-255> to the kernel on boot. + * Erik I. Bols?, , Oct 31, 1999 + * + * Completely rewrite request handling to be make_request_fn style and + * non blocking, pushing work to a helper thread. Lots of fixes from + * Al Viro too. + * Jens Axboe , Nov 2000 + * + * Support up to 256 loop devices + * Heinz Mauelshagen , Feb 2002 + * + * Support for falling back on the write file operation when the address space + * operations prepare_write and/or commit_write are not available on the + * backing filesystem. + * Anton Altaparmakov, 16 Feb 2005 + * + * Still To Fix: + * - Advisory locking is ignored here. + * - Should use an own CAP_* category instead of CAP_SYS_ADMIN + * + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for invalidate_bdev() */ +#include +#include +#include +#include +#include + +#include "../include/lustre_lib.h" +#include "../include/lustre_lite.h" +#include "llite_internal.h" + +#define LLOOP_MAX_SEGMENTS LNET_MAX_IOV + +/* Possible states of device */ +enum { + LLOOP_UNBOUND, + LLOOP_BOUND, + LLOOP_RUNDOWN, +}; + +struct lloop_device { + int lo_number; + int lo_refcnt; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + struct file *lo_backing_file; + struct block_device *lo_device; + unsigned lo_blocksize; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + struct bio *lo_bio; + struct bio *lo_biotail; + int lo_state; + struct semaphore lo_sem; + struct mutex lo_ctl_mutex; + atomic_t lo_pending; + wait_queue_head_t lo_bh_wait; + + struct request_queue *lo_queue; + + const struct lu_env *lo_env; + struct cl_io lo_io; + struct ll_dio_pages lo_pvec; + + /* data to handle bio for lustre. */ + struct lo_request_data { + struct page *lrd_pages[LLOOP_MAX_SEGMENTS]; + loff_t lrd_offsets[LLOOP_MAX_SEGMENTS]; + } lo_requests[1]; +}; + +/* + * Loop flags + */ +enum { + LO_FLAGS_READ_ONLY = 1, +}; + +static int lloop_major; +#define MAX_LOOP_DEFAULT 16 +static int max_loop = MAX_LOOP_DEFAULT; +static struct lloop_device *loop_dev; +static struct gendisk **disks; +static struct mutex lloop_mutex; +static void *ll_iocontrol_magic = NULL; + +static loff_t get_loop_size(struct lloop_device *lo, struct file *file) +{ + loff_t size, offset, loopsize; + + /* Compute loopsize in bytes */ + size = i_size_read(file->f_mapping->host); + offset = lo->lo_offset; + loopsize = size - offset; + if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) + loopsize = lo->lo_sizelimit; + + /* + * Unfortunately, if we want to do I/O on the device, + * the number of 512-byte sectors has to fit into a sector_t. + */ + return loopsize >> 9; +} + +static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head) +{ + const struct lu_env *env = lo->lo_env; + struct cl_io *io = &lo->lo_io; + struct inode *inode = file_inode(lo->lo_backing_file); + struct cl_object *obj = ll_i2info(inode)->lli_clob; + pgoff_t offset; + int ret; + int rw; + u32 page_count = 0; + struct bio_vec bvec; + struct bvec_iter iter; + struct bio *bio; + ssize_t bytes; + + struct ll_dio_pages *pvec = &lo->lo_pvec; + struct page **pages = pvec->ldp_pages; + loff_t *offsets = pvec->ldp_offsets; + + truncate_inode_pages(inode->i_mapping, 0); + + /* initialize the IO */ + memset(io, 0, sizeof(*io)); + io->ci_obj = obj; + ret = cl_io_init(env, io, CIT_MISC, obj); + if (ret) + return io->ci_result; + io->ci_lockreq = CILR_NEVER; + + LASSERT(head != NULL); + rw = head->bi_rw; + for (bio = head; bio != NULL; bio = bio->bi_next) { + LASSERT(rw == bio->bi_rw); + + offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset; + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_offset != 0); + BUG_ON(bvec.bv_len != PAGE_CACHE_SIZE); + + pages[page_count] = bvec.bv_page; + offsets[page_count] = offset; + page_count++; + offset += bvec.bv_len; + } + LASSERT(page_count <= LLOOP_MAX_SEGMENTS); + } + + ll_stats_ops_tally(ll_i2sbi(inode), + (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ, + page_count); + + pvec->ldp_size = page_count << PAGE_CACHE_SHIFT; + pvec->ldp_nr = page_count; + + /* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to + * write those pages into OST. Even worse case is that more pages + * would be asked to write out to swap space, and then finally get here + * again. + * Unfortunately this is NOT easy to fix. + * Thoughts on solution: + * 0. Define a reserved pool for cl_pages, which could be a list of + * pre-allocated cl_pages; + * 1. Define a new operation in cl_object_operations{}, says clo_depth, + * which measures how many layers for this lustre object. Generally + * speaking, the depth would be 2, one for llite, and one for lovsub. + * However, for SNS, there will be more since we need additional page + * to store parity; + * 2. Reserve the # of (page_count * depth) cl_pages from the reserved + * pool. Afterwards, the clio would allocate the pages from reserved + * pool, this guarantees we needn't allocate the cl_pages from + * generic cl_page slab cache. + * Of course, if there is NOT enough pages in the pool, we might + * be asked to write less pages once, this purely depends on + * implementation. Anyway, we should be careful to avoid deadlocking. + */ + mutex_lock(&inode->i_mutex); + bytes = ll_direct_rw_pages(env, io, rw, inode, pvec); + mutex_unlock(&inode->i_mutex); + cl_io_fini(env, io); + return (bytes == pvec->ldp_size) ? 0 : (int)bytes; +} + +/* + * Add bio to back of pending list + */ +static void loop_add_bio(struct lloop_device *lo, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&lo->lo_lock, flags); + if (lo->lo_biotail) { + lo->lo_biotail->bi_next = bio; + lo->lo_biotail = bio; + } else + lo->lo_bio = lo->lo_biotail = bio; + spin_unlock_irqrestore(&lo->lo_lock, flags); + + atomic_inc(&lo->lo_pending); + if (waitqueue_active(&lo->lo_bh_wait)) + wake_up(&lo->lo_bh_wait); +} + +/* + * Grab first pending buffer + */ +static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req) +{ + struct bio *first; + struct bio **bio; + unsigned int count = 0; + unsigned int page_count = 0; + int rw; + + spin_lock_irq(&lo->lo_lock); + first = lo->lo_bio; + if (unlikely(first == NULL)) { + spin_unlock_irq(&lo->lo_lock); + return 0; + } + + /* TODO: need to split the bio, too bad. */ + LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS); + + rw = first->bi_rw; + bio = &lo->lo_bio; + while (*bio && (*bio)->bi_rw == rw) { + CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n", + (unsigned long long)(*bio)->bi_iter.bi_sector, + (*bio)->bi_iter.bi_size, + page_count, (*bio)->bi_vcnt); + if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS) + break; + + + page_count += (*bio)->bi_vcnt; + count++; + bio = &(*bio)->bi_next; + } + if (*bio) { + /* Some of bios can't be mergeable. */ + lo->lo_bio = *bio; + *bio = NULL; + } else { + /* Hit the end of queue */ + lo->lo_biotail = NULL; + lo->lo_bio = NULL; + } + *req = first; + spin_unlock_irq(&lo->lo_lock); + return count; +} + +static void loop_make_request(struct request_queue *q, struct bio *old_bio) +{ + struct lloop_device *lo = q->queuedata; + int rw = bio_rw(old_bio); + int inactive; + + if (!lo) + goto err; + + CDEBUG(D_INFO, "submit bio sector %llu size %u\n", + (unsigned long long)old_bio->bi_iter.bi_sector, + old_bio->bi_iter.bi_size); + + spin_lock_irq(&lo->lo_lock); + inactive = lo->lo_state != LLOOP_BOUND; + spin_unlock_irq(&lo->lo_lock); + if (inactive) + goto err; + + if (rw == WRITE) { + if (lo->lo_flags & LO_FLAGS_READ_ONLY) + goto err; + } else if (rw == READA) { + rw = READ; + } else if (rw != READ) { + CERROR("lloop: unknown command (%x)\n", rw); + goto err; + } + loop_add_bio(lo, old_bio); + return; +err: + cfs_bio_io_error(old_bio, old_bio->bi_iter.bi_size); +} + + +static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio) +{ + int ret; + ret = do_bio_lustrebacked(lo, bio); + while (bio) { + struct bio *tmp = bio->bi_next; + bio->bi_next = NULL; + cfs_bio_endio(bio, bio->bi_iter.bi_size, ret); + bio = tmp; + } +} + +static inline int loop_active(struct lloop_device *lo) +{ + return atomic_read(&lo->lo_pending) || + (lo->lo_state == LLOOP_RUNDOWN); +} + +/* + * worker thread that handles reads/writes to file backed loop devices, + * to avoid blocking in our make_request_fn. + */ +static int loop_thread(void *data) +{ + struct lloop_device *lo = data; + struct bio *bio; + unsigned int count; + unsigned long times = 0; + unsigned long total_count = 0; + + struct lu_env *env; + int refcheck; + int ret = 0; + + set_user_nice(current, MIN_NICE); + + lo->lo_state = LLOOP_BOUND; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) { + ret = PTR_ERR(env); + goto out; + } + + lo->lo_env = env; + memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec)); + lo->lo_pvec.ldp_pages = lo->lo_requests[0].lrd_pages; + lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets; + + /* + * up sem, we are running + */ + up(&lo->lo_sem); + + for (;;) { + wait_event(lo->lo_bh_wait, loop_active(lo)); + if (!atomic_read(&lo->lo_pending)) { + int exiting = 0; + spin_lock_irq(&lo->lo_lock); + exiting = (lo->lo_state == LLOOP_RUNDOWN); + spin_unlock_irq(&lo->lo_lock); + if (exiting) + break; + } + + bio = NULL; + count = loop_get_bio(lo, &bio); + if (!count) { + CWARN("lloop(minor: %d): missing bio\n", lo->lo_number); + continue; + } + + total_count += count; + if (total_count < count) { /* overflow */ + total_count = count; + times = 1; + } else { + times++; + } + if ((times & 127) == 0) { + CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n", + total_count, times, total_count / times); + } + + LASSERT(bio != NULL); + LASSERT(count <= atomic_read(&lo->lo_pending)); + loop_handle_bio(lo, bio); + atomic_sub(count, &lo->lo_pending); + } + cl_env_put(env, &refcheck); + +out: + up(&lo->lo_sem); + return ret; +} + +static int loop_set_fd(struct lloop_device *lo, struct file *unused, + struct block_device *bdev, struct file *file) +{ + struct inode *inode; + struct address_space *mapping; + int lo_flags = 0; + int error; + loff_t size; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + error = -EBUSY; + if (lo->lo_state != LLOOP_UNBOUND) + goto out; + + mapping = file->f_mapping; + inode = mapping->host; + + error = -EINVAL; + if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC) + goto out; + + if (!(file->f_mode & FMODE_WRITE)) + lo_flags |= LO_FLAGS_READ_ONLY; + + size = get_loop_size(lo, file); + + if ((loff_t)(sector_t)size != size) { + error = -EFBIG; + goto out; + } + + /* remove all pages in cache so as dirty pages not to be existent. */ + truncate_inode_pages(mapping, 0); + + set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); + + lo->lo_blocksize = PAGE_CACHE_SIZE; + lo->lo_device = bdev; + lo->lo_flags = lo_flags; + lo->lo_backing_file = file; + lo->lo_sizelimit = 0; + lo->old_gfp_mask = mapping_gfp_mask(mapping); + mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + + lo->lo_bio = lo->lo_biotail = NULL; + + /* + * set queue make_request_fn, and add limits based on lower level + * device + */ + blk_queue_make_request(lo->lo_queue, loop_make_request); + lo->lo_queue->queuedata = lo; + + /* queue parameters */ + CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8))); + blk_queue_logical_block_size(lo->lo_queue, + (unsigned short)PAGE_CACHE_SIZE); + blk_queue_max_hw_sectors(lo->lo_queue, + LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9)); + blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS); + + set_capacity(disks[lo->lo_number], size); + bd_set_size(bdev, size << 9); + + set_blocksize(bdev, lo->lo_blocksize); + + kthread_run(loop_thread, lo, "lloop%d", lo->lo_number); + down(&lo->lo_sem); + return 0; + +out: + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + return error; +} + +static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev, + int count) +{ + struct file *filp = lo->lo_backing_file; + gfp_t gfp = lo->old_gfp_mask; + + if (lo->lo_state != LLOOP_BOUND) + return -ENXIO; + + if (lo->lo_refcnt > count) /* we needed one fd for the ioctl */ + return -EBUSY; + + if (filp == NULL) + return -EINVAL; + + spin_lock_irq(&lo->lo_lock); + lo->lo_state = LLOOP_RUNDOWN; + spin_unlock_irq(&lo->lo_lock); + wake_up(&lo->lo_bh_wait); + + down(&lo->lo_sem); + lo->lo_backing_file = NULL; + lo->lo_device = NULL; + lo->lo_offset = 0; + lo->lo_sizelimit = 0; + lo->lo_flags = 0; + invalidate_bdev(bdev); + set_capacity(disks[lo->lo_number], 0); + bd_set_size(bdev, 0); + mapping_set_gfp_mask(filp->f_mapping, gfp); + lo->lo_state = LLOOP_UNBOUND; + fput(filp); + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + return 0; +} + +static int lo_open(struct block_device *bdev, fmode_t mode) +{ + struct lloop_device *lo = bdev->bd_disk->private_data; + + mutex_lock(&lo->lo_ctl_mutex); + lo->lo_refcnt++; + mutex_unlock(&lo->lo_ctl_mutex); + + return 0; +} + +static void lo_release(struct gendisk *disk, fmode_t mode) +{ + struct lloop_device *lo = disk->private_data; + + mutex_lock(&lo->lo_ctl_mutex); + --lo->lo_refcnt; + mutex_unlock(&lo->lo_ctl_mutex); +} + +/* lloop device node's ioctl function. */ +static int lo_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct lloop_device *lo = bdev->bd_disk->private_data; + struct inode *inode = NULL; + int err = 0; + + mutex_lock(&lloop_mutex); + switch (cmd) { + case LL_IOC_LLOOP_DETACH: { + err = loop_clr_fd(lo, bdev, 2); + if (err == 0) + blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */ + break; + } + + case LL_IOC_LLOOP_INFO: { + struct lu_fid fid; + + if (lo->lo_backing_file == NULL) { + err = -ENOENT; + break; + } + if (inode == NULL) + inode = file_inode(lo->lo_backing_file); + if (lo->lo_state == LLOOP_BOUND) + fid = ll_i2info(inode)->lli_fid; + else + fid_zero(&fid); + + if (copy_to_user((struct lu_fid *)arg, &fid, sizeof(fid))) + err = -EFAULT; + break; + } + + default: + err = -EINVAL; + break; + } + mutex_unlock(&lloop_mutex); + + return err; +} + +static struct block_device_operations lo_fops = { + .owner = THIS_MODULE, + .open = lo_open, + .release = lo_release, + .ioctl = lo_ioctl, +}; + +/* dynamic iocontrol callback. + * This callback is registered in lloop_init and will be called by + * ll_iocontrol_call. + * + * This is a llite regular file ioctl function. It takes the responsibility + * of attaching or detaching a file by a lloop's device number. + */ +static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file, + unsigned int cmd, unsigned long arg, + void *magic, int *rcp) +{ + struct lloop_device *lo = NULL; + struct block_device *bdev = NULL; + int err = 0; + dev_t dev; + + if (magic != ll_iocontrol_magic) + return LLIOC_CONT; + + if (disks == NULL) { + err = -ENODEV; + goto out1; + } + + CWARN("Enter llop_ioctl\n"); + + mutex_lock(&lloop_mutex); + switch (cmd) { + case LL_IOC_LLOOP_ATTACH: { + struct lloop_device *lo_free = NULL; + int i; + + for (i = 0; i < max_loop; i++, lo = NULL) { + lo = &loop_dev[i]; + if (lo->lo_state == LLOOP_UNBOUND) { + if (!lo_free) + lo_free = lo; + continue; + } + if (file_inode(lo->lo_backing_file) == file_inode(file)) + break; + } + if (lo || !lo_free) { + err = -EBUSY; + goto out; + } + + lo = lo_free; + dev = MKDEV(lloop_major, lo->lo_number); + + /* quit if the used pointer is writable */ + if (put_user((long)old_encode_dev(dev), (long *)arg)) { + err = -EFAULT; + goto out; + } + + bdev = blkdev_get_by_dev(dev, file->f_mode, NULL); + if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto out; + } + + get_file(file); + err = loop_set_fd(lo, NULL, bdev, file); + if (err) { + fput(file); + blkdev_put(bdev, 0); + } + + break; + } + + case LL_IOC_LLOOP_DETACH_BYDEV: { + int minor; + + dev = old_decode_dev(arg); + if (MAJOR(dev) != lloop_major) { + err = -EINVAL; + goto out; + } + + minor = MINOR(dev); + if (minor > max_loop - 1) { + err = -EINVAL; + goto out; + } + + lo = &loop_dev[minor]; + if (lo->lo_state != LLOOP_BOUND) { + err = -EINVAL; + goto out; + } + + bdev = lo->lo_device; + err = loop_clr_fd(lo, bdev, 1); + if (err == 0) + blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */ + + break; + } + + default: + err = -EINVAL; + break; + } + +out: + mutex_unlock(&lloop_mutex); +out1: + if (rcp) + *rcp = err; + return LLIOC_STOP; +} + +static int __init lloop_init(void) +{ + int i; + unsigned int cmdlist[] = { + LL_IOC_LLOOP_ATTACH, + LL_IOC_LLOOP_DETACH_BYDEV, + }; + + if (max_loop < 1 || max_loop > 256) { + max_loop = MAX_LOOP_DEFAULT; + CWARN("lloop: invalid max_loop (must be between 1 and 256), using default (%u)\n", + max_loop); + } + + lloop_major = register_blkdev(0, "lloop"); + if (lloop_major < 0) + return -EIO; + + CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n", + lloop_major, max_loop); + + ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist); + if (ll_iocontrol_magic == NULL) + goto out_mem1; + + loop_dev = kcalloc(max_loop, sizeof(*loop_dev), GFP_KERNEL); + if (!loop_dev) + goto out_mem1; + + disks = kcalloc(max_loop, sizeof(*disks), GFP_KERNEL); + if (!disks) + goto out_mem2; + + for (i = 0; i < max_loop; i++) { + disks[i] = alloc_disk(1); + if (!disks[i]) + goto out_mem3; + } + + mutex_init(&lloop_mutex); + + for (i = 0; i < max_loop; i++) { + struct lloop_device *lo = &loop_dev[i]; + struct gendisk *disk = disks[i]; + + lo->lo_queue = blk_alloc_queue(GFP_KERNEL); + if (!lo->lo_queue) + goto out_mem4; + + mutex_init(&lo->lo_ctl_mutex); + sema_init(&lo->lo_sem, 0); + init_waitqueue_head(&lo->lo_bh_wait); + lo->lo_number = i; + spin_lock_init(&lo->lo_lock); + disk->major = lloop_major; + disk->first_minor = i; + disk->fops = &lo_fops; + sprintf(disk->disk_name, "lloop%d", i); + disk->private_data = lo; + disk->queue = lo->lo_queue; + } + + /* We cannot fail after we call this, so another loop!*/ + for (i = 0; i < max_loop; i++) + add_disk(disks[i]); + return 0; + +out_mem4: + while (i--) + blk_cleanup_queue(loop_dev[i].lo_queue); + i = max_loop; +out_mem3: + while (i--) + put_disk(disks[i]); + OBD_FREE(disks, max_loop * sizeof(*disks)); +out_mem2: + OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev)); +out_mem1: + unregister_blkdev(lloop_major, "lloop"); + ll_iocontrol_unregister(ll_iocontrol_magic); + CERROR("lloop: ran out of memory\n"); + return -ENOMEM; +} + +static void lloop_exit(void) +{ + int i; + + ll_iocontrol_unregister(ll_iocontrol_magic); + for (i = 0; i < max_loop; i++) { + del_gendisk(disks[i]); + blk_cleanup_queue(loop_dev[i].lo_queue); + put_disk(disks[i]); + } + + unregister_blkdev(lloop_major, "lloop"); + + OBD_FREE(disks, max_loop * sizeof(*disks)); + OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev)); +} + +module_init(lloop_init); +module_exit(lloop_exit); + +module_param(max_loop, int, 0444); +MODULE_PARM_DESC(max_loop, "maximum of lloop_device"); +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre virtual block device"); +MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/staging/lustre/lustre/llite/lproc_llite.c b/kernel/drivers/staging/lustre/lustre/llite/lproc_llite.c new file mode 100644 index 000000000..83a9b8547 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/lproc_llite.c @@ -0,0 +1,1536 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/lustre_lite.h" +#include "../include/lprocfs_status.h" +#include +#include "../include/obd_support.h" + +#include "llite_internal.h" +#include "vvp_internal.h" + +/* /proc/lustre/llite mount point registration */ +static struct file_operations ll_rw_extents_stats_fops; +static struct file_operations ll_rw_extents_stats_pp_fops; +static struct file_operations ll_rw_offset_stats_fops; + +static int ll_blksize_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = (struct super_block *)m->private; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) + seq_printf(m, "%u\n", osfs.os_bsize); + + return rc; +} +LPROC_SEQ_FOPS_RO(ll_blksize); + +static int ll_kbytestotal_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = (struct super_block *)m->private; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + + return rc; +} +LPROC_SEQ_FOPS_RO(ll_kbytestotal); + +static int ll_kbytesfree_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = (struct super_block *)m->private; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + + return rc; +} +LPROC_SEQ_FOPS_RO(ll_kbytesfree); + +static int ll_kbytesavail_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = (struct super_block *)m->private; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + + return rc; +} +LPROC_SEQ_FOPS_RO(ll_kbytesavail); + +static int ll_filestotal_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = (struct super_block *)m->private; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) + seq_printf(m, "%llu\n", osfs.os_files); + + return rc; +} +LPROC_SEQ_FOPS_RO(ll_filestotal); + +static int ll_filesfree_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = (struct super_block *)m->private; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) + seq_printf(m, "%llu\n", osfs.os_ffree); + + return rc; +} +LPROC_SEQ_FOPS_RO(ll_filesfree); + +static int ll_client_type_seq_show(struct seq_file *m, void *v) +{ + struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private); + + LASSERT(sbi != NULL); + + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + seq_puts(m, "remote client\n"); + else + seq_puts(m, "local client\n"); + + return 0; +} +LPROC_SEQ_FOPS_RO(ll_client_type); + +static int ll_fstype_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = (struct super_block *)m->private; + + LASSERT(sb != NULL); + seq_printf(m, "%s\n", sb->s_type->name); + return 0; +} +LPROC_SEQ_FOPS_RO(ll_fstype); + +static int ll_sb_uuid_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = (struct super_block *)m->private; + + LASSERT(sb != NULL); + seq_printf(m, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid); + return 0; +} +LPROC_SEQ_FOPS_RO(ll_sb_uuid); + +static int ll_site_stats_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + + /* + * See description of statistical counters in struct cl_site, and + * struct lu_site. + */ + return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m); +} +LPROC_SEQ_FOPS_RO(ll_site_stats); + +static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + long pages_number; + int mult; + + spin_lock(&sbi->ll_lock); + pages_number = sbi->ll_ra_info.ra_max_pages; + spin_unlock(&sbi->ll_lock); + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + return lprocfs_seq_read_frac_helper(m, pages_number, mult); +} + +static ssize_t ll_max_readahead_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct super_block *sb = ((struct seq_file *)file->private_data)->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int mult, rc, pages_number; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number < 0 || pages_number > totalram_pages / 2) { + CERROR("can't set file readahead more than %lu MB\n", + totalram_pages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/ + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} +LPROC_SEQ_FOPS(ll_max_readahead_mb); + +static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + long pages_number; + int mult; + + spin_lock(&sbi->ll_lock); + pages_number = sbi->ll_ra_info.ra_max_pages_per_file; + spin_unlock(&sbi->ll_lock); + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + return lprocfs_seq_read_frac_helper(m, pages_number, mult); +} + +static ssize_t ll_max_readahead_per_file_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct super_block *sb = ((struct seq_file *)file->private_data)->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int mult, rc, pages_number; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number < 0 || + pages_number > sbi->ll_ra_info.ra_max_pages) { + CERROR("can't set file readahead more than max_read_ahead_mb %lu MB\n", + sbi->ll_ra_info.ra_max_pages); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages_per_file = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} +LPROC_SEQ_FOPS(ll_max_readahead_per_file_mb); + +static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + long pages_number; + int mult; + + spin_lock(&sbi->ll_lock); + pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages; + spin_unlock(&sbi->ll_lock); + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + return lprocfs_seq_read_frac_helper(m, pages_number, mult); +} + +static ssize_t ll_max_read_ahead_whole_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct super_block *sb = ((struct seq_file *)file->private_data)->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int mult, rc, pages_number; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + /* Cap this at the current max readahead window size, the readahead + * algorithm does this anyway so it's pointless to set it larger. */ + if (pages_number < 0 || + pages_number > sbi->ll_ra_info.ra_max_pages_per_file) { + CERROR("can't set max_read_ahead_whole_mb more than max_read_ahead_per_file_mb: %lu\n", + sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_CACHE_SHIFT)); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} +LPROC_SEQ_FOPS(ll_max_read_ahead_whole_mb); + +static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = &sbi->ll_cache; + int shift = 20 - PAGE_CACHE_SHIFT; + int max_cached_mb; + int unused_mb; + + max_cached_mb = cache->ccc_lru_max >> shift; + unused_mb = atomic_read(&cache->ccc_lru_left) >> shift; + seq_printf(m, + "users: %d\n" + "max_cached_mb: %d\n" + "used_mb: %d\n" + "unused_mb: %d\n" + "reclaim_count: %u\n", + atomic_read(&cache->ccc_users), + max_cached_mb, + max_cached_mb - unused_mb, + unused_mb, + cache->ccc_lru_shrinkers); + return 0; +} + +static ssize_t ll_max_cached_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct super_block *sb = ((struct seq_file *)file->private_data)->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = &sbi->ll_cache; + int mult, rc, pages_number; + int diff = 0; + int nrpages = 0; + char kernbuf[128]; + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) - + kernbuf; + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number < 0 || pages_number > totalram_pages) { + CERROR("%s: can't set max cache more than %lu MB\n", + ll_get_fsname(sb, NULL, 0), + totalram_pages >> (20 - PAGE_CACHE_SHIFT)); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + diff = pages_number - cache->ccc_lru_max; + spin_unlock(&sbi->ll_lock); + + /* easy - add more LRU slots. */ + if (diff >= 0) { + atomic_add(diff, &cache->ccc_lru_left); + rc = 0; + goto out; + } + + diff = -diff; + while (diff > 0) { + int tmp; + + /* reduce LRU budget from free slots. */ + do { + int ov, nv; + + ov = atomic_read(&cache->ccc_lru_left); + if (ov == 0) + break; + + nv = ov > diff ? ov - diff : 0; + rc = atomic_cmpxchg(&cache->ccc_lru_left, ov, nv); + if (likely(ov == rc)) { + diff -= ov - nv; + nrpages += ov - nv; + break; + } + } while (1); + + if (diff <= 0) + break; + + if (sbi->ll_dt_exp == NULL) { /* being initialized */ + rc = -ENODEV; + break; + } + + /* difficult - have to ask OSCs to drop LRU slots. */ + tmp = diff << 1; + rc = obd_set_info_async(NULL, sbi->ll_dt_exp, + sizeof(KEY_CACHE_LRU_SHRINK), + KEY_CACHE_LRU_SHRINK, + sizeof(tmp), &tmp, NULL); + if (rc < 0) + break; + } + +out: + if (rc >= 0) { + spin_lock(&sbi->ll_lock); + cache->ccc_lru_max = pages_number; + spin_unlock(&sbi->ll_lock); + rc = count; + } else { + atomic_add(nrpages, &cache->ccc_lru_left); + } + return rc; +} +LPROC_SEQ_FOPS(ll_max_cached_mb); + +static int ll_checksum_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0); + return 0; +} + +static ssize_t ll_checksum_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct super_block *sb = ((struct seq_file *)file->private_data)->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + if (!sbi->ll_dt_exp) + /* Not set up yet */ + return -EAGAIN; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + if (val) + sbi->ll_flags |= LL_SBI_CHECKSUM; + else + sbi->ll_flags &= ~LL_SBI_CHECKSUM; + + rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), + KEY_CHECKSUM, sizeof(val), &val, NULL); + if (rc) + CWARN("Failed to set OSC checksum flags: %d\n", rc); + + return count; +} +LPROC_SEQ_FOPS(ll_checksum); + +static int ll_max_rw_chunk_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + + seq_printf(m, "%lu\n", ll_s2sbi(sb)->ll_max_rw_chunk); + return 0; +} + +static ssize_t ll_max_rw_chunk_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct super_block *sb = ((struct seq_file *)file->private_data)->private; + int rc, val; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + ll_s2sbi(sb)->ll_max_rw_chunk = val; + return count; +} +LPROC_SEQ_FOPS(ll_max_rw_chunk); + +static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type) +{ + struct super_block *sb = m->private; + + if (ll_s2sbi(sb)->ll_stats_track_type == type) + seq_printf(m, "%d\n", ll_s2sbi(sb)->ll_stats_track_id); + else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) + seq_puts(m, "0 (all)\n"); + else + seq_puts(m, "untracked\n"); + + return 0; +} + +static int ll_wr_track_id(const char __user *buffer, unsigned long count, + void *data, enum stats_track_type type) +{ + struct super_block *sb = data; + int rc, pid; + + rc = lprocfs_write_helper(buffer, count, &pid); + if (rc) + return rc; + ll_s2sbi(sb)->ll_stats_track_id = pid; + if (pid == 0) + ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL; + else + ll_s2sbi(sb)->ll_stats_track_type = type; + lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats); + return count; +} + +static int ll_track_pid_seq_show(struct seq_file *m, void *v) +{ + return ll_rd_track_id(m, STATS_TRACK_PID); +} + +static ssize_t ll_track_pid_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *seq = file->private_data; + return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PID); +} +LPROC_SEQ_FOPS(ll_track_pid); + +static int ll_track_ppid_seq_show(struct seq_file *m, void *v) +{ + return ll_rd_track_id(m, STATS_TRACK_PPID); +} + +static ssize_t ll_track_ppid_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *seq = file->private_data; + return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PPID); +} +LPROC_SEQ_FOPS(ll_track_ppid); + +static int ll_track_gid_seq_show(struct seq_file *m, void *v) +{ + return ll_rd_track_id(m, STATS_TRACK_GID); +} + +static ssize_t ll_track_gid_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *seq = file->private_data; + return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_GID); +} +LPROC_SEQ_FOPS(ll_track_gid); + +static int ll_statahead_max_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "%u\n", sbi->ll_sa_max); + return 0; +} + +static ssize_t ll_statahead_max_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct super_block *sb = ((struct seq_file *)file->private_data)->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val >= 0 && val <= LL_SA_RPC_MAX) + sbi->ll_sa_max = val; + else + CERROR("Bad statahead_max value %d. Valid values are in the range [0, %d]\n", + val, LL_SA_RPC_MAX); + + return count; +} +LPROC_SEQ_FOPS(ll_statahead_max); + +static int ll_statahead_agl_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "%u\n", sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0); + return 0; +} + +static ssize_t ll_statahead_agl_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct super_block *sb = ((struct seq_file *)file->private_data)->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val) + sbi->ll_flags |= LL_SBI_AGL_ENABLED; + else + sbi->ll_flags &= ~LL_SBI_AGL_ENABLED; + + return count; +} +LPROC_SEQ_FOPS(ll_statahead_agl); + +static int ll_statahead_stats_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, + "statahead total: %u\n" + "statahead wrong: %u\n" + "agl total: %u\n", + atomic_read(&sbi->ll_sa_total), + atomic_read(&sbi->ll_sa_wrong), + atomic_read(&sbi->ll_agl_total)); + return 0; +} +LPROC_SEQ_FOPS_RO(ll_statahead_stats); + +static int ll_lazystatfs_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "%u\n", sbi->ll_flags & LL_SBI_LAZYSTATFS ? 1 : 0); + return 0; +} + +static ssize_t ll_lazystatfs_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct super_block *sb = ((struct seq_file *)file->private_data)->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val) + sbi->ll_flags |= LL_SBI_LAZYSTATFS; + else + sbi->ll_flags &= ~LL_SBI_LAZYSTATFS; + + return count; +} +LPROC_SEQ_FOPS(ll_lazystatfs); + +static int ll_max_easize_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned int ealen; + int rc; + + rc = ll_get_max_mdsize(sbi, &ealen); + if (rc) + return rc; + + seq_printf(m, "%u\n", ealen); + return 0; +} +LPROC_SEQ_FOPS_RO(ll_max_easize); + +static int ll_default_easize_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned int ealen; + int rc; + + rc = ll_get_default_mdsize(sbi, &ealen); + if (rc) + return rc; + + seq_printf(m, "%u\n", ealen); + return 0; +} +LPROC_SEQ_FOPS_RO(ll_default_easize); + +static int ll_max_cookiesize_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned int cookielen; + int rc; + + rc = ll_get_max_cookiesize(sbi, &cookielen); + if (rc) + return rc; + + seq_printf(m, "%u\n", cookielen); + return 0; +} +LPROC_SEQ_FOPS_RO(ll_max_cookiesize); + +static int ll_default_cookiesize_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned int cookielen; + int rc; + + rc = ll_get_default_cookiesize(sbi, &cookielen); + if (rc) + return rc; + + seq_printf(m, "%u\n", cookielen); + return 0; +} +LPROC_SEQ_FOPS_RO(ll_default_cookiesize); + +static int ll_sbi_flags_seq_show(struct seq_file *m, void *v) +{ + const char *str[] = LL_SBI_FLAGS; + struct super_block *sb = m->private; + int flags = ll_s2sbi(sb)->ll_flags; + int i = 0; + + while (flags != 0) { + if (ARRAY_SIZE(str) <= i) { + CERROR("%s: Revise array LL_SBI_FLAGS to match sbi flags please.\n", + ll_get_fsname(sb, NULL, 0)); + return -EINVAL; + } + + if (flags & 0x1) + seq_printf(m, "%s ", str[i]); + flags >>= 1; + ++i; + } + seq_printf(m, "\b\n"); + return 0; +} +LPROC_SEQ_FOPS_RO(ll_sbi_flags); + +static int ll_xattr_cache_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "%u\n", sbi->ll_xattr_cache_enabled); + + return 0; +} + +static ssize_t ll_xattr_cache_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct super_block *sb = seq->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val != 0 && val != 1) + return -ERANGE; + + if (val == 1 && !(sbi->ll_flags & LL_SBI_XATTR_CACHE)) + return -ENOTSUPP; + + sbi->ll_xattr_cache_enabled = val; + + return count; +} +LPROC_SEQ_FOPS(ll_xattr_cache); + +static struct lprocfs_vars lprocfs_llite_obd_vars[] = { + { "uuid", &ll_sb_uuid_fops, NULL, 0 }, + /* { "mntpt_path", ll_rd_path, 0, 0 }, */ + { "fstype", &ll_fstype_fops, NULL, 0 }, + { "site", &ll_site_stats_fops, NULL, 0 }, + { "blocksize", &ll_blksize_fops, NULL, 0 }, + { "kbytestotal", &ll_kbytestotal_fops, NULL, 0 }, + { "kbytesfree", &ll_kbytesfree_fops, NULL, 0 }, + { "kbytesavail", &ll_kbytesavail_fops, NULL, 0 }, + { "filestotal", &ll_filestotal_fops, NULL, 0 }, + { "filesfree", &ll_filesfree_fops, NULL, 0 }, + { "client_type", &ll_client_type_fops, NULL, 0 }, + /* { "filegroups", lprocfs_rd_filegroups, 0, 0 }, */ + { "max_read_ahead_mb", &ll_max_readahead_mb_fops, NULL }, + { "max_read_ahead_per_file_mb", &ll_max_readahead_per_file_mb_fops, + NULL }, + { "max_read_ahead_whole_mb", &ll_max_read_ahead_whole_mb_fops, NULL }, + { "max_cached_mb", &ll_max_cached_mb_fops, NULL }, + { "checksum_pages", &ll_checksum_fops, NULL }, + { "max_rw_chunk", &ll_max_rw_chunk_fops, NULL }, + { "stats_track_pid", &ll_track_pid_fops, NULL }, + { "stats_track_ppid", &ll_track_ppid_fops, NULL }, + { "stats_track_gid", &ll_track_gid_fops, NULL }, + { "statahead_max", &ll_statahead_max_fops, NULL }, + { "statahead_agl", &ll_statahead_agl_fops, NULL }, + { "statahead_stats", &ll_statahead_stats_fops, NULL, 0 }, + { "lazystatfs", &ll_lazystatfs_fops, NULL }, + { "max_easize", &ll_max_easize_fops, NULL, 0 }, + { "default_easize", &ll_default_easize_fops, NULL, 0 }, + { "max_cookiesize", &ll_max_cookiesize_fops, NULL, 0 }, + { "default_cookiesize", &ll_default_cookiesize_fops, NULL, 0 }, + { "sbi_flags", &ll_sbi_flags_fops, NULL, 0 }, + { "xattr_cache", &ll_xattr_cache_fops, NULL, 0 }, + { NULL } +}; + +#define MAX_STRING_SIZE 128 + +static const struct llite_file_opcode { + __u32 opcode; + __u32 type; + const char *opname; +} llite_opcode_table[LPROC_LL_FILE_OPCODES] = { + /* file operation */ + { LPROC_LL_DIRTY_HITS, LPROCFS_TYPE_REGS, "dirty_pages_hits" }, + { LPROC_LL_DIRTY_MISSES, LPROCFS_TYPE_REGS, "dirty_pages_misses" }, + { LPROC_LL_READ_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "read_bytes" }, + { LPROC_LL_WRITE_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "write_bytes" }, + { LPROC_LL_BRW_READ, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + "brw_read" }, + { LPROC_LL_BRW_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + "brw_write" }, + { LPROC_LL_OSC_READ, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "osc_read" }, + { LPROC_LL_OSC_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "osc_write" }, + { LPROC_LL_IOCTL, LPROCFS_TYPE_REGS, "ioctl" }, + { LPROC_LL_OPEN, LPROCFS_TYPE_REGS, "open" }, + { LPROC_LL_RELEASE, LPROCFS_TYPE_REGS, "close" }, + { LPROC_LL_MAP, LPROCFS_TYPE_REGS, "mmap" }, + { LPROC_LL_LLSEEK, LPROCFS_TYPE_REGS, "seek" }, + { LPROC_LL_FSYNC, LPROCFS_TYPE_REGS, "fsync" }, + { LPROC_LL_READDIR, LPROCFS_TYPE_REGS, "readdir" }, + /* inode operation */ + { LPROC_LL_SETATTR, LPROCFS_TYPE_REGS, "setattr" }, + { LPROC_LL_TRUNC, LPROCFS_TYPE_REGS, "truncate" }, + { LPROC_LL_FLOCK, LPROCFS_TYPE_REGS, "flock" }, + { LPROC_LL_GETATTR, LPROCFS_TYPE_REGS, "getattr" }, + /* dir inode operation */ + { LPROC_LL_CREATE, LPROCFS_TYPE_REGS, "create" }, + { LPROC_LL_LINK, LPROCFS_TYPE_REGS, "link" }, + { LPROC_LL_UNLINK, LPROCFS_TYPE_REGS, "unlink" }, + { LPROC_LL_SYMLINK, LPROCFS_TYPE_REGS, "symlink" }, + { LPROC_LL_MKDIR, LPROCFS_TYPE_REGS, "mkdir" }, + { LPROC_LL_RMDIR, LPROCFS_TYPE_REGS, "rmdir" }, + { LPROC_LL_MKNOD, LPROCFS_TYPE_REGS, "mknod" }, + { LPROC_LL_RENAME, LPROCFS_TYPE_REGS, "rename" }, + /* special inode operation */ + { LPROC_LL_STAFS, LPROCFS_TYPE_REGS, "statfs" }, + { LPROC_LL_ALLOC_INODE, LPROCFS_TYPE_REGS, "alloc_inode" }, + { LPROC_LL_SETXATTR, LPROCFS_TYPE_REGS, "setxattr" }, + { LPROC_LL_GETXATTR, LPROCFS_TYPE_REGS, "getxattr" }, + { LPROC_LL_GETXATTR_HITS, LPROCFS_TYPE_REGS, "getxattr_hits" }, + { LPROC_LL_LISTXATTR, LPROCFS_TYPE_REGS, "listxattr" }, + { LPROC_LL_REMOVEXATTR, LPROCFS_TYPE_REGS, "removexattr" }, + { LPROC_LL_INODE_PERM, LPROCFS_TYPE_REGS, "inode_permission" }, +}; + +void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) +{ + if (!sbi->ll_stats) + return; + if (sbi->ll_stats_track_type == STATS_TRACK_ALL) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_PID && + sbi->ll_stats_track_id == current->pid) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_PPID && + sbi->ll_stats_track_id == current->real_parent->pid) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_GID && + sbi->ll_stats_track_id == + from_kgid(&init_user_ns, current_gid())) + lprocfs_counter_add(sbi->ll_stats, op, count); +} +EXPORT_SYMBOL(ll_stats_ops_tally); + +static const char *ra_stat_string[] = { + [RA_STAT_HIT] = "hits", + [RA_STAT_MISS] = "misses", + [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", + [RA_STAT_MISS_IN_WINDOW] = "miss inside window", + [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", + [RA_STAT_FAILED_MATCH] = "failed lock match", + [RA_STAT_DISCARDED] = "read but discarded", + [RA_STAT_ZERO_LEN] = "zero length file", + [RA_STAT_ZERO_WINDOW] = "zero size window", + [RA_STAT_EOF] = "read-ahead to EOF", + [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", + [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page", +}; + +LPROC_SEQ_FOPS_RO_TYPE(llite, name); +LPROC_SEQ_FOPS_RO_TYPE(llite, uuid); + +int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, char *osc, char *mdc) +{ + struct lprocfs_vars lvars[2]; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + struct proc_dir_entry *dir; + char name[MAX_STRING_SIZE + 1], *ptr; + int err, id, len, rc; + + memset(lvars, 0, sizeof(lvars)); + + name[MAX_STRING_SIZE] = '\0'; + lvars[0].name = name; + + LASSERT(sbi != NULL); + LASSERT(mdc != NULL); + LASSERT(osc != NULL); + + /* Get fsname */ + len = strlen(lsi->lsi_lmd->lmd_profile); + ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-'); + if (ptr && (strcmp(ptr, "-client") == 0)) + len -= 7; + + /* Mount info */ + snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len, + lsi->lsi_lmd->lmd_profile, sb); + + sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL); + if (IS_ERR(sbi->ll_proc_root)) { + err = PTR_ERR(sbi->ll_proc_root); + sbi->ll_proc_root = NULL; + return err; + } + + rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444, + &vvp_dump_pgcache_file_ops, sbi); + if (rc) + CWARN("Error adding the dump_page_cache file\n"); + + rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644, + &ll_rw_extents_stats_fops, sbi); + if (rc) + CWARN("Error adding the extent_stats file\n"); + + rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process", + 0644, &ll_rw_extents_stats_pp_fops, sbi); + if (rc) + CWARN("Error adding the extents_stats_per_process file\n"); + + rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644, + &ll_rw_offset_stats_fops, sbi); + if (rc) + CWARN("Error adding the offset_stats file\n"); + + /* File operations stats */ + sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES, + LPROCFS_STATS_FLAG_NONE); + if (sbi->ll_stats == NULL) { + err = -ENOMEM; + goto out; + } + /* do counter init */ + for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) { + __u32 type = llite_opcode_table[id].type; + void *ptr = NULL; + if (type & LPROCFS_TYPE_REGS) + ptr = "regs"; + else if (type & LPROCFS_TYPE_BYTES) + ptr = "bytes"; + else if (type & LPROCFS_TYPE_PAGES) + ptr = "pages"; + lprocfs_counter_init(sbi->ll_stats, + llite_opcode_table[id].opcode, + (type & LPROCFS_CNTR_AVGMINMAX), + llite_opcode_table[id].opname, ptr); + } + err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats); + if (err) + goto out; + + sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string), + LPROCFS_STATS_FLAG_NONE); + if (sbi->ll_ra_stats == NULL) { + err = -ENOMEM; + goto out; + } + + for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++) + lprocfs_counter_init(sbi->ll_ra_stats, id, 0, + ra_stat_string[id], "pages"); + err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats", + sbi->ll_ra_stats); + if (err) + goto out; + + + err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb); + if (err) + goto out; + + /* MDC info */ + obd = class_name2obd(mdc); + + LASSERT(obd != NULL); + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + LASSERT(obd->obd_type->typ_name != NULL); + + dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root); + if (dir == NULL) { + err = -ENOMEM; + goto out; + } + + snprintf(name, MAX_STRING_SIZE, "common_name"); + lvars[0].fops = &llite_name_fops; + err = lprocfs_add_vars(dir, lvars, obd); + if (err) + goto out; + + snprintf(name, MAX_STRING_SIZE, "uuid"); + lvars[0].fops = &llite_uuid_fops; + err = lprocfs_add_vars(dir, lvars, obd); + if (err) + goto out; + + /* OSC */ + obd = class_name2obd(osc); + + LASSERT(obd != NULL); + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + LASSERT(obd->obd_type->typ_name != NULL); + + dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root); + if (dir == NULL) { + err = -ENOMEM; + goto out; + } + + snprintf(name, MAX_STRING_SIZE, "common_name"); + lvars[0].fops = &llite_name_fops; + err = lprocfs_add_vars(dir, lvars, obd); + if (err) + goto out; + + snprintf(name, MAX_STRING_SIZE, "uuid"); + lvars[0].fops = &llite_uuid_fops; + err = lprocfs_add_vars(dir, lvars, obd); +out: + if (err) { + lprocfs_remove(&sbi->ll_proc_root); + lprocfs_free_stats(&sbi->ll_ra_stats); + lprocfs_free_stats(&sbi->ll_stats); + } + return err; +} + +void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) +{ + if (sbi->ll_proc_root) { + lprocfs_remove(&sbi->ll_proc_root); + lprocfs_free_stats(&sbi->ll_ra_stats); + lprocfs_free_stats(&sbi->ll_stats); + } +} +#undef MAX_STRING_SIZE + +#define pct(a, b) (b ? a * 100 / b : 0) + +static void ll_display_extents_info(struct ll_rw_extents_info *io_extents, + struct seq_file *seq, int which) +{ + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + unsigned long start, end, r, w; + char *unitp = "KMGTPEZY"; + int i, units = 10; + struct per_process_info *pp_info = &io_extents->pp_extents[which]; + + read_cum = 0; + write_cum = 0; + start = 0; + + for (i = 0; i < LL_HIST_MAX; i++) { + read_tot += pp_info->pp_r_hist.oh_buckets[i]; + write_tot += pp_info->pp_w_hist.oh_buckets[i]; + } + + for (i = 0; i < LL_HIST_MAX; i++) { + r = pp_info->pp_r_hist.oh_buckets[i]; + w = pp_info->pp_w_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + end = 1 << (i + LL_HIST_START - units); + seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu | %14lu %4lu %4lu\n", + start, *unitp, end, *unitp, + (i == LL_HIST_MAX - 1) ? '+' : ' ', + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + start = end; + if (start == 1<<10) { + start = 1; + units += 10; + unitp++; + } + if (read_cum == read_tot && write_cum == write_tot) + break; + } +} + +static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + int k; + + do_gettimeofday(&now); + + if (!sbi->ll_rw_stats_on) { + seq_printf(seq, "disabled\n" + "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n"); + return 0; + } + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, (unsigned long)now.tv_usec); + seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); + seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", + "extents", "calls", "%", "cum%", + "calls", "%", "cum%"); + spin_lock(&sbi->ll_pp_extent_lock); + for (k = 0; k < LL_PROCESS_HIST_MAX; k++) { + if (io_extents->pp_extents[k].pid != 0) { + seq_printf(seq, "\nPID: %d\n", + io_extents->pp_extents[k].pid); + ll_display_extents_info(io_extents, seq, k); + } + } + spin_unlock(&sbi->ll_pp_extent_lock); + return 0; +} + +static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file, + const char __user *buf, + size_t len, + loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + int i; + int value = 1, rc = 0; + + if (len == 0) + return -EINVAL; + + rc = lprocfs_write_helper(buf, len, &value); + if (rc < 0 && len < 16) { + char kernbuf[16]; + + if (copy_from_user(kernbuf, buf, len)) + return -EFAULT; + kernbuf[len] = 0; + + if (kernbuf[len - 1] == '\n') + kernbuf[len - 1] = 0; + + if (strcmp(kernbuf, "disabled") == 0 || + strcmp(kernbuf, "Disabled") == 0) + value = 0; + } + + if (value == 0) + sbi->ll_rw_stats_on = 0; + else + sbi->ll_rw_stats_on = 1; + + spin_lock(&sbi->ll_pp_extent_lock); + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + io_extents->pp_extents[i].pid = 0; + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); + } + spin_unlock(&sbi->ll_pp_extent_lock); + return len; +} + +LPROC_SEQ_FOPS(ll_rw_extents_stats_pp); + +static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + + do_gettimeofday(&now); + + if (!sbi->ll_rw_stats_on) { + seq_printf(seq, "disabled\n" + "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n"); + return 0; + } + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, (unsigned long)now.tv_usec); + + seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); + seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", + "extents", "calls", "%", "cum%", + "calls", "%", "cum%"); + spin_lock(&sbi->ll_lock); + ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX); + spin_unlock(&sbi->ll_lock); + + return 0; +} + +static ssize_t ll_rw_extents_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + int i; + int value = 1, rc = 0; + + if (len == 0) + return -EINVAL; + + rc = lprocfs_write_helper(buf, len, &value); + if (rc < 0 && len < 16) { + char kernbuf[16]; + + if (copy_from_user(kernbuf, buf, len)) + return -EFAULT; + kernbuf[len] = 0; + + if (kernbuf[len - 1] == '\n') + kernbuf[len - 1] = 0; + + if (strcmp(kernbuf, "disabled") == 0 || + strcmp(kernbuf, "Disabled") == 0) + value = 0; + } + + if (value == 0) + sbi->ll_rw_stats_on = 0; + else + sbi->ll_rw_stats_on = 1; + + spin_lock(&sbi->ll_pp_extent_lock); + for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { + io_extents->pp_extents[i].pid = 0; + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); + } + spin_unlock(&sbi->ll_pp_extent_lock); + + return len; +} +LPROC_SEQ_FOPS(ll_rw_extents_stats); + +void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw) +{ + int i, cur = -1; + struct ll_rw_process_info *process; + struct ll_rw_process_info *offset; + int *off_count = &sbi->ll_rw_offset_entry_count; + int *process_count = &sbi->ll_offset_process_count; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + + if (!sbi->ll_rw_stats_on) + return; + process = sbi->ll_rw_process_info; + offset = sbi->ll_rw_offset_info; + + spin_lock(&sbi->ll_pp_extent_lock); + /* Extent statistics */ + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if (io_extents->pp_extents[i].pid == pid) { + cur = i; + break; + } + } + + if (cur == -1) { + /* new process */ + sbi->ll_extent_process_count = + (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX; + cur = sbi->ll_extent_process_count; + io_extents->pp_extents[cur].pid = pid; + lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist); + lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist); + } + + for(i = 0; (count >= (1 << LL_HIST_START << i)) && + (i < (LL_HIST_MAX - 1)); i++); + if (rw == 0) { + io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++; + io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++; + } else { + io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++; + io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++; + } + spin_unlock(&sbi->ll_pp_extent_lock); + + spin_lock(&sbi->ll_process_lock); + /* Offset statistics */ + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if (process[i].rw_pid == pid) { + if (process[i].rw_last_file != file) { + process[i].rw_range_start = pos; + process[i].rw_last_file_pos = pos + count; + process[i].rw_smallest_extent = count; + process[i].rw_largest_extent = count; + process[i].rw_offset = 0; + process[i].rw_last_file = file; + spin_unlock(&sbi->ll_process_lock); + return; + } + if (process[i].rw_last_file_pos != pos) { + *off_count = + (*off_count + 1) % LL_OFFSET_HIST_MAX; + offset[*off_count].rw_op = process[i].rw_op; + offset[*off_count].rw_pid = pid; + offset[*off_count].rw_range_start = + process[i].rw_range_start; + offset[*off_count].rw_range_end = + process[i].rw_last_file_pos; + offset[*off_count].rw_smallest_extent = + process[i].rw_smallest_extent; + offset[*off_count].rw_largest_extent = + process[i].rw_largest_extent; + offset[*off_count].rw_offset = + process[i].rw_offset; + process[i].rw_op = rw; + process[i].rw_range_start = pos; + process[i].rw_smallest_extent = count; + process[i].rw_largest_extent = count; + process[i].rw_offset = pos - + process[i].rw_last_file_pos; + } + if (process[i].rw_smallest_extent > count) + process[i].rw_smallest_extent = count; + if (process[i].rw_largest_extent < count) + process[i].rw_largest_extent = count; + process[i].rw_last_file_pos = pos + count; + spin_unlock(&sbi->ll_process_lock); + return; + } + } + *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX; + process[*process_count].rw_pid = pid; + process[*process_count].rw_op = rw; + process[*process_count].rw_range_start = pos; + process[*process_count].rw_last_file_pos = pos + count; + process[*process_count].rw_smallest_extent = count; + process[*process_count].rw_largest_extent = count; + process[*process_count].rw_offset = 0; + process[*process_count].rw_last_file = file; + spin_unlock(&sbi->ll_process_lock); +} + +static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_process_info *offset = sbi->ll_rw_offset_info; + struct ll_rw_process_info *process = sbi->ll_rw_process_info; + int i; + + do_gettimeofday(&now); + + if (!sbi->ll_rw_stats_on) { + seq_printf(seq, "disabled\n" + "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n"); + return 0; + } + spin_lock(&sbi->ll_process_lock); + + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, (unsigned long)now.tv_usec); + seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n", + "R/W", "PID", "RANGE START", "RANGE END", + "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET"); + /* We stored the discontiguous offsets here; print them first */ + for (i = 0; i < LL_OFFSET_HIST_MAX; i++) { + if (offset[i].rw_pid != 0) + seq_printf(seq, + "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu", + offset[i].rw_op == READ ? 'R' : 'W', + offset[i].rw_pid, + offset[i].rw_range_start, + offset[i].rw_range_end, + (unsigned long)offset[i].rw_smallest_extent, + (unsigned long)offset[i].rw_largest_extent, + offset[i].rw_offset); + } + /* Then print the current offsets for each process */ + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if (process[i].rw_pid != 0) + seq_printf(seq, + "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu", + process[i].rw_op == READ ? 'R' : 'W', + process[i].rw_pid, + process[i].rw_range_start, + process[i].rw_last_file_pos, + (unsigned long)process[i].rw_smallest_extent, + (unsigned long)process[i].rw_largest_extent, + process[i].rw_offset); + } + spin_unlock(&sbi->ll_process_lock); + + return 0; +} + +static ssize_t ll_rw_offset_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_process_info *process_info = sbi->ll_rw_process_info; + struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info; + int value = 1, rc = 0; + + if (len == 0) + return -EINVAL; + + rc = lprocfs_write_helper(buf, len, &value); + + if (rc < 0 && len < 16) { + char kernbuf[16]; + + if (copy_from_user(kernbuf, buf, len)) + return -EFAULT; + kernbuf[len] = 0; + + if (kernbuf[len - 1] == '\n') + kernbuf[len - 1] = 0; + + if (strcmp(kernbuf, "disabled") == 0 || + strcmp(kernbuf, "Disabled") == 0) + value = 0; + } + + if (value == 0) + sbi->ll_rw_stats_on = 0; + else + sbi->ll_rw_stats_on = 1; + + spin_lock(&sbi->ll_process_lock); + sbi->ll_offset_process_count = 0; + sbi->ll_rw_offset_entry_count = 0; + memset(process_info, 0, sizeof(struct ll_rw_process_info) * + LL_PROCESS_HIST_MAX); + memset(offset_info, 0, sizeof(struct ll_rw_process_info) * + LL_OFFSET_HIST_MAX); + spin_unlock(&sbi->ll_process_lock); + + return len; +} + +LPROC_SEQ_FOPS(ll_rw_offset_stats); + +void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = NULL; + lvars->obd_vars = lprocfs_llite_obd_vars; +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/namei.c b/kernel/drivers/staging/lustre/lustre/llite/namei.c new file mode 100644 index 000000000..5a25dcd10 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/namei.c @@ -0,0 +1,1178 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/obd_support.h" +#include "../include/lustre_fid.h" +#include "../include/lustre_lite.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_ver.h" +#include "llite_internal.h" + +static int ll_create_it(struct inode *, struct dentry *, + int, struct lookup_intent *); + +/* called from iget5_locked->find_inode() under inode_hash_lock spinlock */ +static int ll_test_inode(struct inode *inode, void *opaque) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lustre_md *md = opaque; + + if (unlikely(!(md->body->valid & OBD_MD_FLID))) { + CERROR("MDS body missing FID\n"); + return 0; + } + + if (!lu_fid_eq(&lli->lli_fid, &md->body->fid1)) + return 0; + + return 1; +} + +static int ll_set_inode(struct inode *inode, void *opaque) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body = ((struct lustre_md *)opaque)->body; + + if (unlikely(!(body->valid & OBD_MD_FLID))) { + CERROR("MDS body missing FID\n"); + return -EINVAL; + } + + lli->lli_fid = body->fid1; + if (unlikely(!(body->valid & OBD_MD_FLTYPE))) { + CERROR("Can not initialize inode " DFID + " without object type: valid = %#llx\n", + PFID(&lli->lli_fid), body->valid); + return -EINVAL; + } + + inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mode & S_IFMT); + if (unlikely(inode->i_mode == 0)) { + CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid)); + return -EINVAL; + } + + ll_lli_init(lli); + + return 0; +} + + +/* + * Get an inode by inode number (already instantiated by the intent lookup). + * Returns inode or NULL + */ +struct inode *ll_iget(struct super_block *sb, ino_t hash, + struct lustre_md *md) +{ + struct inode *inode; + + LASSERT(hash != 0); + inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md); + + if (inode) { + if (inode->i_state & I_NEW) { + int rc = 0; + + ll_read_inode2(inode, md); + if (S_ISREG(inode->i_mode) && + ll_i2info(inode)->lli_clob == NULL) { + CDEBUG(D_INODE, + "%s: apply lsm %p to inode "DFID".\n", + ll_get_fsname(sb, NULL, 0), md->lsm, + PFID(ll_inode2fid(inode))); + rc = cl_file_inode_init(inode, md); + } + if (rc != 0) { + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(rc); + } else + unlock_new_inode(inode); + } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) + ll_update_inode(inode, md); + CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n", + inode, PFID(&md->body->fid1)); + } + return inode; +} + +static void ll_invalidate_negative_children(struct inode *dir) +{ + struct dentry *dentry, *tmp_subdir; + struct ll_d_hlist_node *p; + + ll_lock_dcache(dir); + ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (!list_empty(&dentry->d_subdirs)) { + struct dentry *child; + + list_for_each_entry_safe(child, tmp_subdir, + &dentry->d_subdirs, + d_child) { + if (d_really_is_negative(child)) + d_lustre_invalidate(child, 1); + } + } + spin_unlock(&dentry->d_lock); + } + ll_unlock_dcache(dir); +} + +int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct lustre_handle lockh; + int rc; + + switch (flag) { + case LDLM_CB_BLOCKING: + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc < 0) { + CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc); + return rc; + } + break; + case LDLM_CB_CANCELING: { + struct inode *inode = ll_inode_from_resource_lock(lock); + __u64 bits = lock->l_policy_data.l_inodebits.bits; + + /* Inode is set to lock->l_resource->lr_lvb_inode + * for mdc - bug 24555 */ + LASSERT(lock->l_ast_data == NULL); + + if (inode == NULL) + break; + + /* Invalidate all dentries associated with this inode */ + LASSERT(lock->l_flags & LDLM_FL_CANCELING); + + if (!fid_res_name_eq(ll_inode2fid(inode), + &lock->l_resource->lr_name)) { + LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)", + PFID(ll_inode2fid(inode)), inode); + LBUG(); + } + + if (bits & MDS_INODELOCK_XATTR) { + ll_xattr_cache_destroy(inode); + bits &= ~MDS_INODELOCK_XATTR; + } + + /* For OPEN locks we differentiate between lock modes + * LCK_CR, LCK_CW, LCK_PR - bug 22891 */ + if (bits & MDS_INODELOCK_OPEN) + ll_have_md_lock(inode, &bits, lock->l_req_mode); + + if (bits & MDS_INODELOCK_OPEN) { + fmode_t fmode; + + switch (lock->l_req_mode) { + case LCK_CW: + fmode = FMODE_WRITE; + break; + case LCK_PR: + fmode = FMODE_EXEC; + break; + case LCK_CR: + fmode = FMODE_READ; + break; + default: + LDLM_ERROR(lock, "bad lock mode for OPEN lock"); + LBUG(); + } + + ll_md_real_close(inode, fmode); + } + + if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM)) + ll_have_md_lock(inode, &bits, LCK_MINMODE); + + if (bits & MDS_INODELOCK_LAYOUT) { + struct cl_object_conf conf = { + .coc_opc = OBJECT_CONF_INVALIDATE, + .coc_inode = inode, + }; + + rc = ll_layout_conf(inode, &conf); + if (rc < 0) + CDEBUG(D_INODE, "cannot invalidate layout of " + DFID": rc = %d\n", + PFID(ll_inode2fid(inode)), rc); + } + + if (bits & MDS_INODELOCK_UPDATE) { + struct ll_inode_info *lli = ll_i2info(inode); + + spin_lock(&lli->lli_lock); + lli->lli_flags &= ~LLIF_MDS_SIZE_LOCK; + spin_unlock(&lli->lli_lock); + } + + if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) { + CDEBUG(D_INODE, "invalidating inode %lu\n", + inode->i_ino); + truncate_inode_pages(inode->i_mapping, 0); + ll_invalidate_negative_children(inode); + } + + if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) && + inode->i_sb->s_root != NULL && + !is_root_inode(inode)) + ll_invalidate_aliases(inode); + + iput(inode); + break; + } + default: + LBUG(); + } + + return 0; +} + +__u32 ll_i2suppgid(struct inode *i) +{ + if (in_group_p(i->i_gid)) + return (__u32)from_kgid(&init_user_ns, i->i_gid); + else + return (__u32)(-1); +} + +/* Pack the required supplementary groups into the supplied groups array. + * If we don't need to use the groups from the target inode(s) then we + * instead pack one or more groups from the user's supplementary group + * array in case it might be useful. Not needed if doing an MDS-side upcall. */ +void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2) +{ +#if 0 + int i; +#endif + + LASSERT(i1 != NULL); + LASSERT(suppgids != NULL); + + suppgids[0] = ll_i2suppgid(i1); + + if (i2) + suppgids[1] = ll_i2suppgid(i2); + else + suppgids[1] = -1; + +#if 0 + for (i = 0; i < current_ngroups; i++) { + if (suppgids[0] == -1) { + if (current_groups[i] != suppgids[1]) + suppgids[0] = current_groups[i]; + continue; + } + if (suppgids[1] == -1) { + if (current_groups[i] != suppgids[0]) + suppgids[1] = current_groups[i]; + continue; + } + break; + } +#endif +} + +/* + * try to reuse three types of dentry: + * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid + * by concurrent .revalidate). + * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may + * be cleared by others calling d_lustre_revalidate). + * 3. DISCONNECTED alias. + */ +static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry) +{ + struct dentry *alias, *discon_alias, *invalid_alias; + struct ll_d_hlist_node *p; + + if (ll_d_hlist_empty(&inode->i_dentry)) + return NULL; + + discon_alias = invalid_alias = NULL; + + ll_lock_dcache(inode); + ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry, d_u.d_alias) { + LASSERT(alias != dentry); + + spin_lock(&alias->d_lock); + if (alias->d_flags & DCACHE_DISCONNECTED) + /* LASSERT(last_discon == NULL); LU-405, bz 20055 */ + discon_alias = alias; + else if (alias->d_parent == dentry->d_parent && + alias->d_name.hash == dentry->d_name.hash && + alias->d_name.len == dentry->d_name.len && + memcmp(alias->d_name.name, dentry->d_name.name, + dentry->d_name.len) == 0) + invalid_alias = alias; + spin_unlock(&alias->d_lock); + + if (invalid_alias) + break; + } + alias = invalid_alias ?: discon_alias ?: NULL; + if (alias) { + spin_lock(&alias->d_lock); + dget_dlock(alias); + spin_unlock(&alias->d_lock); + } + ll_unlock_dcache(inode); + + return alias; +} + +/* + * Similar to d_splice_alias(), but lustre treats invalid alias + * similar to DCACHE_DISCONNECTED, and tries to use it anyway. + */ +struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de) +{ + struct dentry *new; + int rc; + + if (inode) { + new = ll_find_alias(inode, de); + if (new) { + rc = ll_d_init(new); + if (rc < 0) { + dput(new); + return ERR_PTR(rc); + } + d_move(new, de); + iput(inode); + CDEBUG(D_DENTRY, + "Reuse dentry %p inode %p refc %d flags %#x\n", + new, d_inode(new), d_count(new), new->d_flags); + return new; + } + } + rc = ll_d_init(de); + if (rc < 0) + return ERR_PTR(rc); + d_add(de, inode); + CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n", + de, d_inode(de), d_count(de), de->d_flags); + return de; +} + +static int ll_lookup_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, + struct inode *parent, struct dentry **de) +{ + struct inode *inode = NULL; + __u64 bits = 0; + int rc; + + /* NB 1 request reference will be taken away by ll_intent_lock() + * when I return */ + CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it, + it->d.lustre.it_disposition); + if (!it_disposition(it, DISP_LOOKUP_NEG)) { + rc = ll_prep_inode(&inode, request, (*de)->d_sb, it); + if (rc) + return rc; + + ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits); + + /* We used to query real size from OSTs here, but actually + this is not needed. For stat() calls size would be updated + from subsequent do_revalidate()->ll_inode_revalidate_it() in + 2.4 and + vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6 + Everybody else who needs correct file size would call + ll_glimpse_size or some equivalent themselves anyway. + Also see bug 7198. */ + } + + /* Only hash *de if it is unhashed (new dentry). + * Atoimc_open may passing hashed dentries for open. + */ + if (d_unhashed(*de)) { + struct dentry *alias; + + alias = ll_splice_alias(inode, *de); + if (IS_ERR(alias)) + return PTR_ERR(alias); + *de = alias; + } else if (!it_disposition(it, DISP_LOOKUP_NEG) && + !it_disposition(it, DISP_OPEN_CREATE)) { + /* With DISP_OPEN_CREATE dentry will + instantiated in ll_create_it. */ + LASSERT(d_inode(*de) == NULL); + d_instantiate(*de, inode); + } + + if (!it_disposition(it, DISP_LOOKUP_NEG)) { + /* we have lookup look - unhide dentry */ + if (bits & MDS_INODELOCK_LOOKUP) + d_lustre_revalidate(*de); + } else if (!it_disposition(it, DISP_OPEN_CREATE)) { + /* If file created on server, don't depend on parent UPDATE + * lock to unhide it. It is left hidden and next lookup can + * find it in ll_splice_alias. + */ + /* Check that parent has UPDATE lock. */ + struct lookup_intent parent_it = { + .it_op = IT_GETATTR, + .d.lustre.it_lock_handle = 0 }; + + if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, + &ll_i2info(parent)->lli_fid, NULL)) { + d_lustre_revalidate(*de); + ll_intent_release(&parent_it); + } + } + + return 0; +} + +static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, + struct lookup_intent *it, int lookup_flags) +{ + struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; + struct dentry *save = dentry, *retval; + struct ptlrpc_request *req = NULL; + struct inode *inode; + struct md_op_data *op_data; + __u32 opc; + int rc; + + if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen) + return ERR_PTR(-ENAMETOOLONG); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),intent=%s\n", + dentry, parent->i_ino, + parent->i_generation, parent, LL_IT2STR(it)); + + if (d_mountpoint(dentry)) + CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it)); + + if (it == NULL || it->it_op == IT_GETXATTR) + it = &lookup_it; + + if (it->it_op == IT_GETATTR) { + rc = ll_statahead_enter(parent, &dentry, 0); + if (rc == 1) { + if (dentry == save) + retval = NULL; + else + retval = dentry; + goto out; + } + } + + if (it->it_op & IT_CREAT) + opc = LUSTRE_OPC_CREATE; + else + opc = LUSTRE_OPC_ANY; + + op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name, + dentry->d_name.len, lookup_flags, opc, + NULL); + if (IS_ERR(op_data)) + return (void *)op_data; + + /* enforce umask if acl disabled or MDS doesn't support umask */ + if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) + it->it_create_mode &= ~current_umask(); + + rc = md_intent_lock(ll_i2mdexp(parent), op_data, NULL, 0, it, + lookup_flags, &req, ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + if (rc < 0) { + retval = ERR_PTR(rc); + goto out; + } + + rc = ll_lookup_it_finish(req, it, parent, &dentry); + if (rc != 0) { + ll_intent_release(it); + retval = ERR_PTR(rc); + goto out; + } + + inode = d_inode(dentry); + if ((it->it_op & IT_OPEN) && inode && + !S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) { + ll_release_openhandle(inode, it); + } + ll_lookup_finish_locks(it, inode); + + if (dentry == save) + retval = NULL; + else + retval = dentry; + goto out; + out: + if (req) + ptlrpc_req_finished(req); + if (it->it_op == IT_GETATTR && (retval == NULL || retval == dentry)) + ll_statahead_mark(parent, dentry); + return retval; +} + +static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry, + unsigned int flags) +{ + struct lookup_intent *itp, it = { .it_op = IT_GETATTR }; + struct dentry *de; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),flags=%u\n", + dentry, parent->i_ino, + parent->i_generation, parent, flags); + + /* Optimize away (CREATE && !OPEN). Let .create handle the race. */ + if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN)) + return NULL; + + if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) + itp = NULL; + else + itp = ⁢ + de = ll_lookup_it(parent, dentry, itp, 0); + + if (itp != NULL) + ll_intent_release(itp); + + return de; +} + +/* + * For cached negative dentry and new dentry, handle lookup/create/open + * together. + */ +static int ll_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode, int *opened) +{ + struct lookup_intent *it; + struct dentry *de; + long long lookup_flags = LOOKUP_OPEN; + int rc = 0; + + CDEBUG(D_VFSTRACE, + "VFS Op:name=%pd,dir=%lu/%u(%p),file %p,open_flags %x,mode %x opened %d\n", + dentry, dir->i_ino, + dir->i_generation, dir, file, open_flags, mode, *opened); + + it = kzalloc(sizeof(*it), GFP_NOFS); + if (!it) + return -ENOMEM; + + it->it_op = IT_OPEN; + if (open_flags & O_CREAT) { + it->it_op |= IT_CREAT; + lookup_flags |= LOOKUP_CREATE; + } + it->it_create_mode = (mode & S_IALLUGO) | S_IFREG; + it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags); + + /* Dentry added to dcache tree in ll_lookup_it */ + de = ll_lookup_it(dir, dentry, it, lookup_flags); + if (IS_ERR(de)) + rc = PTR_ERR(de); + else if (de != NULL) + dentry = de; + + if (!rc) { + if (it_disposition(it, DISP_OPEN_CREATE)) { + /* Dentry instantiated in ll_create_it. */ + rc = ll_create_it(dir, dentry, mode, it); + if (rc) { + /* We dget in ll_splice_alias. */ + if (de != NULL) + dput(de); + goto out_release; + } + + *opened |= FILE_CREATED; + } + if (d_really_is_positive(dentry) && it_disposition(it, DISP_OPEN_OPEN)) { + /* Open dentry. */ + if (S_ISFIFO(d_inode(dentry)->i_mode)) { + /* We cannot call open here as it would + * deadlock. + */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) + ptlrpc_req_finished( + (struct ptlrpc_request *) + it->d.lustre.it_data); + rc = finish_no_open(file, de); + } else { + file->private_data = it; + rc = finish_open(file, dentry, NULL, opened); + /* We dget in ll_splice_alias. finish_open takes + * care of dget for fd open. + */ + if (de != NULL) + dput(de); + } + } else { + rc = finish_no_open(file, de); + } + } + +out_release: + ll_intent_release(it); + OBD_FREE(it, sizeof(*it)); + + return rc; +} + + +/* We depend on "mode" being set with the proper file type/umask by now */ +static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it) +{ + struct inode *inode = NULL; + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int rc; + + LASSERT(it && it->d.lustre.it_disposition); + + LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF)); + request = it->d.lustre.it_data; + it_clear_disposition(it, DISP_ENQ_CREATE_REF); + rc = ll_prep_inode(&inode, request, dir->i_sb, it); + if (rc) { + inode = ERR_PTR(rc); + goto out; + } + + LASSERT(ll_d_hlist_empty(&inode->i_dentry)); + + /* We asked for a lock on the directory, but were granted a + * lock on the inode. Since we finally have an inode pointer, + * stuff it in the lock. */ + CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n", + inode, inode->i_ino, inode->i_generation); + ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); + out: + ptlrpc_req_finished(request); + return inode; +} + +/* + * By the time this is called, we already have created the directory cache + * entry for the new file, but it is so far negative - it has no inode. + * + * We defer creating the OBD object(s) until open, to keep the intent and + * non-intent code paths similar, and also because we do not have the MDS + * inode number before calling ll_create_node() (which is needed for LOV), + * so we would need to do yet another RPC to the MDS to store the LOV EA + * data on the MDS. If needed, we would pass the PACKED lmm as data and + * lmm_size in datalen (the MDS still has code which will handle that). + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode, + struct lookup_intent *it) +{ + struct inode *inode; + int rc = 0; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),intent=%s\n", + dentry, dir->i_ino, + dir->i_generation, dir, LL_IT2STR(it)); + + rc = it_open_error(DISP_OPEN_CREATE, it); + if (rc) + return rc; + + inode = ll_create_node(dir, it); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + d_instantiate(dentry, inode); + return 0; +} + +static void ll_update_times(struct ptlrpc_request *request, + struct inode *inode) +{ + struct mdt_body *body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + + LASSERT(body); + if (body->valid & OBD_MD_FLMTIME && + body->mtime > LTIME_S(inode->i_mtime)) { + CDEBUG(D_INODE, "setting ino %lu mtime from %lu to %llu\n", + inode->i_ino, LTIME_S(inode->i_mtime), body->mtime); + LTIME_S(inode->i_mtime) = body->mtime; + } + if (body->valid & OBD_MD_FLCTIME && + body->ctime > LTIME_S(inode->i_ctime)) + LTIME_S(inode->i_ctime) = body->ctime; +} + +static int ll_new_node(struct inode *dir, struct dentry *dentry, + const char *tgt, int mode, int rdev, + __u32 opc) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + struct inode *inode = NULL; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int tgt_len = 0; + int err; + + if (unlikely(tgt != NULL)) + tgt_len = strlen(tgt) + 1; + + op_data = ll_prep_md_op_data(NULL, dir, NULL, + dentry->d_name.name, + dentry->d_name.len, + 0, opc, NULL); + if (IS_ERR(op_data)) { + err = PTR_ERR(op_data); + goto err_exit; + } + + err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode, + from_kuid(&init_user_ns, current_fsuid()), + from_kgid(&init_user_ns, current_fsgid()), + cfs_curproc_cap_pack(), rdev, &request); + ll_finish_md_op_data(op_data); + if (err) + goto err_exit; + + ll_update_times(request, dir); + + err = ll_prep_inode(&inode, request, dir->i_sb, NULL); + if (err) + goto err_exit; + + d_instantiate(dentry, inode); +err_exit: + ptlrpc_req_finished(request); + + return err; +} + +static int ll_mknod(struct inode *dir, struct dentry *dchild, + umode_t mode, dev_t rdev) +{ + int err; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p) mode %o dev %x\n", + dchild, dir->i_ino, dir->i_generation, dir, + mode, old_encode_dev(rdev)); + + if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) + mode &= ~current_umask(); + + switch (mode & S_IFMT) { + case 0: + mode |= S_IFREG; /* for mode = 0 case, fallthrough */ + case S_IFREG: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ll_new_node(dir, dchild, NULL, mode, + old_encode_dev(rdev), + LUSTRE_OPC_MKNOD); + break; + case S_IFDIR: + err = -EPERM; + break; + default: + err = -EINVAL; + } + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1); + + return err; +} + +/* + * Plain create. Intent create is handled in atomic_open. + */ +static int ll_create_nd(struct inode *dir, struct dentry *dentry, + umode_t mode, bool want_excl) +{ + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),flags=%u, excl=%d\n", + dentry, dir->i_ino, + dir->i_generation, dir, mode, want_excl); + + rc = ll_mknod(dir, dentry, mode, 0); + + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, unhashed %d\n", + dentry, d_unhashed(dentry)); + + return rc; +} + +static inline void ll_get_child_fid(struct dentry *child, struct lu_fid *fid) +{ + if (d_really_is_positive(child)) + *fid = *ll_inode2fid(d_inode(child)); +} + +/** + * Remove dir entry + **/ +int ll_rmdir_entry(struct inode *dir, char *name, int namelen) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", + namelen, name, dir->i_ino, dir->i_generation, dir); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name), + S_IFDIR, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + op_data->op_cli_flags |= CLI_RM_ENTRY; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc == 0) { + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1); + } + + ptlrpc_req_finished(request); + return rc; +} + +int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) +{ + struct mdt_body *body; + struct lov_mds_md *eadata; + struct lov_stripe_md *lsm = NULL; + struct obd_trans_info oti = { 0 }; + struct obdo *oa; + struct obd_capa *oc = NULL; + int rc; + + /* req is swabbed so this is safe */ + body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + if (!(body->valid & OBD_MD_FLEASIZE)) + return 0; + + if (body->eadatasize == 0) { + CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n"); + rc = -EPROTO; + goto out; + } + + /* The MDS sent back the EA because we unlinked the last reference + * to this file. Use this EA to unlink the objects on the OST. + * It's opaque so we don't swab here; we leave it to obd_unpackmd() to + * check it is complete and sensible. */ + eadata = req_capsule_server_sized_get(&request->rq_pill, &RMF_MDT_MD, + body->eadatasize); + LASSERT(eadata != NULL); + + rc = obd_unpackmd(ll_i2dtexp(dir), &lsm, eadata, body->eadatasize); + if (rc < 0) { + CERROR("obd_unpackmd: %d\n", rc); + goto out; + } + LASSERT(rc >= sizeof(*lsm)); + + OBDO_ALLOC(oa); + if (oa == NULL) { + rc = -ENOMEM; + goto out_free_memmd; + } + + oa->o_oi = lsm->lsm_oi; + oa->o_mode = body->mode & S_IFMT; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP; + + if (body->valid & OBD_MD_FLCOOKIE) { + oa->o_valid |= OBD_MD_FLCOOKIE; + oti.oti_logcookies = + req_capsule_server_sized_get(&request->rq_pill, + &RMF_LOGCOOKIES, + sizeof(struct llog_cookie) * + lsm->lsm_stripe_count); + if (oti.oti_logcookies == NULL) { + oa->o_valid &= ~OBD_MD_FLCOOKIE; + body->valid &= ~OBD_MD_FLCOOKIE; + } + } + + if (body->valid & OBD_MD_FLOSSCAPA) { + rc = md_unpack_capa(ll_i2mdexp(dir), request, &RMF_CAPA2, &oc); + if (rc) + goto out_free_memmd; + } + + rc = obd_destroy(NULL, ll_i2dtexp(dir), oa, lsm, &oti, + ll_i2mdexp(dir), oc); + capa_put(oc); + if (rc) + CERROR("obd destroy objid "DOSTID" error %d\n", + POSTID(&lsm->lsm_oi), rc); +out_free_memmd: + obd_free_memmd(ll_i2dtexp(dir), &lsm); + OBDO_FREE(oa); +out: + return rc; +} + +/* ll_unlink() doesn't update the inode with the new link count. + * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there + * is any lock existing. They will recycle dentries and inodes based upon locks + * too. b=20433 */ +static int ll_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n", + dentry, dir->i_ino, dir->i_generation, dir); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, + dentry->d_name.name, + dentry->d_name.len, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + ll_get_child_fid(dentry, &op_data->op_fid3); + op_data->op_fid2 = op_data->op_fid3; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc) + goto out; + + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1); + + rc = ll_objects_destroy(request, dir); + out: + ptlrpc_req_finished(request); + return rc; +} + +static int ll_mkdir(struct inode *dir, struct dentry *dentry, ll_umode_t mode) +{ + int err; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n", + dentry, dir->i_ino, dir->i_generation, dir); + + if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) + mode &= ~current_umask(); + mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR; + err = ll_new_node(dir, dentry, NULL, mode, 0, LUSTRE_OPC_MKDIR); + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1); + + return err; +} + +static int ll_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n", + dentry, dir->i_ino, dir->i_generation, dir); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, + dentry->d_name.name, + dentry->d_name.len, + S_IFDIR, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + ll_get_child_fid(dentry, &op_data->op_fid3); + op_data->op_fid2 = op_data->op_fid3; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc == 0) { + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1); + } + + ptlrpc_req_finished(request); + return rc; +} + +static int ll_symlink(struct inode *dir, struct dentry *dentry, + const char *oldname) +{ + int err; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),target=%.*s\n", + dentry, dir->i_ino, dir->i_generation, + dir, 3000, oldname); + + err = ll_new_node(dir, dentry, oldname, S_IFLNK | S_IRWXUGO, + 0, LUSTRE_OPC_SYMLINK); + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1); + + return err; +} + +static int ll_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct inode *src = d_inode(old_dentry); + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int err; + + CDEBUG(D_VFSTRACE, + "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%pd\n", + src->i_ino, src->i_generation, src, dir->i_ino, + dir->i_generation, dir, new_dentry); + + op_data = ll_prep_md_op_data(NULL, src, dir, new_dentry->d_name.name, + new_dentry->d_name.len, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + err = md_link(sbi->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (err) + goto out; + + ll_update_times(request, dir); + ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1); +out: + ptlrpc_req_finished(request); + return err; +} + +static int ll_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(old_dir); + struct md_op_data *op_data; + int err; + + CDEBUG(D_VFSTRACE, + "VFS Op:oldname=%pd,src_dir=%lu/%u(%p),newname=%pd,tgt_dir=%lu/%u(%p)\n", + old_dentry, old_dir->i_ino, old_dir->i_generation, old_dir, + new_dentry, new_dir->i_ino, new_dir->i_generation, new_dir); + + op_data = ll_prep_md_op_data(NULL, old_dir, new_dir, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + ll_get_child_fid(old_dentry, &op_data->op_fid3); + ll_get_child_fid(new_dentry, &op_data->op_fid4); + err = md_rename(sbi->ll_md_exp, op_data, + old_dentry->d_name.name, + old_dentry->d_name.len, + new_dentry->d_name.name, + new_dentry->d_name.len, &request); + ll_finish_md_op_data(op_data); + if (!err) { + ll_update_times(request, old_dir); + ll_update_times(request, new_dir); + ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1); + err = ll_objects_destroy(request, old_dir); + } + + ptlrpc_req_finished(request); + if (!err) + d_move(old_dentry, new_dentry); + return err; +} + +const struct inode_operations ll_dir_inode_operations = { + .mknod = ll_mknod, + .atomic_open = ll_atomic_open, + .lookup = ll_lookup_nd, + .create = ll_create_nd, + /* We need all these non-raw things for NFSD, to not patch it. */ + .unlink = ll_unlink, + .mkdir = ll_mkdir, + .rmdir = ll_rmdir, + .symlink = ll_symlink, + .link = ll_link, + .rename = ll_rename, + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, + .get_acl = ll_get_acl, +}; + +const struct inode_operations ll_special_inode_operations = { + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, + .get_acl = ll_get_acl, +}; diff --git a/kernel/drivers/staging/lustre/lustre/llite/remote_perm.c b/kernel/drivers/staging/lustre/lustre/llite/remote_perm.c new file mode 100644 index 000000000..a58182600 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/remote_perm.c @@ -0,0 +1,331 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/remote_perm.c + * + * Lustre Permission Cache for Remote Client + * + * Author: Lai Siyao + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include + +#include "../include/lustre_lite.h" +#include "../include/lustre_ha.h" +#include "../include/lustre_dlm.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre_disk.h" +#include "../include/lustre_param.h" +#include "llite_internal.h" + +struct kmem_cache *ll_remote_perm_cachep = NULL; +struct kmem_cache *ll_rmtperm_hash_cachep = NULL; + +static inline struct ll_remote_perm *alloc_ll_remote_perm(void) +{ + struct ll_remote_perm *lrp; + + OBD_SLAB_ALLOC_PTR_GFP(lrp, ll_remote_perm_cachep, GFP_KERNEL); + if (lrp) + INIT_HLIST_NODE(&lrp->lrp_list); + return lrp; +} + +static inline void free_ll_remote_perm(struct ll_remote_perm *lrp) +{ + if (!lrp) + return; + + if (!hlist_unhashed(&lrp->lrp_list)) + hlist_del(&lrp->lrp_list); + OBD_SLAB_FREE(lrp, ll_remote_perm_cachep, sizeof(*lrp)); +} + +static struct hlist_head *alloc_rmtperm_hash(void) +{ + struct hlist_head *hash; + int i; + + OBD_SLAB_ALLOC_GFP(hash, ll_rmtperm_hash_cachep, + REMOTE_PERM_HASHSIZE * sizeof(*hash), + GFP_IOFS); + if (!hash) + return NULL; + + for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) + INIT_HLIST_HEAD(hash + i); + + return hash; +} + +void free_rmtperm_hash(struct hlist_head *hash) +{ + int i; + struct ll_remote_perm *lrp; + struct hlist_node *next; + + if (!hash) + return; + + for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) + hlist_for_each_entry_safe(lrp, next, hash + i, + lrp_list) + free_ll_remote_perm(lrp); + OBD_SLAB_FREE(hash, ll_rmtperm_hash_cachep, + REMOTE_PERM_HASHSIZE * sizeof(*hash)); +} + +static inline int remote_perm_hashfunc(uid_t uid) +{ + return uid & (REMOTE_PERM_HASHSIZE - 1); +} + +/* NB: setxid permission is not checked here, instead it's done on + * MDT when client get remote permission. */ +static int do_check_remote_perm(struct ll_inode_info *lli, int mask) +{ + struct hlist_head *head; + struct ll_remote_perm *lrp; + int found = 0, rc; + + if (!lli->lli_remote_perms) + return -ENOENT; + + head = lli->lli_remote_perms + + remote_perm_hashfunc(from_kuid(&init_user_ns, current_uid())); + + spin_lock(&lli->lli_lock); + hlist_for_each_entry(lrp, head, lrp_list) { + if (lrp->lrp_uid != from_kuid(&init_user_ns, current_uid())) + continue; + if (lrp->lrp_gid != from_kgid(&init_user_ns, current_gid())) + continue; + if (lrp->lrp_fsuid != from_kuid(&init_user_ns, current_fsuid())) + continue; + if (lrp->lrp_fsgid != from_kgid(&init_user_ns, current_fsgid())) + continue; + found = 1; + break; + } + + if (!found) { + rc = -ENOENT; + goto out; + } + + CDEBUG(D_SEC, "found remote perm: %u/%u/%u/%u - %#x\n", + lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid, + lrp->lrp_access_perm); + rc = ((lrp->lrp_access_perm & mask) == mask) ? 0 : -EACCES; + +out: + spin_unlock(&lli->lli_lock); + return rc; +} + +int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_remote_perm *lrp = NULL, *tmp = NULL; + struct hlist_head *head, *perm_hash = NULL; + + LASSERT(ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT); + +#if 0 + if (perm->rp_uid != current->uid || + perm->rp_gid != current->gid || + perm->rp_fsuid != current->fsuid || + perm->rp_fsgid != current->fsgid) { + /* user might setxid in this small period */ + CDEBUG(D_SEC, + "remote perm user %u/%u/%u/%u != current %u/%u/%u/%u\n", + perm->rp_uid, perm->rp_gid, perm->rp_fsuid, + perm->rp_fsgid, current->uid, current->gid, + current->fsuid, current->fsgid); + return -EAGAIN; + } +#endif + + if (!lli->lli_remote_perms) { + perm_hash = alloc_rmtperm_hash(); + if (perm_hash == NULL) { + CERROR("alloc lli_remote_perms failed!\n"); + return -ENOMEM; + } + } + + spin_lock(&lli->lli_lock); + + if (!lli->lli_remote_perms) + lli->lli_remote_perms = perm_hash; + else + free_rmtperm_hash(perm_hash); + + head = lli->lli_remote_perms + remote_perm_hashfunc(perm->rp_uid); + +again: + hlist_for_each_entry(tmp, head, lrp_list) { + if (tmp->lrp_uid != perm->rp_uid) + continue; + if (tmp->lrp_gid != perm->rp_gid) + continue; + if (tmp->lrp_fsuid != perm->rp_fsuid) + continue; + if (tmp->lrp_fsgid != perm->rp_fsgid) + continue; + free_ll_remote_perm(lrp); + lrp = tmp; + break; + } + + if (!lrp) { + spin_unlock(&lli->lli_lock); + lrp = alloc_ll_remote_perm(); + if (!lrp) { + CERROR("alloc memory for ll_remote_perm failed!\n"); + return -ENOMEM; + } + spin_lock(&lli->lli_lock); + goto again; + } + + lrp->lrp_access_perm = perm->rp_access_perm; + if (lrp != tmp) { + lrp->lrp_uid = perm->rp_uid; + lrp->lrp_gid = perm->rp_gid; + lrp->lrp_fsuid = perm->rp_fsuid; + lrp->lrp_fsgid = perm->rp_fsgid; + hlist_add_head(&lrp->lrp_list, head); + } + lli->lli_rmtperm_time = cfs_time_current(); + spin_unlock(&lli->lli_lock); + + CDEBUG(D_SEC, "new remote perm@%p: %u/%u/%u/%u - %#x\n", + lrp, lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid, + lrp->lrp_access_perm); + + return 0; +} + +int lustre_check_remote_perm(struct inode *inode, int mask) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + struct mdt_remote_perm *perm; + struct obd_capa *oc; + unsigned long save; + int i = 0, rc; + + do { + save = lli->lli_rmtperm_time; + rc = do_check_remote_perm(lli, mask); + if (!rc || (rc != -ENOENT && i)) + break; + + might_sleep(); + + mutex_lock(&lli->lli_rmtperm_mutex); + /* check again */ + if (save != lli->lli_rmtperm_time) { + rc = do_check_remote_perm(lli, mask); + if (!rc || (rc != -ENOENT && i)) { + mutex_unlock(&lli->lli_rmtperm_mutex); + break; + } + } + + if (i++ > 5) { + CERROR("check remote perm falls in dead loop!\n"); + LBUG(); + } + + oc = ll_mdscapa_get(inode); + rc = md_get_remote_perm(sbi->ll_md_exp, ll_inode2fid(inode), oc, + ll_i2suppgid(inode), &req); + capa_put(oc); + if (rc) { + mutex_unlock(&lli->lli_rmtperm_mutex); + break; + } + + perm = req_capsule_server_swab_get(&req->rq_pill, &RMF_ACL, + lustre_swab_mdt_remote_perm); + if (unlikely(perm == NULL)) { + mutex_unlock(&lli->lli_rmtperm_mutex); + rc = -EPROTO; + break; + } + + rc = ll_update_remote_perm(inode, perm); + mutex_unlock(&lli->lli_rmtperm_mutex); + if (rc == -ENOMEM) + break; + + ptlrpc_req_finished(req); + req = NULL; + } while (1); + ptlrpc_req_finished(req); + return rc; +} + +#if 0 /* NB: remote perms can't be freed in ll_mdc_blocking_ast of UPDATE lock, + * because it will fail sanity test 48. + */ +void ll_free_remote_perms(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct hlist_head *hash = lli->lli_remote_perms; + struct ll_remote_perm *lrp; + struct hlist_node *node, *next; + int i; + + LASSERT(hash); + + spin_lock(&lli->lli_lock); + + for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) { + hlist_for_each_entry_safe(lrp, node, next, hash + i, + lrp_list) + free_ll_remote_perm(lrp); + } + + spin_unlock(&lli->lli_lock); +} +#endif diff --git a/kernel/drivers/staging/lustre/lustre/llite/rw.c b/kernel/drivers/staging/lustre/lustre/llite/rw.c new file mode 100644 index 000000000..991d20c50 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/rw.c @@ -0,0 +1,1289 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/rw.c + * + * Lustre Lite I/O page cache routines shared by different kernel revs + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +/* current_is_kswapd() */ +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/lustre_lite.h" +#include "../include/obd_cksum.h" +#include "llite_internal.h" +#include "../include/linux/lustre_compat25.h" + +/** + * Finalizes cl-data before exiting typical address_space operation. Dual to + * ll_cl_init(). + */ +static void ll_cl_fini(struct ll_cl_context *lcc) +{ + struct lu_env *env = lcc->lcc_env; + struct cl_io *io = lcc->lcc_io; + struct cl_page *page = lcc->lcc_page; + + LASSERT(lcc->lcc_cookie == current); + LASSERT(env != NULL); + + if (page != NULL) { + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + } + + cl_env_put(env, &lcc->lcc_refcheck); +} + +/** + * Initializes common cl-data at the typical address_space operation entry + * point. + */ +static struct ll_cl_context *ll_cl_init(struct file *file, + struct page *vmpage, int create) +{ + struct ll_cl_context *lcc; + struct lu_env *env; + struct cl_io *io; + struct cl_object *clob; + struct ccc_io *cio; + + int refcheck; + int result = 0; + + clob = ll_i2info(vmpage->mapping->host)->lli_clob; + LASSERT(clob != NULL); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return ERR_CAST(env); + + lcc = &vvp_env_info(env)->vti_io_ctx; + memset(lcc, 0, sizeof(*lcc)); + lcc->lcc_env = env; + lcc->lcc_refcheck = refcheck; + lcc->lcc_cookie = current; + + cio = ccc_env_io(env); + io = cio->cui_cl.cis_io; + if (io == NULL && create) { + struct inode *inode = vmpage->mapping->host; + loff_t pos; + + if (mutex_trylock(&inode->i_mutex)) { + mutex_unlock(&(inode)->i_mutex); + + /* this is too bad. Someone is trying to write the + * page w/o holding inode mutex. This means we can + * add dirty pages into cache during truncate */ + CERROR("Proc %s is dirtying page w/o inode lock, this will break truncate\n", + current->comm); + dump_stack(); + LBUG(); + return ERR_PTR(-EIO); + } + + /* + * Loop-back driver calls ->prepare_write(). + * methods directly, bypassing file system ->write() operation, + * so cl_io has to be created here. + */ + io = ccc_env_thread_io(env); + ll_io_init(io, file, 1); + + /* No lock at all for this kind of IO - we can't do it because + * we have held page lock, it would cause deadlock. + * XXX: This causes poor performance to loop device - One page + * per RPC. + * In order to get better performance, users should use + * lloop driver instead. + */ + io->ci_lockreq = CILR_NEVER; + + pos = vmpage->index << PAGE_CACHE_SHIFT; + + /* Create a temp IO to serve write. */ + result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE); + if (result == 0) { + cio->cui_fd = LUSTRE_FPRIVATE(file); + cio->cui_iter = NULL; + result = cl_io_iter_init(env, io); + if (result == 0) { + result = cl_io_lock(env, io); + if (result == 0) + result = cl_io_start(env, io); + } + } else + result = io->ci_result; + } + + lcc->lcc_io = io; + if (io == NULL) + result = -EIO; + if (result == 0) { + struct cl_page *page; + + LASSERT(io != NULL); + LASSERT(io->ci_state == CIS_IO_GOING); + LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file)); + page = cl_page_find(env, clob, vmpage->index, vmpage, + CPT_CACHEABLE); + if (!IS_ERR(page)) { + lcc->lcc_page = page; + lu_ref_add(&page->cp_reference, "cl_io", io); + result = 0; + } else + result = PTR_ERR(page); + } + if (result) { + ll_cl_fini(lcc); + lcc = ERR_PTR(result); + } + + CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %d %p %p\n", + vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result, + env, io); + return lcc; +} + +static struct ll_cl_context *ll_cl_get(void) +{ + struct ll_cl_context *lcc; + struct lu_env *env; + int refcheck; + + env = cl_env_get(&refcheck); + LASSERT(!IS_ERR(env)); + lcc = &vvp_env_info(env)->vti_io_ctx; + LASSERT(env == lcc->lcc_env); + LASSERT(current == lcc->lcc_cookie); + cl_env_put(env, &refcheck); + + /* env has got in ll_cl_init, so it is still usable. */ + return lcc; +} + +/** + * ->prepare_write() address space operation called by generic_file_write() + * for every page during write. + */ +int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from, + unsigned to) +{ + struct ll_cl_context *lcc; + int result; + + lcc = ll_cl_init(file, vmpage, 1); + if (!IS_ERR(lcc)) { + struct lu_env *env = lcc->lcc_env; + struct cl_io *io = lcc->lcc_io; + struct cl_page *page = lcc->lcc_page; + + cl_page_assume(env, io, page); + + result = cl_io_prepare_write(env, io, page, from, to); + if (result == 0) { + /* + * Add a reference, so that page is not evicted from + * the cache until ->commit_write() is called. + */ + cl_page_get(page); + lu_ref_add(&page->cp_reference, "prepare_write", + current); + } else { + cl_page_unassume(env, io, page); + ll_cl_fini(lcc); + } + /* returning 0 in prepare assumes commit must be called + * afterwards */ + } else { + result = PTR_ERR(lcc); + } + return result; +} + +int ll_commit_write(struct file *file, struct page *vmpage, unsigned from, + unsigned to) +{ + struct ll_cl_context *lcc; + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + int result = 0; + + lcc = ll_cl_get(); + env = lcc->lcc_env; + page = lcc->lcc_page; + io = lcc->lcc_io; + + LASSERT(cl_page_is_owned(page, io)); + LASSERT(from <= to); + if (from != to) /* handle short write case. */ + result = cl_io_commit_write(env, io, page, from, to); + if (cl_page_is_owned(page, io)) + cl_page_unassume(env, io, page); + + /* + * Release reference acquired by ll_prepare_write(). + */ + lu_ref_del(&page->cp_reference, "prepare_write", current); + cl_page_put(env, page); + ll_cl_fini(lcc); + return result; +} + +struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt) +{ + __u64 opc; + + opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW; + return ll_osscapa_get(inode, opc); +} + +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); + +/** + * Get readahead pages from the filesystem readahead pool of the client for a + * thread. + * + * /param sbi superblock for filesystem readahead state ll_ra_info + * /param ria per-thread readahead state + * /param pages number of pages requested for readahead for the thread. + * + * WARNING: This algorithm is used to reduce contention on sbi->ll_lock. + * It should work well if the ra_max_pages is much greater than the single + * file's read-ahead window, and not too many threads contending for + * these readahead pages. + * + * TODO: There may be a 'global sync problem' if many threads are trying + * to get an ra budget that is larger than the remaining readahead pages + * and reach here at exactly the same time. They will compute /a ret to + * consume the remaining pages, but will fail at atomic_add_return() and + * get a zero ra window, although there is still ra space remaining. - Jay */ + +static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, + struct ra_io_arg *ria, + unsigned long pages) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + long ret; + + /* If read-ahead pages left are less than 1M, do not do read-ahead, + * otherwise it will form small read RPC(< 1M), which hurt server + * performance a lot. */ + ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages); + if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages)) { + ret = 0; + goto out; + } + + /* If the non-strided (ria_pages == 0) readahead window + * (ria_start + ret) has grown across an RPC boundary, then trim + * readahead size by the amount beyond the RPC so it ends on an + * RPC boundary. If the readahead window is already ending on + * an RPC boundary (beyond_rpc == 0), or smaller than a full + * RPC (beyond_rpc < ret) the readahead size is unchanged. + * The (beyond_rpc != 0) check is skipped since the conditional + * branch is more expensive than subtracting zero from the result. + * + * Strided read is left unaligned to avoid small fragments beyond + * the RPC boundary from needing an extra read RPC. */ + if (ria->ria_pages == 0) { + long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES; + if (/* beyond_rpc != 0 && */ beyond_rpc < ret) + ret -= beyond_rpc; + } + + if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { + atomic_sub(ret, &ra->ra_cur_pages); + ret = 0; + } + +out: + return ret; +} + +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + atomic_sub(len, &ra->ra_cur_pages); +} + +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which) +{ + LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which); + lprocfs_counter_incr(sbi->ll_ra_stats, which); +} + +void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which) +{ + struct ll_sb_info *sbi = ll_i2sbi(mapping->host); + ll_ra_stats_inc_sbi(sbi, which); +} + +#define RAS_CDEBUG(ras) \ + CDEBUG(D_READA, \ + "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \ + "csr %lu sf %lu sp %lu sl %lu \n", \ + ras->ras_last_readpage, ras->ras_consecutive_requests, \ + ras->ras_consecutive_pages, ras->ras_window_start, \ + ras->ras_window_len, ras->ras_next_readahead, \ + ras->ras_requests, ras->ras_request_index, \ + ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ + ras->ras_stride_pages, ras->ras_stride_length) + +static int index_in_window(unsigned long index, unsigned long point, + unsigned long before, unsigned long after) +{ + unsigned long start = point - before, end = point + after; + + if (start > point) + start = 0; + if (end < point) + end = ~0; + + return start <= index && index <= end; +} + +static struct ll_readahead_state *ll_ras_get(struct file *f) +{ + struct ll_file_data *fd; + + fd = LUSTRE_FPRIVATE(f); + return &fd->fd_ras; +} + +void ll_ra_read_in(struct file *f, struct ll_ra_read *rar) +{ + struct ll_readahead_state *ras; + + ras = ll_ras_get(f); + + spin_lock(&ras->ras_lock); + ras->ras_requests++; + ras->ras_request_index = 0; + ras->ras_consecutive_requests++; + rar->lrr_reader = current; + + list_add(&rar->lrr_linkage, &ras->ras_read_beads); + spin_unlock(&ras->ras_lock); +} + +void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar) +{ + struct ll_readahead_state *ras; + + ras = ll_ras_get(f); + + spin_lock(&ras->ras_lock); + list_del_init(&rar->lrr_linkage); + spin_unlock(&ras->ras_lock); +} + +static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras) +{ + struct ll_ra_read *scan; + + list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) { + if (scan->lrr_reader == current) + return scan; + } + return NULL; +} + +struct ll_ra_read *ll_ra_read_get(struct file *f) +{ + struct ll_readahead_state *ras; + struct ll_ra_read *bead; + + ras = ll_ras_get(f); + + spin_lock(&ras->ras_lock); + bead = ll_ra_read_get_locked(ras); + spin_unlock(&ras->ras_lock); + return bead; +} + +static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_page *page, + struct page *vmpage) +{ + struct ccc_page *cp; + int rc; + + rc = 0; + cl_page_assume(env, io, page); + lu_ref_add(&page->cp_reference, "ra", current); + cp = cl2ccc_page(cl_page_at(page, &vvp_device_type)); + if (!cp->cpg_defer_uptodate && !PageUptodate(vmpage)) { + rc = cl_page_is_under_lock(env, io, page); + if (rc == -EBUSY) { + cp->cpg_defer_uptodate = 1; + cp->cpg_ra_used = 0; + cl_page_list_add(queue, page); + rc = 1; + } else { + cl_page_delete(env, page); + rc = -ENOLCK; + } + } else { + /* skip completed pages */ + cl_page_unassume(env, io, page); + } + lu_ref_del(&page->cp_reference, "ra", current); + cl_page_put(env, page); + return rc; +} + +/** + * Initiates read-ahead of a page with given index. + * + * \retval +ve: page was added to \a queue. + * + * \retval -ENOLCK: there is no extent lock for this part of a file, stop + * read-ahead. + * + * \retval -ve, 0: page wasn't added to \a queue for other reason. + */ +static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, + pgoff_t index, struct address_space *mapping) +{ + struct page *vmpage; + struct cl_object *clob = ll_i2info(mapping->host)->lli_clob; + struct cl_page *page; + enum ra_stat which = _NR_RA_STAT; /* keep gcc happy */ + int rc = 0; + const char *msg = NULL; + + vmpage = grab_cache_page_nowait(mapping, index); + if (vmpage != NULL) { + /* Check if vmpage was truncated or reclaimed */ + if (vmpage->mapping == mapping) { + page = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + rc = cl_read_ahead_page(env, io, queue, + page, vmpage); + if (rc == -ENOLCK) { + which = RA_STAT_FAILED_MATCH; + msg = "lock match failed"; + } + } else { + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "cl_page_find failed"; + } + } else { + which = RA_STAT_WRONG_GRAB_PAGE; + msg = "g_c_p_n returned invalid page"; + } + if (rc != 1) + unlock_page(vmpage); + page_cache_release(vmpage); + } else { + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "g_c_p_n failed"; + } + if (msg != NULL) { + ll_ra_stats_inc(mapping, which); + CDEBUG(D_READA, "%s\n", msg); + } + return rc; +} + +#define RIA_DEBUG(ria) \ + CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \ + ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ + ria->ria_pages) + +/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't + * know what the actual RPC size is. If this needs to change, it makes more + * sense to tune the i_blkbits value for the file based on the OSTs it is + * striped over, rather than having a constant value for all files here. */ + +/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_CACHE_SHIFT)). + * Temporarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled + * by default, this should be adjusted corresponding with max_read_ahead_mb + * and max_read_ahead_per_file_mb otherwise the readahead budget can be used + * up quickly which will affect read performance significantly. See LU-2816 */ +#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT) + +static inline int stride_io_mode(struct ll_readahead_state *ras) +{ + return ras->ras_consecutive_stride_requests > 1; +} +/* The function calculates how much pages will be read in + * [off, off + length], in such stride IO area, + * stride_offset = st_off, stride_length = st_len, + * stride_pages = st_pgs + * + * |------------------|*****|------------------|*****|------------|*****|.... + * st_off + * |--- st_pgs ---| + * |----- st_len -----| + * + * How many pages it should read in such pattern + * |-------------------------------------------------------------| + * off + * |<------ length ------->| + * + * = |<----->| + |-------------------------------------| + |---| + * start_left st_pgs * i end_left + */ +static unsigned long +stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs, + unsigned long off, unsigned long length) +{ + __u64 start = off > st_off ? off - st_off : 0; + __u64 end = off + length > st_off ? off + length - st_off : 0; + unsigned long start_left = 0; + unsigned long end_left = 0; + unsigned long pg_count; + + if (st_len == 0 || length == 0 || end == 0) + return length; + + start_left = do_div(start, st_len); + if (start_left < st_pgs) + start_left = st_pgs - start_left; + else + start_left = 0; + + end_left = do_div(end, st_len); + if (end_left > st_pgs) + end_left = st_pgs; + + CDEBUG(D_READA, "start %llu, end %llu start_left %lu end_left %lu \n", + start, end, start_left, end_left); + + if (start == end) + pg_count = end_left - (st_pgs - start_left); + else + pg_count = start_left + st_pgs * (end - start - 1) + end_left; + + CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu pgcount %lu\n", + st_off, st_len, st_pgs, off, length, pg_count); + + return pg_count; +} + +static int ria_page_count(struct ra_io_arg *ria) +{ + __u64 length = ria->ria_end >= ria->ria_start ? + ria->ria_end - ria->ria_start + 1 : 0; + + return stride_pg_count(ria->ria_stoff, ria->ria_length, + ria->ria_pages, ria->ria_start, + length); +} + +/*Check whether the index is in the defined ra-window */ +static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) +{ + /* If ria_length == ria_pages, it means non-stride I/O mode, + * idx should always inside read-ahead window in this case + * For stride I/O mode, just check whether the idx is inside + * the ria_pages. */ + return ria->ria_length == 0 || ria->ria_length == ria->ria_pages || + (idx >= ria->ria_stoff && (idx - ria->ria_stoff) % + ria->ria_length < ria->ria_pages); +} + +static int ll_read_ahead_pages(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *queue, + struct ra_io_arg *ria, + unsigned long *reserved_pages, + struct address_space *mapping, + unsigned long *ra_end) +{ + int rc, count = 0, stride_ria; + unsigned long page_idx; + + LASSERT(ria != NULL); + RIA_DEBUG(ria); + + stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0; + for (page_idx = ria->ria_start; page_idx <= ria->ria_end && + *reserved_pages > 0; page_idx++) { + if (ras_inside_ra_window(page_idx, ria)) { + /* If the page is inside the read-ahead window*/ + rc = ll_read_ahead_page(env, io, queue, + page_idx, mapping); + if (rc == 1) { + (*reserved_pages)--; + count ++; + } else if (rc == -ENOLCK) + break; + } else if (stride_ria) { + /* If it is not in the read-ahead window, and it is + * read-ahead mode, then check whether it should skip + * the stride gap */ + pgoff_t offset; + /* FIXME: This assertion only is valid when it is for + * forward read-ahead, it will be fixed when backward + * read-ahead is implemented */ + LASSERTF(page_idx > ria->ria_stoff, "Invalid page_idx %lu rs %lu re %lu ro %lu rl %lu rp %lu\n", + page_idx, + ria->ria_start, ria->ria_end, ria->ria_stoff, + ria->ria_length, ria->ria_pages); + offset = page_idx - ria->ria_stoff; + offset = offset % (ria->ria_length); + if (offset > ria->ria_pages) { + page_idx += ria->ria_length - offset; + CDEBUG(D_READA, "i %lu skip %lu \n", page_idx, + ria->ria_length - offset); + continue; + } + } + } + *ra_end = page_idx; + return count; +} + +int ll_readahead(const struct lu_env *env, struct cl_io *io, + struct ll_readahead_state *ras, struct address_space *mapping, + struct cl_page_list *queue, int flags) +{ + struct vvp_io *vio = vvp_env_io(env); + struct vvp_thread_info *vti = vvp_env_info(env); + struct cl_attr *attr = ccc_env_thread_attr(env); + unsigned long start = 0, end = 0, reserved; + unsigned long ra_end, len; + struct inode *inode; + struct ll_ra_read *bead; + struct ra_io_arg *ria = &vti->vti_ria; + struct ll_inode_info *lli; + struct cl_object *clob; + int ret = 0; + __u64 kms; + + inode = mapping->host; + lli = ll_i2info(inode); + clob = lli->lli_clob; + + memset(ria, 0, sizeof(*ria)); + + cl_object_attr_lock(clob); + ret = cl_object_attr_get(env, clob, attr); + cl_object_attr_unlock(clob); + + if (ret != 0) + return ret; + kms = attr->cat_kms; + if (kms == 0) { + ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN); + return 0; + } + + spin_lock(&ras->ras_lock); + if (vio->cui_ra_window_set) + bead = &vio->cui_bead; + else + bead = NULL; + + /* Enlarge the RA window to encompass the full read */ + if (bead != NULL && ras->ras_window_start + ras->ras_window_len < + bead->lrr_start + bead->lrr_count) { + ras->ras_window_len = bead->lrr_start + bead->lrr_count - + ras->ras_window_start; + } + /* Reserve a part of the read-ahead window that we'll be issuing */ + if (ras->ras_window_len) { + start = ras->ras_next_readahead; + end = ras->ras_window_start + ras->ras_window_len - 1; + } + if (end != 0) { + unsigned long rpc_boundary; + /* + * Align RA window to an optimal boundary. + * + * XXX This would be better to align to cl_max_pages_per_rpc + * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may + * be aligned to the RAID stripe size in the future and that + * is more important than the RPC size. + */ + /* Note: we only trim the RPC, instead of extending the RPC + * to the boundary, so to avoid reading too much pages during + * random reading. */ + rpc_boundary = (end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1)); + if (rpc_boundary > 0) + rpc_boundary--; + + if (rpc_boundary > start) + end = rpc_boundary; + + /* Truncate RA window to end of file */ + end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT)); + + ras->ras_next_readahead = max(end, end + 1); + RAS_CDEBUG(ras); + } + ria->ria_start = start; + ria->ria_end = end; + /* If stride I/O mode is detected, get stride window*/ + if (stride_io_mode(ras)) { + ria->ria_stoff = ras->ras_stride_offset; + ria->ria_length = ras->ras_stride_length; + ria->ria_pages = ras->ras_stride_pages; + } + spin_unlock(&ras->ras_lock); + + if (end == 0) { + ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW); + return 0; + } + len = ria_page_count(ria); + if (len == 0) + return 0; + + reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len); + if (reserved < len) + ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT); + + CDEBUG(D_READA, "reserved page %lu ra_cur %d ra_max %lu\n", reserved, + atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), + ll_i2sbi(inode)->ll_ra_info.ra_max_pages); + + ret = ll_read_ahead_pages(env, io, queue, + ria, &reserved, mapping, &ra_end); + + LASSERTF(reserved >= 0, "reserved %lu\n", reserved); + if (reserved != 0) + ll_ra_count_put(ll_i2sbi(inode), reserved); + + if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT)) + ll_ra_stats_inc(mapping, RA_STAT_EOF); + + /* if we didn't get to the end of the region we reserved from + * the ras we need to go back and update the ras so that the + * next read-ahead tries from where we left off. we only do so + * if the region we failed to issue read-ahead on is still ahead + * of the app and behind the next index to start read-ahead from */ + CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n", + ra_end, end, ria->ria_end); + + if (ra_end != end + 1) { + spin_lock(&ras->ras_lock); + if (ra_end < ras->ras_next_readahead && + index_in_window(ra_end, ras->ras_window_start, 0, + ras->ras_window_len)) { + ras->ras_next_readahead = ra_end; + RAS_CDEBUG(ras); + } + spin_unlock(&ras->ras_lock); + } + + return ret; +} + +static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras, + unsigned long index) +{ + ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1)); +} + +/* called with the ras_lock held or from places where it doesn't matter */ +static void ras_reset(struct inode *inode, struct ll_readahead_state *ras, + unsigned long index) +{ + ras->ras_last_readpage = index; + ras->ras_consecutive_requests = 0; + ras->ras_consecutive_pages = 0; + ras->ras_window_len = 0; + ras_set_start(inode, ras, index); + ras->ras_next_readahead = max(ras->ras_window_start, index); + + RAS_CDEBUG(ras); +} + +/* called with the ras_lock held or from places where it doesn't matter */ +static void ras_stride_reset(struct ll_readahead_state *ras) +{ + ras->ras_consecutive_stride_requests = 0; + ras->ras_stride_length = 0; + ras->ras_stride_pages = 0; + RAS_CDEBUG(ras); +} + +void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) +{ + spin_lock_init(&ras->ras_lock); + ras_reset(inode, ras, 0); + ras->ras_requests = 0; + INIT_LIST_HEAD(&ras->ras_read_beads); +} + +/* + * Check whether the read request is in the stride window. + * If it is in the stride window, return 1, otherwise return 0. + */ +static int index_in_stride_window(struct ll_readahead_state *ras, + unsigned long index) +{ + unsigned long stride_gap; + + if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 || + ras->ras_stride_pages == ras->ras_stride_length) + return 0; + + stride_gap = index - ras->ras_last_readpage - 1; + + /* If it is contiguous read */ + if (stride_gap == 0) + return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages; + + /* Otherwise check the stride by itself */ + return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap && + ras->ras_consecutive_pages == ras->ras_stride_pages; +} + +static void ras_update_stride_detector(struct ll_readahead_state *ras, + unsigned long index) +{ + unsigned long stride_gap = index - ras->ras_last_readpage - 1; + + if (!stride_io_mode(ras) && (stride_gap != 0 || + ras->ras_consecutive_stride_requests == 0)) { + ras->ras_stride_pages = ras->ras_consecutive_pages; + ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; + } + LASSERT(ras->ras_request_index == 0); + LASSERT(ras->ras_consecutive_stride_requests == 0); + + if (index <= ras->ras_last_readpage) { + /*Reset stride window for forward read*/ + ras_stride_reset(ras); + return; + } + + ras->ras_stride_pages = ras->ras_consecutive_pages; + ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; + + RAS_CDEBUG(ras); + return; +} + +static unsigned long +stride_page_count(struct ll_readahead_state *ras, unsigned long len) +{ + return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length, + ras->ras_stride_pages, ras->ras_stride_offset, + len); +} + +/* Stride Read-ahead window will be increased inc_len according to + * stride I/O pattern */ +static void ras_stride_increase_window(struct ll_readahead_state *ras, + struct ll_ra_info *ra, + unsigned long inc_len) +{ + unsigned long left, step, window_len; + unsigned long stride_len; + + LASSERT(ras->ras_stride_length > 0); + LASSERTF(ras->ras_window_start + ras->ras_window_len + >= ras->ras_stride_offset, "window_start %lu, window_len %lu stride_offset %lu\n", + ras->ras_window_start, + ras->ras_window_len, ras->ras_stride_offset); + + stride_len = ras->ras_window_start + ras->ras_window_len - + ras->ras_stride_offset; + + left = stride_len % ras->ras_stride_length; + window_len = ras->ras_window_len - left; + + if (left < ras->ras_stride_pages) + left += inc_len; + else + left = ras->ras_stride_pages + inc_len; + + LASSERT(ras->ras_stride_pages != 0); + + step = left / ras->ras_stride_pages; + left %= ras->ras_stride_pages; + + window_len += step * ras->ras_stride_length + left; + + if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file) + ras->ras_window_len = window_len; + + RAS_CDEBUG(ras); +} + +static void ras_increase_window(struct inode *inode, + struct ll_readahead_state *ras, + struct ll_ra_info *ra) +{ + /* The stretch of ra-window should be aligned with max rpc_size + * but current clio architecture does not support retrieve such + * information from lower layer. FIXME later + */ + if (stride_io_mode(ras)) + ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode)); + else + ras->ras_window_len = min(ras->ras_window_len + + RAS_INCREASE_STEP(inode), + ra->ra_max_pages_per_file); +} + +void ras_update(struct ll_sb_info *sbi, struct inode *inode, + struct ll_readahead_state *ras, unsigned long index, + unsigned hit) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + int zero = 0, stride_detect = 0, ra_miss = 0; + + spin_lock(&ras->ras_lock); + + ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS); + + /* reset the read-ahead window in two cases. First when the app seeks + * or reads to some other part of the file. Secondly if we get a + * read-ahead miss that we think we've previously issued. This can + * be a symptom of there being so many read-ahead pages that the VM is + * reclaiming it before we get to it. */ + if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) { + zero = 1; + ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE); + } else if (!hit && ras->ras_window_len && + index < ras->ras_next_readahead && + index_in_window(index, ras->ras_window_start, 0, + ras->ras_window_len)) { + ra_miss = 1; + ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW); + } + + /* On the second access to a file smaller than the tunable + * ra_max_read_ahead_whole_pages trigger RA on all pages in the + * file up to ra_max_pages_per_file. This is simply a best effort + * and only occurs once per open file. Normal RA behavior is reverted + * to for subsequent IO. The mmap case does not increment + * ras_requests and thus can never trigger this behavior. */ + if (ras->ras_requests == 2 && !ras->ras_request_index) { + __u64 kms_pages; + + kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages, + ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file); + + if (kms_pages && + kms_pages <= ra->ra_max_read_ahead_whole_pages) { + ras->ras_window_start = 0; + ras->ras_last_readpage = 0; + ras->ras_next_readahead = 0; + ras->ras_window_len = min(ra->ra_max_pages_per_file, + ra->ra_max_read_ahead_whole_pages); + goto out_unlock; + } + } + if (zero) { + /* check whether it is in stride I/O mode*/ + if (!index_in_stride_window(ras, index)) { + if (ras->ras_consecutive_stride_requests == 0 && + ras->ras_request_index == 0) { + ras_update_stride_detector(ras, index); + ras->ras_consecutive_stride_requests++; + } else { + ras_stride_reset(ras); + } + ras_reset(inode, ras, index); + ras->ras_consecutive_pages++; + goto out_unlock; + } else { + ras->ras_consecutive_pages = 0; + ras->ras_consecutive_requests = 0; + if (++ras->ras_consecutive_stride_requests > 1) + stride_detect = 1; + RAS_CDEBUG(ras); + } + } else { + if (ra_miss) { + if (index_in_stride_window(ras, index) && + stride_io_mode(ras)) { + /*If stride-RA hit cache miss, the stride dector + *will not be reset to avoid the overhead of + *redetecting read-ahead mode */ + if (index != ras->ras_last_readpage + 1) + ras->ras_consecutive_pages = 0; + ras_reset(inode, ras, index); + RAS_CDEBUG(ras); + } else { + /* Reset both stride window and normal RA + * window */ + ras_reset(inode, ras, index); + ras->ras_consecutive_pages++; + ras_stride_reset(ras); + goto out_unlock; + } + } else if (stride_io_mode(ras)) { + /* If this is contiguous read but in stride I/O mode + * currently, check whether stride step still is valid, + * if invalid, it will reset the stride ra window*/ + if (!index_in_stride_window(ras, index)) { + /* Shrink stride read-ahead window to be zero */ + ras_stride_reset(ras); + ras->ras_window_len = 0; + ras->ras_next_readahead = index; + } + } + } + ras->ras_consecutive_pages++; + ras->ras_last_readpage = index; + ras_set_start(inode, ras, index); + + if (stride_io_mode(ras)) + /* Since stride readahead is sensitive to the offset + * of read-ahead, so we use original offset here, + * instead of ras_window_start, which is RPC aligned */ + ras->ras_next_readahead = max(index, ras->ras_next_readahead); + else + ras->ras_next_readahead = max(ras->ras_window_start, + ras->ras_next_readahead); + RAS_CDEBUG(ras); + + /* Trigger RA in the mmap case where ras_consecutive_requests + * is not incremented and thus can't be used to trigger RA */ + if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) { + ras->ras_window_len = RAS_INCREASE_STEP(inode); + goto out_unlock; + } + + /* Initially reset the stride window offset to next_readahead*/ + if (ras->ras_consecutive_stride_requests == 2 && stride_detect) { + /** + * Once stride IO mode is detected, next_readahead should be + * reset to make sure next_readahead > stride offset + */ + ras->ras_next_readahead = max(index, ras->ras_next_readahead); + ras->ras_stride_offset = index; + ras->ras_window_len = RAS_INCREASE_STEP(inode); + } + + /* The initial ras_window_len is set to the request size. To avoid + * uselessly reading and discarding pages for random IO the window is + * only increased once per consecutive request received. */ + if ((ras->ras_consecutive_requests > 1 || stride_detect) && + !ras->ras_request_index) + ras_increase_window(inode, ras, ra); +out_unlock: + RAS_CDEBUG(ras); + ras->ras_request_index++; + spin_unlock(&ras->ras_lock); + return; +} + +int ll_writepage(struct page *vmpage, struct writeback_control *wbc) +{ + struct inode *inode = vmpage->mapping->host; + struct ll_inode_info *lli = ll_i2info(inode); + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + struct cl_object *clob; + struct cl_env_nest nest; + bool redirtied = false; + bool unlocked = false; + int result; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + LASSERT(ll_i2dtexp(inode) != NULL); + + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) { + result = PTR_ERR(env); + goto out; + } + + clob = ll_i2info(inode)->lli_clob; + LASSERT(clob != NULL); + + io = ccc_env_thread_io(env); + io->ci_obj = clob; + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, clob); + if (result == 0) { + page = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + lu_ref_add(&page->cp_reference, "writepage", + current); + cl_page_assume(env, io, page); + result = cl_page_flush(env, io, page); + if (result != 0) { + /* + * Re-dirty page on error so it retries write, + * but not in case when IO has actually + * occurred and completed with an error. + */ + if (!PageError(vmpage)) { + redirty_page_for_writepage(wbc, vmpage); + result = 0; + redirtied = true; + } + } + cl_page_disown(env, io, page); + unlocked = true; + lu_ref_del(&page->cp_reference, + "writepage", current); + cl_page_put(env, page); + } else { + result = PTR_ERR(page); + } + } + cl_io_fini(env, io); + + if (redirtied && wbc->sync_mode == WB_SYNC_ALL) { + loff_t offset = cl_offset(clob, vmpage->index); + + /* Flush page failed because the extent is being written out. + * Wait for the write of extent to be finished to avoid + * breaking kernel which assumes ->writepage should mark + * PageWriteback or clean the page. */ + result = cl_sync_file_range(inode, offset, + offset + PAGE_CACHE_SIZE - 1, + CL_FSYNC_LOCAL, 1); + if (result > 0) { + /* actually we may have written more than one page. + * decreasing this page because the caller will count + * it. */ + wbc->nr_to_write -= result - 1; + result = 0; + } + } + + cl_env_nested_put(&nest, env); + goto out; + +out: + if (result < 0) { + if (!lli->lli_async_rc) + lli->lli_async_rc = result; + SetPageError(vmpage); + if (!unlocked) + unlock_page(vmpage); + } + return result; +} + +int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ll_sb_info *sbi = ll_i2sbi(inode); + loff_t start; + loff_t end; + enum cl_fsync_mode mode; + int range_whole = 0; + int result; + int ignore_layout = 0; + + if (wbc->range_cyclic) { + start = mapping->writeback_index << PAGE_CACHE_SHIFT; + end = OBD_OBJECT_EOF; + } else { + start = wbc->range_start; + end = wbc->range_end; + if (end == LLONG_MAX) { + end = OBD_OBJECT_EOF; + range_whole = start == 0; + } + } + + mode = CL_FSYNC_NONE; + if (wbc->sync_mode == WB_SYNC_ALL) + mode = CL_FSYNC_LOCAL; + + if (sbi->ll_umounting) + /* if the mountpoint is being umounted, all pages have to be + * evicted to avoid hitting LBUG when truncate_inode_pages() + * is called later on. */ + ignore_layout = 1; + result = cl_sync_file_range(inode, start, end, mode, ignore_layout); + if (result > 0) { + wbc->nr_to_write -= result; + result = 0; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) { + if (end == OBD_OBJECT_EOF) + end = i_size_read(inode); + mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) + 1; + } + return result; +} + +int ll_readpage(struct file *file, struct page *vmpage) +{ + struct ll_cl_context *lcc; + int result; + + lcc = ll_cl_init(file, vmpage, 0); + if (!IS_ERR(lcc)) { + struct lu_env *env = lcc->lcc_env; + struct cl_io *io = lcc->lcc_io; + struct cl_page *page = lcc->lcc_page; + + LASSERT(page->cp_type == CPT_CACHEABLE); + if (likely(!PageUptodate(vmpage))) { + cl_page_assume(env, io, page); + result = cl_io_read_page(env, io, page); + } else { + /* Page from a non-object file. */ + unlock_page(vmpage); + result = 0; + } + ll_cl_fini(lcc); + } else { + unlock_page(vmpage); + result = PTR_ERR(lcc); + } + return result; +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/rw26.c b/kernel/drivers/staging/lustre/lustre/llite/rw26.c new file mode 100644 index 000000000..c6c824356 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/rw26.c @@ -0,0 +1,553 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/llite/rw26.c + * + * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/lustre_lite.h" +#include "llite_internal.h" +#include "../include/linux/lustre_compat25.h" + +/** + * Implements Linux VM address_space::invalidatepage() method. This method is + * called when the page is truncate from a file, either as a result of + * explicit truncate, or when inode is removed from memory (as a result of + * final iput(), umount, or memory pressure induced icache shrinking). + * + * [0, offset] bytes of the page remain valid (this is for a case of not-page + * aligned truncate). Lustre leaves partially truncated page in the cache, + * relying on struct inode::i_size to limit further accesses. + */ +static void ll_invalidatepage(struct page *vmpage, unsigned int offset, + unsigned int length) +{ + struct inode *inode; + struct lu_env *env; + struct cl_page *page; + struct cl_object *obj; + + int refcheck; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + /* + * It is safe to not check anything in invalidatepage/releasepage + * below because they are run with page locked and all our io is + * happening with locked page too + */ + if (offset == 0 && length == PAGE_CACHE_SIZE) { + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + inode = vmpage->mapping->host; + obj = ll_i2info(inode)->lli_clob; + if (obj != NULL) { + page = cl_vmpage_page(vmpage, obj); + if (page != NULL) { + lu_ref_add(&page->cp_reference, + "delete", vmpage); + cl_page_delete(env, page); + lu_ref_del(&page->cp_reference, + "delete", vmpage); + cl_page_put(env, page); + } + } else + LASSERT(vmpage->private == 0); + cl_env_put(env, &refcheck); + } + } +} + +#ifdef HAVE_RELEASEPAGE_WITH_INT +#define RELEASEPAGE_ARG_TYPE int +#else +#define RELEASEPAGE_ARG_TYPE gfp_t +#endif +static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct cl_object *obj; + struct cl_page *page; + struct address_space *mapping; + int result; + + LASSERT(PageLocked(vmpage)); + if (PageWriteback(vmpage) || PageDirty(vmpage)) + return 0; + + mapping = vmpage->mapping; + if (mapping == NULL) + return 1; + + obj = ll_i2info(mapping->host)->lli_clob; + if (obj == NULL) + return 1; + + /* 1 for page allocator, 1 for cl_page and 1 for page cache */ + if (page_count(vmpage) > 3) + return 0; + + /* TODO: determine what gfp should be used by @gfp_mask. */ + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + /* If we can't allocate an env we won't call cl_page_put() + * later on which further means it's impossible to drop + * page refcount by cl_page, so ask kernel to not free + * this page. */ + return 0; + + page = cl_vmpage_page(vmpage, obj); + result = page == NULL; + if (page != NULL) { + if (!cl_page_in_use(page)) { + result = 1; + cl_page_delete(env, page); + } + cl_page_put(env, page); + } + cl_env_nested_put(&nest, env); + return result; +} + +static int ll_set_page_dirty(struct page *vmpage) +{ +#if 0 + struct cl_page *page = vvp_vmpage_page_transient(vmpage); + struct vvp_object *obj = cl_inode2vvp(vmpage->mapping->host); + struct vvp_page *cpg; + + /* + * XXX should page method be called here? + */ + LASSERT(&obj->co_cl == page->cp_obj); + cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type)); + /* + * XXX cannot do much here, because page is possibly not locked: + * sys_munmap()->... + * ->unmap_page_range()->zap_pte_range()->set_page_dirty(). + */ + vvp_write_pending(obj, cpg); +#endif + return __set_page_dirty_nobuffers(vmpage); +} + +#define MAX_DIRECTIO_SIZE (2*1024*1024*1024UL) + +static inline int ll_get_user_pages(int rw, unsigned long user_addr, + size_t size, struct page ***pages, + int *max_pages) +{ + int result = -ENOMEM; + + /* set an arbitrary limit to prevent arithmetic overflow */ + if (size > MAX_DIRECTIO_SIZE) { + *pages = NULL; + return -EFBIG; + } + + *max_pages = (user_addr + size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + *max_pages -= user_addr >> PAGE_CACHE_SHIFT; + + OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages)); + if (*pages) { + result = get_user_pages_fast(user_addr, *max_pages, + (rw == READ), *pages); + if (unlikely(result <= 0)) + OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages)); + } + + return result; +} + +/* ll_free_user_pages - tear down page struct array + * @pages: array of page struct pointers underlying target buffer */ +static void ll_free_user_pages(struct page **pages, int npages, int do_dirty) +{ + int i; + + for (i = 0; i < npages; i++) { + if (do_dirty) + set_page_dirty_lock(pages[i]); + page_cache_release(pages[i]); + } + kvfree(pages); +} + +ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, + struct ll_dio_pages *pv) +{ + struct cl_page *clp; + struct cl_2queue *queue; + struct cl_object *obj = io->ci_obj; + int i; + ssize_t rc = 0; + loff_t file_offset = pv->ldp_start_offset; + long size = pv->ldp_size; + int page_count = pv->ldp_nr; + struct page **pages = pv->ldp_pages; + long page_size = cl_page_size(obj); + bool do_io; + int io_pages = 0; + + queue = &io->ci_queue; + cl_2queue_init(queue); + for (i = 0; i < page_count; i++) { + if (pv->ldp_offsets) + file_offset = pv->ldp_offsets[i]; + + LASSERT(!(file_offset & (page_size - 1))); + clp = cl_page_find(env, obj, cl_index(obj, file_offset), + pv->ldp_pages[i], CPT_TRANSIENT); + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + do_io = true; + + /* check the page type: if the page is a host page, then do + * write directly */ + if (clp->cp_type == CPT_CACHEABLE) { + struct page *vmpage = cl_page_vmpage(env, clp); + struct page *src_page; + struct page *dst_page; + void *src; + void *dst; + + src_page = (rw == WRITE) ? pages[i] : vmpage; + dst_page = (rw == WRITE) ? vmpage : pages[i]; + + src = kmap_atomic(src_page); + dst = kmap_atomic(dst_page); + memcpy(dst, src, min(page_size, size)); + kunmap_atomic(dst); + kunmap_atomic(src); + + /* make sure page will be added to the transfer by + * cl_io_submit()->...->vvp_page_prep_write(). */ + if (rw == WRITE) + set_page_dirty(vmpage); + + if (rw == READ) { + /* do not issue the page for read, since it + * may reread a ra page which has NOT uptodate + * bit set. */ + cl_page_disown(env, io, clp); + do_io = false; + } + } + + if (likely(do_io)) { + cl_2queue_add(queue, clp); + + /* + * Set page clip to tell transfer formation engine + * that page has to be sent even if it is beyond KMS. + */ + cl_page_clip(env, clp, 0, min(size, page_size)); + + ++io_pages; + } + + /* drop the reference count for cl_page_find */ + cl_page_put(env, clp); + size -= page_size; + file_offset += page_size; + } + + if (rc == 0 && io_pages) { + rc = cl_io_submit_sync(env, io, + rw == READ ? CRT_READ : CRT_WRITE, + queue, 0); + } + if (rc == 0) + rc = pv->ldp_size; + + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + return rc; +} +EXPORT_SYMBOL(ll_direct_rw_pages); + +static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, + struct address_space *mapping, + size_t size, loff_t file_offset, + struct page **pages, int page_count) +{ + struct ll_dio_pages pvec = { .ldp_pages = pages, + .ldp_nr = page_count, + .ldp_size = size, + .ldp_offsets = NULL, + .ldp_start_offset = file_offset + }; + + return ll_direct_rw_pages(env, io, rw, inode, &pvec); +} + +#ifdef KMALLOC_MAX_SIZE +#define MAX_MALLOC KMALLOC_MAX_SIZE +#else +#define MAX_MALLOC (128 * 1024) +#endif + +/* This is the maximum size of a single O_DIRECT request, based on the + * kmalloc limit. We need to fit all of the brw_page structs, each one + * representing PAGE_SIZE worth of user data, into a single buffer, and + * then truncate this to be a full-sized RPC. For 4kB PAGE_SIZE this is + * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */ +#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \ + ~(DT_MAX_BRW_SIZE - 1)) +static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter, + loff_t file_offset) +{ + struct lu_env *env; + struct cl_io *io; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ccc_object *obj = cl_inode2ccc(inode); + ssize_t count = iov_iter_count(iter); + ssize_t tot_bytes = 0, result = 0; + struct ll_inode_info *lli = ll_i2info(inode); + long size = MAX_DIO_SIZE; + int refcheck; + + if (!lli->lli_has_smd) + return -EBADF; + + /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ + if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK)) + return -EINVAL; + + CDEBUG(D_VFSTRACE, + "VFS Op:inode=%lu/%u(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)\n", + inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE, + file_offset, file_offset, count >> PAGE_CACHE_SHIFT, + MAX_DIO_SIZE >> PAGE_CACHE_SHIFT); + + /* Check that all user buffers are aligned as well */ + if (iov_iter_alignment(iter) & ~CFS_PAGE_MASK) + return -EINVAL; + + env = cl_env_get(&refcheck); + LASSERT(!IS_ERR(env)); + io = ccc_env_io(env)->cui_cl.cis_io; + LASSERT(io != NULL); + + /* 0. Need locking between buffered and direct access. and race with + * size changing by concurrent truncates and writes. + * 1. Need inode mutex to operate transient pages. + */ + if (iov_iter_rw(iter) == READ) + mutex_lock(&inode->i_mutex); + + LASSERT(obj->cob_transient_pages == 0); + while (iov_iter_count(iter)) { + struct page **pages; + size_t offs; + + count = min_t(size_t, iov_iter_count(iter), size); + if (iov_iter_rw(iter) == READ) { + if (file_offset >= i_size_read(inode)) + break; + if (file_offset + count > i_size_read(inode)) + count = i_size_read(inode) - file_offset; + } + + result = iov_iter_get_pages_alloc(iter, &pages, count, &offs); + if (likely(result > 0)) { + int n = DIV_ROUND_UP(result + offs, PAGE_SIZE); + result = ll_direct_IO_26_seg(env, io, iov_iter_rw(iter), + inode, file->f_mapping, + result, file_offset, pages, + n); + ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ); + } + if (unlikely(result <= 0)) { + /* If we can't allocate a large enough buffer + * for the request, shrink it to a smaller + * PAGE_SIZE multiple and try again. + * We should always be able to kmalloc for a + * page worth of page pointers = 4MB on i386. */ + if (result == -ENOMEM && + size > (PAGE_CACHE_SIZE / sizeof(*pages)) * + PAGE_CACHE_SIZE) { + size = ((((size / 2) - 1) | + ~CFS_PAGE_MASK) + 1) & + CFS_PAGE_MASK; + CDEBUG(D_VFSTRACE, "DIO size now %lu\n", + size); + continue; + } + + goto out; + } + iov_iter_advance(iter, result); + tot_bytes += result; + file_offset += result; + } +out: + LASSERT(obj->cob_transient_pages == 0); + if (iov_iter_rw(iter) == READ) + mutex_unlock(&inode->i_mutex); + + if (tot_bytes > 0) { + if (iov_iter_rw(iter) == WRITE) { + struct lov_stripe_md *lsm; + + lsm = ccc_inode_lsm_get(inode); + LASSERT(lsm != NULL); + lov_stripe_lock(lsm); + obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0); + lov_stripe_unlock(lsm); + ccc_inode_lsm_put(inode, lsm); + } + } + + cl_env_put(env, &refcheck); + return tot_bytes ? : result; +} + +static int ll_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int rc; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + *pagep = page; + + rc = ll_prepare_write(file, page, from, from + len); + if (rc) { + unlock_page(page); + page_cache_release(page); + } + return rc; +} + +static int ll_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + int rc; + + rc = ll_commit_write(file, page, from, from + copied); + unlock_page(page); + page_cache_release(page); + + return rc ?: copied; +} + +#ifdef CONFIG_MIGRATION +static int ll_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode + ) +{ + /* Always fail page migration until we have a proper implementation */ + return -EIO; +} +#endif + +#ifndef MS_HAS_NEW_AOPS +const struct address_space_operations ll_aops = { + .readpage = ll_readpage, + .direct_IO = ll_direct_IO_26, + .writepage = ll_writepage, + .writepages = ll_writepages, + .set_page_dirty = ll_set_page_dirty, + .write_begin = ll_write_begin, + .write_end = ll_write_end, + .invalidatepage = ll_invalidatepage, + .releasepage = (void *)ll_releasepage, +#ifdef CONFIG_MIGRATION + .migratepage = ll_migratepage, +#endif +}; +#else +const struct address_space_operations_ext ll_aops = { + .orig_aops.readpage = ll_readpage, +/* .orig_aops.readpages = ll_readpages, */ + .orig_aops.direct_IO = ll_direct_IO_26, + .orig_aops.writepage = ll_writepage, + .orig_aops.writepages = ll_writepages, + .orig_aops.set_page_dirty = ll_set_page_dirty, + .orig_aops.prepare_write = ll_prepare_write, + .orig_aops.commit_write = ll_commit_write, + .orig_aops.invalidatepage = ll_invalidatepage, + .orig_aops.releasepage = ll_releasepage, +#ifdef CONFIG_MIGRATION + .orig_aops.migratepage = ll_migratepage, +#endif + .write_begin = ll_write_begin, + .write_end = ll_write_end +}; +#endif diff --git a/kernel/drivers/staging/lustre/lustre/llite/statahead.c b/kernel/drivers/staging/lustre/lustre/llite/statahead.c new file mode 100644 index 000000000..7f8071242 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/statahead.c @@ -0,0 +1,1729 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/obd_support.h" +#include "../include/lustre_lite.h" +#include "../include/lustre_dlm.h" +#include "llite_internal.h" + +#define SA_OMITTED_ENTRY_MAX 8ULL + +typedef enum { + /** negative values are for error cases */ + SA_ENTRY_INIT = 0, /** init entry */ + SA_ENTRY_SUCC = 1, /** stat succeed */ + SA_ENTRY_INVA = 2, /** invalid entry */ + SA_ENTRY_DEST = 3, /** entry to be destroyed */ +} se_stat_t; + +struct ll_sa_entry { + /* link into sai->sai_entries */ + struct list_head se_link; + /* link into sai->sai_entries_{received,stated} */ + struct list_head se_list; + /* link into sai hash table locally */ + struct list_head se_hash; + /* entry reference count */ + atomic_t se_refcount; + /* entry index in the sai */ + __u64 se_index; + /* low layer ldlm lock handle */ + __u64 se_handle; + /* entry status */ + se_stat_t se_stat; + /* entry size, contains name */ + int se_size; + /* pointer to async getattr enqueue info */ + struct md_enqueue_info *se_minfo; + /* pointer to the async getattr request */ + struct ptlrpc_request *se_req; + /* pointer to the target inode */ + struct inode *se_inode; + /* entry name */ + struct qstr se_qstr; +}; + +static unsigned int sai_generation; +static DEFINE_SPINLOCK(sai_generation_lock); + +static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry) +{ + return list_empty(&entry->se_hash); +} + +/* + * The entry only can be released by the caller, it is necessary to hold lock. + */ +static inline int ll_sa_entry_stated(struct ll_sa_entry *entry) +{ + smp_rmb(); + return (entry->se_stat != SA_ENTRY_INIT); +} + +static inline int ll_sa_entry_hash(int val) +{ + return val & LL_SA_CACHE_MASK; +} + +/* + * Insert entry to hash SA table. + */ +static inline void +ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + int i = ll_sa_entry_hash(entry->se_qstr.hash); + + spin_lock(&sai->sai_cache_lock[i]); + list_add_tail(&entry->se_hash, &sai->sai_cache[i]); + spin_unlock(&sai->sai_cache_lock[i]); +} + +/* + * Remove entry from SA table. + */ +static inline void +ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + int i = ll_sa_entry_hash(entry->se_qstr.hash); + + spin_lock(&sai->sai_cache_lock[i]); + list_del_init(&entry->se_hash); + spin_unlock(&sai->sai_cache_lock[i]); +} + +static inline int agl_should_run(struct ll_statahead_info *sai, + struct inode *inode) +{ + return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid); +} + +static inline struct ll_sa_entry * +sa_first_received_entry(struct ll_statahead_info *sai) +{ + return list_entry(sai->sai_entries_received.next, + struct ll_sa_entry, se_list); +} + +static inline struct ll_inode_info * +agl_first_entry(struct ll_statahead_info *sai) +{ + return list_entry(sai->sai_entries_agl.next, + struct ll_inode_info, lli_agl_list); +} + +static inline int sa_sent_full(struct ll_statahead_info *sai) +{ + return atomic_read(&sai->sai_cache_count) >= sai->sai_max; +} + +static inline int sa_received_empty(struct ll_statahead_info *sai) +{ + return list_empty(&sai->sai_entries_received); +} + +static inline int agl_list_empty(struct ll_statahead_info *sai) +{ + return list_empty(&sai->sai_entries_agl); +} + +/** + * (1) hit ratio less than 80% + * or + * (2) consecutive miss more than 8 + * then means low hit. + */ +static inline int sa_low_hit(struct ll_statahead_info *sai) +{ + return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || + (sai->sai_consecutive_miss > 8)); +} + +/* + * If the given index is behind of statahead window more than + * SA_OMITTED_ENTRY_MAX, then it is old. + */ +static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) +{ + return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < + sai->sai_index); +} + +/* + * Insert it into sai_entries tail when init. + */ +static struct ll_sa_entry * +ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index, + const char *name, int len) +{ + struct ll_inode_info *lli; + struct ll_sa_entry *entry; + int entry_size; + char *dname; + + entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4; + entry = kzalloc(entry_size, GFP_NOFS); + if (unlikely(!entry)) + return ERR_PTR(-ENOMEM); + + CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n", + len, name, entry, index); + + entry->se_index = index; + + /* + * Statahead entry reference rules: + * + * 1) When statahead entry is initialized, its reference is set as 2. + * One reference is used by the directory scanner. When the scanner + * searches the statahead cache for the given name, it can perform + * lockless hash lookup (only the scanner can remove entry from hash + * list), and once found, it needn't to call "atomic_inc()" for the + * entry reference. So the performance is improved. After using the + * statahead entry, the scanner will call "atomic_dec()" to drop the + * reference held when initialization. If it is the last reference, + * the statahead entry will be freed. + * + * 2) All other threads, including statahead thread and ptlrpcd thread, + * when they process the statahead entry, the reference for target + * should be held to guarantee the entry will not be released by the + * directory scanner. After processing the entry, these threads will + * drop the entry reference. If it is the last reference, the entry + * will be freed. + * + * The second reference when initializes the statahead entry is used + * by the statahead thread, following the rule 2). + */ + atomic_set(&entry->se_refcount, 2); + entry->se_stat = SA_ENTRY_INIT; + entry->se_size = entry_size; + dname = (char *)entry + sizeof(struct ll_sa_entry); + memcpy(dname, name, len); + dname[len] = 0; + entry->se_qstr.hash = full_name_hash(name, len); + entry->se_qstr.len = len; + entry->se_qstr.name = dname; + + lli = ll_i2info(sai->sai_inode); + spin_lock(&lli->lli_sa_lock); + list_add_tail(&entry->se_link, &sai->sai_entries); + INIT_LIST_HEAD(&entry->se_list); + ll_sa_entry_enhash(sai, entry); + spin_unlock(&lli->lli_sa_lock); + + atomic_inc(&sai->sai_cache_count); + + return entry; +} + +/* + * Used by the directory scanner to search entry with name. + * + * Only the caller can remove the entry from hash, so it is unnecessary to hold + * hash lock. It is caller's duty to release the init refcount on the entry, so + * it is also unnecessary to increase refcount on the entry. + */ +static struct ll_sa_entry * +ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr) +{ + struct ll_sa_entry *entry; + int i = ll_sa_entry_hash(qstr->hash); + + list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { + if (entry->se_qstr.hash == qstr->hash && + entry->se_qstr.len == qstr->len && + memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) + return entry; + } + return NULL; +} + +/* + * Used by the async getattr request callback to find entry with index. + * + * Inside lli_sa_lock to prevent others to change the list during the search. + * It needs to increase entry refcount before returning to guarantee that the + * entry cannot be freed by others. + */ +static struct ll_sa_entry * +ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index) +{ + struct ll_sa_entry *entry; + + list_for_each_entry(entry, &sai->sai_entries, se_link) { + if (entry->se_index == index) { + LASSERT(atomic_read(&entry->se_refcount) > 0); + atomic_inc(&entry->se_refcount); + return entry; + } + if (entry->se_index > index) + break; + } + return NULL; +} + +static void ll_sa_entry_cleanup(struct ll_statahead_info *sai, + struct ll_sa_entry *entry) +{ + struct md_enqueue_info *minfo = entry->se_minfo; + struct ptlrpc_request *req = entry->se_req; + + if (minfo) { + entry->se_minfo = NULL; + ll_intent_release(&minfo->mi_it); + iput(minfo->mi_dir); + OBD_FREE_PTR(minfo); + } + + if (req) { + entry->se_req = NULL; + ptlrpc_req_finished(req); + } +} + +static void ll_sa_entry_put(struct ll_statahead_info *sai, + struct ll_sa_entry *entry) +{ + if (atomic_dec_and_test(&entry->se_refcount)) { + CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", + entry->se_qstr.len, entry->se_qstr.name, entry, + entry->se_index); + + LASSERT(list_empty(&entry->se_link)); + LASSERT(list_empty(&entry->se_list)); + LASSERT(ll_sa_entry_unhashed(entry)); + + ll_sa_entry_cleanup(sai, entry); + iput(entry->se_inode); + + OBD_FREE(entry, entry->se_size); + atomic_dec(&sai->sai_cache_count); + } +} + +static inline void +do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_inode); + + LASSERT(!ll_sa_entry_unhashed(entry)); + LASSERT(!list_empty(&entry->se_link)); + + ll_sa_entry_unhash(sai, entry); + + spin_lock(&lli->lli_sa_lock); + entry->se_stat = SA_ENTRY_DEST; + list_del_init(&entry->se_link); + if (likely(!list_empty(&entry->se_list))) + list_del_init(&entry->se_list); + spin_unlock(&lli->lli_sa_lock); + + ll_sa_entry_put(sai, entry); +} + +/* + * Delete it from sai_entries_stated list when fini. + */ +static void +ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + struct ll_sa_entry *pos, *next; + + if (entry) + do_sa_entry_fini(sai, entry); + + /* drop old entry, only 'scanner' process does this, no need to lock */ + list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) { + if (!is_omitted_entry(sai, pos->se_index)) + break; + do_sa_entry_fini(sai, pos); + } +} + +/* + * Inside lli_sa_lock. + */ +static void +do_sa_entry_to_stated(struct ll_statahead_info *sai, + struct ll_sa_entry *entry, se_stat_t stat) +{ + struct ll_sa_entry *se; + struct list_head *pos = &sai->sai_entries_stated; + + if (!list_empty(&entry->se_list)) + list_del_init(&entry->se_list); + + list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) { + if (se->se_index < entry->se_index) { + pos = &se->se_list; + break; + } + } + + list_add(&entry->se_list, pos); + entry->se_stat = stat; +} + +/* + * Move entry to sai_entries_stated and sort with the index. + * \retval 1 -- entry to be destroyed. + * \retval 0 -- entry is inserted into stated list. + */ +static int +ll_sa_entry_to_stated(struct ll_statahead_info *sai, + struct ll_sa_entry *entry, se_stat_t stat) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_inode); + int ret = 1; + + ll_sa_entry_cleanup(sai, entry); + + spin_lock(&lli->lli_sa_lock); + if (likely(entry->se_stat != SA_ENTRY_DEST)) { + do_sa_entry_to_stated(sai, entry, stat); + ret = 0; + } + spin_unlock(&lli->lli_sa_lock); + + return ret; +} + +/* + * Insert inode into the list of sai_entries_agl. + */ +static void ll_agl_add(struct ll_statahead_info *sai, + struct inode *inode, int index) +{ + struct ll_inode_info *child = ll_i2info(inode); + struct ll_inode_info *parent = ll_i2info(sai->sai_inode); + int added = 0; + + spin_lock(&child->lli_agl_lock); + if (child->lli_agl_index == 0) { + child->lli_agl_index = index; + spin_unlock(&child->lli_agl_lock); + + LASSERT(list_empty(&child->lli_agl_list)); + + igrab(inode); + spin_lock(&parent->lli_agl_lock); + if (agl_list_empty(sai)) + added = 1; + list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl); + spin_unlock(&parent->lli_agl_lock); + } else { + spin_unlock(&child->lli_agl_lock); + } + + if (added > 0) + wake_up(&sai->sai_agl_thread.t_ctl_waitq); +} + +static struct ll_statahead_info *ll_sai_alloc(void) +{ + struct ll_statahead_info *sai; + int i; + + sai = kzalloc(sizeof(*sai), GFP_NOFS); + if (!sai) + return NULL; + + atomic_set(&sai->sai_refcount, 1); + + spin_lock(&sai_generation_lock); + sai->sai_generation = ++sai_generation; + if (unlikely(sai_generation == 0)) + sai->sai_generation = ++sai_generation; + spin_unlock(&sai_generation_lock); + + sai->sai_max = LL_SA_RPC_MIN; + sai->sai_index = 1; + init_waitqueue_head(&sai->sai_waitq); + init_waitqueue_head(&sai->sai_thread.t_ctl_waitq); + init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq); + + INIT_LIST_HEAD(&sai->sai_entries); + INIT_LIST_HEAD(&sai->sai_entries_received); + INIT_LIST_HEAD(&sai->sai_entries_stated); + INIT_LIST_HEAD(&sai->sai_entries_agl); + + for (i = 0; i < LL_SA_CACHE_SIZE; i++) { + INIT_LIST_HEAD(&sai->sai_cache[i]); + spin_lock_init(&sai->sai_cache_lock[i]); + } + atomic_set(&sai->sai_cache_count, 0); + + return sai; +} + +static inline struct ll_statahead_info * +ll_sai_get(struct ll_statahead_info *sai) +{ + atomic_inc(&sai->sai_refcount); + return sai; +} + +static void ll_sai_put(struct ll_statahead_info *sai) +{ + struct inode *inode = sai->sai_inode; + struct ll_inode_info *lli = ll_i2info(inode); + + if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { + struct ll_sa_entry *entry, *next; + + if (unlikely(atomic_read(&sai->sai_refcount) > 0)) { + /* It is race case, the interpret callback just hold + * a reference count */ + spin_unlock(&lli->lli_sa_lock); + return; + } + + LASSERT(lli->lli_opendir_key == NULL); + LASSERT(thread_is_stopped(&sai->sai_thread)); + LASSERT(thread_is_stopped(&sai->sai_agl_thread)); + + lli->lli_sai = NULL; + lli->lli_opendir_pid = 0; + spin_unlock(&lli->lli_sa_lock); + + if (sai->sai_sent > sai->sai_replied) + CDEBUG(D_READA, "statahead for dir "DFID + " does not finish: [sent:%llu] [replied:%llu]\n", + PFID(&lli->lli_fid), + sai->sai_sent, sai->sai_replied); + + list_for_each_entry_safe(entry, next, + &sai->sai_entries, se_link) + do_sa_entry_fini(sai, entry); + + LASSERT(list_empty(&sai->sai_entries)); + LASSERT(sa_received_empty(sai)); + LASSERT(list_empty(&sai->sai_entries_stated)); + + LASSERT(atomic_read(&sai->sai_cache_count) == 0); + LASSERT(agl_list_empty(sai)); + + iput(inode); + OBD_FREE_PTR(sai); + } +} + +/* Do NOT forget to drop inode refcount when into sai_entries_agl. */ +static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) +{ + struct ll_inode_info *lli = ll_i2info(inode); + __u64 index = lli->lli_agl_index; + int rc; + + LASSERT(list_empty(&lli->lli_agl_list)); + + /* AGL maybe fall behind statahead with one entry */ + if (is_omitted_entry(sai, index + 1)) { + lli->lli_agl_index = 0; + iput(inode); + return; + } + + /* Someone is in glimpse (sync or async), do nothing. */ + rc = down_write_trylock(&lli->lli_glimpse_sem); + if (rc == 0) { + lli->lli_agl_index = 0; + iput(inode); + return; + } + + /* + * Someone triggered glimpse within 1 sec before. + * 1) The former glimpse succeeded with glimpse lock granted by OST, and + * if the lock is still cached on client, AGL needs to do nothing. If + * it is cancelled by other client, AGL maybe cannot obtain new lock + * for no glimpse callback triggered by AGL. + * 2) The former glimpse succeeded, but OST did not grant glimpse lock. + * Under such case, it is quite possible that the OST will not grant + * glimpse lock for AGL also. + * 3) The former glimpse failed, compared with other two cases, it is + * relative rare. AGL can ignore such case, and it will not muchly + * affect the performance. + */ + if (lli->lli_glimpse_time != 0 && + time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) { + up_write(&lli->lli_glimpse_sem); + lli->lli_agl_index = 0; + iput(inode); + return; + } + + CDEBUG(D_READA, "Handling (init) async glimpse: inode = " + DFID", idx = %llu\n", PFID(&lli->lli_fid), index); + + cl_agl(inode); + lli->lli_agl_index = 0; + lli->lli_glimpse_time = cfs_time_current(); + up_write(&lli->lli_glimpse_sem); + + CDEBUG(D_READA, "Handled (init) async glimpse: inode= " + DFID", idx = %llu, rc = %d\n", + PFID(&lli->lli_fid), index, rc); + + iput(inode); +} + +static void ll_post_statahead(struct ll_statahead_info *sai) +{ + struct inode *dir = sai->sai_inode; + struct inode *child; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_sa_entry *entry; + struct md_enqueue_info *minfo; + struct lookup_intent *it; + struct ptlrpc_request *req; + struct mdt_body *body; + int rc = 0; + + spin_lock(&lli->lli_sa_lock); + if (unlikely(sa_received_empty(sai))) { + spin_unlock(&lli->lli_sa_lock); + return; + } + entry = sa_first_received_entry(sai); + atomic_inc(&entry->se_refcount); + list_del_init(&entry->se_list); + spin_unlock(&lli->lli_sa_lock); + + LASSERT(entry->se_handle != 0); + + minfo = entry->se_minfo; + it = &minfo->mi_it; + req = entry->se_req; + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) { + rc = -EFAULT; + goto out; + } + + child = entry->se_inode; + if (child == NULL) { + /* + * lookup. + */ + LASSERT(fid_is_zero(&minfo->mi_data.op_fid2)); + + /* XXX: No fid in reply, this is probably cross-ref case. + * SA can't handle it yet. */ + if (body->valid & OBD_MD_MDS) { + rc = -EAGAIN; + goto out; + } + } else { + /* + * revalidate. + */ + /* unlinked and re-created with the same name */ + if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){ + entry->se_inode = NULL; + iput(child); + child = NULL; + } + } + + it->d.lustre.it_lock_handle = entry->se_handle; + rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); + if (rc != 1) { + rc = -EAGAIN; + goto out; + } + + rc = ll_prep_inode(&child, req, dir->i_sb, it); + if (rc) + goto out; + + CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n", + child, child->i_ino, child->i_generation); + ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); + + entry->se_inode = child; + + if (agl_should_run(sai, child)) + ll_agl_add(sai, child, entry->se_index); + +out: + /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock + * reference count by calling "ll_intent_drop_lock()" in spite of the + * above operations failed or not. Do not worry about calling + * "ll_intent_drop_lock()" more than once. */ + rc = ll_sa_entry_to_stated(sai, entry, + rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); + if (rc == 0 && entry->se_index == sai->sai_index_wait) + wake_up(&sai->sai_waitq); + ll_sa_entry_put(sai, entry); +} + +static int ll_statahead_interpret(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, int rc) +{ + struct lookup_intent *it = &minfo->mi_it; + struct inode *dir = minfo->mi_dir; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = NULL; + struct ll_sa_entry *entry; + __u64 handle = 0; + int wakeup; + + if (it_disposition(it, DISP_LOOKUP_NEG)) + rc = -ENOENT; + + if (rc == 0) { + /* release ibits lock ASAP to avoid deadlock when statahead + * thread enqueues lock on parent in readdir and another + * process enqueues lock on child with parent lock held, eg. + * unlink. */ + handle = it->d.lustre.it_lock_handle; + ll_intent_drop_lock(it); + } + + spin_lock(&lli->lli_sa_lock); + /* stale entry */ + if (unlikely(lli->lli_sai == NULL || + lli->lli_sai->sai_generation != minfo->mi_generation)) { + spin_unlock(&lli->lli_sa_lock); + rc = -ESTALE; + goto out; + } else { + sai = ll_sai_get(lli->lli_sai); + if (unlikely(!thread_is_running(&sai->sai_thread))) { + sai->sai_replied++; + spin_unlock(&lli->lli_sa_lock); + rc = -EBADFD; + goto out; + } + + entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata); + if (entry == NULL) { + sai->sai_replied++; + spin_unlock(&lli->lli_sa_lock); + rc = -EIDRM; + goto out; + } + + if (rc != 0) { + do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA); + wakeup = (entry->se_index == sai->sai_index_wait); + } else { + entry->se_minfo = minfo; + entry->se_req = ptlrpc_request_addref(req); + /* Release the async ibits lock ASAP to avoid deadlock + * when statahead thread tries to enqueue lock on parent + * for readpage and other tries to enqueue lock on child + * with parent's lock held, for example: unlink. */ + entry->se_handle = handle; + wakeup = sa_received_empty(sai); + list_add_tail(&entry->se_list, + &sai->sai_entries_received); + } + sai->sai_replied++; + spin_unlock(&lli->lli_sa_lock); + + ll_sa_entry_put(sai, entry); + if (wakeup) + wake_up(&sai->sai_thread.t_ctl_waitq); + } + +out: + if (rc != 0) { + ll_intent_release(it); + iput(dir); + OBD_FREE_PTR(minfo); + } + if (sai != NULL) + ll_sai_put(sai); + return rc; +} + +static void sa_args_fini(struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo) +{ + LASSERT(minfo && einfo); + iput(minfo->mi_dir); + capa_put(minfo->mi_data.op_capa1); + capa_put(minfo->mi_data.op_capa2); + OBD_FREE_PTR(minfo); + OBD_FREE_PTR(einfo); +} + +/** + * There is race condition between "capa_put" and "ll_statahead_interpret" for + * accessing "op_data.op_capa[1,2]" as following: + * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling + * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and + * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid + * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling + * "md_intent_getattr_async". + */ +static int sa_args_init(struct inode *dir, struct inode *child, + struct ll_sa_entry *entry, struct md_enqueue_info **pmi, + struct ldlm_enqueue_info **pei, + struct obd_capa **pcapa) +{ + struct qstr *qstr = &entry->se_qstr; + struct ll_inode_info *lli = ll_i2info(dir); + struct md_enqueue_info *minfo; + struct ldlm_enqueue_info *einfo; + struct md_op_data *op_data; + + einfo = kzalloc(sizeof(*einfo), GFP_NOFS); + if (!einfo) + return -ENOMEM; + + minfo = kzalloc(sizeof(*minfo), GFP_NOFS); + if (!minfo) { + OBD_FREE_PTR(einfo); + return -ENOMEM; + } + + op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name, + qstr->len, 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(einfo); + OBD_FREE_PTR(minfo); + return PTR_ERR(op_data); + } + + minfo->mi_it.it_op = IT_GETATTR; + minfo->mi_dir = igrab(dir); + minfo->mi_cb = ll_statahead_interpret; + minfo->mi_generation = lli->lli_sai->sai_generation; + minfo->mi_cbdata = entry->se_index; + + einfo->ei_type = LDLM_IBITS; + einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); + einfo->ei_cb_bl = ll_md_blocking_ast; + einfo->ei_cb_cp = ldlm_completion_ast; + einfo->ei_cb_gl = NULL; + einfo->ei_cbdata = NULL; + + *pmi = minfo; + *pei = einfo; + pcapa[0] = op_data->op_capa1; + pcapa[1] = op_data->op_capa2; + + return 0; +} + +static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry) +{ + struct md_enqueue_info *minfo; + struct ldlm_enqueue_info *einfo; + struct obd_capa *capas[2]; + int rc; + + rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas); + if (rc) + return rc; + + rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); + if (!rc) { + capa_put(capas[0]); + capa_put(capas[1]); + } else { + sa_args_fini(minfo, einfo); + } + + return rc; +} + +/** + * similar to ll_revalidate_it(). + * \retval 1 -- dentry valid + * \retval 0 -- will send stat-ahead request + * \retval others -- prepare stat-ahead request failed + */ +static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, + struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct lookup_intent it = { .it_op = IT_GETATTR, + .d.lustre.it_lock_handle = 0 }; + struct md_enqueue_info *minfo; + struct ldlm_enqueue_info *einfo; + struct obd_capa *capas[2]; + int rc; + + if (unlikely(inode == NULL)) + return 1; + + if (d_mountpoint(dentry)) + return 1; + + entry->se_inode = igrab(inode); + rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode), + NULL); + if (rc == 1) { + entry->se_handle = it.d.lustre.it_lock_handle; + ll_intent_release(&it); + return 1; + } + + rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas); + if (rc) { + entry->se_inode = NULL; + iput(inode); + return rc; + } + + rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); + if (!rc) { + capa_put(capas[0]); + capa_put(capas[1]); + } else { + entry->se_inode = NULL; + iput(inode); + sa_args_fini(minfo, einfo); + } + + return rc; +} + +static void ll_statahead_one(struct dentry *parent, const char *entry_name, + int entry_name_len) +{ + struct inode *dir = d_inode(parent); + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct dentry *dentry = NULL; + struct ll_sa_entry *entry; + int rc; + int rc1; + + entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name, + entry_name_len); + if (IS_ERR(entry)) + return; + + dentry = d_lookup(parent, &entry->se_qstr); + if (!dentry) { + rc = do_sa_lookup(dir, entry); + } else { + rc = do_sa_revalidate(dir, entry, dentry); + if (rc == 1 && agl_should_run(sai, d_inode(dentry))) + ll_agl_add(sai, d_inode(dentry), entry->se_index); + } + + if (dentry != NULL) + dput(dentry); + + if (rc) { + rc1 = ll_sa_entry_to_stated(sai, entry, + rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); + if (rc1 == 0 && entry->se_index == sai->sai_index_wait) + wake_up(&sai->sai_waitq); + } else { + sai->sai_sent++; + } + + sai->sai_index++; + /* drop one refcount on entry by ll_sa_entry_alloc */ + ll_sa_entry_put(sai, entry); +} + +static int ll_agl_thread(void *arg) +{ + struct dentry *parent = (struct dentry *)arg; + struct inode *dir = d_inode(parent); + struct ll_inode_info *plli = ll_i2info(dir); + struct ll_inode_info *clli; + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); + struct ptlrpc_thread *thread = &sai->sai_agl_thread; + struct l_wait_info lwi = { 0 }; + + thread->t_pid = current_pid(); + CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n", + sai, parent); + + atomic_inc(&sbi->ll_agl_total); + spin_lock(&plli->lli_agl_lock); + sai->sai_agl_valid = 1; + if (thread_is_init(thread)) + /* If someone else has changed the thread state + * (e.g. already changed to SVC_STOPPING), we can't just + * blindly overwrite that setting. */ + thread_set_flags(thread, SVC_RUNNING); + spin_unlock(&plli->lli_agl_lock); + wake_up(&thread->t_ctl_waitq); + + while (1) { + l_wait_event(thread->t_ctl_waitq, + !agl_list_empty(sai) || + !thread_is_running(thread), + &lwi); + + if (!thread_is_running(thread)) + break; + + spin_lock(&plli->lli_agl_lock); + /* The statahead thread maybe help to process AGL entries, + * so check whether list empty again. */ + if (!agl_list_empty(sai)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + ll_agl_trigger(&clli->lli_vfs_inode, sai); + } else { + spin_unlock(&plli->lli_agl_lock); + } + } + + spin_lock(&plli->lli_agl_lock); + sai->sai_agl_valid = 0; + while (!agl_list_empty(sai)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + clli->lli_agl_index = 0; + iput(&clli->lli_vfs_inode); + spin_lock(&plli->lli_agl_lock); + } + thread_set_flags(thread, SVC_STOPPED); + spin_unlock(&plli->lli_agl_lock); + wake_up(&thread->t_ctl_waitq); + ll_sai_put(sai); + CDEBUG(D_READA, "agl thread stopped: sai %p, parent %pd\n", + sai, parent); + return 0; +} + +static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) +{ + struct ptlrpc_thread *thread = &sai->sai_agl_thread; + struct l_wait_info lwi = { 0 }; + struct ll_inode_info *plli; + struct task_struct *task; + + CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n", + sai, parent); + + plli = ll_i2info(d_inode(parent)); + task = kthread_run(ll_agl_thread, parent, + "ll_agl_%u", plli->lli_opendir_pid); + if (IS_ERR(task)) { + CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); + thread_set_flags(thread, SVC_STOPPED); + return; + } + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); +} + +static int ll_statahead_thread(void *arg) +{ + struct dentry *parent = (struct dentry *)arg; + struct inode *dir = d_inode(parent); + struct ll_inode_info *plli = ll_i2info(dir); + struct ll_inode_info *clli; + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); + struct ptlrpc_thread *thread = &sai->sai_thread; + struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread; + struct page *page; + __u64 pos = 0; + int first = 0; + int rc = 0; + struct ll_dir_chain chain; + struct l_wait_info lwi = { 0 }; + + thread->t_pid = current_pid(); + CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n", + sai, parent); + + if (sbi->ll_flags & LL_SBI_AGL_ENABLED) + ll_start_agl(parent, sai); + + atomic_inc(&sbi->ll_sa_total); + spin_lock(&plli->lli_sa_lock); + if (thread_is_init(thread)) + /* If someone else has changed the thread state + * (e.g. already changed to SVC_STOPPING), we can't just + * blindly overwrite that setting. */ + thread_set_flags(thread, SVC_RUNNING); + spin_unlock(&plli->lli_sa_lock); + wake_up(&thread->t_ctl_waitq); + + ll_dir_chain_init(&chain); + page = ll_get_dir_page(dir, pos, &chain); + + while (1) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + if (IS_ERR(page)) { + rc = PTR_ERR(page); + CDEBUG(D_READA, "error reading dir "DFID" at %llu/%llu: [rc %d] [parent %u]\n", + PFID(ll_inode2fid(dir)), pos, sai->sai_index, + rc, plli->lli_opendir_pid); + goto out; + } + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL; + ent = lu_dirent_next(ent)) { + __u64 hash; + int namelen; + char *name; + + hash = le64_to_cpu(ent->lde_hash); + if (unlikely(hash < pos)) + /* + * Skip until we find target hash value. + */ + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (unlikely(namelen == 0)) + /* + * Skip dummy record. + */ + continue; + + name = ent->lde_name; + if (name[0] == '.') { + if (namelen == 1) { + /* + * skip "." + */ + continue; + } else if (name[1] == '.' && namelen == 2) { + /* + * skip ".." + */ + continue; + } else if (!sai->sai_ls_all) { + /* + * skip hidden files. + */ + sai->sai_skip_hidden++; + continue; + } + } + + /* + * don't stat-ahead first entry. + */ + if (unlikely(++first == 1)) + continue; + +keep_it: + l_wait_event(thread->t_ctl_waitq, + !sa_sent_full(sai) || + !sa_received_empty(sai) || + !agl_list_empty(sai) || + !thread_is_running(thread), + &lwi); + +interpret_it: + while (!sa_received_empty(sai)) + ll_post_statahead(sai); + + if (unlikely(!thread_is_running(thread))) { + ll_release_page(page, 0); + rc = 0; + goto out; + } + + /* If no window for metadata statahead, but there are + * some AGL entries to be triggered, then try to help + * to process the AGL entries. */ + if (sa_sent_full(sai)) { + spin_lock(&plli->lli_agl_lock); + while (!agl_list_empty(sai)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + ll_agl_trigger(&clli->lli_vfs_inode, + sai); + + if (!sa_received_empty(sai)) + goto interpret_it; + + if (unlikely( + !thread_is_running(thread))) { + ll_release_page(page, 0); + rc = 0; + goto out; + } + + if (!sa_sent_full(sai)) + goto do_it; + + spin_lock(&plli->lli_agl_lock); + } + spin_unlock(&plli->lli_agl_lock); + + goto keep_it; + } + +do_it: + ll_statahead_one(parent, name, namelen); + } + pos = le64_to_cpu(dp->ldp_hash_end); + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + ll_release_page(page, 0); + while (1) { + l_wait_event(thread->t_ctl_waitq, + !sa_received_empty(sai) || + sai->sai_sent == sai->sai_replied|| + !thread_is_running(thread), + &lwi); + + while (!sa_received_empty(sai)) + ll_post_statahead(sai); + + if (unlikely(!thread_is_running(thread))) { + rc = 0; + goto out; + } + + if (sai->sai_sent == sai->sai_replied && + sa_received_empty(sai)) + break; + } + + spin_lock(&plli->lli_agl_lock); + while (!agl_list_empty(sai) && + thread_is_running(thread)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + ll_agl_trigger(&clli->lli_vfs_inode, sai); + spin_lock(&plli->lli_agl_lock); + } + spin_unlock(&plli->lli_agl_lock); + + rc = 0; + goto out; + } else if (1) { + /* + * chain is exhausted. + * Normal case: continue to the next page. + */ + ll_release_page(page, le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + page = ll_get_dir_page(dir, pos, &chain); + } else { + LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + ll_release_page(page, 1); + /* + * go into overflow page. + */ + } + } + +out: + if (sai->sai_agl_valid) { + spin_lock(&plli->lli_agl_lock); + thread_set_flags(agl_thread, SVC_STOPPING); + spin_unlock(&plli->lli_agl_lock); + wake_up(&agl_thread->t_ctl_waitq); + + CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", + sai, (unsigned int)agl_thread->t_pid); + l_wait_event(agl_thread->t_ctl_waitq, + thread_is_stopped(agl_thread), + &lwi); + } else { + /* Set agl_thread flags anyway. */ + thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); + } + ll_dir_chain_fini(&chain); + spin_lock(&plli->lli_sa_lock); + if (!sa_received_empty(sai)) { + thread_set_flags(thread, SVC_STOPPING); + spin_unlock(&plli->lli_sa_lock); + + /* To release the resources held by received entries. */ + while (!sa_received_empty(sai)) + ll_post_statahead(sai); + + spin_lock(&plli->lli_sa_lock); + } + thread_set_flags(thread, SVC_STOPPED); + spin_unlock(&plli->lli_sa_lock); + wake_up(&sai->sai_waitq); + wake_up(&thread->t_ctl_waitq); + ll_sai_put(sai); + dput(parent); + CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %pd\n", + sai, parent); + return rc; +} + +/** + * called in ll_file_release(). + */ +void ll_stop_statahead(struct inode *dir, void *key) +{ + struct ll_inode_info *lli = ll_i2info(dir); + + if (unlikely(key == NULL)) + return; + + spin_lock(&lli->lli_sa_lock); + if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) { + spin_unlock(&lli->lli_sa_lock); + return; + } + + lli->lli_opendir_key = NULL; + + if (lli->lli_sai) { + struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread; + + if (!thread_is_stopped(thread)) { + thread_set_flags(thread, SVC_STOPPING); + spin_unlock(&lli->lli_sa_lock); + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_READA, "stop statahead thread: sai %p pid %u\n", + lli->lli_sai, (unsigned int)thread->t_pid); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopped(thread), + &lwi); + } else { + spin_unlock(&lli->lli_sa_lock); + } + + /* + * Put the ref which was held when first statahead_enter. + * It maybe not the last ref for some statahead requests + * maybe inflight. + */ + ll_sai_put(lli->lli_sai); + } else { + lli->lli_opendir_pid = 0; + spin_unlock(&lli->lli_sa_lock); + } +} + +enum { + /** + * not first dirent, or is "." + */ + LS_NONE_FIRST_DE = 0, + /** + * the first non-hidden dirent + */ + LS_FIRST_DE, + /** + * the first hidden dirent, that is "." + */ + LS_FIRST_DOT_DE +}; + +static int is_first_dirent(struct inode *dir, struct dentry *dentry) +{ + struct ll_dir_chain chain; + struct qstr *target = &dentry->d_name; + struct page *page; + __u64 pos = 0; + int dot_de; + int rc = LS_NONE_FIRST_DE; + + ll_dir_chain_init(&chain); + page = ll_get_dir_page(dir, pos, &chain); + + while (1) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + if (IS_ERR(page)) { + struct ll_inode_info *lli = ll_i2info(dir); + + rc = PTR_ERR(page); + CERROR("error reading dir "DFID" at %llu: [rc %d] [parent %u]\n", + PFID(ll_inode2fid(dir)), pos, + rc, lli->lli_opendir_pid); + break; + } + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL; + ent = lu_dirent_next(ent)) { + __u64 hash; + int namelen; + char *name; + + hash = le64_to_cpu(ent->lde_hash); + /* The ll_get_dir_page() can return any page containing + * the given hash which may be not the start hash. */ + if (unlikely(hash < pos)) + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (unlikely(namelen == 0)) + /* + * skip dummy record. + */ + continue; + + name = ent->lde_name; + if (name[0] == '.') { + if (namelen == 1) + /* + * skip "." + */ + continue; + else if (name[1] == '.' && namelen == 2) + /* + * skip ".." + */ + continue; + else + dot_de = 1; + } else { + dot_de = 0; + } + + if (dot_de && target->name[0] != '.') { + CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", + target->len, target->name, + namelen, name); + continue; + } + + if (target->len != namelen || + memcmp(target->name, name, namelen) != 0) + rc = LS_NONE_FIRST_DE; + else if (!dot_de) + rc = LS_FIRST_DE; + else + rc = LS_FIRST_DOT_DE; + + ll_release_page(page, 0); + goto out; + } + pos = le64_to_cpu(dp->ldp_hash_end); + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + ll_release_page(page, 0); + break; + } else if (1) { + /* + * chain is exhausted + * Normal case: continue to the next page. + */ + ll_release_page(page, le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + page = ll_get_dir_page(dir, pos, &chain); + } else { + /* + * go into overflow page. + */ + LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + ll_release_page(page, 1); + } + } + +out: + ll_dir_chain_fini(&chain); + return rc; +} + +static void +ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + struct ptlrpc_thread *thread = &sai->sai_thread; + struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); + int hit; + + if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC) + hit = 1; + else + hit = 0; + + ll_sa_entry_fini(sai, entry); + if (hit) { + sai->sai_hit++; + sai->sai_consecutive_miss = 0; + sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); + } else { + struct ll_inode_info *lli = ll_i2info(sai->sai_inode); + + sai->sai_miss++; + sai->sai_consecutive_miss++; + if (sa_low_hit(sai) && thread_is_running(thread)) { + atomic_inc(&sbi->ll_sa_wrong); + CDEBUG(D_READA, "Statahead for dir " DFID " hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stopping statahead thread\n", + PFID(&lli->lli_fid), sai->sai_hit, + sai->sai_miss, sai->sai_sent, + sai->sai_replied); + spin_lock(&lli->lli_sa_lock); + if (!thread_is_stopped(thread)) + thread_set_flags(thread, SVC_STOPPING); + spin_unlock(&lli->lli_sa_lock); + } + } + + if (!thread_is_stopped(thread)) + wake_up(&thread->t_ctl_waitq); +} + +/** + * Start statahead thread if this is the first dir entry. + * Otherwise if a thread is started already, wait it until it is ahead of me. + * \retval 1 -- find entry with lock in cache, the caller needs to do + * nothing. + * \retval 0 -- find entry in cache, but without lock, the caller needs + * refresh from MDS. + * \retval others -- the caller need to process as non-statahead. + */ +int do_statahead_enter(struct inode *dir, struct dentry **dentryp, + int only_unplug) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct dentry *parent; + struct ll_sa_entry *entry; + struct ptlrpc_thread *thread; + struct l_wait_info lwi = { 0 }; + int rc = 0; + struct ll_inode_info *plli; + + LASSERT(lli->lli_opendir_pid == current_pid()); + + if (sai) { + thread = &sai->sai_thread; + if (unlikely(thread_is_stopped(thread) && + list_empty(&sai->sai_entries_stated))) { + /* to release resource */ + ll_stop_statahead(dir, lli->lli_opendir_key); + return -EAGAIN; + } + + if ((*dentryp)->d_name.name[0] == '.') { + if (sai->sai_ls_all || + sai->sai_miss_hidden >= sai->sai_skip_hidden) { + /* + * Hidden dentry is the first one, or statahead + * thread does not skip so many hidden dentries + * before "sai_ls_all" enabled as below. + */ + } else { + if (!sai->sai_ls_all) + /* + * It maybe because hidden dentry is not + * the first one, "sai_ls_all" was not + * set, then "ls -al" missed. Enable + * "sai_ls_all" for such case. + */ + sai->sai_ls_all = 1; + + /* + * Such "getattr" has been skipped before + * "sai_ls_all" enabled as above. + */ + sai->sai_miss_hidden++; + return -EAGAIN; + } + } + + entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name); + if (entry == NULL || only_unplug) { + ll_sai_unplug(sai, entry); + return entry ? 1 : -EAGAIN; + } + + if (!ll_sa_entry_stated(entry)) { + sai->sai_index_wait = entry->se_index; + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL, + LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(sai->sai_waitq, + ll_sa_entry_stated(entry) || + thread_is_stopped(thread), + &lwi); + if (rc < 0) { + ll_sai_unplug(sai, entry); + return -EAGAIN; + } + } + + if (entry->se_stat == SA_ENTRY_SUCC && + entry->se_inode != NULL) { + struct inode *inode = entry->se_inode; + struct lookup_intent it = { .it_op = IT_GETATTR, + .d.lustre.it_lock_handle = + entry->se_handle }; + __u64 bits; + + rc = md_revalidate_lock(ll_i2mdexp(dir), &it, + ll_inode2fid(inode), &bits); + if (rc == 1) { + if (d_inode(*dentryp) == NULL) { + struct dentry *alias; + + alias = ll_splice_alias(inode, + *dentryp); + if (IS_ERR(alias)) { + ll_sai_unplug(sai, entry); + return PTR_ERR(alias); + } + *dentryp = alias; + } else if (d_inode(*dentryp) != inode) { + /* revalidate, but inode is recreated */ + CDEBUG(D_READA, + "stale dentry %pd inode %lu/%u, statahead inode %lu/%u\n", + *dentryp, + d_inode(*dentryp)->i_ino, + d_inode(*dentryp)->i_generation, + inode->i_ino, + inode->i_generation); + ll_sai_unplug(sai, entry); + return -ESTALE; + } else { + iput(inode); + } + entry->se_inode = NULL; + + if ((bits & MDS_INODELOCK_LOOKUP) && + d_lustre_invalid(*dentryp)) + d_lustre_revalidate(*dentryp); + ll_intent_release(&it); + } + } + + ll_sai_unplug(sai, entry); + return rc; + } + + /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ + rc = is_first_dirent(dir, *dentryp); + if (rc == LS_NONE_FIRST_DE) { + /* It is not "ls -{a}l" operation, no need statahead for it. */ + rc = -EAGAIN; + goto out; + } + + sai = ll_sai_alloc(); + if (sai == NULL) { + rc = -ENOMEM; + goto out; + } + + sai->sai_ls_all = (rc == LS_FIRST_DOT_DE); + sai->sai_inode = igrab(dir); + if (unlikely(sai->sai_inode == NULL)) { + CWARN("Do not start stat ahead on dying inode "DFID"\n", + PFID(&lli->lli_fid)); + rc = -ESTALE; + goto out; + } + + /* get parent reference count here, and put it in ll_statahead_thread */ + parent = dget((*dentryp)->d_parent); + if (unlikely(sai->sai_inode != d_inode(parent))) { + struct ll_inode_info *nlli = ll_i2info(d_inode(parent)); + + CWARN("Race condition, someone changed %pd just now: old parent "DFID", new parent "DFID"\n", + *dentryp, + PFID(&lli->lli_fid), PFID(&nlli->lli_fid)); + dput(parent); + iput(sai->sai_inode); + rc = -EAGAIN; + goto out; + } + + CDEBUG(D_READA, "start statahead thread: sai %p, parent %pd\n", + sai, parent); + + /* The sai buffer already has one reference taken at allocation time, + * but as soon as we expose the sai by attaching it to the lli that + * default reference can be dropped by another thread calling + * ll_stop_statahead. We need to take a local reference to protect + * the sai buffer while we intend to access it. */ + ll_sai_get(sai); + lli->lli_sai = sai; + + plli = ll_i2info(d_inode(parent)); + rc = PTR_ERR(kthread_run(ll_statahead_thread, parent, + "ll_sa_%u", plli->lli_opendir_pid)); + thread = &sai->sai_thread; + if (IS_ERR_VALUE(rc)) { + CERROR("can't start ll_sa thread, rc: %d\n", rc); + dput(parent); + lli->lli_opendir_key = NULL; + thread_set_flags(thread, SVC_STOPPED); + thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); + /* Drop both our own local reference and the default + * reference from allocation time. */ + ll_sai_put(sai); + ll_sai_put(sai); + LASSERT(lli->lli_sai == NULL); + return -EAGAIN; + } + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + ll_sai_put(sai); + + /* + * We don't stat-ahead for the first dirent since we are already in + * lookup. + */ + return -EAGAIN; + +out: + if (sai != NULL) + OBD_FREE_PTR(sai); + spin_lock(&lli->lli_sa_lock); + lli->lli_opendir_key = NULL; + lli->lli_opendir_pid = 0; + spin_unlock(&lli->lli_sa_lock); + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/super25.c b/kernel/drivers/staging/lustre/lustre/llite/super25.c new file mode 100644 index 000000000..a494f6271 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/super25.c @@ -0,0 +1,226 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include "../include/lustre_lite.h" +#include "../include/lustre_ha.h" +#include "../include/lustre_dlm.h" +#include +#include +#include "../include/lprocfs_status.h" +#include "llite_internal.h" + +static struct kmem_cache *ll_inode_cachep; + +static struct inode *ll_alloc_inode(struct super_block *sb) +{ + struct ll_inode_info *lli; + ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1); + OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, GFP_NOFS); + if (lli == NULL) + return NULL; + + inode_init_once(&lli->lli_vfs_inode); + return &lli->lli_vfs_inode; +} + +static void ll_inode_destroy_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ll_inode_info *ptr = ll_i2info(inode); + OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep); +} + +static void ll_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ll_inode_destroy_callback); +} + +/* exported operations */ +struct super_operations lustre_super_operations = { + .alloc_inode = ll_alloc_inode, + .destroy_inode = ll_destroy_inode, + .evict_inode = ll_delete_inode, + .put_super = ll_put_super, + .statfs = ll_statfs, + .umount_begin = ll_umount_begin, + .remount_fs = ll_remount_fs, + .show_options = ll_show_options, +}; +MODULE_ALIAS_FS("lustre"); + +void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)); + +static int __init init_lustre_lite(void) +{ + struct proc_dir_entry *entry; + lnet_process_id_t lnet_id; + struct timeval tv; + int i, rc, seed[2]; + + CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1); + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre client module (%p).\n", + &lustre_super_operations); + + rc = -ENOMEM; + ll_inode_cachep = kmem_cache_create("lustre_inode_cache", + sizeof(struct ll_inode_info), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ll_inode_cachep == NULL) + goto out_cache; + + ll_file_data_slab = kmem_cache_create("ll_file_data", + sizeof(struct ll_file_data), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ll_file_data_slab == NULL) + goto out_cache; + + ll_remote_perm_cachep = kmem_cache_create("ll_remote_perm_cache", + sizeof(struct ll_remote_perm), + 0, 0, NULL); + if (ll_remote_perm_cachep == NULL) + goto out_cache; + + ll_rmtperm_hash_cachep = kmem_cache_create("ll_rmtperm_hash_cache", + REMOTE_PERM_HASHSIZE * + sizeof(struct list_head), + 0, 0, NULL); + if (ll_rmtperm_hash_cachep == NULL) + goto out_cache; + + entry = lprocfs_register("llite", proc_lustre_root, NULL, NULL); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CERROR("cannot register '/proc/fs/lustre/llite': rc = %d\n", + rc); + goto out_cache; + } + + proc_lustre_fs_root = entry; + + cfs_get_random_bytes(seed, sizeof(seed)); + + /* Nodes with small feet have little entropy. The NID for this + * node gives the most entropy in the low bits */ + for (i = 0;; i++) { + if (LNetGetId(i, &lnet_id) == -ENOENT) + break; + + if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) + seed[0] ^= LNET_NIDADDR(lnet_id.nid); + } + + do_gettimeofday(&tv); + cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]); + setup_timer(&ll_capa_timer, ll_capa_timer_callback, 0); + rc = ll_capa_thread_start(); + if (rc != 0) + goto out_proc; + + rc = vvp_global_init(); + if (rc != 0) + goto out_capa; + + rc = ll_xattr_init(); + if (rc != 0) + goto out_vvp; + + lustre_register_client_fill_super(ll_fill_super); + lustre_register_kill_super_cb(ll_kill_super); + lustre_register_client_process_config(ll_process_config); + + return 0; + +out_vvp: + vvp_global_fini(); +out_capa: + del_timer(&ll_capa_timer); + ll_capa_thread_stop(); +out_proc: + lprocfs_remove(&proc_lustre_fs_root); +out_cache: + if (ll_inode_cachep != NULL) + kmem_cache_destroy(ll_inode_cachep); + + if (ll_file_data_slab != NULL) + kmem_cache_destroy(ll_file_data_slab); + + if (ll_remote_perm_cachep != NULL) + kmem_cache_destroy(ll_remote_perm_cachep); + + if (ll_rmtperm_hash_cachep != NULL) + kmem_cache_destroy(ll_rmtperm_hash_cachep); + + return rc; +} + +static void __exit exit_lustre_lite(void) +{ + lustre_register_client_fill_super(NULL); + lustre_register_kill_super_cb(NULL); + lustre_register_client_process_config(NULL); + + lprocfs_remove(&proc_lustre_fs_root); + + ll_xattr_fini(); + vvp_global_fini(); + del_timer(&ll_capa_timer); + ll_capa_thread_stop(); + LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0, + "client remaining capa count %d\n", + capa_count[CAPA_SITE_CLIENT]); + + kmem_cache_destroy(ll_inode_cachep); + kmem_cache_destroy(ll_rmtperm_hash_cachep); + + kmem_cache_destroy(ll_remote_perm_cachep); + + kmem_cache_destroy(ll_file_data_slab); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Lite Client File System"); +MODULE_LICENSE("GPL"); + +module_init(init_lustre_lite); +module_exit(exit_lustre_lite); diff --git a/kernel/drivers/staging/lustre/lustre/llite/symlink.c b/kernel/drivers/staging/lustre/lustre/llite/symlink.c new file mode 100644 index 000000000..3711e671a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/symlink.c @@ -0,0 +1,170 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/lustre_lite.h" +#include "llite_internal.h" + +static int ll_readlink_internal(struct inode *inode, + struct ptlrpc_request **request, char **symname) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int rc, symlen = i_size_read(inode) + 1; + struct mdt_body *body; + struct md_op_data *op_data; + + *request = NULL; + + if (lli->lli_symlink_name) { + int print_limit = min_t(int, PAGE_SIZE - 128, symlen); + + *symname = lli->lli_symlink_name; + /* If the total CDEBUG() size is larger than a page, it + * will print a warning to the console, avoid this by + * printing just the last part of the symlink. */ + CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n", + print_limit < symlen ? "..." : "", print_limit, + (*symname) + symlen - print_limit, symlen); + return 0; + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + op_data->op_valid = OBD_MD_LINKNAME; + rc = md_getattr(sbi->ll_md_exp, op_data, request); + ll_finish_md_op_data(op_data); + if (rc) { + if (rc != -ENOENT) + CERROR("inode %lu: rc = %d\n", inode->i_ino, rc); + goto failed; + } + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + if ((body->valid & OBD_MD_LINKNAME) == 0) { + CERROR("OBD_MD_LINKNAME not set on reply\n"); + rc = -EPROTO; + goto failed; + } + + LASSERT(symlen != 0); + if (body->eadatasize != symlen) { + CERROR("inode %lu: symlink length %d not expected %d\n", + inode->i_ino, body->eadatasize - 1, symlen - 1); + rc = -EPROTO; + goto failed; + } + + *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD); + if (*symname == NULL || + strnlen(*symname, symlen) != symlen - 1) { + /* not full/NULL terminated */ + CERROR("inode %lu: symlink not NULL terminated string of length %d\n", + inode->i_ino, symlen - 1); + rc = -EPROTO; + goto failed; + } + + lli->lli_symlink_name = kzalloc(symlen, GFP_NOFS); + /* do not return an error if we cannot cache the symlink locally */ + if (lli->lli_symlink_name) { + memcpy(lli->lli_symlink_name, *symname, symlen); + *symname = lli->lli_symlink_name; + } + return 0; + +failed: + return rc; +} + +static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = d_inode(dentry); + struct ptlrpc_request *request = NULL; + int rc; + char *symname = NULL; + + CDEBUG(D_VFSTRACE, "VFS Op\n"); + /* Limit the recursive symlink depth to 5 instead of default + * 8 links when kernel has 4k stack to prevent stack overflow. + * For 8k stacks we need to limit it to 7 for local servers. */ + if (THREAD_SIZE < 8192 && current->link_count >= 6) { + rc = -ELOOP; + } else if (THREAD_SIZE == 8192 && current->link_count >= 8) { + rc = -ELOOP; + } else { + ll_inode_size_lock(inode); + rc = ll_readlink_internal(inode, &request, &symname); + ll_inode_size_unlock(inode); + } + if (rc) { + ptlrpc_req_finished(request); + request = NULL; + symname = ERR_PTR(rc); + } + + nd_set_link(nd, symname); + /* symname may contain a pointer to the request message buffer, + * we delay request releasing until ll_put_link then. + */ + return request; +} + +static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + ptlrpc_req_finished(cookie); +} + +struct inode_operations ll_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .setattr = ll_setattr, + .follow_link = ll_follow_link, + .put_link = ll_put_link, + .getattr = ll_getattr, + .permission = ll_inode_permission, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, +}; diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_dev.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_dev.c new file mode 100644 index 000000000..fde41d7c5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_dev.c @@ -0,0 +1,547 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl_device and cl_device_type implementation for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include "../include/obd.h" +#include "../include/lustre_lite.h" +#include "llite_internal.h" +#include "vvp_internal.h" + +/***************************************************************************** + * + * Vvp device and device type functions. + * + */ + +/* + * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical + * "llite_" (var. "ll_") prefix. + */ + +static struct kmem_cache *vvp_thread_kmem; +static struct kmem_cache *vvp_session_kmem; +static struct lu_kmem_descr vvp_caches[] = { + { + .ckd_cache = &vvp_thread_kmem, + .ckd_name = "vvp_thread_kmem", + .ckd_size = sizeof(struct vvp_thread_info), + }, + { + .ckd_cache = &vvp_session_kmem, + .ckd_name = "vvp_session_kmem", + .ckd_size = sizeof(struct vvp_session) + }, + { + .ckd_cache = NULL + } +}; + +static void *vvp_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, vvp_thread_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void vvp_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_thread_info *info = data; + + OBD_SLAB_FREE_PTR(info, vvp_thread_kmem); +} + +static void *vvp_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_session *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, GFP_NOFS); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void vvp_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_session *session = data; + + OBD_SLAB_FREE_PTR(session, vvp_session_kmem); +} + + +struct lu_context_key vvp_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = vvp_key_init, + .lct_fini = vvp_key_fini +}; + +struct lu_context_key vvp_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = vvp_session_key_init, + .lct_fini = vvp_session_key_fini +}; + +/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key); + +static const struct lu_device_operations vvp_lu_ops = { + .ldo_object_alloc = vvp_object_alloc +}; + +static const struct cl_device_operations vvp_cl_ops = { + .cdo_req_init = ccc_req_init +}; + +static struct lu_device *vvp_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops); +} + +static const struct lu_device_type_operations vvp_device_type_ops = { + .ldto_init = vvp_type_init, + .ldto_fini = vvp_type_fini, + + .ldto_start = vvp_type_start, + .ldto_stop = vvp_type_stop, + + .ldto_device_alloc = vvp_device_alloc, + .ldto_device_free = ccc_device_free, + .ldto_device_init = ccc_device_init, + .ldto_device_fini = ccc_device_fini +}; + +struct lu_device_type vvp_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_VVP_NAME, + .ldt_ops = &vvp_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** + * A mutex serializing calls to vvp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +int vvp_global_init(void) +{ + int result; + + result = lu_kmem_init(vvp_caches); + if (result == 0) { + result = ccc_global_init(&vvp_device_type); + if (result != 0) + lu_kmem_fini(vvp_caches); + } + return result; +} + +void vvp_global_fini(void) +{ + ccc_global_fini(&vvp_device_type); + lu_kmem_fini(vvp_caches); +} + + +/***************************************************************************** + * + * mirror obd-devices into cl devices. + * + */ + +int cl_sb_init(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct cl_device *cl; + struct lu_env *env; + int rc = 0; + int refcheck; + + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cl = cl_type_setup(env, NULL, &vvp_device_type, + sbi->ll_dt_exp->exp_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + cl2ccc_dev(cl)->cdv_sb = sb; + sbi->ll_cl = cl; + sbi->ll_site = cl2lu_dev(cl)->ld_site; + } + cl_env_put(env, &refcheck); + } else + rc = PTR_ERR(env); + return rc; +} + +int cl_sb_fini(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + struct cl_device *cld; + int refcheck; + int result; + + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cld = sbi->ll_cl; + + if (cld != NULL) { + cl_stack_fini(env, cld); + sbi->ll_cl = NULL; + sbi->ll_site = NULL; + } + cl_env_put(env, &refcheck); + result = 0; + } else { + CERROR("Cannot cleanup cl-stack due to memory shortage.\n"); + result = PTR_ERR(env); + } + /* + * If mount failed (sbi->ll_cl == NULL), and this there are no other + * mounts, stop device types manually (this usually happens + * automatically when last device is destroyed). + */ + lu_types_stop(); + return result; +} + +/**************************************************************************** + * + * /proc/fs/lustre/llite/$MNT/dump_page_cache + * + ****************************************************************************/ + +/* + * To represent contents of a page cache as a byte stream, following + * information if encoded in 64bit offset: + * + * - file hash bucket in lu_site::ls_hash[] 28bits + * + * - how far file is from bucket head 4bits + * + * - page index 32bits + * + * First two data identify a file in the cache uniquely. + */ + +#define PGC_OBJ_SHIFT (32 + 4) +#define PGC_DEPTH_SHIFT (32) + +struct vvp_pgcache_id { + unsigned vpi_bucket; + unsigned vpi_depth; + uint32_t vpi_index; + + unsigned vpi_curdep; + struct lu_object_header *vpi_obj; +}; + +static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id) +{ + CLASSERT(sizeof(pos) == sizeof(__u64)); + + id->vpi_index = pos & 0xffffffff; + id->vpi_depth = (pos >> PGC_DEPTH_SHIFT) & 0xf; + id->vpi_bucket = (unsigned long long)pos >> PGC_OBJ_SHIFT; +} + +static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id) +{ + return + ((__u64)id->vpi_index) | + ((__u64)id->vpi_depth << PGC_DEPTH_SHIFT) | + ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT); +} + +static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + struct vvp_pgcache_id *id = data; + struct lu_object_header *hdr = cfs_hash_object(hs, hnode); + + if (id->vpi_curdep-- > 0) + return 0; /* continue */ + + if (lu_object_is_dying(hdr)) + return 1; + + cfs_hash_get(hs, hnode); + id->vpi_obj = hdr; + return 1; +} + +static struct cl_object *vvp_pgcache_obj(const struct lu_env *env, + struct lu_device *dev, + struct vvp_pgcache_id *id) +{ + LASSERT(lu_device_is_cl(dev)); + + id->vpi_depth &= 0xf; + id->vpi_obj = NULL; + id->vpi_curdep = id->vpi_depth; + + cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket, + vvp_pgcache_obj_get, id); + if (id->vpi_obj != NULL) { + struct lu_object *lu_obj; + + lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type); + if (lu_obj != NULL) { + lu_object_ref_add(lu_obj, "dump", current); + return lu2cl(lu_obj); + } + lu_object_put(env, lu_object_top(id->vpi_obj)); + + } else if (id->vpi_curdep > 0) { + id->vpi_depth = 0xf; + } + return NULL; +} + +static loff_t vvp_pgcache_find(const struct lu_env *env, + struct lu_device *dev, loff_t pos) +{ + struct cl_object *clob; + struct lu_site *site; + struct vvp_pgcache_id id; + + site = dev->ld_site; + vvp_pgcache_id_unpack(pos, &id); + + while (1) { + if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash)) + return ~0ULL; + clob = vvp_pgcache_obj(env, dev, &id); + if (clob != NULL) { + struct cl_object_header *hdr; + int nr; + struct cl_page *pg; + + /* got an object. Find next page. */ + hdr = cl_object_header(clob); + + spin_lock(&hdr->coh_page_guard); + nr = radix_tree_gang_lookup(&hdr->coh_tree, + (void **)&pg, + id.vpi_index, 1); + if (nr > 0) { + id.vpi_index = pg->cp_index; + /* Cant support over 16T file */ + nr = !(pg->cp_index > 0xffffffff); + } + spin_unlock(&hdr->coh_page_guard); + + lu_object_ref_del(&clob->co_lu, "dump", current); + cl_object_put(env, clob); + if (nr > 0) + return vvp_pgcache_id_pack(&id); + } + /* to the next object. */ + ++id.vpi_depth; + id.vpi_depth &= 0xf; + if (id.vpi_depth == 0 && ++id.vpi_bucket == 0) + return ~0ULL; + id.vpi_index = 0; + } +} + +#define seq_page_flag(seq, page, flag, has_flags) do { \ + if (test_bit(PG_##flag, &(page)->flags)) { \ + seq_printf(seq, "%s"#flag, has_flags ? "|" : ""); \ + has_flags = 1; \ + } \ +} while (0) + +static void vvp_pgcache_page_show(const struct lu_env *env, + struct seq_file *seq, struct cl_page *page) +{ + struct ccc_page *cpg; + struct page *vmpage; + int has_flags; + + cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type)); + vmpage = cpg->cpg_page; + seq_printf(seq, " %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [", + 0 /* gen */, + cpg, page, + "none", + cpg->cpg_write_queued ? "wq" : "- ", + cpg->cpg_defer_uptodate ? "du" : "- ", + PageWriteback(vmpage) ? "wb" : "-", + vmpage, vmpage->mapping->host->i_ino, + vmpage->mapping->host->i_generation, + vmpage->mapping->host, vmpage->index, + page_count(vmpage)); + has_flags = 0; + seq_page_flag(seq, vmpage, locked, has_flags); + seq_page_flag(seq, vmpage, error, has_flags); + seq_page_flag(seq, vmpage, referenced, has_flags); + seq_page_flag(seq, vmpage, uptodate, has_flags); + seq_page_flag(seq, vmpage, dirty, has_flags); + seq_page_flag(seq, vmpage, writeback, has_flags); + seq_printf(seq, "%s]\n", has_flags ? "" : "-"); +} + +static int vvp_pgcache_show(struct seq_file *f, void *v) +{ + loff_t pos; + struct ll_sb_info *sbi; + struct cl_object *clob; + struct lu_env *env; + struct cl_page *page; + struct cl_object_header *hdr; + struct vvp_pgcache_id id; + int refcheck; + int result; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + pos = *(loff_t *) v; + vvp_pgcache_id_unpack(pos, &id); + sbi = f->private; + clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id); + if (clob != NULL) { + hdr = cl_object_header(clob); + + spin_lock(&hdr->coh_page_guard); + page = cl_page_lookup(hdr, id.vpi_index); + spin_unlock(&hdr->coh_page_guard); + + seq_printf(f, "%8x@"DFID": ", + id.vpi_index, PFID(&hdr->coh_lu.loh_fid)); + if (page != NULL) { + vvp_pgcache_page_show(env, f, page); + cl_page_put(env, page); + } else + seq_puts(f, "missing\n"); + lu_object_ref_del(&clob->co_lu, "dump", current); + cl_object_put(env, clob); + } else + seq_printf(f, "%llx missing\n", pos); + cl_env_put(env, &refcheck); + result = 0; + } else + result = PTR_ERR(env); + return result; +} + +static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + int refcheck; + + sbi = f->private; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + sbi = f->private; + if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT) + pos = ERR_PTR(-EFBIG); + else { + *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, + *pos); + if (*pos == ~0ULL) + pos = NULL; + } + cl_env_put(env, &refcheck); + } + return pos; +} + +static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + int refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + sbi = f->private; + *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1); + if (*pos == ~0ULL) + pos = NULL; + cl_env_put(env, &refcheck); + } + return pos; +} + +static void vvp_pgcache_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static struct seq_operations vvp_pgcache_ops = { + .start = vvp_pgcache_start, + .next = vvp_pgcache_next, + .stop = vvp_pgcache_stop, + .show = vvp_pgcache_show +}; + +static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp) +{ + struct ll_sb_info *sbi = PDE_DATA(inode); + struct seq_file *seq; + int result; + + result = seq_open(filp, &vvp_pgcache_ops); + if (result == 0) { + seq = filp->private_data; + seq->private = sbi; + } + return result; +} + +const struct file_operations vvp_dump_pgcache_file_ops = { + .owner = THIS_MODULE, + .open = vvp_dump_pgcache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_internal.h b/kernel/drivers/staging/lustre/lustre/llite/vvp_internal.h new file mode 100644 index 000000000..2162bf6c0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_internal.h @@ -0,0 +1,62 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal definitions for VVP layer. + * + * Author: Nikita Danilov + */ + +#ifndef VVP_INTERNAL_H +#define VVP_INTERNAL_H + + +#include "../include/cl_object.h" +#include "llite_internal.h" + +int vvp_io_init (const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); +int vvp_lock_init (const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); +int vvp_page_init (const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, struct page *vmpage); +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +struct ccc_object *cl_inode2ccc(struct inode *inode); + +extern const struct file_operations vvp_dump_pgcache_file_ops; + +#endif /* VVP_INTERNAL_H */ diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_io.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_io.c new file mode 100644 index 000000000..91bba7967 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -0,0 +1,1209 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include "../include/obd.h" +#include "../include/lustre_lite.h" + +#include "vvp_internal.h" + +static struct vvp_io *cl2vvp_io(const struct lu_env *env, + const struct cl_io_slice *slice); + +/** + * True, if \a io is a normal io, False for splice_{read,write} + */ +int cl_is_normalio(const struct lu_env *env, const struct cl_io *io) +{ + struct vvp_io *vio = vvp_env_io(env); + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + return vio->cui_io_subtype == IO_NORMAL; +} + +/** + * For swapping layout. The file's layout may have changed. + * To avoid populating pages to a wrong stripe, we have to verify the + * correctness of layout. It works because swapping layout processes + * have to acquire group lock. + */ +static bool can_populate_pages(const struct lu_env *env, struct cl_io *io, + struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ccc_io *cio = ccc_env_io(env); + bool rc = true; + + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + /* don't need lock here to check lli_layout_gen as we have held + * extent lock and GROUP lock has to hold to swap layout */ + if (ll_layout_version_get(lli) != cio->cui_layout_gen) { + io->ci_need_restart = 1; + /* this will return application a short read/write */ + io->ci_continue = 0; + rc = false; + } + case CIT_FAULT: + /* fault is okay because we've already had a page. */ + default: + break; + } + + return rc; +} + +/***************************************************************************** + * + * io operations. + * + */ + +static int vvp_io_fault_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct inode *inode = ccc_object_inode(ios->cis_obj); + + LASSERT(inode == + file_inode(cl2ccc_io(env, ios)->cui_fd->fd_file)); + vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime); + return 0; +} + +static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct ccc_io *cio = cl2ccc_io(env, ios); + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, DFID + " ignore/verify layout %d/%d, layout version %d restore needed %d\n", + PFID(lu_object_fid(&obj->co_lu)), + io->ci_ignore_layout, io->ci_verify_layout, + cio->cui_layout_gen, io->ci_restore_needed); + + if (io->ci_restore_needed == 1) { + int rc; + + /* file was detected release, we need to restore it + * before finishing the io + */ + rc = ll_layout_restore(ccc_object_inode(obj)); + /* if restore registration failed, no restart, + * we will return -ENODATA */ + /* The layout will change after restore, so we need to + * block on layout lock hold by the MDT + * as MDT will not send new layout in lvb (see LU-3124) + * we have to explicitly fetch it, all this will be done + * by ll_layout_refresh() + */ + if (rc == 0) { + io->ci_restore_needed = 0; + io->ci_need_restart = 1; + io->ci_verify_layout = 1; + } else { + io->ci_restore_needed = 1; + io->ci_need_restart = 0; + io->ci_verify_layout = 0; + io->ci_result = rc; + } + } + + if (!io->ci_ignore_layout && io->ci_verify_layout) { + __u32 gen = 0; + + /* check layout version */ + ll_layout_refresh(ccc_object_inode(obj), &gen); + io->ci_need_restart = cio->cui_layout_gen != gen; + if (io->ci_need_restart) { + CDEBUG(D_VFSTRACE, + DFID" layout changed from %d to %d.\n", + PFID(lu_object_fid(&obj->co_lu)), + cio->cui_layout_gen, gen); + /* today successful restore is the only possible + * case */ + /* restore was done, clear restoring state */ + ll_i2info(ccc_object_inode(obj))->lli_flags &= + ~LLIF_FILE_RESTORING; + } + } +} + +static void vvp_io_fault_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_page *page = io->u.ci_fault.ft_page; + + CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj)); + + if (page != NULL) { + lu_ref_del(&page->cp_reference, "fault", io); + cl_page_put(env, page); + io->u.ci_fault.ft_page = NULL; + } + vvp_io_fini(env, ios); +} + +static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma) +{ + /* + * we only want to hold PW locks if the mmap() can generate + * writes back to the file and that only happens in shared + * writable vmas + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return CLM_WRITE; + return CLM_READ; +} + +static int vvp_mmap_locks(const struct lu_env *env, + struct ccc_io *vio, struct cl_io *io) +{ + struct ccc_thread_info *cti = ccc_env_info(env); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct cl_lock_descr *descr = &cti->cti_descr; + ldlm_policy_data_t policy; + unsigned long addr; + ssize_t count; + int result; + struct iov_iter i; + struct iovec iov; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + if (!cl_is_normalio(env, io)) + return 0; + + if (vio->cui_iter == NULL) /* nfs or loop back device write */ + return 0; + + /* No MM (e.g. NFS)? No vmas too. */ + if (mm == NULL) + return 0; + + iov_for_each(iov, i, *(vio->cui_iter)) { + addr = (unsigned long)iov.iov_base; + count = iov.iov_len; + if (count == 0) + continue; + + count += addr & (~CFS_PAGE_MASK); + addr &= CFS_PAGE_MASK; + + down_read(&mm->mmap_sem); + while ((vma = our_vma(mm, addr, count)) != NULL) { + struct inode *inode = file_inode(vma->vm_file); + int flags = CEF_MUST; + + if (ll_file_nolock(vma->vm_file)) { + /* + * For no lock case, a lockless lock will be + * generated. + */ + flags = CEF_NEVER; + } + + /* + * XXX: Required lock mode can be weakened: CIT_WRITE + * io only ever reads user level buffer, and CIT_READ + * only writes on it. + */ + policy_from_vma(&policy, vma, addr, count); + descr->cld_mode = vvp_mode_from_vma(vma); + descr->cld_obj = ll_i2info(inode)->lli_clob; + descr->cld_start = cl_index(descr->cld_obj, + policy.l_extent.start); + descr->cld_end = cl_index(descr->cld_obj, + policy.l_extent.end); + descr->cld_enq_flags = flags; + result = cl_io_lock_alloc_add(env, io, descr); + + CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", + descr->cld_mode, descr->cld_start, + descr->cld_end); + + if (result < 0) { + up_read(&mm->mmap_sem); + return result; + } + + if (vma->vm_end - addr >= count) + break; + + count -= vma->vm_end - addr; + addr = vma->vm_end; + } + up_read(&mm->mmap_sem); + } + return 0; +} + +static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io, + enum cl_lock_mode mode, loff_t start, loff_t end) +{ + struct ccc_io *cio = ccc_env_io(env); + int result; + int ast_flags = 0; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + ccc_io_update_iov(env, cio, io); + + if (io->u.ci_rw.crw_nonblock) + ast_flags |= CEF_NONBLOCK; + result = vvp_mmap_locks(env, cio, io); + if (result == 0) + result = ccc_io_one_lock(env, io, ast_flags, mode, start, end); + return result; +} + +static int vvp_io_read_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_io_rw_common *rd = &io->u.ci_rd.rd; + int result; + + result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos, + rd->crw_pos + rd->crw_count - 1); + + return result; +} + +static int vvp_io_fault_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct vvp_io *vio = cl2vvp_io(env, ios); + /* + * XXX LDLM_FL_CBPENDING + */ + return ccc_io_one_lock_index + (env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma), + io->u.ci_fault.ft_index, io->u.ci_fault.ft_index); +} + +static int vvp_io_write_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + loff_t start; + loff_t end; + + if (io->u.ci_wr.wr_append) { + start = 0; + end = OBD_OBJECT_EOF; + } else { + start = io->u.ci_wr.wr.crw_pos; + end = start + io->u.ci_wr.wr.crw_count - 1; + } + return vvp_io_rw_lock(env, io, CLM_WRITE, start, end); +} + +static int vvp_io_setattr_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + return 0; +} + +/** + * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io. + * + * Handles "lockless io" mode when extent locking is done by server. + */ +static int vvp_io_setattr_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct ccc_io *cio = ccc_env_io(env); + struct cl_io *io = ios->cis_io; + __u64 new_size; + __u32 enqflags = 0; + + if (cl_io_is_trunc(io)) { + new_size = io->u.ci_setattr.sa_attr.lvb_size; + if (new_size == 0) + enqflags = CEF_DISCARD_DATA; + } else { + if ((io->u.ci_setattr.sa_attr.lvb_mtime >= + io->u.ci_setattr.sa_attr.lvb_ctime) || + (io->u.ci_setattr.sa_attr.lvb_atime >= + io->u.ci_setattr.sa_attr.lvb_ctime)) + return 0; + new_size = 0; + } + cio->u.setattr.cui_local_lock = SETATTR_EXTENT_LOCK; + return ccc_io_one_lock(env, io, enqflags, CLM_WRITE, + new_size, OBD_OBJECT_EOF); +} + +static int vvp_do_vmtruncate(struct inode *inode, size_t size) +{ + int result; + /* + * Only ll_inode_size_lock is taken at this level. + */ + ll_inode_size_lock(inode); + result = inode_newsize_ok(inode, size); + if (result < 0) { + ll_inode_size_unlock(inode); + return result; + } + truncate_setsize(inode, size); + ll_inode_size_unlock(inode); + return result; +} + +static int vvp_io_setattr_trunc(const struct lu_env *env, + const struct cl_io_slice *ios, + struct inode *inode, loff_t size) +{ + inode_dio_wait(inode); + return 0; +} + +static int vvp_io_setattr_time(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct cl_attr *attr = ccc_env_thread_attr(env); + int result; + unsigned valid = CAT_CTIME; + + cl_object_attr_lock(obj); + attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime; + if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) { + attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime; + valid |= CAT_ATIME; + } + if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) { + attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime; + valid |= CAT_MTIME; + } + result = cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); + + return result; +} + +static int vvp_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = ccc_object_inode(io->ci_obj); + int result = 0; + + mutex_lock(&inode->i_mutex); + if (cl_io_is_trunc(io)) + result = vvp_io_setattr_trunc(env, ios, inode, + io->u.ci_setattr.sa_attr.lvb_size); + if (result == 0) + result = vvp_io_setattr_time(env, ios); + return result; +} + +static void vvp_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = ccc_object_inode(io->ci_obj); + + if (cl_io_is_trunc(io)) { + /* Truncate in memory pages - they must be clean pages + * because osc has already notified to destroy osc_extents. */ + vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size); + inode_dio_write_done(inode); + } + mutex_unlock(&inode->i_mutex); +} + +static void vvp_io_setattr_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + vvp_io_fini(env, ios); +} + +static int vvp_io_read_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct ccc_io *cio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + struct ll_ra_read *bead = &vio->cui_bead; + struct file *file = cio->cui_fd->fd_file; + + int result; + loff_t pos = io->u.ci_rd.rd.crw_pos; + long cnt = io->u.ci_rd.rd.crw_count; + long tot = cio->cui_tot_count; + int exceed = 0; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt); + + if (!can_populate_pages(env, io, inode)) + return 0; + + result = ccc_prep_size(env, obj, io, pos, tot, &exceed); + if (result != 0) + return result; + else if (exceed != 0) + goto out; + + LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, + "Read ino %lu, %lu bytes, offset %lld, size %llu\n", + inode->i_ino, cnt, pos, i_size_read(inode)); + + /* turn off the kernel's read-ahead */ + cio->cui_fd->fd_file->f_ra.ra_pages = 0; + + /* initialize read-ahead window once per syscall */ + if (!vio->cui_ra_window_set) { + vio->cui_ra_window_set = 1; + bead->lrr_start = cl_index(obj, pos); + /* + * XXX: explicit PAGE_CACHE_SIZE + */ + bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1); + ll_ra_read_in(file, bead); + } + + /* BUG: 5972 */ + file_accessed(file); + switch (vio->cui_io_subtype) { + case IO_NORMAL: + LASSERT(cio->cui_iocb->ki_pos == pos); + result = generic_file_read_iter(cio->cui_iocb, cio->cui_iter); + break; + case IO_SPLICE: + result = generic_file_splice_read(file, &pos, + vio->u.splice.cui_pipe, cnt, + vio->u.splice.cui_flags); + /* LU-1109: do splice read stripe by stripe otherwise if it + * may make nfsd stuck if this read occupied all internal pipe + * buffers. */ + io->ci_continue = 0; + break; + default: + CERROR("Wrong IO type %u\n", vio->cui_io_subtype); + LBUG(); + } + +out: + if (result >= 0) { + if (result < cnt) + io->ci_continue = 0; + io->ci_nob += result; + ll_rw_stats_tally(ll_i2sbi(inode), current->pid, + cio->cui_fd, pos, result, READ); + result = 0; + } + return result; +} + +static void vvp_io_read_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct ccc_io *cio = cl2ccc_io(env, ios); + + if (vio->cui_ra_window_set) + ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead); + + vvp_io_fini(env, ios); +} + +static int vvp_io_write_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct ccc_io *cio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + ssize_t result = 0; + loff_t pos = io->u.ci_wr.wr.crw_pos; + size_t cnt = io->u.ci_wr.wr.crw_count; + + if (!can_populate_pages(env, io, inode)) + return 0; + + if (cl_io_is_append(io)) { + /* + * PARALLEL IO This has to be changed for parallel IO doing + * out-of-order writes. + */ + pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode); + cio->cui_iocb->ki_pos = pos; + } else { + LASSERT(cio->cui_iocb->ki_pos == pos); + } + + CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt); + + if (cio->cui_iter == NULL) /* from a temp io in ll_cl_init(). */ + result = 0; + else + result = generic_file_write_iter(cio->cui_iocb, cio->cui_iter); + + if (result > 0) { + if (result < cnt) + io->ci_continue = 0; + io->ci_nob += result; + ll_rw_stats_tally(ll_i2sbi(inode), current->pid, + cio->cui_fd, pos, result, WRITE); + result = 0; + } + return result; +} + +static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) +{ + struct vm_fault *vmf = cfio->fault.ft_vmf; + + cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf); + cfio->fault.ft_flags_valid = 1; + + if (vmf->page) { + CDEBUG(D_PAGE, + "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", + vmf->page, vmf->page->mapping, vmf->page->index, + (long)vmf->page->flags, page_count(vmf->page), + page_private(vmf->page), vmf->virtual_address); + if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) { + lock_page(vmf->page); + cfio->fault.ft_flags |= VM_FAULT_LOCKED; + } + + cfio->ft_vmpage = vmf->page; + return 0; + } + + if (cfio->fault.ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { + CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address); + return -EFAULT; + } + + if (cfio->fault.ft_flags & VM_FAULT_OOM) { + CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address); + return -ENOMEM; + } + + if (cfio->fault.ft_flags & VM_FAULT_RETRY) + return -EAGAIN; + + CERROR("Unknown error in page fault %d!\n", cfio->fault.ft_flags); + return -EINVAL; +} + + +static int vvp_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + struct cl_fault_io *fio = &io->u.ci_fault; + struct vvp_fault_io *cfio = &vio->u.fault; + loff_t offset; + int result = 0; + struct page *vmpage = NULL; + struct cl_page *page; + loff_t size; + pgoff_t last; /* last page in a file data region */ + + if (fio->ft_executable && + LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime) + CWARN("binary "DFID + " changed while waiting for the page fault lock\n", + PFID(lu_object_fid(&obj->co_lu))); + + /* offset of the last byte on the page */ + offset = cl_offset(obj, fio->ft_index + 1) - 1; + LASSERT(cl_index(obj, offset) == fio->ft_index); + result = ccc_prep_size(env, obj, io, 0, offset + 1, NULL); + if (result != 0) + return result; + + /* must return locked page */ + if (fio->ft_mkwrite) { + LASSERT(cfio->ft_vmpage != NULL); + lock_page(cfio->ft_vmpage); + } else { + result = vvp_io_kernel_fault(cfio); + if (result != 0) + return result; + } + + vmpage = cfio->ft_vmpage; + LASSERT(PageLocked(vmpage)); + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE)) + ll_invalidate_page(vmpage); + + size = i_size_read(inode); + /* Though we have already held a cl_lock upon this page, but + * it still can be truncated locally. */ + if (unlikely((vmpage->mapping != inode->i_mapping) || + (page_offset(vmpage) > size))) { + CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n"); + + /* return +1 to stop cl_io_loop() and ll_fault() will catch + * and retry. */ + result = +1; + goto out; + } + + + if (fio->ft_mkwrite) { + pgoff_t last_index; + /* + * Capture the size while holding the lli_trunc_sem from above + * we want to make sure that we complete the mkwrite action + * while holding this lock. We need to make sure that we are + * not past the end of the file. + */ + last_index = cl_index(obj, size - 1); + if (last_index < fio->ft_index) { + CDEBUG(D_PAGE, + "llite: mkwrite and truncate race happened: %p: 0x%lx 0x%lx\n", + vmpage->mapping, fio->ft_index, last_index); + /* + * We need to return if we are + * passed the end of the file. This will propagate + * up the call stack to ll_page_mkwrite where + * we will return VM_FAULT_NOPAGE. Any non-negative + * value returned here will be silently + * converted to 0. If the vmpage->mapping is null + * the error code would be converted back to ENODATA + * in ll_page_mkwrite0. Thus we return -ENODATA + * to handle both cases + */ + result = -ENODATA; + goto out; + } + } + + page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE); + if (IS_ERR(page)) { + result = PTR_ERR(page); + goto out; + } + + /* if page is going to be written, we should add this page into cache + * earlier. */ + if (fio->ft_mkwrite) { + wait_on_page_writeback(vmpage); + if (set_page_dirty(vmpage)) { + struct ccc_page *cp; + + /* vvp_page_assume() calls wait_on_page_writeback(). */ + cl_page_assume(env, io, page); + + cp = cl2ccc_page(cl_page_at(page, &vvp_device_type)); + vvp_write_pending(cl2ccc(obj), cp); + + /* Do not set Dirty bit here so that in case IO is + * started before the page is really made dirty, we + * still have chance to detect it. */ + result = cl_page_cache_add(env, io, page, CRT_WRITE); + LASSERT(cl_page_is_owned(page, io)); + + vmpage = NULL; + if (result < 0) { + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + + cl_page_put(env, page); + + /* we're in big trouble, what can we do now? */ + if (result == -EDQUOT) + result = -ENOSPC; + goto out; + } else + cl_page_disown(env, io, page); + } + } + + last = cl_index(obj, size - 1); + /* + * The ft_index is only used in the case of + * a mkwrite action. We need to check + * our assertions are correct, since + * we should have caught this above + */ + LASSERT(!fio->ft_mkwrite || fio->ft_index <= last); + if (fio->ft_index == last) + /* + * Last page is mapped partially. + */ + fio->ft_nob = size - cl_offset(obj, fio->ft_index); + else + fio->ft_nob = cl_page_size(obj); + + lu_ref_add(&page->cp_reference, "fault", io); + fio->ft_page = page; + +out: + /* return unlocked vmpage to avoid deadlocking */ + if (vmpage != NULL) + unlock_page(vmpage); + cfio->fault.ft_flags &= ~VM_FAULT_LOCKED; + return result; +} + +static int vvp_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + /* we should mark TOWRITE bit to each dirty page in radix tree to + * verify pages have been written, but this is difficult because of + * race. */ + return 0; +} + +static int vvp_io_read_page(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = slice->cpl_obj; + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *page = slice->cpl_page; + struct inode *inode = ccc_object_inode(obj); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_file_data *fd = cl2ccc_io(env, ios)->cui_fd; + struct ll_readahead_state *ras = &fd->fd_ras; + struct page *vmpage = cp->cpg_page; + struct cl_2queue *queue = &io->ci_queue; + int rc; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + LASSERT(slice->cpl_obj == obj); + + if (sbi->ll_ra_info.ra_max_pages_per_file && + sbi->ll_ra_info.ra_max_pages) + ras_update(sbi, inode, ras, page->cp_index, + cp->cpg_defer_uptodate); + + /* Sanity check whether the page is protected by a lock. */ + rc = cl_page_is_under_lock(env, io, page); + if (rc != -EBUSY) { + CL_PAGE_HEADER(D_WARNING, env, page, "%s: %d\n", + rc == -ENODATA ? "without a lock" : + "match failed", rc); + if (rc != -ENODATA) + return rc; + } + + if (cp->cpg_defer_uptodate) { + cp->cpg_ra_used = 1; + cl_page_export(env, page, 1); + } + /* + * Add page into the queue even when it is marked uptodate above. + * this will unlock it automatically as part of cl_page_list_disown(). + */ + cl_2queue_add(queue, page); + if (sbi->ll_ra_info.ra_max_pages_per_file && + sbi->ll_ra_info.ra_max_pages) + ll_readahead(env, io, ras, + vmpage->mapping, &queue->c2_qin, fd->fd_flags); + + return 0; +} + +static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, struct ccc_page *cp, + enum cl_req_type crt) +{ + struct cl_2queue *queue; + int result; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + queue = &io->ci_queue; + cl_2queue_init_page(queue, page); + + result = cl_io_submit_sync(env, io, crt, queue, 0); + LASSERT(cl_page_is_owned(page, io)); + + if (crt == CRT_READ) + /* + * in CRT_WRITE case page is left locked even in case of + * error. + */ + cl_page_list_disown(env, io, &queue->c2_qin); + cl_2queue_fini(env, queue); + + return result; +} + +/** + * Prepare partially written-to page for a write. + */ +static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io, + struct cl_object *obj, struct cl_page *pg, + struct ccc_page *cp, + unsigned from, unsigned to) +{ + struct cl_attr *attr = ccc_env_thread_attr(env); + loff_t offset = cl_offset(obj, pg->cp_index); + int result; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result == 0) { + /* + * If are writing to a new page, no need to read old data. + * The extent locking will have updated the KMS, and for our + * purposes here we can treat it like i_size. + */ + if (attr->cat_kms <= offset) { + char *kaddr = kmap_atomic(cp->cpg_page); + + memset(kaddr, 0, cl_page_size(obj)); + kunmap_atomic(kaddr); + } else if (cp->cpg_defer_uptodate) + cp->cpg_ra_used = 1; + else + result = vvp_page_sync_io(env, io, pg, cp, CRT_READ); + /* + * In older implementations, obdo_refresh_inode is called here + * to update the inode because the write might modify the + * object info at OST. However, this has been proven useless, + * since LVB functions will be called when user space program + * tries to retrieve inode attribute. Also, see bug 15909 for + * details. -jay + */ + if (result == 0) + cl_page_export(env, pg, 1); + } + return result; +} + +static int vvp_io_prepare_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct cl_object *obj = slice->cpl_obj; + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *pg = slice->cpl_page; + struct page *vmpage = cp->cpg_page; + + int result; + + LINVRNT(cl_page_is_vmlocked(env, pg)); + LASSERT(vmpage->mapping->host == ccc_object_inode(obj)); + + result = 0; + + CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to); + if (!PageUptodate(vmpage)) { + /* + * We're completely overwriting an existing page, so _don't_ + * set it up to date until commit_write + */ + if (from == 0 && to == PAGE_CACHE_SIZE) { + CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n"); + POISON_PAGE(page, 0x11); + } else + result = vvp_io_prepare_partial(env, ios->cis_io, obj, + pg, cp, from, to); + } else + CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n"); + return result; +} + +static int vvp_io_commit_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct cl_object *obj = slice->cpl_obj; + struct cl_io *io = ios->cis_io; + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *pg = slice->cpl_page; + struct inode *inode = ccc_object_inode(obj); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + struct page *vmpage = cp->cpg_page; + + int result; + int tallyop; + loff_t size; + + LINVRNT(cl_page_is_vmlocked(env, pg)); + LASSERT(vmpage->mapping->host == inode); + + LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "committing page write\n"); + CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to); + + /* + * queue a write for some time in the future the first time we + * dirty the page. + * + * This is different from what other file systems do: they usually + * just mark page (and some of its buffers) dirty and rely on + * balance_dirty_pages() to start a write-back. Lustre wants write-back + * to be started earlier for the following reasons: + * + * (1) with a large number of clients we need to limit the amount + * of cached data on the clients a lot; + * + * (2) large compute jobs generally want compute-only then io-only + * and the IO should complete as quickly as possible; + * + * (3) IO is batched up to the RPC size and is async until the + * client max cache is hit + * (/proc/fs/lustre/osc/OSC.../max_dirty_mb) + * + */ + if (!PageDirty(vmpage)) { + tallyop = LPROC_LL_DIRTY_MISSES; + result = cl_page_cache_add(env, io, pg, CRT_WRITE); + if (result == 0) { + /* page was added into cache successfully. */ + set_page_dirty(vmpage); + vvp_write_pending(cl2ccc(obj), cp); + } else if (result == -EDQUOT) { + pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT; + bool need_clip = true; + + /* + * Client ran out of disk space grant. Possible + * strategies are: + * + * (a) do a sync write, renewing grant; + * + * (b) stop writing on this stripe, switch to the + * next one. + * + * (b) is a part of "parallel io" design that is the + * ultimate goal. (a) is what "old" client did, and + * what the new code continues to do for the time + * being. + */ + if (last_index > pg->cp_index) { + to = PAGE_CACHE_SIZE; + need_clip = false; + } else if (last_index == pg->cp_index) { + int size_to = i_size_read(inode) & ~CFS_PAGE_MASK; + if (to < size_to) + to = size_to; + } + if (need_clip) + cl_page_clip(env, pg, 0, to); + result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE); + if (result) + CERROR("Write page %lu of inode %p failed %d\n", + pg->cp_index, inode, result); + } + } else { + tallyop = LPROC_LL_DIRTY_HITS; + result = 0; + } + ll_stats_ops_tally(sbi, tallyop, 1); + + /* Inode should be marked DIRTY even if no new page was marked DIRTY + * because page could have been not flushed between 2 modifications. + * It is important the file is marked DIRTY as soon as the I/O is done + * Indeed, when cache is flushed, file could be already closed and it + * is too late to warn the MDT. + * It is acceptable that file is marked DIRTY even if I/O is dropped + * for some reasons before being flushed to OST. + */ + if (result == 0) { + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + + size = cl_offset(obj, pg->cp_index) + to; + + ll_inode_size_lock(inode); + if (result == 0) { + if (size > i_size_read(inode)) { + cl_isize_write_nolock(inode, size); + CDEBUG(D_VFSTRACE, DFID" updating i_size %lu\n", + PFID(lu_object_fid(&obj->co_lu)), + (unsigned long)size); + } + cl_page_export(env, pg, 1); + } else { + if (size > i_size_read(inode)) + cl_page_discard(env, io, pg); + } + ll_inode_size_unlock(inode); + return result; +} + +static const struct cl_io_operations vvp_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = vvp_io_read_fini, + .cio_lock = vvp_io_read_lock, + .cio_start = vvp_io_read_start, + .cio_advance = ccc_io_advance + }, + [CIT_WRITE] = { + .cio_fini = vvp_io_fini, + .cio_lock = vvp_io_write_lock, + .cio_start = vvp_io_write_start, + .cio_advance = ccc_io_advance + }, + [CIT_SETATTR] = { + .cio_fini = vvp_io_setattr_fini, + .cio_iter_init = vvp_io_setattr_iter_init, + .cio_lock = vvp_io_setattr_lock, + .cio_start = vvp_io_setattr_start, + .cio_end = vvp_io_setattr_end + }, + [CIT_FAULT] = { + .cio_fini = vvp_io_fault_fini, + .cio_iter_init = vvp_io_fault_iter_init, + .cio_lock = vvp_io_fault_lock, + .cio_start = vvp_io_fault_start, + .cio_end = ccc_io_end + }, + [CIT_FSYNC] = { + .cio_start = vvp_io_fsync_start, + .cio_fini = vvp_io_fini + }, + [CIT_MISC] = { + .cio_fini = vvp_io_fini + } + }, + .cio_read_page = vvp_io_read_page, + .cio_prepare_write = vvp_io_prepare_write, + .cio_commit_write = vvp_io_commit_write +}; + +int vvp_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct vvp_io *vio = vvp_env_io(env); + struct ccc_io *cio = ccc_env_io(env); + struct inode *inode = ccc_object_inode(obj); + int result; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, DFID + " ignore/verify layout %d/%d, layout version %d restore needed %d\n", + PFID(lu_object_fid(&obj->co_lu)), + io->ci_ignore_layout, io->ci_verify_layout, + cio->cui_layout_gen, io->ci_restore_needed); + + CL_IO_SLICE_CLEAN(cio, cui_cl); + cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops); + vio->cui_ra_window_set = 0; + result = 0; + if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) { + size_t count; + struct ll_inode_info *lli = ll_i2info(inode); + + count = io->u.ci_rw.crw_count; + /* "If nbyte is 0, read() will return 0 and have no other + * results." -- Single Unix Spec */ + if (count == 0) + result = 1; + else + cio->cui_tot_count = count; + + /* for read/write, we store the jobid in the inode, and + * it'll be fetched by osc when building RPC. + * + * it's not accurate if the file is shared by different + * jobs. + */ + lustre_get_jobid(lli->lli_jobid); + } else if (io->ci_type == CIT_SETATTR) { + if (!cl_io_is_trunc(io)) + io->ci_lockreq = CILR_MANDATORY; + } + + /* ignore layout change for generic CIT_MISC but not for glimpse. + * io context for glimpse must set ci_verify_layout to true, + * see cl_glimpse_size0() for details. */ + if (io->ci_type == CIT_MISC && !io->ci_verify_layout) + io->ci_ignore_layout = 1; + + /* Enqueue layout lock and get layout version. We need to do this + * even for operations requiring to open file, such as read and write, + * because it might not grant layout lock in IT_OPEN. */ + if (result == 0 && !io->ci_ignore_layout) { + result = ll_layout_refresh(inode, &cio->cui_layout_gen); + if (result == -ENOENT) + /* If the inode on MDS has been removed, but the objects + * on OSTs haven't been destroyed (async unlink), layout + * fetch will return -ENOENT, we'd ignore this error + * and continue with dirty flush. LU-3230. */ + result = 0; + if (result < 0) + CERROR("%s: refresh file layout " DFID " error %d.\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(lu_object_fid(&obj->co_lu)), result); + } + + return result; +} + +static struct vvp_io *cl2vvp_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + /* Calling just for assertion */ + cl2ccc_io(env, slice); + return vvp_env_io(env); +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_lock.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_lock.c new file mode 100644 index 000000000..f354e82d4 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_lock.c @@ -0,0 +1,85 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include "../include/obd.h" +#include "../include/lustre_lite.h" + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Vvp lock functions. + * + */ + +/** + * Estimates lock value for the purpose of managing the lock cache during + * memory shortages. + * + * Locks for memory mapped files are almost infinitely precious, others are + * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are + * ordered within themselves by weights assigned from other layers. + */ +static unsigned long vvp_lock_weigh(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct ccc_object *cob = cl2ccc(slice->cls_obj); + + return atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0; +} + +static const struct cl_lock_operations vvp_lock_ops = { + .clo_delete = ccc_lock_delete, + .clo_fini = ccc_lock_fini, + .clo_enqueue = ccc_lock_enqueue, + .clo_wait = ccc_lock_wait, + .clo_use = ccc_lock_use, + .clo_unuse = ccc_lock_unuse, + .clo_fits_into = ccc_lock_fits_into, + .clo_state = ccc_lock_state, + .clo_weigh = vvp_lock_weigh +}; + +int vvp_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops); +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_object.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_object.c new file mode 100644 index 000000000..b6f6d4cb6 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_object.c @@ -0,0 +1,201 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl_object implementation for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd.h" +#include "../include/lustre_lite.h" + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Object operations. + * + */ + +static int vvp_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct ccc_object *obj = lu2ccc(o); + struct inode *inode = obj->cob_inode; + struct ll_inode_info *lli; + + (*p)(env, cookie, "(%s %d %d) inode: %p ", + list_empty(&obj->cob_pending_list) ? "-" : "+", + obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt), + inode); + if (inode) { + lli = ll_i2info(inode); + (*p)(env, cookie, "%lu/%u %o %u %d %p "DFID, + inode->i_ino, inode->i_generation, inode->i_mode, + inode->i_nlink, atomic_read(&inode->i_count), + lli->lli_clob, PFID(&lli->lli_fid)); + } + return 0; +} + +static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct inode *inode = ccc_object_inode(obj); + + /* + * lov overwrites most of these fields in + * lov_attr_get()->...lov_merge_lvb_kms(), except when inode + * attributes are newer. + */ + + attr->cat_size = i_size_read(inode); + attr->cat_mtime = LTIME_S(inode->i_mtime); + attr->cat_atime = LTIME_S(inode->i_atime); + attr->cat_ctime = LTIME_S(inode->i_ctime); + attr->cat_blocks = inode->i_blocks; + attr->cat_uid = from_kuid(&init_user_ns, inode->i_uid); + attr->cat_gid = from_kgid(&init_user_ns, inode->i_gid); + /* KMS is not known by this layer */ + return 0; /* layers below have to fill in the rest */ +} + +static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct inode *inode = ccc_object_inode(obj); + + if (valid & CAT_UID) + inode->i_uid = make_kuid(&init_user_ns, attr->cat_uid); + if (valid & CAT_GID) + inode->i_gid = make_kgid(&init_user_ns, attr->cat_gid); + if (valid & CAT_ATIME) + LTIME_S(inode->i_atime) = attr->cat_atime; + if (valid & CAT_MTIME) + LTIME_S(inode->i_mtime) = attr->cat_mtime; + if (valid & CAT_CTIME) + LTIME_S(inode->i_ctime) = attr->cat_ctime; + if (0 && valid & CAT_SIZE) + cl_isize_write_nolock(inode, attr->cat_size); + /* not currently necessary */ + if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE)) + mark_inode_dirty(inode); + return 0; +} + +static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct ll_inode_info *lli = ll_i2info(conf->coc_inode); + + if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { + CDEBUG(D_VFSTRACE, DFID ": losing layout lock\n", + PFID(&lli->lli_fid)); + + ll_layout_version_set(lli, LL_LAYOUT_GEN_NONE); + + /* Clean up page mmap for this inode. + * The reason for us to do this is that if the page has + * already been installed into memory space, the process + * can access it without interacting with lustre, so this + * page may be stale due to layout change, and the process + * will never be notified. + * This operation is expensive but mmap processes have to pay + * a price themselves. */ + unmap_mapping_range(conf->coc_inode->i_mapping, + 0, OBD_OBJECT_EOF, 0); + + return 0; + } + + if (conf->coc_opc != OBJECT_CONF_SET) + return 0; + + if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL) { + CDEBUG(D_VFSTRACE, DFID ": layout version change: %u -> %u\n", + PFID(&lli->lli_fid), lli->lli_layout_gen, + conf->u.coc_md->lsm->lsm_layout_gen); + + lli->lli_has_smd = lsm_has_objects(conf->u.coc_md->lsm); + ll_layout_version_set(lli, conf->u.coc_md->lsm->lsm_layout_gen); + } else { + CDEBUG(D_VFSTRACE, DFID ": layout nuked: %u.\n", + PFID(&lli->lli_fid), lli->lli_layout_gen); + + lli->lli_has_smd = false; + ll_layout_version_set(lli, LL_LAYOUT_GEN_EMPTY); + } + return 0; +} + +static const struct cl_object_operations vvp_ops = { + .coo_page_init = vvp_page_init, + .coo_lock_init = vvp_lock_init, + .coo_io_init = vvp_io_init, + .coo_attr_get = vvp_attr_get, + .coo_attr_set = vvp_attr_set, + .coo_conf_set = vvp_conf_set, + .coo_glimpse = ccc_object_glimpse +}; + +static const struct lu_object_operations vvp_lu_obj_ops = { + .loo_object_init = ccc_object_init, + .loo_object_free = ccc_object_free, + .loo_object_print = vvp_object_print +}; + +struct ccc_object *cl_inode2ccc(struct inode *inode) +{ + struct cl_inode_info *lli = cl_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct lu_object *lu; + + LASSERT(obj != NULL); + lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type); + LASSERT(lu != NULL); + return lu2ccc(lu); +} + +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops); +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/vvp_page.c b/kernel/drivers/staging/lustre/lustre/llite/vvp_page.c new file mode 100644 index 000000000..954ed08c6 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/vvp_page.c @@ -0,0 +1,551 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include "../include/obd.h" +#include "../include/lustre_lite.h" + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Page operations. + * + */ + +static void vvp_page_fini_common(struct ccc_page *cp) +{ + struct page *vmpage = cp->cpg_page; + + LASSERT(vmpage != NULL); + page_cache_release(vmpage); +} + +static void vvp_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct page *vmpage = cp->cpg_page; + + /* + * vmpage->private was already cleared when page was moved into + * VPG_FREEING state. + */ + LASSERT((struct cl_page *)vmpage->private != slice->cpl_page); + vvp_page_fini_common(cp); +} + +static int vvp_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io, + int nonblock) +{ + struct ccc_page *vpg = cl2ccc_page(slice); + struct page *vmpage = vpg->cpg_page; + + LASSERT(vmpage != NULL); + if (nonblock) { + if (!trylock_page(vmpage)) + return -EAGAIN; + + if (unlikely(PageWriteback(vmpage))) { + unlock_page(vmpage); + return -EAGAIN; + } + + return 0; + } + + lock_page(vmpage); + wait_on_page_writeback(vmpage); + return 0; +} + +static void vvp_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + wait_on_page_writeback(vmpage); +} + +static void vvp_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); +} + +static void vvp_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + unlock_page(cl2vm_page(slice)); +} + +static void vvp_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + struct address_space *mapping; + struct ccc_page *cpg = cl2ccc_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + mapping = vmpage->mapping; + + if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used) + ll_ra_stats_inc(mapping, RA_STAT_DISCARDED); + + /* + * truncate_complete_page() calls + * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete(). + */ + truncate_complete_page(mapping, vmpage); +} + +static int vvp_page_unmap(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + __u64 offset; + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + offset = vmpage->index << PAGE_CACHE_SHIFT; + + /* + * XXX is it safe to call this with the page lock held? + */ + ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_CACHE_SIZE); + return 0; +} + +static void vvp_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct page *vmpage = cl2vm_page(slice); + struct inode *inode = vmpage->mapping->host; + struct cl_object *obj = slice->cpl_obj; + + LASSERT(PageLocked(vmpage)); + LASSERT((struct cl_page *)vmpage->private == slice->cpl_page); + LASSERT(inode == ccc_object_inode(obj)); + + vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice)); + ClearPagePrivate(vmpage); + vmpage->private = 0; + /* + * Reference from vmpage to cl_page is removed, but the reference back + * is still here. It is removed later in vvp_page_fini(). + */ +} + +static void vvp_page_export(const struct lu_env *env, + const struct cl_page_slice *slice, + int uptodate) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + if (uptodate) + SetPageUptodate(vmpage); + else + ClearPageUptodate(vmpage); +} + +static int vvp_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA; +} + +static int vvp_page_prep_read(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + /* Skip the page already marked as PG_uptodate. */ + return PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0; +} + +static int vvp_page_prep_write(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageDirty(vmpage)); + + set_page_writeback(vmpage); + vvp_write_pending(cl2ccc(slice->cpl_obj), cl2ccc_page(slice)); + + return 0; +} + +/** + * Handles page transfer errors at VM level. + * + * This takes inode as a separate argument, because inode on which error is to + * be set can be different from \a vmpage inode in case of direct-io. + */ +static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret) +{ + struct ccc_object *obj = cl_inode2ccc(inode); + + if (ioret == 0) { + ClearPageError(vmpage); + obj->cob_discard_page_warned = 0; + } else { + SetPageError(vmpage); + if (ioret == -ENOSPC) + set_bit(AS_ENOSPC, &inode->i_mapping->flags); + else + set_bit(AS_EIO, &inode->i_mapping->flags); + + if ((ioret == -ESHUTDOWN || ioret == -EINTR) && + obj->cob_discard_page_warned == 0) { + obj->cob_discard_page_warned = 1; + ll_dirty_page_discard_warn(vmpage, ioret); + } + } +} + +static void vvp_page_completion_read(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct page *vmpage = cp->cpg_page; + struct cl_page *page = cl_page_top(slice->cpl_page); + struct inode *inode = ccc_object_inode(page->cp_obj); + + LASSERT(PageLocked(vmpage)); + CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret); + + if (cp->cpg_defer_uptodate) + ll_ra_count_put(ll_i2sbi(inode), 1); + + if (ioret == 0) { + if (!cp->cpg_defer_uptodate) + cl_page_export(env, page, 1); + } else + cp->cpg_defer_uptodate = 0; + + if (page->cp_sync_io == NULL) + unlock_page(vmpage); +} + +static void vvp_page_completion_write(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *pg = slice->cpl_page; + struct page *vmpage = cp->cpg_page; + + LASSERT(ergo(pg->cp_sync_io != NULL, PageLocked(vmpage))); + LASSERT(PageWriteback(vmpage)); + + CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret); + + /* + * TODO: Actually it makes sense to add the page into oap pending + * list again and so that we don't need to take the page out from + * SoM write pending list, if we just meet a recoverable error, + * -ENOMEM, etc. + * To implement this, we just need to return a non zero value in + * ->cpo_completion method. The underlying transfer should be notified + * and then re-add the page into pending transfer queue. -jay + */ + + cp->cpg_write_queued = 0; + vvp_write_complete(cl2ccc(slice->cpl_obj), cp); + + /* + * Only mark the page error only when it's an async write because + * applications won't wait for IO to finish. + */ + if (pg->cp_sync_io == NULL) + vvp_vmpage_error(ccc_object_inode(pg->cp_obj), vmpage, ioret); + + end_page_writeback(vmpage); +} + +/** + * Implements cl_page_operations::cpo_make_ready() method. + * + * This is called to yank a page from the transfer cache and to send it out as + * a part of transfer. This function try-locks the page. If try-lock failed, + * page is owned by some concurrent IO, and should be skipped (this is bad, + * but hopefully rare situation, as it usually results in transfer being + * shorter than possible). + * + * \retval 0 success, page can be placed into transfer + * + * \retval -EAGAIN page is either used by concurrent IO has been + * truncated. Skip it. + */ +static int vvp_page_make_ready(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct page *vmpage = cl2vm_page(slice); + struct cl_page *pg = slice->cpl_page; + int result = 0; + + lock_page(vmpage); + if (clear_page_dirty_for_io(vmpage)) { + LASSERT(pg->cp_state == CPS_CACHED); + /* This actually clears the dirty bit in the radix + * tree. */ + set_page_writeback(vmpage); + vvp_write_pending(cl2ccc(slice->cpl_obj), + cl2ccc_page(slice)); + CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); + } else if (pg->cp_state == CPS_PAGEOUT) { + /* is it possible for osc_flush_async_page() to already + * make it ready? */ + result = -EALREADY; + } else { + CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n", + pg->cp_state); + LBUG(); + } + unlock_page(vmpage); + return result; +} + +static int vvp_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct ccc_page *vp = cl2ccc_page(slice); + struct page *vmpage = vp->cpg_page; + + (*printer)(env, cookie, LUSTRE_VVP_NAME "-page@%p(%d:%d:%d) vm@%p ", + vp, vp->cpg_defer_uptodate, vp->cpg_ra_used, + vp->cpg_write_queued, vmpage); + if (vmpage != NULL) { + (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru", + (long)vmpage->flags, page_count(vmpage), + page_mapcount(vmpage), vmpage->private, + page_index(vmpage), + list_empty(&vmpage->lru) ? "not-" : ""); + } + (*printer)(env, cookie, "\n"); + return 0; +} + +static const struct cl_page_operations vvp_page_ops = { + .cpo_own = vvp_page_own, + .cpo_assume = vvp_page_assume, + .cpo_unassume = vvp_page_unassume, + .cpo_disown = vvp_page_disown, + .cpo_vmpage = ccc_page_vmpage, + .cpo_discard = vvp_page_discard, + .cpo_delete = vvp_page_delete, + .cpo_unmap = vvp_page_unmap, + .cpo_export = vvp_page_export, + .cpo_is_vmlocked = vvp_page_is_vmlocked, + .cpo_fini = vvp_page_fini, + .cpo_print = vvp_page_print, + .cpo_is_under_lock = ccc_page_is_under_lock, + .io = { + [CRT_READ] = { + .cpo_prep = vvp_page_prep_read, + .cpo_completion = vvp_page_completion_read, + .cpo_make_ready = ccc_fail, + }, + [CRT_WRITE] = { + .cpo_prep = vvp_page_prep_write, + .cpo_completion = vvp_page_completion_write, + .cpo_make_ready = vvp_page_make_ready, + } + } +}; + +static void vvp_transient_page_verify(const struct cl_page *page) +{ + struct inode *inode = ccc_object_inode(page->cp_obj); + + LASSERT(!mutex_trylock(&inode->i_mutex)); +} + +static int vvp_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused, int nonblock) +{ + vvp_transient_page_verify(slice->cpl_page); + return 0; +} + +static void vvp_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct cl_page *page = slice->cpl_page; + + vvp_transient_page_verify(slice->cpl_page); + + /* + * For transient pages, remove it from the radix tree. + */ + cl_page_delete(env, page); +} + +static int vvp_transient_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct inode *inode = ccc_object_inode(slice->cpl_obj); + int locked; + + locked = !mutex_trylock(&inode->i_mutex); + if (!locked) + mutex_unlock(&inode->i_mutex); + return locked ? -EBUSY : -ENODATA; +} + +static void +vvp_transient_page_completion(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *clp = slice->cpl_page; + struct ccc_object *clobj = cl2ccc(clp->cp_obj); + + vvp_page_fini_common(cp); + LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); + clobj->cob_transient_pages--; +} + +static const struct cl_page_operations vvp_transient_page_ops = { + .cpo_own = vvp_transient_page_own, + .cpo_assume = vvp_transient_page_assume, + .cpo_unassume = vvp_transient_page_unassume, + .cpo_disown = vvp_transient_page_disown, + .cpo_discard = vvp_transient_page_discard, + .cpo_vmpage = ccc_page_vmpage, + .cpo_fini = vvp_transient_page_fini, + .cpo_is_vmlocked = vvp_transient_page_is_vmlocked, + .cpo_print = vvp_page_print, + .cpo_is_under_lock = ccc_page_is_under_lock, + .io = { + [CRT_READ] = { + .cpo_prep = ccc_transient_page_prep, + .cpo_completion = vvp_transient_page_completion, + }, + [CRT_WRITE] = { + .cpo_prep = ccc_transient_page_prep, + .cpo_completion = vvp_transient_page_completion, + } + } +}; + +int vvp_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct ccc_page *cpg = cl_object_page_slice(obj, page); + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + cpg->cpg_page = vmpage; + page_cache_get(vmpage); + + INIT_LIST_HEAD(&cpg->cpg_pending_linkage); + if (page->cp_type == CPT_CACHEABLE) { + SetPagePrivate(vmpage); + vmpage->private = (unsigned long)page; + cl_page_slice_add(page, &cpg->cpg_cl, obj, + &vvp_page_ops); + } else { + struct ccc_object *clobj = cl2ccc(obj); + + LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); + cl_page_slice_add(page, &cpg->cpg_cl, obj, + &vvp_transient_page_ops); + clobj->cob_transient_pages++; + } + return 0; +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/xattr.c b/kernel/drivers/staging/lustre/lustre/llite/xattr.c new file mode 100644 index 000000000..e0fcbe139 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/xattr.c @@ -0,0 +1,621 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "../include/obd_support.h" +#include "../include/lustre_lite.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_ver.h" +#include "../include/lustre_eacl.h" + +#include "llite_internal.h" + +#define XATTR_USER_T (1) +#define XATTR_TRUSTED_T (2) +#define XATTR_SECURITY_T (3) +#define XATTR_ACL_ACCESS_T (4) +#define XATTR_ACL_DEFAULT_T (5) +#define XATTR_LUSTRE_T (6) +#define XATTR_OTHER_T (7) + +static +int get_xattr_type(const char *name) +{ + if (!strcmp(name, POSIX_ACL_XATTR_ACCESS)) + return XATTR_ACL_ACCESS_T; + + if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT)) + return XATTR_ACL_DEFAULT_T; + + if (!strncmp(name, XATTR_USER_PREFIX, + sizeof(XATTR_USER_PREFIX) - 1)) + return XATTR_USER_T; + + if (!strncmp(name, XATTR_TRUSTED_PREFIX, + sizeof(XATTR_TRUSTED_PREFIX) - 1)) + return XATTR_TRUSTED_T; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1)) + return XATTR_SECURITY_T; + + if (!strncmp(name, XATTR_LUSTRE_PREFIX, + sizeof(XATTR_LUSTRE_PREFIX) - 1)) + return XATTR_LUSTRE_T; + + return XATTR_OTHER_T; +} + +static +int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type) +{ + if ((xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T) && + !(sbi->ll_flags & LL_SBI_ACL)) + return -EOPNOTSUPP; + + if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR)) + return -EOPNOTSUPP; + if (xattr_type == XATTR_TRUSTED_T && !capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + if (xattr_type == XATTR_OTHER_T) + return -EOPNOTSUPP; + + return 0; +} + +static +int ll_setxattr_common(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, __u64 valid) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + int xattr_type, rc; + struct obd_capa *oc; +#ifdef CONFIG_FS_POSIX_ACL + struct rmtacl_ctl_entry *rce = NULL; + posix_acl_xattr_header *new_value = NULL; + ext_acl_xattr_header *acl = NULL; +#endif + const char *pv = value; + + xattr_type = get_xattr_type(name); + rc = xattr_type_filter(sbi, xattr_type); + if (rc) + return rc; + + if ((xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T) && + !inode_owner_or_capable(inode)) + return -EPERM; + + /* b10667: ignore lustre special xattr for now */ + if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) || + (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0)) + return 0; + + /* b15587: ignore security.capability xattr for now */ + if ((xattr_type == XATTR_SECURITY_T && + strcmp(name, "security.capability") == 0)) + return 0; + + /* LU-549: Disable security.selinux when selinux is disabled */ + if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() && + strcmp(name, "security.selinux") == 0) + return -EOPNOTSUPP; + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT && + (xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T)) { + rce = rct_search(&sbi->ll_rct, current_pid()); + if (rce == NULL || + (rce->rce_ops != RMT_LSETFACL && + rce->rce_ops != RMT_RSETFACL)) + return -EOPNOTSUPP; + + if (rce->rce_ops == RMT_LSETFACL) { + struct eacl_entry *ee; + + ee = et_search_del(&sbi->ll_et, current_pid(), + ll_inode2fid(inode), xattr_type); + LASSERT(ee != NULL); + if (valid & OBD_MD_FLXATTR) { + acl = lustre_acl_xattr_merge2ext( + (posix_acl_xattr_header *)value, + size, ee->ee_acl); + if (IS_ERR(acl)) { + ee_free(ee); + return PTR_ERR(acl); + } + size = CFS_ACL_XATTR_SIZE(\ + le32_to_cpu(acl->a_count), \ + ext_acl_xattr); + pv = (const char *)acl; + } + ee_free(ee); + } else if (rce->rce_ops == RMT_RSETFACL) { + size = lustre_posix_acl_xattr_filter( + (posix_acl_xattr_header *)value, + size, &new_value); + if (unlikely(size < 0)) + return size; + + pv = (const char *)new_value; + } else + return -EOPNOTSUPP; + + valid |= rce_ops2valid(rce->rce_ops); + } +#endif + oc = ll_mdscapa_get(inode); + rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, + valid, name, pv, size, 0, flags, + ll_i2suppgid(inode), &req); + capa_put(oc); +#ifdef CONFIG_FS_POSIX_ACL + if (new_value != NULL) + lustre_posix_acl_xattr_free(new_value, size); + if (acl != NULL) + lustre_ext_acl_xattr_free(acl); +#endif + if (rc) { + if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) { + LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n"); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } + return rc; + } + + ptlrpc_req_finished(req); + return 0; +} + +int ll_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = d_inode(dentry); + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n", + inode->i_ino, inode->i_generation, inode, name); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); + + if ((strncmp(name, XATTR_TRUSTED_PREFIX, + sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 && + strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) || + (strncmp(name, XATTR_LUSTRE_PREFIX, + sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 && + strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) { + struct lov_user_md *lump = (struct lov_user_md *)value; + int rc = 0; + + if (size != 0 && size < sizeof(struct lov_user_md)) + return -EINVAL; + + /* Attributes that are saved via getxattr will always have + * the stripe_offset as 0. Instead, the MDS should be + * allowed to pick the starting OST index. b=17846 */ + if (lump != NULL && lump->lmm_stripe_offset == 0) + lump->lmm_stripe_offset = -1; + + if (lump != NULL && S_ISREG(inode->i_mode)) { + int flags = FMODE_WRITE; + int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ? + sizeof(*lump) : sizeof(struct lov_user_md_v3); + + rc = ll_lov_setstripe_ea_info(inode, dentry, flags, lump, + lum_size); + /* b10667: rc always be 0 here for now */ + rc = 0; + } else if (S_ISDIR(inode->i_mode)) { + rc = ll_dir_setstripe(inode, lump, 0); + } + + return rc; + + } else if (strcmp(name, XATTR_NAME_LMA) == 0 || + strcmp(name, XATTR_NAME_LINK) == 0) + return 0; + + return ll_setxattr_common(inode, name, value, size, flags, + OBD_MD_FLXATTR); +} + +int ll_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = d_inode(dentry); + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n", + inode->i_ino, inode->i_generation, inode, name); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1); + return ll_setxattr_common(inode, name, NULL, 0, 0, + OBD_MD_FLXATTRRM); +} + +static +int ll_getxattr_common(struct inode *inode, const char *name, + void *buffer, size_t size, __u64 valid) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + struct mdt_body *body; + int xattr_type, rc; + void *xdata; + struct obd_capa *oc; + struct rmtacl_ctl_entry *rce = NULL; + struct ll_inode_info *lli = ll_i2info(inode); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", + inode->i_ino, inode->i_generation, inode); + + /* listxattr have slightly different behavior from of ext3: + * without 'user_xattr' ext3 will list all xattr names but + * filtered out "^user..*"; we list them all for simplicity. + */ + if (!name) { + xattr_type = XATTR_OTHER_T; + goto do_getxattr; + } + + xattr_type = get_xattr_type(name); + rc = xattr_type_filter(sbi, xattr_type); + if (rc) + return rc; + + /* b15587: ignore security.capability xattr for now */ + if ((xattr_type == XATTR_SECURITY_T && + strcmp(name, "security.capability") == 0)) + return -ENODATA; + + /* LU-549: Disable security.selinux when selinux is disabled */ + if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() && + strcmp(name, "security.selinux") == 0) + return -EOPNOTSUPP; + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT && + (xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T)) { + rce = rct_search(&sbi->ll_rct, current_pid()); + if (rce == NULL || + (rce->rce_ops != RMT_LSETFACL && + rce->rce_ops != RMT_LGETFACL && + rce->rce_ops != RMT_RSETFACL && + rce->rce_ops != RMT_RGETFACL)) + return -EOPNOTSUPP; + } + + /* posix acl is under protection of LOOKUP lock. when calling to this, + * we just have path resolution to the target inode, so we have great + * chance that cached ACL is uptodate. + */ + if (xattr_type == XATTR_ACL_ACCESS_T && + !(sbi->ll_flags & LL_SBI_RMT_CLIENT)) { + + struct posix_acl *acl; + + spin_lock(&lli->lli_lock); + acl = posix_acl_dup(lli->lli_posix_acl); + spin_unlock(&lli->lli_lock); + + if (!acl) + return -ENODATA; + + rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + return rc; + } + if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode)) + return -ENODATA; +#endif + +do_getxattr: + if (sbi->ll_xattr_cache_enabled && xattr_type != XATTR_ACL_ACCESS_T) { + rc = ll_xattr_cache_get(inode, name, buffer, size, valid); + if (rc == -EAGAIN) + goto getxattr_nocache; + if (rc < 0) + goto out_xattr; + + /* Add "system.posix_acl_access" to the list */ + if (lli->lli_posix_acl != NULL && valid & OBD_MD_FLXATTRLS) { + if (size == 0) { + rc += sizeof(XATTR_NAME_ACL_ACCESS); + } else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) { + memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS, + sizeof(XATTR_NAME_ACL_ACCESS)); + rc += sizeof(XATTR_NAME_ACL_ACCESS); + } else { + rc = -ERANGE; + goto out_xattr; + } + } + } else { +getxattr_nocache: + oc = ll_mdscapa_get(inode); + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, + valid | (rce ? rce_ops2valid(rce->rce_ops) : 0), + name, NULL, 0, size, 0, &req); + capa_put(oc); + + if (rc < 0) + goto out_xattr; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body); + + /* only detect the xattr size */ + if (size == 0) { + rc = body->eadatasize; + goto out; + } + + if (size < body->eadatasize) { + CERROR("server bug: replied size %u > %u\n", + body->eadatasize, (int)size); + rc = -ERANGE; + goto out; + } + + if (body->eadatasize == 0) { + rc = -ENODATA; + goto out; + } + + /* do not need swab xattr data */ + xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, + body->eadatasize); + if (!xdata) { + rc = -EFAULT; + goto out; + } + + memcpy(buffer, xdata, body->eadatasize); + rc = body->eadatasize; + } + +#ifdef CONFIG_FS_POSIX_ACL + if (rce && rce->rce_ops == RMT_LSETFACL) { + ext_acl_xattr_header *acl; + + acl = lustre_posix_acl_xattr_2ext( + (posix_acl_xattr_header *)buffer, rc); + if (IS_ERR(acl)) { + rc = PTR_ERR(acl); + goto out; + } + + rc = ee_add(&sbi->ll_et, current_pid(), ll_inode2fid(inode), + xattr_type, acl); + if (unlikely(rc < 0)) { + lustre_ext_acl_xattr_free(acl); + goto out; + } + } +#endif + +out_xattr: + if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) { + LCONSOLE_INFO( + "%s: disabling user_xattr feature because it is not supported on the server: rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), rc); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } +out: + ptlrpc_req_finished(req); + return rc; +} + +ssize_t ll_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + struct inode *inode = d_inode(dentry); + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n", + inode->i_ino, inode->i_generation, inode, name); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); + + if ((strncmp(name, XATTR_TRUSTED_PREFIX, + sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 && + strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) || + (strncmp(name, XATTR_LUSTRE_PREFIX, + sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 && + strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) { + struct lov_stripe_md *lsm; + struct lov_user_md *lump; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *request = NULL; + int rc = 0, lmmsize = 0; + + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -ENODATA; + + if (size == 0 && S_ISDIR(inode->i_mode)) { + /* XXX directory EA is fix for now, optimize to save + * RPC transfer */ + rc = sizeof(struct lov_user_md); + goto out; + } + + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) { + if (S_ISDIR(inode->i_mode)) { + rc = ll_dir_getstripe(inode, &lmm, + &lmmsize, &request); + } else { + rc = -ENODATA; + } + } else { + /* LSM is present already after lookup/getattr call. + * we need to grab layout lock once it is implemented */ + rc = obd_packmd(ll_i2dtexp(inode), &lmm, lsm); + lmmsize = rc; + } + ccc_inode_lsm_put(inode, lsm); + + if (rc < 0) + goto out; + + if (size == 0) { + /* used to call ll_get_max_mdsize() forward to get + * the maximum buffer size, while some apps (such as + * rsync 3.0.x) care much about the exact xattr value + * size */ + rc = lmmsize; + goto out; + } + + if (size < lmmsize) { + CERROR("server bug: replied size %d > %d for %pd (%s)\n", + lmmsize, (int)size, dentry, name); + rc = -ERANGE; + goto out; + } + + lump = (struct lov_user_md *)buffer; + memcpy(lump, lmm, lmmsize); + /* do not return layout gen for getxattr otherwise it would + * confuse tar --xattr by recognizing layout gen as stripe + * offset when the file is restored. See LU-2809. */ + lump->lmm_layout_gen = 0; + + rc = lmmsize; +out: + if (request) + ptlrpc_req_finished(request); + else if (lmm) + obd_free_diskmd(ll_i2dtexp(inode), &lmm); + return rc; + } + + return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR); +} + +ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = d_inode(dentry); + int rc = 0, rc2 = 0; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *request = NULL; + int lmmsize; + + LASSERT(inode); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", + inode->i_ino, inode->i_generation, inode); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1); + + rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS); + if (rc < 0) + goto out; + + if (buffer != NULL) { + struct ll_sb_info *sbi = ll_i2sbi(inode); + char *xattr_name = buffer; + int xlen, rem = rc; + + while (rem > 0) { + xlen = strnlen(xattr_name, rem - 1) + 1; + rem -= xlen; + if (xattr_type_filter(sbi, + get_xattr_type(xattr_name)) == 0) { + /* skip OK xattr type + * leave it in buffer + */ + xattr_name += xlen; + continue; + } + /* move up remaining xattrs in buffer + * removing the xattr that is not OK + */ + memmove(xattr_name, xattr_name + xlen, rem); + rc -= xlen; + } + } + if (S_ISREG(inode->i_mode)) { + if (!ll_i2info(inode)->lli_has_smd) + rc2 = -1; + } else if (S_ISDIR(inode->i_mode)) { + rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request); + } + + if (rc2 < 0) { + rc2 = 0; + goto out; + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) { + const int prefix_len = sizeof(XATTR_LUSTRE_PREFIX) - 1; + const size_t name_len = sizeof("lov") - 1; + const size_t total_len = prefix_len + name_len + 1; + + if (((rc + total_len) > size) && (buffer != NULL)) { + ptlrpc_req_finished(request); + return -ERANGE; + } + + if (buffer != NULL) { + buffer += rc; + memcpy(buffer, XATTR_LUSTRE_PREFIX, prefix_len); + memcpy(buffer + prefix_len, "lov", name_len); + buffer[prefix_len + name_len] = '\0'; + } + rc2 = total_len; + } +out: + ptlrpc_req_finished(request); + rc = rc + rc2; + + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/llite/xattr_cache.c b/kernel/drivers/staging/lustre/lustre/llite/xattr_cache.c new file mode 100644 index 000000000..69ea92adf --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/llite/xattr_cache.c @@ -0,0 +1,538 @@ +/* + * Copyright 2012 Xyratex Technology Limited + * + * Author: Andrew Perepechko + * + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include "../include/obd_support.h" +#include "../include/lustre_lite.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_ver.h" +#include "llite_internal.h" + +/* If we ever have hundreds of extended attributes, we might want to consider + * using a hash or a tree structure instead of list for faster lookups. + */ +struct ll_xattr_entry { + struct list_head xe_list; /* protected with + * lli_xattrs_list_rwsem */ + char *xe_name; /* xattr name, \0-terminated */ + char *xe_value; /* xattr value */ + unsigned xe_namelen; /* strlen(xe_name) + 1 */ + unsigned xe_vallen; /* xattr value length */ +}; + +static struct kmem_cache *xattr_kmem; +static struct lu_kmem_descr xattr_caches[] = { + { + .ckd_cache = &xattr_kmem, + .ckd_name = "xattr_kmem", + .ckd_size = sizeof(struct ll_xattr_entry) + }, + { + .ckd_cache = NULL + } +}; + +int ll_xattr_init(void) +{ + return lu_kmem_init(xattr_caches); +} + +void ll_xattr_fini(void) +{ + lu_kmem_fini(xattr_caches); +} + +/** + * Initializes xattr cache for an inode. + * + * This initializes the xattr list and marks cache presence. + */ +static void ll_xattr_cache_init(struct ll_inode_info *lli) +{ + + + LASSERT(lli != NULL); + + INIT_LIST_HEAD(&lli->lli_xattrs); + lli->lli_flags |= LLIF_XATTR_CACHE; +} + +/** + * This looks for a specific extended attribute. + * + * Find in @cache and return @xattr_name attribute in @xattr, + * for the NULL @xattr_name return the first cached @xattr. + * + * \retval 0 success + * \retval -ENODATA if not found + */ +static int ll_xattr_cache_find(struct list_head *cache, + const char *xattr_name, + struct ll_xattr_entry **xattr) +{ + struct ll_xattr_entry *entry; + + + + list_for_each_entry(entry, cache, xe_list) { + /* xattr_name == NULL means look for any entry */ + if (xattr_name == NULL || + strcmp(xattr_name, entry->xe_name) == 0) { + *xattr = entry; + CDEBUG(D_CACHE, "find: [%s]=%.*s\n", + entry->xe_name, entry->xe_vallen, + entry->xe_value); + return 0; + } + } + + return -ENODATA; +} + +/** + * This adds an xattr. + * + * Add @xattr_name attr with @xattr_val value and @xattr_val_len length, + * + * \retval 0 success + * \retval -ENOMEM if no memory could be allocated for the cached attr + * \retval -EPROTO if duplicate xattr is being added + */ +static int ll_xattr_cache_add(struct list_head *cache, + const char *xattr_name, + const char *xattr_val, + unsigned xattr_val_len) +{ + struct ll_xattr_entry *xattr; + + + + if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { + CDEBUG(D_CACHE, "duplicate xattr: [%s]\n", xattr_name); + return -EPROTO; + } + + OBD_SLAB_ALLOC_PTR_GFP(xattr, xattr_kmem, GFP_NOFS); + if (xattr == NULL) { + CDEBUG(D_CACHE, "failed to allocate xattr\n"); + return -ENOMEM; + } + + xattr->xe_name = kstrdup(xattr_name, GFP_NOFS); + if (!xattr->xe_name) { + CDEBUG(D_CACHE, "failed to alloc xattr name %u\n", + xattr->xe_namelen); + goto err_name; + } + xattr->xe_value = kmemdup(xattr_val, xattr_val_len, GFP_NOFS); + if (!xattr->xe_value) + goto err_value; + + xattr->xe_vallen = xattr_val_len; + list_add(&xattr->xe_list, cache); + + CDEBUG(D_CACHE, "set: [%s]=%.*s\n", xattr_name, + xattr_val_len, xattr_val); + + return 0; +err_value: + OBD_FREE(xattr->xe_name, xattr->xe_namelen); +err_name: + OBD_SLAB_FREE_PTR(xattr, xattr_kmem); + + return -ENOMEM; +} + +/** + * This removes an extended attribute from cache. + * + * Remove @xattr_name attribute from @cache. + * + * \retval 0 success + * \retval -ENODATA if @xattr_name is not cached + */ +static int ll_xattr_cache_del(struct list_head *cache, + const char *xattr_name) +{ + struct ll_xattr_entry *xattr; + + + + CDEBUG(D_CACHE, "del xattr: %s\n", xattr_name); + + if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { + list_del(&xattr->xe_list); + OBD_FREE(xattr->xe_name, xattr->xe_namelen); + OBD_FREE(xattr->xe_value, xattr->xe_vallen); + OBD_SLAB_FREE_PTR(xattr, xattr_kmem); + + return 0; + } + + return -ENODATA; +} + +/** + * This iterates cached extended attributes. + * + * Walk over cached attributes in @cache and + * fill in @xld_buffer or only calculate buffer + * size if @xld_buffer is NULL. + * + * \retval >= 0 buffer list size + * \retval -ENODATA if the list cannot fit @xld_size buffer + */ +static int ll_xattr_cache_list(struct list_head *cache, + char *xld_buffer, + int xld_size) +{ + struct ll_xattr_entry *xattr, *tmp; + int xld_tail = 0; + + + + list_for_each_entry_safe(xattr, tmp, cache, xe_list) { + CDEBUG(D_CACHE, "list: buffer=%p[%d] name=%s\n", + xld_buffer, xld_tail, xattr->xe_name); + + if (xld_buffer) { + xld_size -= xattr->xe_namelen; + if (xld_size < 0) + break; + memcpy(&xld_buffer[xld_tail], + xattr->xe_name, xattr->xe_namelen); + } + xld_tail += xattr->xe_namelen; + } + + if (xld_size < 0) + return -ERANGE; + + return xld_tail; +} + +/** + * Check if the xattr cache is initialized (filled). + * + * \retval 0 @cache is not initialized + * \retval 1 @cache is initialized + */ +static int ll_xattr_cache_valid(struct ll_inode_info *lli) +{ + return !!(lli->lli_flags & LLIF_XATTR_CACHE); +} + +/** + * This finalizes the xattr cache. + * + * Free all xattr memory. @lli is the inode info pointer. + * + * \retval 0 no error occurred + */ +static int ll_xattr_cache_destroy_locked(struct ll_inode_info *lli) +{ + + + if (!ll_xattr_cache_valid(lli)) + return 0; + + while (ll_xattr_cache_del(&lli->lli_xattrs, NULL) == 0) + ; /* empty loop */ + lli->lli_flags &= ~LLIF_XATTR_CACHE; + + return 0; +} + +int ll_xattr_cache_destroy(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + + + down_write(&lli->lli_xattrs_list_rwsem); + rc = ll_xattr_cache_destroy_locked(lli); + up_write(&lli->lli_xattrs_list_rwsem); + + return rc; +} + +/** + * Match or enqueue a PR lock. + * + * Find or request an LDLM lock with xattr data. + * Since LDLM does not provide API for atomic match_or_enqueue, + * the function handles it with a separate enq lock. + * If successful, the function exits with the list lock held. + * + * \retval 0 no error occurred + * \retval -ENOMEM not enough memory + */ +static int ll_xattr_find_get_lock(struct inode *inode, + struct lookup_intent *oit, + struct ptlrpc_request **req) +{ + ldlm_mode_t mode; + struct lustre_handle lockh = { 0 }; + struct md_op_data *op_data; + struct ll_inode_info *lli = ll_i2info(inode); + struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS, + .ei_mode = it_to_lock_mode(oit), + .ei_cb_bl = ll_md_blocking_ast, + .ei_cb_cp = ldlm_completion_ast }; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_export *exp = sbi->ll_md_exp; + int rc; + + + + mutex_lock(&lli->lli_xattrs_enq_lock); + /* inode may have been shrunk and recreated, so data is gone, match lock + * only when data exists. */ + if (ll_xattr_cache_valid(lli)) { + /* Try matching first. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_XATTR, &lockh, 0, + LCK_PR); + if (mode != 0) { + /* fake oit in mdc_revalidate_lock() manner */ + oit->d.lustre.it_lock_handle = lockh.cookie; + oit->d.lustre.it_lock_mode = mode; + goto out; + } + } + + /* Enqueue if the lock isn't cached locally. */ + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + mutex_unlock(&lli->lli_xattrs_enq_lock); + return PTR_ERR(op_data); + } + + op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS; + + rc = md_enqueue(exp, &einfo, oit, op_data, &lockh, NULL, 0, NULL, 0); + ll_finish_md_op_data(op_data); + + if (rc < 0) { + CDEBUG(D_CACHE, + "md_intent_lock failed with %d for fid "DFID"\n", + rc, PFID(ll_inode2fid(inode))); + mutex_unlock(&lli->lli_xattrs_enq_lock); + return rc; + } + + *req = (struct ptlrpc_request *)oit->d.lustre.it_data; +out: + down_write(&lli->lli_xattrs_list_rwsem); + mutex_unlock(&lli->lli_xattrs_enq_lock); + + return 0; +} + +/** + * Refill the xattr cache. + * + * Fetch and cache the whole of xattrs for @inode, acquiring + * a read or a write xattr lock depending on operation in @oit. + * Intent is dropped on exit unless the operation is setxattr. + * + * \retval 0 no error occurred + * \retval -EPROTO network protocol error + * \retval -ENOMEM not enough memory for the cache + */ +static int ll_xattr_cache_refill(struct inode *inode, struct lookup_intent *oit) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + const char *xdata, *xval, *xtail, *xvtail; + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body; + __u32 *xsizes; + int rc = 0, i; + + + + rc = ll_xattr_find_get_lock(inode, oit, &req); + if (rc) + goto out_no_unlock; + + /* Do we have the data at this point? */ + if (ll_xattr_cache_valid(lli)) { + ll_stats_ops_tally(sbi, LPROC_LL_GETXATTR_HITS, 1); + rc = 0; + goto out_maybe_drop; + } + + /* Matched but no cache? Cancelled on error by a parallel refill. */ + if (unlikely(req == NULL)) { + CDEBUG(D_CACHE, "cancelled by a parallel getxattr\n"); + rc = -EIO; + goto out_maybe_drop; + } + + if (oit->d.lustre.it_status < 0) { + CDEBUG(D_CACHE, "getxattr intent returned %d for fid "DFID"\n", + oit->d.lustre.it_status, PFID(ll_inode2fid(inode))); + rc = oit->d.lustre.it_status; + /* xattr data is so large that we don't want to cache it */ + if (rc == -ERANGE) + rc = -EAGAIN; + goto out_destroy; + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) { + CERROR("no MDT BODY in the refill xattr reply\n"); + rc = -EPROTO; + goto out_destroy; + } + /* do not need swab xattr data */ + xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, + body->eadatasize); + xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS, + body->aclsize); + xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS, + body->max_mdsize * sizeof(__u32)); + if (xdata == NULL || xval == NULL || xsizes == NULL) { + CERROR("wrong setxattr reply\n"); + rc = -EPROTO; + goto out_destroy; + } + + xtail = xdata + body->eadatasize; + xvtail = xval + body->aclsize; + + CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail); + + ll_xattr_cache_init(lli); + + for (i = 0; i < body->max_mdsize; i++) { + CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval); + /* Perform consistency checks: attr names and vals in pill */ + if (memchr(xdata, 0, xtail - xdata) == NULL) { + CERROR("xattr protocol violation (names are broken)\n"); + rc = -EPROTO; + } else if (xval + *xsizes > xvtail) { + CERROR("xattr protocol violation (vals are broken)\n"); + rc = -EPROTO; + } else if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_XATTR_ENOMEM)) { + rc = -ENOMEM; + } else if (!strcmp(xdata, XATTR_NAME_ACL_ACCESS)) { + /* Filter out ACL ACCESS since it's cached separately */ + CDEBUG(D_CACHE, "not caching %s\n", + XATTR_NAME_ACL_ACCESS); + rc = 0; + } else { + rc = ll_xattr_cache_add(&lli->lli_xattrs, xdata, xval, + *xsizes); + } + if (rc < 0) { + ll_xattr_cache_destroy_locked(lli); + goto out_destroy; + } + xdata += strlen(xdata) + 1; + xval += *xsizes; + xsizes++; + } + + if (xdata != xtail || xval != xvtail) + CERROR("a hole in xattr data\n"); + + ll_set_lock_data(sbi->ll_md_exp, inode, oit, NULL); + + goto out_maybe_drop; +out_maybe_drop: + + ll_intent_drop_lock(oit); + + if (rc != 0) + up_write(&lli->lli_xattrs_list_rwsem); +out_no_unlock: + ptlrpc_req_finished(req); + + return rc; + +out_destroy: + up_write(&lli->lli_xattrs_list_rwsem); + + ldlm_lock_decref_and_cancel((struct lustre_handle *) + &oit->d.lustre.it_lock_handle, + oit->d.lustre.it_lock_mode); + + goto out_no_unlock; +} + +/** + * Get an xattr value or list xattrs using the write-through cache. + * + * Get the xattr value (@valid has OBD_MD_FLXATTR set) of @name or + * list xattr names (@valid has OBD_MD_FLXATTRLS set) for @inode. + * The resulting value/list is stored in @buffer if the former + * is not larger than @size. + * + * \retval 0 no error occurred + * \retval -EPROTO network protocol error + * \retval -ENOMEM not enough memory for the cache + * \retval -ERANGE the buffer is not large enough + * \retval -ENODATA no such attr or the list is empty + */ +int ll_xattr_cache_get(struct inode *inode, + const char *name, + char *buffer, + size_t size, + __u64 valid) +{ + struct lookup_intent oit = { .it_op = IT_GETXATTR }; + struct ll_inode_info *lli = ll_i2info(inode); + int rc = 0; + + + + LASSERT(!!(valid & OBD_MD_FLXATTR) ^ !!(valid & OBD_MD_FLXATTRLS)); + + down_read(&lli->lli_xattrs_list_rwsem); + if (!ll_xattr_cache_valid(lli)) { + up_read(&lli->lli_xattrs_list_rwsem); + rc = ll_xattr_cache_refill(inode, &oit); + if (rc) + return rc; + downgrade_write(&lli->lli_xattrs_list_rwsem); + } else { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR_HITS, 1); + } + + if (valid & OBD_MD_FLXATTR) { + struct ll_xattr_entry *xattr; + + rc = ll_xattr_cache_find(&lli->lli_xattrs, name, &xattr); + if (rc == 0) { + rc = xattr->xe_vallen; + /* zero size means we are only requested size in rc */ + if (size != 0) { + if (size >= xattr->xe_vallen) + memcpy(buffer, xattr->xe_value, + xattr->xe_vallen); + else + rc = -ERANGE; + } + } + } else if (valid & OBD_MD_FLXATTRLS) { + rc = ll_xattr_cache_list(&lli->lli_xattrs, + size ? buffer : NULL, size); + } + + goto out; +out: + up_read(&lli->lli_xattrs_list_rwsem); + + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/lmv/Makefile b/kernel/drivers/staging/lustre/lustre/lmv/Makefile new file mode 100644 index 000000000..a7a15369a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lmv/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LUSTRE_FS) += lmv.o +lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o +lmv-$(CONFIG_PROC_FS) += lproc_lmv.o diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c new file mode 100644 index 000000000..ee235926f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lmv/lmv_fld.c @@ -0,0 +1,83 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include + +#include "../include/obd_support.h" +#include "../include/lustre/lustre_idl.h" +#include "../include/lustre_fid.h" +#include "../include/lustre_lib.h" +#include "../include/lustre_net.h" +#include "../include/lustre_dlm.h" +#include "../include/obd_class.h" +#include "../include/lprocfs_status.h" +#include "lmv_internal.h" + +int lmv_fld_lookup(struct lmv_obd *lmv, + const struct lu_fid *fid, + u32 *mds) +{ + int rc; + + /* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and + * this fid_is_local check should be removed once LU-2240 is fixed */ + LASSERTF((fid_seq_in_fldb(fid_seq(fid)) || + fid_seq_is_local_file(fid_seq(fid))) && + fid_is_sane(fid), DFID" is insane!\n", PFID(fid)); + + rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds, + LU_SEQ_RANGE_MDT, NULL); + if (rc) { + CERROR("Error while looking for mds number. Seq %#llx, err = %d\n", + fid_seq(fid), rc); + return rc; + } + + CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n", + *mds, PFID(fid)); + + if (*mds >= lmv->desc.ld_tgt_count) { + CERROR("FLD lookup got invalid mds #%x (max: %x) for fid=" DFID "\n", *mds, lmv->desc.ld_tgt_count, + PFID(fid)); + rc = -EINVAL; + } + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/kernel/drivers/staging/lustre/lustre/lmv/lmv_intent.c new file mode 100644 index 000000000..d22d57b4f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lmv/lmv_intent.c @@ -0,0 +1,323 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include +#include +#include "../include/lustre_intent.h" +#include "../include/obd_support.h" +#include "../include/lustre/lustre_idl.h" +#include "../include/lustre_lib.h" +#include "../include/lustre_net.h" +#include "../include/lustre_dlm.h" +#include "../include/obd_class.h" +#include "../include/lprocfs_status.h" +#include "lmv_internal.h" + +static int lmv_intent_remote(struct obd_export *exp, void *lmm, + int lmmsize, struct lookup_intent *it, + const struct lu_fid *parent_fid, int flags, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct ptlrpc_request *req = NULL; + struct lustre_handle plock; + struct md_op_data *op_data; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int pmode; + int rc = 0; + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + return -EPROTO; + + LASSERT((body->valid & OBD_MD_MDS)); + + /* + * Unfortunately, we have to lie to MDC/MDS to retrieve + * attributes llite needs and provideproper locking. + */ + if (it->it_op & IT_LOOKUP) + it->it_op = IT_GETATTR; + + /* + * We got LOOKUP lock, but we really need attrs. + */ + pmode = it->d.lustre.it_lock_mode; + if (pmode) { + plock.cookie = it->d.lustre.it_lock_handle; + it->d.lustre.it_lock_mode = 0; + it->d.lustre.it_data = NULL; + } + + LASSERT(fid_is_sane(&body->fid1)); + + tgt = lmv_find_target(lmv, &body->fid1); + if (IS_ERR(tgt)) { + rc = PTR_ERR(tgt); + goto out; + } + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) { + rc = -ENOMEM; + goto out; + } + + op_data->op_fid1 = body->fid1; + /* Sent the parent FID to the remote MDT */ + if (parent_fid != NULL) { + /* The parent fid is only for remote open to + * check whether the open is from OBF, + * see mdt_cross_open */ + LASSERT(it->it_op & IT_OPEN); + op_data->op_fid2 = *parent_fid; + /* Add object FID to op_fid3, in case it needs to check stale + * (M_CHECK_STALE), see mdc_finish_intent_lock */ + op_data->op_fid3 = body->fid1; + } + + op_data->op_bias = MDS_CROSS_REF; + CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%d\n", + PFID(&body->fid1), tgt->ltd_idx); + + rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, + flags, &req, cb_blocking, extra_lock_flags); + if (rc) + goto out_free_op_data; + + /* + * LLite needs LOOKUP lock to track dentry revocation in order to + * maintain dcache consistency. Thus drop UPDATE|PERM lock here + * and put LOOKUP in request. + */ + if (it->d.lustre.it_lock_mode != 0) { + it->d.lustre.it_remote_lock_handle = + it->d.lustre.it_lock_handle; + it->d.lustre.it_remote_lock_mode = it->d.lustre.it_lock_mode; + } + + it->d.lustre.it_lock_handle = plock.cookie; + it->d.lustre.it_lock_mode = pmode; + +out_free_op_data: + OBD_FREE_PTR(op_data); +out: + if (rc && pmode) + ldlm_lock_decref(&plock, pmode); + + ptlrpc_req_finished(*reqp); + *reqp = req; + return rc; +} + +/* + * IT_OPEN is intended to open (and create, possible) an object. Parent (pid) + * may be split dir. + */ +int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int rc; + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + /* If it is ready to open the file by FID, do not need + * allocate FID at all, otherwise it will confuse MDT */ + if ((it->it_op & IT_CREAT) && + !(it->it_flags & MDS_OPEN_BY_FID)) { + /* + * For open with IT_CREATE and for IT_CREATE cases allocate new + * fid and setup FLD for it. + */ + op_data->op_fid3 = op_data->op_fid2; + rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data); + if (rc != 0) + return rc; + } + + CDEBUG(D_INODE, "OPEN_INTENT with fid1=" DFID ", fid2=" DFID ", name='%s' -> mds #%d\n", + PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx); + + rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, flags, + reqp, cb_blocking, extra_lock_flags); + if (rc != 0) + return rc; + /* + * Nothing is found, do not access body->fid1 as it is zero and thus + * pointless. + */ + if ((it->d.lustre.it_disposition & DISP_LOOKUP_NEG) && + !(it->d.lustre.it_disposition & DISP_OPEN_CREATE) && + !(it->d.lustre.it_disposition & DISP_OPEN_OPEN)) + return rc; + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + return -EPROTO; + /* + * Not cross-ref case, just get out of here. + */ + if (likely(!(body->valid & OBD_MD_MDS))) + return 0; + + /* + * Okay, MDS has returned success. Probably name has been resolved in + * remote inode. + */ + rc = lmv_intent_remote(exp, lmm, lmmsize, it, &op_data->op_fid1, flags, + reqp, cb_blocking, extra_lock_flags); + if (rc != 0) { + LASSERT(rc < 0); + /* + * This is possible, that some userspace application will try to + * open file as directory and we will have -ENOTDIR here. As + * this is normal situation, we should not print error here, + * only debug info. + */ + CDEBUG(D_INODE, "Can't handle remote %s: dir " DFID "(" DFID "):%*s: %d\n", + LL_IT2STR(it), PFID(&op_data->op_fid2), + PFID(&op_data->op_fid1), op_data->op_namelen, + op_data->op_name, rc); + return rc; + } + + return rc; +} + +/* + * Handler for: getattr, lookup and revalidate cases. + */ +int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = NULL; + struct mdt_body *body; + int rc = 0; + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + if (!fid_is_sane(&op_data->op_fid2)) + fid_zero(&op_data->op_fid2); + + CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID + ", name='%s' -> mds #%d\n", PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2), + op_data->op_name ? op_data->op_name : "", + tgt->ltd_idx); + + op_data->op_bias &= ~MDS_CROSS_REF; + + rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, + flags, reqp, cb_blocking, extra_lock_flags); + + if (rc < 0 || *reqp == NULL) + return rc; + + /* + * MDS has returned success. Probably name has been resolved in + * remote inode. Let's check this. + */ + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + return -EPROTO; + /* Not cross-ref case, just get out of here. */ + if (likely(!(body->valid & OBD_MD_MDS))) + return 0; + + rc = lmv_intent_remote(exp, lmm, lmmsize, it, NULL, flags, reqp, + cb_blocking, extra_lock_flags); + + return rc; +} + +int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + int rc; + + LASSERT(it != NULL); + LASSERT(fid_is_sane(&op_data->op_fid1)); + + CDEBUG(D_INODE, "INTENT LOCK '%s' for '%*s' on "DFID"\n", + LL_IT2STR(it), op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1)); + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT)) + rc = lmv_intent_lookup(exp, op_data, lmm, lmmsize, it, + flags, reqp, cb_blocking, + extra_lock_flags); + else if (it->it_op & IT_OPEN) + rc = lmv_intent_open(exp, op_data, lmm, lmmsize, it, + flags, reqp, cb_blocking, + extra_lock_flags); + else + LBUG(); + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/kernel/drivers/staging/lustre/lustre/lmv/lmv_internal.h new file mode 100644 index 000000000..852d78721 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lmv/lmv_internal.h @@ -0,0 +1,157 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LMV_INTERNAL_H_ +#define _LMV_INTERNAL_H_ + +#include "../include/lustre/lustre_idl.h" +#include "../include/obd.h" + +#define LMV_MAX_TGT_COUNT 128 + +#define lmv_init_lock(lmv) mutex_lock(&lmv->init_mutex) +#define lmv_init_unlock(lmv) mutex_unlock(&lmv->init_mutex) + +#define LL_IT2STR(it) \ + ((it) ? ldlm_it2str((it)->it_op) : "0") + +int lmv_check_connect(struct obd_device *obd); + +int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *, int); +int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds); +int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds); +int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data); + +static inline struct lmv_stripe_md *lmv_get_mea(struct ptlrpc_request *req) +{ + struct mdt_body *body; + struct lmv_stripe_md *mea; + + LASSERT(req != NULL); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + + if (!body || !S_ISDIR(body->mode) || !body->eadatasize) + return NULL; + + mea = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, + body->eadatasize); + LASSERT(mea != NULL); + + if (mea->mea_count == 0) + return NULL; + if (mea->mea_magic != MEA_MAGIC_LAST_CHAR && + mea->mea_magic != MEA_MAGIC_ALL_CHARS && + mea->mea_magic != MEA_MAGIC_HASH_SEGMENT) + return NULL; + + return mea; +} + +static inline int lmv_get_easize(struct lmv_obd *lmv) +{ + return sizeof(struct lmv_stripe_md) + + lmv->desc.ld_tgt_count * + sizeof(struct lu_fid); +} + +static inline struct lmv_tgt_desc * +lmv_get_target(struct lmv_obd *lmv, u32 mds) +{ + int count = lmv->desc.ld_tgt_count; + int i; + + for (i = 0; i < count; i++) { + if (lmv->tgts[i] == NULL) + continue; + + if (lmv->tgts[i]->ltd_idx == mds) + return lmv->tgts[i]; + } + + return ERR_PTR(-ENODEV); +} + +static inline struct lmv_tgt_desc * +lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid) +{ + u32 mds = 0; + int rc; + + if (lmv->desc.ld_tgt_count > 1) { + rc = lmv_fld_lookup(lmv, fid, &mds); + if (rc) + return ERR_PTR(rc); + } + + return lmv_get_target(lmv, mds); +} + +struct lmv_tgt_desc +*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data, + struct lu_fid *fid); +/* lproc_lmv.c */ +#if defined(CONFIG_PROC_FS) +void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif +extern struct file_operations lmv_proc_target_fops; + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c new file mode 100644 index 000000000..b9459faf8 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lmv/lmv_obd.c @@ -0,0 +1,2892 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../include/lustre/lustre_idl.h" +#include "../include/obd_support.h" +#include "../include/lustre_lib.h" +#include "../include/lustre_net.h" +#include "../include/obd_class.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre_lite.h" +#include "../include/lustre_fid.h" +#include "lmv_internal.h" + +static void lmv_activate_target(struct lmv_obd *lmv, + struct lmv_tgt_desc *tgt, + int activate) +{ + if (tgt->ltd_active == activate) + return; + + tgt->ltd_active = activate; + lmv->desc.ld_active_tgt_count += (activate ? 1 : -1); +} + +/** + * Error codes: + * + * -EINVAL : UUID can't be found in the LMV's target list + * -ENOTCONN: The UUID is found, but the target connection is bad (!) + * -EBADF : The UUID is found, but the OBD of the wrong type (!) + */ +static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid, + int activate) +{ + struct lmv_tgt_desc *uninitialized_var(tgt); + struct obd_device *obd; + int i; + int rc = 0; + + CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n", + lmv, uuid->uuid, activate); + + spin_lock(&lmv->lmv_lock); + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + if (tgt == NULL || tgt->ltd_exp == NULL) + continue; + + CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", i, + tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie); + + if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) + break; + } + + if (i == lmv->desc.ld_tgt_count) { + rc = -EINVAL; + goto out_lmv_lock; + } + + obd = class_exp2obd(tgt->ltd_exp); + if (obd == NULL) { + rc = -ENOTCONN; + goto out_lmv_lock; + } + + CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n", + obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd, + obd->obd_type->typ_name, i); + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0); + + if (tgt->ltd_active == activate) { + CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd, + activate ? "" : "in"); + goto out_lmv_lock; + } + + CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, + activate ? "" : "in"); + lmv_activate_target(lmv, tgt, activate); + + out_lmv_lock: + spin_unlock(&lmv->lmv_lock); + return rc; +} + +static struct obd_uuid *lmv_get_uuid(struct obd_export *exp) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + + return obd_get_uuid(lmv->tgts[0]->ltd_exp); +} + +static int lmv_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data) +{ + struct obd_connect_data *conn_data; + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_uuid *uuid; + int rc = 0; + + if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) { + CERROR("unexpected notification of %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + return -EINVAL; + } + + uuid = &watched->u.cli.cl_target_uuid; + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { + /* + * Set MDC as active before notifying the observer, so the + * observer can use the MDC normally. + */ + rc = lmv_set_mdc_active(lmv, uuid, + ev == OBD_NOTIFY_ACTIVE); + if (rc) { + CERROR("%sactivation of %s failed: %d\n", + ev == OBD_NOTIFY_ACTIVE ? "" : "de", + uuid->uuid, rc); + return rc; + } + } else if (ev == OBD_NOTIFY_OCD) { + conn_data = &watched->u.cli.cl_import->imp_connect_data; + /* + * XXX: Make sure that ocd_connect_flags from all targets are + * the same. Otherwise one of MDTs runs wrong version or + * something like this. --umka + */ + obd->obd_self_export->exp_connect_data = *conn_data; + } +#if 0 + else if (ev == OBD_NOTIFY_DISCON) { + /* + * For disconnect event, flush fld cache for failout MDS case. + */ + fld_client_flush(&lmv->lmv_fld); + } +#endif + /* + * Pass the notification up the chain. + */ + if (obd->obd_observer) + rc = obd_notify(obd->obd_observer, watched, ev, data); + + return rc; +} + +/** + * This is fake connect function. Its purpose is to initialize lmv and say + * caller that everything is okay. Real connection will be performed later. + */ +static int lmv_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct proc_dir_entry *lmv_proc_dir; + struct lmv_obd *lmv = &obd->u.lmv; + struct lustre_handle conn = { 0 }; + int rc = 0; + + /* + * We don't want to actually do the underlying connections more than + * once, so keep track. + */ + lmv->refcount++; + if (lmv->refcount > 1) { + *exp = NULL; + return 0; + } + + rc = class_connect(&conn, obd, cluuid); + if (rc) { + CERROR("class_connection() returned %d\n", rc); + return rc; + } + + *exp = class_conn2export(&conn); + class_export_get(*exp); + + lmv->exp = *exp; + lmv->connected = 0; + lmv->cluuid = *cluuid; + + if (data) + lmv->conn_data = *data; + + if (obd->obd_proc_private != NULL) { + lmv_proc_dir = obd->obd_proc_private; + } else { + lmv_proc_dir = lprocfs_register("target_obds", obd->obd_proc_entry, + NULL, NULL); + if (IS_ERR(lmv_proc_dir)) { + CERROR("could not register /proc/fs/lustre/%s/%s/target_obds.", + obd->obd_type->typ_name, obd->obd_name); + lmv_proc_dir = NULL; + } + obd->obd_proc_private = lmv_proc_dir; + } + + /* + * All real clients should perform actual connection right away, because + * it is possible, that LMV will not have opportunity to connect targets + * and MDC stuff will be called directly, for instance while reading + * ../mdc/../kbytesfree procfs file, etc. + */ + if (data->ocd_connect_flags & OBD_CONNECT_REAL) + rc = lmv_check_connect(obd); + + if (rc && lmv_proc_dir) { + lprocfs_remove(&lmv_proc_dir); + obd->obd_proc_private = NULL; + } + + return rc; +} + +static void lmv_set_timeouts(struct obd_device *obd) +{ + struct lmv_tgt_desc *tgt; + struct lmv_obd *lmv; + int i; + + lmv = &obd->u.lmv; + if (lmv->server_timeout == 0) + return; + + if (lmv->connected == 0) + return; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0) + continue; + + obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS), + KEY_INTERMDS, 0, NULL, NULL); + } +} + +static int lmv_init_ea_size(struct obd_export *exp, int easize, + int def_easize, int cookiesize, int def_cookiesize) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int i; + int rc = 0; + int change = 0; + + if (lmv->max_easize < easize) { + lmv->max_easize = easize; + change = 1; + } + if (lmv->max_def_easize < def_easize) { + lmv->max_def_easize = def_easize; + change = 1; + } + if (lmv->max_cookiesize < cookiesize) { + lmv->max_cookiesize = cookiesize; + change = 1; + } + if (lmv->max_def_cookiesize < def_cookiesize) { + lmv->max_def_cookiesize = def_cookiesize; + change = 1; + } + if (change == 0) + return 0; + + if (lmv->connected == 0) + return 0; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || + lmv->tgts[i]->ltd_exp == NULL || + lmv->tgts[i]->ltd_active == 0) { + CWARN("%s: NULL export for %d\n", obd->obd_name, i); + continue; + } + + rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize, + cookiesize, def_cookiesize); + if (rc) { + CERROR("%s: obd_init_ea_size() failed on MDT target %d: rc = %d.\n", + obd->obd_name, i, rc); + break; + } + } + return rc; +} + +#define MAX_STRING_SIZE 128 + +static int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct proc_dir_entry *lmv_proc_dir; + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_uuid *cluuid = &lmv->cluuid; + struct obd_uuid lmv_mdc_uuid = { "LMV_MDC_UUID" }; + struct obd_device *mdc_obd; + struct obd_export *mdc_exp; + struct lu_fld_target target; + int rc; + + mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc_obd) { + CERROR("target %s not attached\n", tgt->ltd_uuid.uuid); + return -EINVAL; + } + + CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n", + mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, + tgt->ltd_uuid.uuid, obd->obd_uuid.uuid, + cluuid->uuid); + + if (!mdc_obd->obd_set_up) { + CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid); + return -EINVAL; + } + + rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid, + &lmv->conn_data, NULL); + if (rc) { + CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc); + return rc; + } + + /* + * Init fid sequence client for this mdc and add new fld target. + */ + rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA); + if (rc) + return rc; + + target.ft_srv = NULL; + target.ft_exp = mdc_exp; + target.ft_idx = tgt->ltd_idx; + + fld_client_add_target(&lmv->lmv_fld, &target); + + rc = obd_register_observer(mdc_obd, obd); + if (rc) { + obd_disconnect(mdc_exp); + CERROR("target %s register_observer error %d\n", + tgt->ltd_uuid.uuid, rc); + return rc; + } + + if (obd->obd_observer) { + /* + * Tell the observer about the new target. + */ + rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd, + OBD_NOTIFY_ACTIVE, + (void *)(tgt - lmv->tgts[0])); + if (rc) { + obd_disconnect(mdc_exp); + return rc; + } + } + + tgt->ltd_active = 1; + tgt->ltd_exp = mdc_exp; + lmv->desc.ld_active_tgt_count++; + + md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize, + lmv->max_cookiesize, lmv->max_def_cookiesize); + + CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n", + mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + + lmv_proc_dir = obd->obd_proc_private; + if (lmv_proc_dir) { + struct proc_dir_entry *mdc_symlink; + + LASSERT(mdc_obd->obd_type != NULL); + LASSERT(mdc_obd->obd_type->typ_name != NULL); + mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name, + lmv_proc_dir, + "../../../%s/%s", + mdc_obd->obd_type->typ_name, + mdc_obd->obd_name); + if (mdc_symlink == NULL) { + CERROR("Could not register LMV target /proc/fs/lustre/%s/%s/target_obds/%s.", + obd->obd_type->typ_name, obd->obd_name, + mdc_obd->obd_name); + lprocfs_remove(&lmv_proc_dir); + obd->obd_proc_private = NULL; + } + } + return 0; +} + +static void lmv_del_target(struct lmv_obd *lmv, int index) +{ + if (lmv->tgts[index] == NULL) + return; + + OBD_FREE_PTR(lmv->tgts[index]); + lmv->tgts[index] = NULL; + return; +} + +static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + __u32 index, int gen) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc = 0; + + CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index); + + lmv_init_lock(lmv); + + if (lmv->desc.ld_tgt_count == 0) { + struct obd_device *mdc_obd; + + mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc_obd) { + lmv_init_unlock(lmv); + CERROR("%s: Target %s not attached: rc = %d\n", + obd->obd_name, uuidp->uuid, -EINVAL); + return -EINVAL; + } + } + + if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) { + tgt = lmv->tgts[index]; + CERROR("%s: UUID %s already assigned at LOV target index %d: rc = %d\n", + obd->obd_name, + obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST); + lmv_init_unlock(lmv); + return -EEXIST; + } + + if (index >= lmv->tgts_size) { + /* We need to reallocate the lmv target array. */ + struct lmv_tgt_desc **newtgts, **old = NULL; + __u32 newsize = 1; + __u32 oldsize = 0; + + while (newsize < index + 1) + newsize <<= 1; + OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize); + if (newtgts == NULL) { + lmv_init_unlock(lmv); + return -ENOMEM; + } + + if (lmv->tgts_size) { + memcpy(newtgts, lmv->tgts, + sizeof(*newtgts) * lmv->tgts_size); + old = lmv->tgts; + oldsize = lmv->tgts_size; + } + + lmv->tgts = newtgts; + lmv->tgts_size = newsize; + smp_rmb(); + if (old) + OBD_FREE(old, sizeof(*old) * oldsize); + + CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts, + lmv->tgts_size); + } + + OBD_ALLOC_PTR(tgt); + if (!tgt) { + lmv_init_unlock(lmv); + return -ENOMEM; + } + + mutex_init(&tgt->ltd_fid_mutex); + tgt->ltd_idx = index; + tgt->ltd_uuid = *uuidp; + tgt->ltd_active = 0; + lmv->tgts[index] = tgt; + if (index >= lmv->desc.ld_tgt_count) + lmv->desc.ld_tgt_count = index + 1; + + if (lmv->connected) { + rc = lmv_connect_mdc(obd, tgt); + if (rc) { + spin_lock(&lmv->lmv_lock); + lmv->desc.ld_tgt_count--; + memset(tgt, 0, sizeof(*tgt)); + spin_unlock(&lmv->lmv_lock); + } else { + int easize = sizeof(struct lmv_stripe_md) + + lmv->desc.ld_tgt_count * sizeof(struct lu_fid); + lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0); + } + } + + lmv_init_unlock(lmv); + return rc; +} + +int lmv_check_connect(struct obd_device *obd) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int i; + int rc; + int easize; + + if (lmv->connected) + return 0; + + lmv_init_lock(lmv); + if (lmv->connected) { + lmv_init_unlock(lmv); + return 0; + } + + if (lmv->desc.ld_tgt_count == 0) { + lmv_init_unlock(lmv); + CERROR("%s: no targets configured.\n", obd->obd_name); + return -EINVAL; + } + + CDEBUG(D_CONFIG, "Time to connect %s to %s\n", + lmv->cluuid.uuid, obd->obd_name); + + LASSERT(lmv->tgts != NULL); + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + if (tgt == NULL) + continue; + rc = lmv_connect_mdc(obd, tgt); + if (rc) + goto out_disc; + } + + lmv_set_timeouts(obd); + class_export_put(lmv->exp); + lmv->connected = 1; + easize = lmv_get_easize(lmv); + lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0); + lmv_init_unlock(lmv); + return 0; + + out_disc: + while (i-- > 0) { + int rc2; + tgt = lmv->tgts[i]; + if (tgt == NULL) + continue; + tgt->ltd_active = 0; + if (tgt->ltd_exp) { + --lmv->desc.ld_active_tgt_count; + rc2 = obd_disconnect(tgt->ltd_exp); + if (rc2) { + CERROR("LMV target %s disconnect on MDC idx %d: error %d\n", + tgt->ltd_uuid.uuid, i, rc2); + } + } + } + class_disconnect(lmv->exp); + lmv_init_unlock(lmv); + return rc; +} + +static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct proc_dir_entry *lmv_proc_dir; + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_device *mdc_obd; + int rc; + + LASSERT(tgt != NULL); + LASSERT(obd != NULL); + + mdc_obd = class_exp2obd(tgt->ltd_exp); + + if (mdc_obd) { + mdc_obd->obd_force = obd->obd_force; + mdc_obd->obd_fail = obd->obd_fail; + mdc_obd->obd_no_recov = obd->obd_no_recov; + } + + lmv_proc_dir = obd->obd_proc_private; + if (lmv_proc_dir) + lprocfs_remove_proc_entry(mdc_obd->obd_name, lmv_proc_dir); + + rc = obd_fid_fini(tgt->ltd_exp->exp_obd); + if (rc) + CERROR("Can't finalize fids factory\n"); + + CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n", + tgt->ltd_exp->exp_obd->obd_name, + tgt->ltd_exp->exp_obd->obd_uuid.uuid); + + obd_register_observer(tgt->ltd_exp->exp_obd, NULL); + rc = obd_disconnect(tgt->ltd_exp); + if (rc) { + if (tgt->ltd_active) { + CERROR("Target %s disconnect error %d\n", + tgt->ltd_uuid.uuid, rc); + } + } + + lmv_activate_target(lmv, tgt, 0); + tgt->ltd_exp = NULL; + return 0; +} + +static int lmv_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + int rc; + int i; + + if (!lmv->tgts) + goto out_local; + + /* + * Only disconnect the underlying layers on the final disconnect. + */ + lmv->refcount--; + if (lmv->refcount != 0) + goto out_local; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL) + continue; + + lmv_disconnect_mdc(obd, lmv->tgts[i]); + } + + if (obd->obd_proc_private) + lprocfs_remove((struct proc_dir_entry **)&obd->obd_proc_private); + else + CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n", + obd->obd_type->typ_name, obd->obd_name); + +out_local: + /* + * This is the case when no real connection is established by + * lmv_check_connect(). + */ + if (!lmv->connected) + class_export_put(exp); + rc = class_disconnect(exp); + if (lmv->refcount == 0) + lmv->connected = 0; + return rc; +} + +static int lmv_fid2path(struct obd_export *exp, int len, void *karg, void *uarg) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lmv_obd *lmv = &obddev->u.lmv; + struct getinfo_fid2path *gf; + struct lmv_tgt_desc *tgt; + struct getinfo_fid2path *remote_gf = NULL; + int remote_gf_size = 0; + int rc; + + gf = (struct getinfo_fid2path *)karg; + tgt = lmv_find_target(lmv, &gf->gf_fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + +repeat_fid2path: + rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg); + if (rc != 0 && rc != -EREMOTE) + goto out_fid2path; + + /* If remote_gf != NULL, it means just building the + * path on the remote MDT, copy this path segment to gf */ + if (remote_gf != NULL) { + struct getinfo_fid2path *ori_gf; + char *ptr; + + ori_gf = (struct getinfo_fid2path *)karg; + if (strlen(ori_gf->gf_path) + + strlen(gf->gf_path) > ori_gf->gf_pathlen) { + rc = -EOVERFLOW; + goto out_fid2path; + } + + ptr = ori_gf->gf_path; + + memmove(ptr + strlen(gf->gf_path) + 1, ptr, + strlen(ori_gf->gf_path)); + + strncpy(ptr, gf->gf_path, strlen(gf->gf_path)); + ptr += strlen(gf->gf_path); + *ptr = '/'; + } + + CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n", + tgt->ltd_exp->exp_obd->obd_name, + gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno, + gf->gf_linkno); + + if (rc == 0) + goto out_fid2path; + + /* sigh, has to go to another MDT to do path building further */ + if (remote_gf == NULL) { + remote_gf_size = sizeof(*remote_gf) + PATH_MAX; + OBD_ALLOC(remote_gf, remote_gf_size); + if (remote_gf == NULL) { + rc = -ENOMEM; + goto out_fid2path; + } + remote_gf->gf_pathlen = PATH_MAX; + } + + if (!fid_is_sane(&gf->gf_fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt->ltd_exp->exp_obd->obd_name, + PFID(&gf->gf_fid), -EINVAL); + rc = -EINVAL; + goto out_fid2path; + } + + tgt = lmv_find_target(lmv, &gf->gf_fid); + if (IS_ERR(tgt)) { + rc = -EINVAL; + goto out_fid2path; + } + + remote_gf->gf_fid = gf->gf_fid; + remote_gf->gf_recno = -1; + remote_gf->gf_linkno = -1; + memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen); + gf = remote_gf; + goto repeat_fid2path; + +out_fid2path: + if (remote_gf != NULL) + OBD_FREE(remote_gf, remote_gf_size); + return rc; +} + +static int lmv_hsm_req_count(struct lmv_obd *lmv, + const struct hsm_user_request *hur, + const struct lmv_tgt_desc *tgt_mds) +{ + int i, nr = 0; + struct lmv_tgt_desc *curr_tgt; + + /* count how many requests must be sent to the given target */ + for (i = 0; i < hur->hur_request.hr_itemcount; i++) { + curr_tgt = lmv_find_target(lmv, &hur->hur_user_item[i].hui_fid); + if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) + nr++; + } + return nr; +} + +static void lmv_hsm_req_build(struct lmv_obd *lmv, + struct hsm_user_request *hur_in, + const struct lmv_tgt_desc *tgt_mds, + struct hsm_user_request *hur_out) +{ + int i, nr_out; + struct lmv_tgt_desc *curr_tgt; + + /* build the hsm_user_request for the given target */ + hur_out->hur_request = hur_in->hur_request; + nr_out = 0; + for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) { + curr_tgt = lmv_find_target(lmv, + &hur_in->hur_user_item[i].hui_fid); + if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) { + hur_out->hur_user_item[nr_out] = + hur_in->hur_user_item[i]; + nr_out++; + } + } + hur_out->hur_request.hr_itemcount = nr_out; + memcpy(hur_data(hur_out), hur_data(hur_in), + hur_in->hur_request.hr_data_len); +} + +static int lmv_hsm_ct_unregister(struct lmv_obd *lmv, unsigned int cmd, int len, + struct lustre_kernelcomm *lk, void *uarg) +{ + int i, rc = 0; + + /* unregister request (call from llapi_hsm_copytool_fini) */ + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + /* best effort: try to clean as much as possible + * (continue on error) */ + obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, lk, uarg); + } + + /* Whatever the result, remove copytool from kuc groups. + * Unreached coordinators will get EPIPE on next requests + * and will unregister automatically. + */ + rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group); + return rc; +} + +static int lmv_hsm_ct_register(struct lmv_obd *lmv, unsigned int cmd, int len, + struct lustre_kernelcomm *lk, void *uarg) +{ + struct file *filp; + int i, j, err; + int rc = 0; + bool any_set = false; + + /* All or nothing: try to register to all MDS. + * In case of failure, unregister from previous MDS, + * except if it because of inactive target. */ + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, + len, lk, uarg); + if (err) { + if (lmv->tgts[i]->ltd_active) { + /* permanent error */ + CERROR("error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n", + lmv->tgts[i]->ltd_uuid.uuid, + i, cmd, err); + rc = err; + lk->lk_flags |= LK_FLG_STOP; + /* unregister from previous MDS */ + for (j = 0; j < i; j++) + obd_iocontrol(cmd, + lmv->tgts[j]->ltd_exp, + len, lk, uarg); + return rc; + } + /* else: transient error. + * kuc will register to the missing MDT + * when it is back */ + } else { + any_set = true; + } + } + + if (!any_set) + /* no registration done: return error */ + return -ENOTCONN; + + /* at least one registration done, with no failure */ + filp = fget(lk->lk_wfd); + if (filp == NULL) { + return -EBADF; + } + rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group, lk->lk_data); + if (rc != 0 && filp != NULL) + fput(filp); + return rc; +} + + + + +static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, + int len, void *karg, void *uarg) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lmv_obd *lmv = &obddev->u.lmv; + int i = 0; + int rc = 0; + int set = 0; + int count = lmv->desc.ld_tgt_count; + + if (count == 0) + return -ENOTTY; + + switch (cmd) { + case IOC_OBD_STATFS: { + struct obd_ioctl_data *data = karg; + struct obd_device *mdc_obd; + struct obd_statfs stat_buf = {0}; + __u32 index; + + memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); + if (index >= count) + return -ENODEV; + + if (lmv->tgts[index] == NULL || + lmv->tgts[index]->ltd_active == 0) + return -ENODATA; + + mdc_obd = class_exp2obd(lmv->tgts[index]->ltd_exp); + if (!mdc_obd) + return -EINVAL; + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd), + min((int) data->ioc_plen2, + (int) sizeof(struct obd_uuid)))) + return -EFAULT; + + rc = obd_statfs(NULL, lmv->tgts[index]->ltd_exp, &stat_buf, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + 0); + if (rc) + return rc; + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min((int) data->ioc_plen1, + (int) sizeof(stat_buf)))) + return -EFAULT; + break; + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct lmv_tgt_desc *tgt = NULL; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_MDTIDX) { + if (qctl->qc_idx < 0 || count <= qctl->qc_idx) + return -EINVAL; + + tgt = lmv->tgts[qctl->qc_idx]; + if (tgt == NULL || tgt->ltd_exp == NULL) + return -EINVAL; + } else if (qctl->qc_valid == QC_UUID) { + for (i = 0; i < count; i++) { + tgt = lmv->tgts[i]; + if (tgt == NULL) + continue; + if (!obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (tgt->ltd_exp == NULL) + return -EINVAL; + + break; + } + } else { + return -EINVAL; + } + + if (i >= count) + return -EAGAIN; + + LASSERT(tgt && tgt->ltd_exp); + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + return -ENOMEM; + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_MDTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } + case OBD_IOC_CHANGELOG_SEND: + case OBD_IOC_CHANGELOG_CLEAR: { + struct ioc_changelog *icc = karg; + + if (icc->icc_mdtindex >= count) + return -ENODEV; + + if (lmv->tgts[icc->icc_mdtindex] == NULL || + lmv->tgts[icc->icc_mdtindex]->ltd_exp == NULL || + lmv->tgts[icc->icc_mdtindex]->ltd_active == 0) + return -ENODEV; + rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex]->ltd_exp, + sizeof(*icc), icc, NULL); + break; + } + case LL_IOC_GET_CONNECT_FLAGS: { + if (lmv->tgts[0] == NULL) + return -ENODATA; + rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg); + break; + } + case OBD_IOC_FID2PATH: { + rc = lmv_fid2path(exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_STATE_GET: + case LL_IOC_HSM_STATE_SET: + case LL_IOC_HSM_ACTION: { + struct md_op_data *op_data = karg; + struct lmv_tgt_desc *tgt; + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + if (tgt->ltd_exp == NULL) + return -EINVAL; + + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_PROGRESS: { + const struct hsm_progress_kernel *hpk = karg; + struct lmv_tgt_desc *tgt; + + tgt = lmv_find_target(lmv, &hpk->hpk_fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_REQUEST: { + struct hsm_user_request *hur = karg; + struct lmv_tgt_desc *tgt; + unsigned int reqcount = hur->hur_request.hr_itemcount; + + if (reqcount == 0) + return 0; + + /* if the request is about a single fid + * or if there is a single MDS, no need to split + * the request. */ + if (reqcount == 1 || count == 1) { + tgt = lmv_find_target(lmv, + &hur->hur_user_item[0].hui_fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + } else { + /* split fid list to their respective MDS */ + for (i = 0; i < count; i++) { + unsigned int nr, reqlen; + int rc1; + struct hsm_user_request *req; + + nr = lmv_hsm_req_count(lmv, hur, lmv->tgts[i]); + if (nr == 0) /* nothing for this MDS */ + continue; + + /* build a request with fids for this MDS */ + reqlen = offsetof(typeof(*hur), + hur_user_item[nr]) + + hur->hur_request.hr_data_len; + OBD_ALLOC_LARGE(req, reqlen); + if (req == NULL) + return -ENOMEM; + + lmv_hsm_req_build(lmv, hur, lmv->tgts[i], req); + + rc1 = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, + reqlen, req, uarg); + if (rc1 != 0 && rc == 0) + rc = rc1; + OBD_FREE_LARGE(req, reqlen); + } + } + break; + } + case LL_IOC_LOV_SWAP_LAYOUTS: { + struct md_op_data *op_data = karg; + struct lmv_tgt_desc *tgt1, *tgt2; + + tgt1 = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt1)) + return PTR_ERR(tgt1); + + tgt2 = lmv_find_target(lmv, &op_data->op_fid2); + if (IS_ERR(tgt2)) + return PTR_ERR(tgt2); + + if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL)) + return -EINVAL; + + /* only files on same MDT can have their layouts swapped */ + if (tgt1->ltd_idx != tgt2->ltd_idx) + return -EPERM; + + rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_CT_START: { + struct lustre_kernelcomm *lk = karg; + if (lk->lk_flags & LK_FLG_STOP) + rc = lmv_hsm_ct_unregister(lmv, cmd, len, lk, uarg); + else + rc = lmv_hsm_ct_register(lmv, cmd, len, lk, uarg); + break; + } + default: + for (i = 0; i < count; i++) { + struct obd_device *mdc_obd; + int err; + + if (lmv->tgts[i] == NULL || + lmv->tgts[i]->ltd_exp == NULL) + continue; + /* ll_umount_begin() sets force flag but for lmv, not + * mdc. Let's pass it through */ + mdc_obd = class_exp2obd(lmv->tgts[i]->ltd_exp); + mdc_obd->obd_force = obddev->obd_force; + err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, + karg, uarg); + if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) { + return err; + } else if (err) { + if (lmv->tgts[i]->ltd_active) { + CERROR("error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n", + lmv->tgts[i]->ltd_uuid.uuid, + i, cmd, err); + if (!rc) + rc = err; + } + } else + set = 1; + } + if (!set && !rc) + rc = -EIO; + } + return rc; +} + +#if 0 +static int lmv_all_chars_policy(int count, const char *name, + int len) +{ + unsigned int c = 0; + + while (len > 0) + c += name[--len]; + c = c % count; + return c; +} + +static int lmv_nid_policy(struct lmv_obd *lmv) +{ + struct obd_import *imp; + __u32 id; + + /* + * XXX: To get nid we assume that underlying obd device is mdc. + */ + imp = class_exp2cliimp(lmv->tgts[0].ltd_exp); + id = imp->imp_connection->c_self ^ (imp->imp_connection->c_self >> 32); + return id % lmv->desc.ld_tgt_count; +} + +static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data, + enum placement_policy placement) +{ + switch (placement) { + case PLACEMENT_CHAR_POLICY: + return lmv_all_chars_policy(lmv->desc.ld_tgt_count, + op_data->op_name, + op_data->op_namelen); + case PLACEMENT_NID_POLICY: + return lmv_nid_policy(lmv); + + default: + break; + } + + CERROR("Unsupported placement policy %x\n", placement); + return -EINVAL; +} +#endif + +/** + * This is _inode_ placement policy function (not name). + */ +static int lmv_placement_policy(struct obd_device *obd, + struct md_op_data *op_data, u32 *mds) +{ + struct lmv_obd *lmv = &obd->u.lmv; + + LASSERT(mds != NULL); + + if (lmv->desc.ld_tgt_count == 1) { + *mds = 0; + return 0; + } + + /** + * If stripe_offset is provided during setdirstripe + * (setdirstripe -i xx), xx MDS will be chosen. + */ + if (op_data->op_cli_flags & CLI_SET_MEA) { + struct lmv_user_md *lum; + + lum = (struct lmv_user_md *)op_data->op_data; + if (lum->lum_type == LMV_STRIPE_TYPE && + lum->lum_stripe_offset != -1) { + if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) { + CERROR("%s: Stripe_offset %d > MDT count %d: rc = %d\n", + obd->obd_name, + lum->lum_stripe_offset, + lmv->desc.ld_tgt_count, -ERANGE); + return -ERANGE; + } + *mds = lum->lum_stripe_offset; + return 0; + } + } + + /* Allocate new fid on target according to operation type and parent + * home mds. */ + *mds = op_data->op_mds; + return 0; +} + +int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds) +{ + struct lmv_tgt_desc *tgt; + int rc; + + tgt = lmv_get_target(lmv, mds); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + /* + * New seq alloc and FLD setup should be atomic. Otherwise we may find + * on server that seq in new allocated fid is not yet known. + */ + mutex_lock(&tgt->ltd_fid_mutex); + + if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL) { + rc = -ENODEV; + goto out; + } + + /* + * Asking underlaying tgt layer to allocate new fid. + */ + rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL); + if (rc > 0) { + LASSERT(fid_is_sane(fid)); + rc = 0; + } + +out: + mutex_unlock(&tgt->ltd_fid_mutex); + return rc; +} + +int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + u32 mds = 0; + int rc; + + LASSERT(op_data != NULL); + LASSERT(fid != NULL); + + rc = lmv_placement_policy(obd, op_data, &mds); + if (rc) { + CERROR("Can't get target for allocating fid, rc %d\n", + rc); + return rc; + } + + rc = __lmv_fid_alloc(lmv, fid, mds); + if (rc) { + CERROR("Can't alloc new fid, rc %d\n", rc); + return rc; + } + + return rc; +} + +static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lprocfs_static_vars lvars; + struct lmv_desc *desc; + int rc; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("LMV setup requires a descriptor\n"); + return -EINVAL; + } + + desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1); + if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("Lmv descriptor size wrong: %d > %d\n", + (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); + return -EINVAL; + } + + OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * 32); + if (lmv->tgts == NULL) + return -ENOMEM; + lmv->tgts_size = 32; + + obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid); + lmv->desc.ld_tgt_count = 0; + lmv->desc.ld_active_tgt_count = 0; + lmv->max_cookiesize = 0; + lmv->max_def_easize = 0; + lmv->max_easize = 0; + lmv->lmv_placement = PLACEMENT_CHAR_POLICY; + + spin_lock_init(&lmv->lmv_lock); + mutex_init(&lmv->init_mutex); + + lprocfs_lmv_init_vars(&lvars); + + lprocfs_obd_setup(obd, lvars.obd_vars); +#if defined (CONFIG_PROC_FS) + { + rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", + 0444, &lmv_proc_target_fops, obd); + if (rc) + CWARN("%s: error adding LMV target_obd file: rc = %d\n", + obd->obd_name, rc); + } +#endif + rc = fld_client_init(&lmv->lmv_fld, obd->obd_name, + LUSTRE_CLI_FLD_HASH_DHT); + if (rc) { + CERROR("Can't init FLD, err %d\n", rc); + goto out; + } + + return 0; + +out: + return rc; +} + +static int lmv_cleanup(struct obd_device *obd) +{ + struct lmv_obd *lmv = &obd->u.lmv; + + fld_client_fini(&lmv->lmv_fld); + if (lmv->tgts != NULL) { + int i; + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL) + continue; + lmv_del_target(lmv, i); + } + OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size); + lmv->tgts_size = 0; + } + return 0; +} + +static int lmv_process_config(struct obd_device *obd, u32 len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct obd_uuid obd_uuid; + int gen; + __u32 index; + int rc; + + switch (lcfg->lcfg_command) { + case LCFG_ADD_MDC: + /* modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDT0000_UUID + * 2:0 3:1 4:lustre-MDT0000-mdc_UUID */ + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) { + rc = -EINVAL; + goto out; + } + + obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); + + if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) { + rc = -EINVAL; + goto out; + } + if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) { + rc = -EINVAL; + goto out; + } + rc = lmv_add_target(obd, &obd_uuid, index, gen); + goto out; + default: + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +static int lmv_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_statfs *temp; + int rc = 0; + int i; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + OBD_ALLOC(temp, sizeof(*temp)); + if (temp == NULL) + return -ENOMEM; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL) + continue; + + rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp, + max_age, flags); + if (rc) { + CERROR("can't stat MDS #%d (%s), error %d\n", i, + lmv->tgts[i]->ltd_exp->exp_obd->obd_name, + rc); + goto out_free_temp; + } + + if (i == 0) { + *osfs = *temp; + /* If the statfs is from mount, it will needs + * retrieve necessary information from MDT0. + * i.e. mount does not need the merged osfs + * from all of MDT. + * And also clients can be mounted as long as + * MDT0 is in service*/ + if (flags & OBD_STATFS_FOR_MDT0) + goto out_free_temp; + } else { + osfs->os_bavail += temp->os_bavail; + osfs->os_blocks += temp->os_blocks; + osfs->os_ffree += temp->os_ffree; + osfs->os_files += temp->os_files; + } + } + +out_free_temp: + OBD_FREE(temp, sizeof(*temp)); + return rc; +} + +static int lmv_getstatus(struct obd_export *exp, + struct lu_fid *fid, + struct obd_capa **pc) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid, pc); + return rc; +} + +static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, u64 valid, const char *name, + const char *input, int input_size, int output_size, + int flags, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_getxattr(tgt->ltd_exp, fid, oc, valid, name, input, + input_size, output_size, flags, request); + + return rc; +} + +static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, u64 valid, const char *name, + const char *input, int input_size, int output_size, + int flags, __u32 suppgid, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input, + input_size, output_size, flags, suppgid, + request); + + return rc; +} + +static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + if (op_data->op_flags & MF_GET_MDT_IDX) { + op_data->op_mds = tgt->ltd_idx; + return 0; + } + + rc = md_getattr(tgt->ltd_exp, op_data, request); + + return rc; +} + +static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int i; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid)); + + /* + * With DNE every object can have two locks in different namespaces: + * lookup lock in space of MDT storing direntry and update/open lock in + * space of MDT storing inode. + */ + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL) + continue; + md_null_inode(lmv->tgts[i]->ltd_exp, fid); + } + + return 0; +} + +static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid, + ldlm_iterator_t it, void *data) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int i; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid)); + + /* + * With DNE every object can have two locks in different namespaces: + * lookup lock in space of MDT storing direntry and update/open lock in + * space of MDT storing inode. + */ + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL) + continue; + rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data); + if (rc) + return rc; + } + + return rc; +} + + +static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1)); + rc = md_close(tgt->ltd_exp, op_data, mod, request); + return rc; +} + +struct lmv_tgt_desc +*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data, + struct lu_fid *fid) +{ + struct lmv_tgt_desc *tgt; + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return tgt; + + op_data->op_mds = tgt->ltd_idx; + + return tgt; +} + +static int lmv_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, + __u32 gid, cfs_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + if (!lmv->desc.ld_active_tgt_count) + return -EIO; + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data); + if (rc) + return rc; + + CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n", + op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), + op_data->op_mds); + + op_data->op_flags |= MF_MDC_CANCEL_FID1; + rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid, + cap_effective, rdev, request); + + if (rc == 0) { + if (*request == NULL) + return rc; + CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2)); + } + return rc; +} + +static int lmv_done_writing(struct obd_export *exp, + struct md_op_data *op_data, + struct md_open_data *mod) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_done_writing(tgt->ltd_exp, op_data, mod); + return rc; +} + +static int +lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, struct md_op_data *op_data, + struct lustre_handle *lockh, void *lmm, int lmmsize, + __u64 extra_lock_flags) +{ + struct ptlrpc_request *req = it->d.lustre.it_data; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lustre_handle plock; + struct lmv_tgt_desc *tgt; + struct md_op_data *rdata; + struct lu_fid fid1; + struct mdt_body *body; + int rc = 0; + int pmode; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + if (!(body->valid & OBD_MD_MDS)) + return 0; + + CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n", + LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1)); + + /* + * We got LOOKUP lock, but we really need attrs. + */ + pmode = it->d.lustre.it_lock_mode; + LASSERT(pmode != 0); + memcpy(&plock, lockh, sizeof(plock)); + it->d.lustre.it_lock_mode = 0; + it->d.lustre.it_data = NULL; + fid1 = body->fid1; + + ptlrpc_req_finished(req); + + tgt = lmv_find_target(lmv, &fid1); + if (IS_ERR(tgt)) { + rc = PTR_ERR(tgt); + goto out; + } + + OBD_ALLOC_PTR(rdata); + if (rdata == NULL) { + rc = -ENOMEM; + goto out; + } + + rdata->op_fid1 = fid1; + rdata->op_bias = MDS_CROSS_REF; + + rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh, + lmm, lmmsize, NULL, extra_lock_flags); + OBD_FREE_PTR(rdata); +out: + ldlm_lock_decref(&plock, pmode); + return rc; +} + +static int +lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, struct md_op_data *op_data, + struct lustre_handle *lockh, void *lmm, int lmmsize, + struct ptlrpc_request **req, __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n", + LL_IT2STR(it), PFID(&op_data->op_fid1)); + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n", + LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx); + + rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh, + lmm, lmmsize, req, extra_lock_flags); + + if (rc == 0 && it && it->it_op == IT_OPEN) { + rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh, + lmm, lmmsize, extra_lock_flags); + } + return rc; +} + +static int +lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req = NULL; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n", + op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), + tgt->ltd_idx); + + rc = md_getattr_name(tgt->ltd_exp, op_data, request); + if (rc != 0) + return rc; + + body = req_capsule_server_get(&(*request)->rq_pill, + &RMF_MDT_BODY); + LASSERT(body != NULL); + + if (body->valid & OBD_MD_MDS) { + struct lu_fid rid = body->fid1; + CDEBUG(D_INODE, "Request attrs for "DFID"\n", + PFID(&rid)); + + tgt = lmv_find_target(lmv, &rid); + if (IS_ERR(tgt)) { + ptlrpc_req_finished(*request); + return PTR_ERR(tgt); + } + + op_data->op_fid1 = rid; + op_data->op_valid |= OBD_MD_FLCROSSREF; + op_data->op_namelen = 0; + op_data->op_name = NULL; + rc = md_getattr_name(tgt->ltd_exp, op_data, &req); + ptlrpc_req_finished(*request); + *request = req; + } + + return rc; +} + +#define md_op_data_fid(op_data, fl) \ + (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \ + fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \ + fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \ + fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \ + NULL) + +static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data, + int op_tgt, ldlm_mode_t mode, int bits, int flag) +{ + struct lu_fid *fid = md_op_data_fid(op_data, flag); + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + ldlm_policy_data_t policy = {{0}}; + int rc = 0; + + if (!fid_is_sane(fid)) + return 0; + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + if (tgt->ltd_idx != op_tgt) { + CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid)); + policy.l_inodebits.bits = bits; + rc = md_cancel_unused(tgt->ltd_exp, fid, &policy, + mode, LCF_ASYNC, NULL); + } else { + CDEBUG(D_INODE, + "EARLY_CANCEL skip operation target %d on "DFID"\n", + op_tgt, PFID(fid)); + op_data->op_flags |= flag; + rc = 0; + } + + return rc; +} + +/* + * llite passes fid of an target inode in op_data->op_fid1 and id of directory in + * op_data->op_fid2 + */ +static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + LASSERT(op_data->op_namelen != 0); + + CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n", + PFID(&op_data->op_fid2), op_data->op_namelen, + op_data->op_name, PFID(&op_data->op_fid1)); + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = cfs_curproc_cap_pack(); + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + /* + * Cancel UPDATE lock on child (fid1). + */ + op_data->op_flags |= MF_MDC_CANCEL_FID2; + rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); + if (rc != 0) + return rc; + + rc = md_link(tgt->ltd_exp, op_data, request); + + return rc; +} + +static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *src_tgt; + struct lmv_tgt_desc *tgt_tgt; + int rc; + + LASSERT(oldlen != 0); + + CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n", + oldlen, old, PFID(&op_data->op_fid1), + newlen, new, PFID(&op_data->op_fid2)); + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = cfs_curproc_cap_pack(); + src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(src_tgt)) + return PTR_ERR(src_tgt); + + tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); + if (IS_ERR(tgt_tgt)) + return PTR_ERR(tgt_tgt); + /* + * LOOKUP lock on src child (fid3) should also be cancelled for + * src_tgt in mdc_rename. + */ + op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; + + /* + * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its + * own target. + */ + rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + LCK_EX, MDS_INODELOCK_UPDATE, + MF_MDC_CANCEL_FID2); + + /* + * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt. + */ + if (rc == 0) { + rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + LCK_EX, MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID4); + } + + /* + * Cancel all the locks on tgt child (fid4). + */ + if (rc == 0) + rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + LCK_EX, MDS_INODELOCK_FULL, + MF_MDC_CANCEL_FID4); + + if (rc == 0) + rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen, + new, newlen, request); + return rc; +} + +static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len, + struct ptlrpc_request **request, + struct md_open_data **mod) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc = 0; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n", + PFID(&op_data->op_fid1), op_data->op_attr.ia_valid); + + op_data->op_flags |= MF_MDC_CANCEL_FID1; + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2, + ea2len, request, mod); + + return rc; +} + +static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_sync(tgt->ltd_exp, fid, oc, request); + return rc; +} + +/* + * Adjust a set of pages, each page containing an array of lu_dirpages, + * so that each page can be used as a single logical lu_dirpage. + * + * A lu_dirpage is laid out as follows, where s = ldp_hash_start, + * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a + * struct lu_dirent. It has size up to LU_PAGE_SIZE. The ldp_hash_end + * value is used as a cookie to request the next lu_dirpage in a + * directory listing that spans multiple pages (two in this example): + * ________ + * | | + * .|--------v------- -----. + * |s|e|f|p|ent|ent| ... |ent| + * '--|-------------- -----' Each CFS_PAGE contains a single + * '------. lu_dirpage. + * .---------v------- -----. + * |s|e|f|p|ent| 0 | ... | 0 | + * '----------------- -----' + * + * However, on hosts where the native VM page size (PAGE_CACHE_SIZE) is + * larger than LU_PAGE_SIZE, a single host page may contain multiple + * lu_dirpages. After reading the lu_dirpages from the MDS, the + * ldp_hash_end of the first lu_dirpage refers to the one immediately + * after it in the same CFS_PAGE (arrows simplified for brevity, but + * in general e0==s1, e1==s2, etc.): + * + * .-------------------- -----. + * |s0|e0|f0|p|ent|ent| ... |ent| + * |---v---------------- -----| + * |s1|e1|f1|p|ent|ent| ... |ent| + * |---v---------------- -----| Here, each CFS_PAGE contains + * ... multiple lu_dirpages. + * |---v---------------- -----| + * |s'|e'|f'|p|ent|ent| ... |ent| + * '---|---------------- -----' + * v + * .----------------------------. + * | next CFS_PAGE | + * + * This structure is transformed into a single logical lu_dirpage as follows: + * + * - Replace e0 with e' so the request for the next lu_dirpage gets the page + * labeled 'next CFS_PAGE'. + * + * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether + * a hash collision with the next page exists. + * + * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span + * to the first entry of the next lu_dirpage. + */ +#if PAGE_CACHE_SIZE > LU_PAGE_SIZE +static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs) +{ + int i; + + for (i = 0; i < ncfspgs; i++) { + struct lu_dirpage *dp = kmap(pages[i]); + struct lu_dirpage *first = dp; + struct lu_dirent *end_dirent = NULL; + struct lu_dirent *ent; + __u64 hash_end = dp->ldp_hash_end; + __u32 flags = dp->ldp_flags; + + while (--nlupgs > 0) { + ent = lu_dirent_start(dp); + for (end_dirent = ent; ent != NULL; + end_dirent = ent, ent = lu_dirent_next(ent)); + + /* Advance dp to next lu_dirpage. */ + dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE); + + /* Check if we've reached the end of the CFS_PAGE. */ + if (!((unsigned long)dp & ~CFS_PAGE_MASK)) + break; + + /* Save the hash and flags of this lu_dirpage. */ + hash_end = dp->ldp_hash_end; + flags = dp->ldp_flags; + + /* Check if lu_dirpage contains no entries. */ + if (!end_dirent) + break; + + /* Enlarge the end entry lde_reclen from 0 to + * first entry of next lu_dirpage. */ + LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0); + end_dirent->lde_reclen = + cpu_to_le16((char *)(dp->ldp_entries) - + (char *)end_dirent); + } + + first->ldp_hash_end = hash_end; + first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE); + first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE); + + kunmap(pages[i]); + } + LASSERTF(nlupgs == 0, "left = %d", nlupgs); +} +#else +#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0) +#endif /* PAGE_CACHE_SIZE > LU_PAGE_SIZE */ + +static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data, + struct page **pages, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + __u64 offset = op_data->op_offset; + int rc; + int ncfspgs; /* pages read in PAGE_CACHE_SIZE */ + int nlupgs; /* pages read in LU_PAGE_SIZE */ + struct lmv_tgt_desc *tgt; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + CDEBUG(D_INODE, "READPAGE at %#llx from "DFID"\n", + offset, PFID(&op_data->op_fid1)); + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_readpage(tgt->ltd_exp, op_data, pages, request); + if (rc != 0) + return rc; + + ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1) + >> PAGE_CACHE_SHIFT; + nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT; + LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK)); + LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages); + + CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs, + op_data->op_npages); + + lmv_adjust_dirpages(pages, ncfspgs, nlupgs); + + return rc; +} + +static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = NULL; + struct mdt_body *body; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; +retry: + /* Send unlink requests to the MDT where the child is located */ + if (likely(!fid_is_zero(&op_data->op_fid2))) + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); + else + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = cfs_curproc_cap_pack(); + + /* + * If child's fid is given, cancel unused locks for it if it is from + * another export than parent. + * + * LOOKUP lock for child (fid3) should also be cancelled on parent + * tgt_tgt in mdc_unlink(). + */ + op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; + + /* + * Cancel FULL locks on child (fid3). + */ + rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX, + MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3); + + if (rc != 0) + return rc; + + CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n", + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx); + + rc = md_unlink(tgt->ltd_exp, op_data, request); + if (rc != 0 && rc != -EREMOTE) + return rc; + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + return -EPROTO; + + /* Not cross-ref case, just get out of here. */ + if (likely(!(body->valid & OBD_MD_MDS))) + return 0; + + CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n", + exp->exp_obd->obd_name, PFID(&body->fid1)); + + /* This is a remote object, try remote MDT, Note: it may + * try more than 1 time here, Considering following case + * /mnt/lustre is root on MDT0, remote1 is on MDT1 + * 1. Initially A does not know where remote1 is, it send + * unlink RPC to MDT0, MDT0 return -EREMOTE, it will + * resend unlink RPC to MDT1 (retry 1st time). + * + * 2. During the unlink RPC in flight, + * client B mv /mnt/lustre/remote1 /mnt/lustre/remote2 + * and create new remote1, but on MDT0 + * + * 3. MDT1 get unlink RPC(from A), then do remote lock on + * /mnt/lustre, then lookup get fid of remote1, and find + * it is remote dir again, and replay -EREMOTE again. + * + * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times). + * + * In theory, it might try unlimited time here, but it should + * be very rare case. */ + op_data->op_fid2 = body->fid1; + ptlrpc_req_finished(*request); + *request = NULL; + + goto retry; +} + +static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + struct lmv_obd *lmv = &obd->u.lmv; + + switch (stage) { + case OBD_CLEANUP_EARLY: + /* XXX: here should be calling obd_precleanup() down to + * stack. */ + break; + case OBD_CLEANUP_EXPORTS: + fld_client_proc_fini(&lmv->lmv_fld); + lprocfs_obd_cleanup(obd); + break; + default: + break; + } + return 0; +} + +static int lmv_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd; + struct lmv_obd *lmv; + int rc = 0; + + obd = class_exp2obd(exp); + if (obd == NULL) { + CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n", + exp->exp_handle.h_cookie); + return -EINVAL; + } + + lmv = &obd->u.lmv; + if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) { + struct lmv_tgt_desc *tgt; + int i; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + LASSERT(*vallen == sizeof(__u32)); + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + /* + * All tgts should be connected when this gets called. + */ + if (tgt == NULL || tgt->ltd_exp == NULL) + continue; + + if (!obd_get_info(env, tgt->ltd_exp, keylen, key, + vallen, val, NULL)) + return 0; + } + return -EINVAL; + } else if (KEY_IS(KEY_MAX_EASIZE) || + KEY_IS(KEY_DEFAULT_EASIZE) || + KEY_IS(KEY_MAX_COOKIESIZE) || + KEY_IS(KEY_DEFAULT_COOKIESIZE) || + KEY_IS(KEY_CONN_DATA)) { + rc = lmv_check_connect(obd); + if (rc) + return rc; + + /* + * Forwarding this request to first MDS, it should know LOV + * desc. + */ + rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key, + vallen, val, NULL); + if (!rc && KEY_IS(KEY_CONN_DATA)) + exp->exp_connect_data = *(struct obd_connect_data *)val; + return rc; + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = lmv->desc.ld_tgt_count; + return 0; + } + + CDEBUG(D_IOCTL, "Invalid key\n"); + return -EINVAL; +} + +static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, u32 vallen, + void *val, struct ptlrpc_request_set *set) +{ + struct lmv_tgt_desc *tgt; + struct obd_device *obd; + struct lmv_obd *lmv; + int rc = 0; + + obd = class_exp2obd(exp); + if (obd == NULL) { + CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n", + exp->exp_handle.h_cookie); + return -EINVAL; + } + lmv = &obd->u.lmv; + + if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) { + int i, err = 0; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + + if (tgt == NULL || tgt->ltd_exp == NULL) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, vallen, val, set); + if (err && rc == 0) + rc = err; + } + + return rc; + } + + return -EINVAL; +} + +static int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_stripe_md *meap; + struct lmv_stripe_md *lsmp; + int mea_size; + int i; + + mea_size = lmv_get_easize(lmv); + if (!lmmp) + return mea_size; + + if (*lmmp && !lsm) { + OBD_FREE_LARGE(*lmmp, mea_size); + *lmmp = NULL; + return 0; + } + + if (*lmmp == NULL) { + OBD_ALLOC_LARGE(*lmmp, mea_size); + if (*lmmp == NULL) + return -ENOMEM; + } + + if (!lsm) + return mea_size; + + lsmp = (struct lmv_stripe_md *)lsm; + meap = (struct lmv_stripe_md *)*lmmp; + + if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR && + lsmp->mea_magic != MEA_MAGIC_ALL_CHARS) + return -EINVAL; + + meap->mea_magic = cpu_to_le32(lsmp->mea_magic); + meap->mea_count = cpu_to_le32(lsmp->mea_count); + meap->mea_master = cpu_to_le32(lsmp->mea_master); + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + meap->mea_ids[i] = lsmp->mea_ids[i]; + fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]); + } + + return mea_size; +} + +static int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmm, int lmm_size) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_stripe_md **tmea = (struct lmv_stripe_md **)lsmp; + struct lmv_stripe_md *mea = (struct lmv_stripe_md *)lmm; + struct lmv_obd *lmv = &obd->u.lmv; + int mea_size; + int i; + __u32 magic; + + mea_size = lmv_get_easize(lmv); + if (lsmp == NULL) + return mea_size; + + if (*lsmp != NULL && lmm == NULL) { + OBD_FREE_LARGE(*tmea, mea_size); + *lsmp = NULL; + return 0; + } + + LASSERT(mea_size == lmm_size); + + OBD_ALLOC_LARGE(*tmea, mea_size); + if (*tmea == NULL) + return -ENOMEM; + + if (!lmm) + return mea_size; + + if (mea->mea_magic == MEA_MAGIC_LAST_CHAR || + mea->mea_magic == MEA_MAGIC_ALL_CHARS || + mea->mea_magic == MEA_MAGIC_HASH_SEGMENT) { + magic = le32_to_cpu(mea->mea_magic); + } else { + /* + * Old mea is not handled here. + */ + CERROR("Old not supportable EA is found\n"); + LBUG(); + } + + (*tmea)->mea_magic = magic; + (*tmea)->mea_count = le32_to_cpu(mea->mea_count); + (*tmea)->mea_master = le32_to_cpu(mea->mea_master); + + for (i = 0; i < (*tmea)->mea_count; i++) { + (*tmea)->mea_ids[i] = mea->mea_ids[i]; + fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]); + } + return mea_size; +} + +static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + ldlm_cancel_flags_t flags, void *opaque) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int rc = 0; + int err; + int i; + + LASSERT(fid != NULL); + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL || + lmv->tgts[i]->ltd_active == 0) + continue; + + err = md_cancel_unused(lmv->tgts[i]->ltd_exp, fid, + policy, mode, flags, opaque); + if (!rc) + rc = err; + } + return rc; +} + +static int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data, + __u64 *bits) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + int rc; + + rc = md_set_lock_data(lmv->tgts[0]->ltd_exp, lockh, data, bits); + return rc; +} + +static ldlm_mode_t lmv_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + struct lustre_handle *lockh) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + ldlm_mode_t rc; + int i; + + CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid)); + + /* + * With CMD every object can have two locks in different namespaces: + * lookup lock in space of mds storing direntry and update/open lock in + * space of mds storing inode. Thus we check all targets, not only that + * one fid was created in. + */ + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || + lmv->tgts[i]->ltd_exp == NULL || + lmv->tgts[i]->ltd_active == 0) + continue; + + rc = md_lock_match(lmv->tgts[i]->ltd_exp, flags, fid, + type, policy, mode, lockh); + if (rc) + return rc; + } + + return 0; +} + +static int lmv_get_lustre_md(struct obd_export *exp, + struct ptlrpc_request *req, + struct obd_export *dt_exp, + struct obd_export *md_exp, + struct lustre_md *md) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + + return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md); +} + +static int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + + if (md->mea) + obd_free_memmd(exp, (void *)&md->mea); + return md_free_lustre_md(lmv->tgts[0]->ltd_exp, md); +} + +static int lmv_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + + tgt = lmv_find_target(lmv, &och->och_fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + return md_set_open_replay_data(tgt->ltd_exp, och, it); +} + +static int lmv_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + + tgt = lmv_find_target(lmv, &och->och_fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + return md_clear_open_replay_data(tgt->ltd_exp, och); +} + +static int lmv_get_remote_perm(struct obd_export *exp, + const struct lu_fid *fid, + struct obd_capa *oc, __u32 suppgid, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_get_remote_perm(tgt->ltd_exp, fid, oc, suppgid, request); + return rc; +} + +static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc, + renew_capa_cb_t cb) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, &oc->c_capa.lc_fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_renew_capa(tgt->ltd_exp, oc, cb); + return rc; +} + +static int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa **oc) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + + return md_unpack_capa(lmv->tgts[0]->ltd_exp, req, field, oc); +} + +static int lmv_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo) +{ + struct md_op_data *op_data = &minfo->mi_data; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = NULL; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo); + return rc; +} + +static int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + rc = lmv_check_connect(obd); + if (rc) + return rc; + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits); + return rc; +} + +/** + * For lmv, only need to send request to master MDT, and the master MDT will + * process with other slave MDTs. The only exception is Q_GETOQUOTA for which + * we directly fetch data from the slave MDTs. + */ +static int lmv_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv->tgts[0]; + int rc = 0, i; + __u64 curspace, curinodes; + + if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) { + CERROR("master lmv inactive\n"); + return -EIO; + } + + if (oqctl->qc_cmd != Q_GETOQUOTA) { + rc = obd_quotactl(tgt->ltd_exp, oqctl); + return rc; + } + + curspace = curinodes = 0; + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + int err; + tgt = lmv->tgts[i]; + + if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0) + continue; + if (!tgt->ltd_active) { + CDEBUG(D_HA, "mdt %d is inactive.\n", i); + continue; + } + + err = obd_quotactl(tgt->ltd_exp, oqctl); + if (err) { + CERROR("getquota on mdt %d failed. %d\n", i, err); + if (!rc) + rc = err; + } else { + curspace += oqctl->qc_dqblk.dqb_curspace; + curinodes += oqctl->qc_dqblk.dqb_curinodes; + } + } + oqctl->qc_dqblk.dqb_curspace = curspace; + oqctl->qc_dqblk.dqb_curinodes = curinodes; + + return rc; +} + +static int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int i, rc = 0; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + int err; + tgt = lmv->tgts[i]; + if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) { + CERROR("lmv idx %d inactive\n", i); + return -EIO; + } + + err = obd_quotacheck(tgt->ltd_exp, oqctl); + if (err && !rc) + rc = err; + } + + return rc; +} + +static struct obd_ops lmv_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = lmv_setup, + .o_cleanup = lmv_cleanup, + .o_precleanup = lmv_precleanup, + .o_process_config = lmv_process_config, + .o_connect = lmv_connect, + .o_disconnect = lmv_disconnect, + .o_statfs = lmv_statfs, + .o_get_info = lmv_get_info, + .o_set_info_async = lmv_set_info_async, + .o_packmd = lmv_packmd, + .o_unpackmd = lmv_unpackmd, + .o_notify = lmv_notify, + .o_get_uuid = lmv_get_uuid, + .o_iocontrol = lmv_iocontrol, + .o_quotacheck = lmv_quotacheck, + .o_quotactl = lmv_quotactl +}; + +static struct md_ops lmv_md_ops = { + .m_getstatus = lmv_getstatus, + .m_null_inode = lmv_null_inode, + .m_find_cbdata = lmv_find_cbdata, + .m_close = lmv_close, + .m_create = lmv_create, + .m_done_writing = lmv_done_writing, + .m_enqueue = lmv_enqueue, + .m_getattr = lmv_getattr, + .m_getxattr = lmv_getxattr, + .m_getattr_name = lmv_getattr_name, + .m_intent_lock = lmv_intent_lock, + .m_link = lmv_link, + .m_rename = lmv_rename, + .m_setattr = lmv_setattr, + .m_setxattr = lmv_setxattr, + .m_sync = lmv_sync, + .m_readpage = lmv_readpage, + .m_unlink = lmv_unlink, + .m_init_ea_size = lmv_init_ea_size, + .m_cancel_unused = lmv_cancel_unused, + .m_set_lock_data = lmv_set_lock_data, + .m_lock_match = lmv_lock_match, + .m_get_lustre_md = lmv_get_lustre_md, + .m_free_lustre_md = lmv_free_lustre_md, + .m_set_open_replay_data = lmv_set_open_replay_data, + .m_clear_open_replay_data = lmv_clear_open_replay_data, + .m_renew_capa = lmv_renew_capa, + .m_unpack_capa = lmv_unpack_capa, + .m_get_remote_perm = lmv_get_remote_perm, + .m_intent_getattr_async = lmv_intent_getattr_async, + .m_revalidate_lock = lmv_revalidate_lock +}; + +static int __init lmv_init(void) +{ + struct lprocfs_static_vars lvars; + int rc; + + lprocfs_lmv_init_vars(&lvars); + + rc = class_register_type(&lmv_obd_ops, &lmv_md_ops, + lvars.module_vars, LUSTRE_LMV_NAME, NULL); + return rc; +} + +static void lmv_exit(void) +{ + class_unregister_type(LUSTRE_LMV_NAME); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver"); +MODULE_LICENSE("GPL"); + +module_init(lmv_init); +module_exit(lmv_exit); diff --git a/kernel/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/kernel/drivers/staging/lustre/lustre/lmv/lproc_lmv.c new file mode 100644 index 000000000..22e5c315f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lmv/lproc_lmv.c @@ -0,0 +1,237 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include "../include/lprocfs_status.h" +#include "../include/obd_class.h" +#include "lmv_internal.h" + +static int lmv_numobd_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lmv_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lmv.desc; + seq_printf(m, "%u\n", desc->ld_tgt_count); + return 0; +} +LPROC_SEQ_FOPS_RO(lmv_numobd); + +static const char *placement_name[] = { + [PLACEMENT_CHAR_POLICY] = "CHAR", + [PLACEMENT_NID_POLICY] = "NID", + [PLACEMENT_INVAL_POLICY] = "INVAL" +}; + +static enum placement_policy placement_name2policy(char *name, int len) +{ + int i; + + for (i = 0; i < PLACEMENT_MAX_POLICY; i++) { + if (!strncmp(placement_name[i], name, len)) + return i; + } + return PLACEMENT_INVAL_POLICY; +} + +static const char *placement_policy2name(enum placement_policy placement) +{ + LASSERT(placement < PLACEMENT_MAX_POLICY); + return placement_name[placement]; +} + +static int lmv_placement_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lmv_obd *lmv; + + LASSERT(dev != NULL); + lmv = &dev->u.lmv; + seq_printf(m, "%s\n", placement_policy2name(lmv->lmv_placement)); + return 0; +} + +#define MAX_POLICY_STRING_SIZE 64 + +static ssize_t lmv_placement_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + char dummy[MAX_POLICY_STRING_SIZE + 1]; + int len = count; + enum placement_policy policy; + struct lmv_obd *lmv; + + if (copy_from_user(dummy, buffer, MAX_POLICY_STRING_SIZE)) + return -EFAULT; + + LASSERT(dev != NULL); + lmv = &dev->u.lmv; + + if (len > MAX_POLICY_STRING_SIZE) + len = MAX_POLICY_STRING_SIZE; + + if (dummy[len - 1] == '\n') + len--; + dummy[len] = '\0'; + + policy = placement_name2policy(dummy, len); + if (policy != PLACEMENT_INVAL_POLICY) { + spin_lock(&lmv->lmv_lock); + lmv->lmv_placement = policy; + spin_unlock(&lmv->lmv_lock); + } else { + CERROR("Invalid placement policy \"%s\"!\n", dummy); + return -EINVAL; + } + return count; +} +LPROC_SEQ_FOPS(lmv_placement); + +static int lmv_activeobd_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lmv_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lmv.desc; + seq_printf(m, "%u\n", desc->ld_active_tgt_count); + return 0; +} +LPROC_SEQ_FOPS_RO(lmv_activeobd); + +static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lmv_obd *lmv; + + LASSERT(dev != NULL); + lmv = &dev->u.lmv; + seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid); + return 0; +} +LPROC_SEQ_FOPS_RO(lmv_desc_uuid); + +static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lmv_obd *lmv = &dev->u.lmv; + return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos]; +} + +static void lmv_tgt_seq_stop(struct seq_file *p, void *v) +{ + return; +} + +static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lmv_obd *lmv = &dev->u.lmv; + ++*pos; + return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos]; +} + +static int lmv_tgt_seq_show(struct seq_file *p, void *v) +{ + struct lmv_tgt_desc *tgt = v; + + if (tgt == NULL) + return 0; + seq_printf(p, "%d: %s %sACTIVE\n", + tgt->ltd_idx, tgt->ltd_uuid.uuid, + tgt->ltd_active ? "" : "IN"); + return 0; +} + +static struct seq_operations lmv_tgt_sops = { + .start = lmv_tgt_seq_start, + .stop = lmv_tgt_seq_stop, + .next = lmv_tgt_seq_next, + .show = lmv_tgt_seq_show, +}; + +static int lmv_target_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lmv_tgt_sops); + if (rc) + return rc; + + seq = file->private_data; + seq->private = PDE_DATA(inode); + + return 0; +} + +LPROC_SEQ_FOPS_RO_TYPE(lmv, uuid); + +static struct lprocfs_vars lprocfs_lmv_obd_vars[] = { + { "numobd", &lmv_numobd_fops, NULL, 0 }, + { "placement", &lmv_placement_fops, NULL, 0 }, + { "activeobd", &lmv_activeobd_fops, NULL, 0 }, + { "uuid", &lmv_uuid_fops, NULL, 0 }, + { "desc_uuid", &lmv_desc_uuid_fops, NULL, 0 }, + { NULL } +}; + +LPROC_SEQ_FOPS_RO_TYPE(lmv, numrefs); + +static struct lprocfs_vars lprocfs_lmv_module_vars[] = { + { "num_refs", &lmv_numrefs_fops, NULL, 0 }, + { NULL } +}; + +struct file_operations lmv_proc_target_fops = { + .owner = THIS_MODULE, + .open = lmv_target_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_lmv_module_vars; + lvars->obd_vars = lprocfs_lmv_obd_vars; +} diff --git a/kernel/drivers/staging/lustre/lustre/lov/Makefile b/kernel/drivers/staging/lustre/lustre/lov/Makefile new file mode 100644 index 000000000..6fe56a24b --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LUSTRE_FS) += lov.o +lov-y := lov_obd.o lov_pack.o lov_offset.o lov_merge.o \ + lov_request.o lov_ea.o lov_dev.o lov_object.o lov_page.o \ + lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o \ + lovsub_lock.o lovsub_io.o lov_pool.o +lov-$(CONFIG_PROC_FS) += lproc_lov.o diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/kernel/drivers/staging/lustre/lustre/lov/lov_cl_internal.h new file mode 100644 index 000000000..314ce8525 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_cl_internal.h @@ -0,0 +1,839 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal interfaces of LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#ifndef LOV_CL_INTERNAL_H +#define LOV_CL_INTERNAL_H + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd.h" +#include "../include/cl_object.h" +#include "lov_internal.h" + +/** \defgroup lov lov + * Logical object volume layer. This layer implements data striping (raid0). + * + * At the lov layer top-entity (object, page, lock, io) is connected to one or + * more sub-entities: top-object, representing a file is connected to a set of + * sub-objects, each representing a stripe, file-level top-lock is connected + * to a set of per-stripe sub-locks, top-page is connected to a (single) + * sub-page, and a top-level IO is connected to a set of (potentially + * concurrent) sub-IO's. + * + * Sub-object, sub-page, and sub-io have well-defined top-object and top-page + * respectively, while a single sub-lock can be part of multiple top-locks. + * + * Reference counting models are different for different types of entities: + * + * - top-object keeps a reference to its sub-objects, and destroys them + * when it is destroyed. + * + * - top-page keeps a reference to its sub-page, and destroys it when it + * is destroyed. + * + * - sub-lock keep a reference to its top-locks. Top-lock keeps a + * reference (and a hold, see cl_lock_hold()) on its sub-locks when it + * actively using them (that is, in cl_lock_state::CLS_QUEUING, + * cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When + * moving into cl_lock_state::CLS_CACHED state, top-lock releases a + * hold. From this moment top-lock has only a 'weak' reference to its + * sub-locks. This reference is protected by top-lock + * cl_lock::cll_guard, and will be automatically cleared by the sub-lock + * when the latter is destroyed. When a sub-lock is canceled, a + * reference to it is removed from the top-lock array, and top-lock is + * moved into CLS_NEW state. It is guaranteed that all sub-locks exist + * while their top-lock is in CLS_HELD or CLS_CACHED states. + * + * - IO's are not reference counted. + * + * To implement a connection between top and sub entities, lov layer is split + * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both + * implementing full set of cl-interfaces. For example, top-object has vvp and + * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is + * used to track child-parent relationship. + * + * @{ + */ + +struct lovsub_device; +struct lovsub_object; +struct lovsub_lock; + +enum lov_device_flags { + LOV_DEV_INITIALIZED = 1 << 0 +}; + +/* + * Upper half. + */ + +/** + * Resources that are used in memory-cleaning path, and whose allocation + * cannot fail even when memory is tight. They are preallocated in sufficient + * quantities in lov_device::ld_emerg[], and access to them is serialized + * lov_device::ld_mutex. + */ +struct lov_device_emerg { + /** + * Page list used to submit IO when memory is in pressure. + */ + struct cl_page_list emrg_page_list; + /** + * sub-io's shared by all threads accessing this device when memory is + * too low to allocate sub-io's dynamically. + */ + struct cl_io emrg_subio; + /** + * Environments used by sub-io's in + * lov_device_emerg::emrg_subio. + */ + struct lu_env *emrg_env; + /** + * Refchecks for lov_device_emerg::emrg_env. + * + * \see cl_env_get() + */ + int emrg_refcheck; +}; + +struct lov_device { + /* + * XXX Locking of lov-private data is missing. + */ + struct cl_device ld_cl; + struct lov_obd *ld_lov; + /** size of lov_device::ld_target[] array */ + __u32 ld_target_nr; + struct lovsub_device **ld_target; + __u32 ld_flags; + + /** Emergency resources used in memory-cleansing paths. */ + struct lov_device_emerg **ld_emrg; + /** + * Serializes access to lov_device::ld_emrg in low-memory + * conditions. + */ + struct mutex ld_mutex; +}; + +/** + * Layout type. + */ +enum lov_layout_type { + LLT_EMPTY, /** empty file without body (mknod + truncate) */ + LLT_RAID0, /** striped file */ + LLT_RELEASED, /** file with no objects (data in HSM) */ + LLT_NR +}; + +static inline char *llt2str(enum lov_layout_type llt) +{ + switch (llt) { + case LLT_EMPTY: + return "EMPTY"; + case LLT_RAID0: + return "RAID0"; + case LLT_RELEASED: + return "RELEASED"; + case LLT_NR: + LBUG(); + } + LBUG(); + return ""; +} + +/** + * lov-specific file state. + * + * lov object has particular layout type, determining how top-object is built + * on top of sub-objects. Layout type can change dynamically. When this + * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode, + * all state pertaining to the old layout type is destroyed, and new state is + * constructed. All object methods take said semaphore in the shared mode, + * providing serialization against transition between layout types. + * + * To avoid multiple `if' or `switch' statements, selecting behavior for the + * current layout type, object methods perform double-dispatch, invoking + * function corresponding to the current layout type. + */ +struct lov_object { + struct cl_object lo_cl; + /** + * Serializes object operations with transitions between layout types. + * + * This semaphore is taken in shared mode by all object methods, and + * is taken in exclusive mode when object type is changed. + * + * \see lov_object::lo_type + */ + struct rw_semaphore lo_type_guard; + /** + * Type of an object. Protected by lov_object::lo_type_guard. + */ + enum lov_layout_type lo_type; + /** + * True if layout is invalid. This bit is cleared when layout lock + * is lost. + */ + bool lo_layout_invalid; + /** + * How many IOs are on going on this object. Layout can be changed + * only if there is no active IO. + */ + atomic_t lo_active_ios; + /** + * Waitq - wait for no one else is using lo_lsm + */ + wait_queue_head_t lo_waitq; + /** + * Layout metadata. NULL if empty layout. + */ + struct lov_stripe_md *lo_lsm; + + union lov_layout_state { + struct lov_layout_raid0 { + unsigned lo_nr; + /** + * When this is true, lov_object::lo_attr contains + * valid up to date attributes for a top-level + * object. This field is reset to 0 when attributes of + * any sub-object change. + */ + int lo_attr_valid; + /** + * Array of sub-objects. Allocated when top-object is + * created (lov_init_raid0()). + * + * Top-object is a strict master of its sub-objects: + * it is created before them, and outlives its + * children (this later is necessary so that basic + * functions like cl_object_top() always + * work). Top-object keeps a reference on every + * sub-object. + * + * When top-object is destroyed (lov_delete_raid0()) + * it releases its reference to a sub-object and waits + * until the latter is finally destroyed. + */ + struct lovsub_object **lo_sub; + /** + * protect lo_sub + */ + spinlock_t lo_sub_lock; + /** + * Cached object attribute, built from sub-object + * attributes. + */ + struct cl_attr lo_attr; + } raid0; + struct lov_layout_state_empty { + } empty; + struct lov_layout_state_released { + } released; + } u; + /** + * Thread that acquired lov_object::lo_type_guard in an exclusive + * mode. + */ + struct task_struct *lo_owner; +}; + +/** + * Flags that top-lock can set on each of its sub-locks. + */ +enum lov_sub_flags { + /** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */ + LSF_HELD = 1 << 0 +}; + +/** + * State lov_lock keeps for each sub-lock. + */ +struct lov_lock_sub { + /** sub-lock itself */ + struct lovsub_lock *sub_lock; + /** An array of per-sub-lock flags, taken from enum lov_sub_flags */ + unsigned sub_flags; + int sub_stripe; + struct cl_lock_descr sub_descr; + struct cl_lock_descr sub_got; +}; + +/** + * lov-specific lock state. + */ +struct lov_lock { + struct cl_lock_slice lls_cl; + /** Number of sub-locks in this lock */ + int lls_nr; + /** + * Number of existing sub-locks. + */ + unsigned lls_nr_filled; + /** + * Set when sub-lock was canceled, while top-lock was being + * used, or unused. + */ + unsigned int lls_cancel_race:1; + /** + * An array of sub-locks + * + * There are two issues with managing sub-locks: + * + * - sub-locks are concurrently canceled, and + * + * - sub-locks are shared with other top-locks. + * + * To manage cancellation, top-lock acquires a hold on a sublock + * (lov_sublock_adopt()) when the latter is inserted into + * lov_lock::lls_sub[]. This hold is released (lov_sublock_release()) + * when top-lock is going into CLS_CACHED state or destroyed. Hold + * prevents sub-lock from cancellation. + * + * Sub-lock sharing means, among other things, that top-lock that is + * in the process of creation (i.e., not yet inserted into lock list) + * is already accessible to other threads once at least one of its + * sub-locks is created, see lov_lock_sub_init(). + * + * Sub-lock can be in one of the following states: + * + * - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such + * sub-lock was either never created (top-lock is in CLS_NEW + * state), or it was created, then canceled, then destroyed + * (lov_lock_unlink() cleared sub-lock pointer in the top-lock). + * + * - sub-lock exists and is on + * hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a + * normal state of a sub-lock in CLS_HELD and CLS_CACHED states + * of a top-lock. + * + * - sub-lock exists, but is not held by the top-lock. This + * happens after top-lock released a hold on sub-locks before + * going into cache (lov_lock_unuse()). + * + * \todo To support wide-striping, array has to be replaced with a set + * of queues to avoid scanning. + */ + struct lov_lock_sub *lls_sub; + /** + * Original description with which lock was enqueued. + */ + struct cl_lock_descr lls_orig; +}; + +struct lov_page { + struct cl_page_slice lps_cl; + int lps_invalid; +}; + +/* + * Bottom half. + */ + +struct lovsub_device { + struct cl_device acid_cl; + struct lov_device *acid_super; + int acid_idx; + struct cl_device *acid_next; +}; + +struct lovsub_object { + struct cl_object_header lso_header; + struct cl_object lso_cl; + struct lov_object *lso_super; + int lso_index; +}; + +/** + * A link between a top-lock and a sub-lock. Separate data-structure is + * necessary, because top-locks and sub-locks are in M:N relationship. + * + * \todo This can be optimized for a (by far) most frequent case of a single + * top-lock per sub-lock. + */ +struct lov_lock_link { + struct lov_lock *lll_super; + /** An index within parent lock. */ + int lll_idx; + /** + * A linkage into per sub-lock list of all corresponding top-locks, + * hanging off lovsub_lock::lss_parents. + */ + struct list_head lll_list; +}; + +/** + * Lock state at lovsub layer. + */ +struct lovsub_lock { + struct cl_lock_slice lss_cl; + /** + * List of top-locks that have given sub-lock as their part. Protected + * by cl_lock::cll_guard mutex. + */ + struct list_head lss_parents; + /** + * Top-lock that initiated current operation on this sub-lock. This is + * only set during top-to-bottom lock operations like enqueue, and is + * used to optimize state change notification. Protected by + * cl_lock::cll_guard mutex. + * + * \see lovsub_lock_state_one(). + */ + struct cl_lock *lss_active; +}; + +/** + * Describe the environment settings for sublocks. + */ +struct lov_sublock_env { + const struct lu_env *lse_env; + struct cl_io *lse_io; + struct lov_io_sub *lse_sub; +}; + +struct lovsub_page { + struct cl_page_slice lsb_cl; +}; + + +struct lov_thread_info { + struct cl_object_conf lti_stripe_conf; + struct lu_fid lti_fid; + struct cl_lock_descr lti_ldescr; + struct ost_lvb lti_lvb; + struct cl_2queue lti_cl2q; + struct cl_lock_closure lti_closure; + wait_queue_t lti_waiter; +}; + +/** + * State that lov_io maintains for every sub-io. + */ +struct lov_io_sub { + int sub_stripe; + /** + * sub-io for a stripe. Ideally sub-io's can be stopped and resumed + * independently, with lov acting as a scheduler to maximize overall + * throughput. + */ + struct cl_io *sub_io; + /** + * Linkage into a list (hanging off lov_io::lis_active) of all + * sub-io's active for the current IO iteration. + */ + struct list_head sub_linkage; + /** + * true, iff cl_io_init() was successfully executed against + * lov_io_sub::sub_io. + */ + int sub_io_initialized; + /** + * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't + * allocated, but borrowed from a per-device emergency pool. + */ + int sub_borrowed; + /** + * environment, in which sub-io executes. + */ + struct lu_env *sub_env; + /** + * environment's refcheck. + * + * \see cl_env_get() + */ + int sub_refcheck; + int sub_refcheck2; + int sub_reenter; + void *sub_cookie; +}; + +/** + * IO state private for LOV. + */ +struct lov_io { + /** super-class */ + struct cl_io_slice lis_cl; + /** + * Pointer to the object slice. This is a duplicate of + * lov_io::lis_cl::cis_object. + */ + struct lov_object *lis_object; + /** + * Original end-of-io position for this IO, set by the upper layer as + * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this, + * changes pos and count to fit IO into a single stripe and uses saved + * value to determine when IO iterations have to stop. + * + * This is used only for CIT_READ and CIT_WRITE io's. + */ + loff_t lis_io_endpos; + + /** + * starting position within a file, for the current io loop iteration + * (stripe), used by ci_io_loop(). + */ + u64 lis_pos; + /** + * end position with in a file, for the current stripe io. This is + * exclusive (i.e., next offset after last byte affected by io). + */ + u64 lis_endpos; + + int lis_mem_frozen; + int lis_stripe_count; + int lis_active_subios; + + /** + * the index of ls_single_subio in ls_subios array + */ + int lis_single_subio_index; + struct cl_io lis_single_subio; + + /** + * size of ls_subios array, actually the highest stripe # + */ + int lis_nr_subios; + struct lov_io_sub *lis_subs; + /** + * List of active sub-io's. + */ + struct list_head lis_active; +}; + +struct lov_session { + struct lov_io ls_io; + struct lov_sublock_env ls_subenv; +}; + +/** + * State of transfer for lov. + */ +struct lov_req { + struct cl_req_slice lr_cl; +}; + +/** + * State of transfer for lovsub. + */ +struct lovsub_req { + struct cl_req_slice lsrq_cl; +}; + +extern struct lu_device_type lov_device_type; +extern struct lu_device_type lovsub_device_type; + +extern struct lu_context_key lov_key; +extern struct lu_context_key lov_session_key; + +extern struct kmem_cache *lov_lock_kmem; +extern struct kmem_cache *lov_object_kmem; +extern struct kmem_cache *lov_thread_kmem; +extern struct kmem_cache *lov_session_kmem; +extern struct kmem_cache *lov_req_kmem; + +extern struct kmem_cache *lovsub_lock_kmem; +extern struct kmem_cache *lovsub_object_kmem; +extern struct kmem_cache *lovsub_req_kmem; + +extern struct kmem_cache *lov_lock_link_kmem; + +int lov_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +int lovsub_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +int lov_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); + +int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +void lov_lock_unlink(const struct lu_env *env, struct lov_lock_link *link, + struct lovsub_lock *sub); + +struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio, + int stripe); +void lov_sub_put(struct lov_io_sub *sub); +int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov, + struct lovsub_lock *sublock, + const struct cl_lock_descr *d, int idx); + + +int lov_page_init(const struct lu_env *env, struct cl_object *ob, + struct cl_page *page, struct page *vmpage); +int lovsub_page_init(const struct lu_env *env, struct cl_object *ob, + struct cl_page *page, struct page *vmpage); + +int lov_page_init_empty(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, struct page *vmpage); +int lov_page_init_raid0(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, struct page *vmpage); +struct lu_object *lov_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +struct lov_lock_link *lov_lock_link_find(const struct lu_env *env, + struct lov_lock *lck, + struct lovsub_lock *sub); +struct lov_io_sub *lov_page_subio(const struct lu_env *env, + struct lov_io *lio, + const struct cl_page_slice *slice); + +void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm); +struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov); + +#define lov_foreach_target(lov, var) \ + for (var = 0; var < lov_targets_nr(lov); ++var) + +/***************************************************************************** + * + * Type conversions. + * + * Accessors. + * + */ + +static inline struct lov_session *lov_env_session(const struct lu_env *env) +{ + struct lov_session *ses; + + ses = lu_context_key_get(env->le_ses, &lov_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct lov_io *lov_env_io(const struct lu_env *env) +{ + return &lov_env_session(env)->ls_io; +} + +static inline int lov_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lov_device_type; +} + +static inline int lovsub_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lovsub_device_type; +} + +static inline struct lu_device *lov2lu_dev(struct lov_device *lov) +{ + return &lov->ld_cl.cd_lu_dev; +} + +static inline struct lov_device *lu2lov_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lov_device_type); + return container_of0(d, struct lov_device, ld_cl.cd_lu_dev); +} + +static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub) +{ + return &lovsub->acid_cl; +} + +static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub) +{ + return &lovsub2cl_dev(lovsub)->cd_lu_dev; +} + +static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lovsub_device_type); + return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev); +} + +static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d) +{ + LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type); + return container_of0(d, struct lovsub_device, acid_cl); +} + +static inline struct lu_object *lov2lu(struct lov_object *lov) +{ + return &lov->lo_cl.co_lu; +} + +static inline struct cl_object *lov2cl(struct lov_object *lov) +{ + return &lov->lo_cl; +} + +static inline struct lov_object *lu2lov(const struct lu_object *obj) +{ + LINVRNT(lov_is_object(obj)); + return container_of0(obj, struct lov_object, lo_cl.co_lu); +} + +static inline struct lov_object *cl2lov(const struct cl_object *obj) +{ + LINVRNT(lov_is_object(&obj->co_lu)); + return container_of0(obj, struct lov_object, lo_cl); +} + +static inline struct lu_object *lovsub2lu(struct lovsub_object *los) +{ + return &los->lso_cl.co_lu; +} + +static inline struct cl_object *lovsub2cl(struct lovsub_object *los) +{ + return &los->lso_cl; +} + +static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj) +{ + LINVRNT(lovsub_is_object(&obj->co_lu)); + return container_of0(obj, struct lovsub_object, lso_cl); +} + +static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj) +{ + LINVRNT(lovsub_is_object(obj)); + return container_of0(obj, struct lovsub_object, lso_cl.co_lu); +} + +static inline struct lovsub_lock * +cl2lovsub_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu)); + return container_of(slice, struct lovsub_lock, lss_cl); +} + +static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + + slice = cl_lock_at(lock, &lovsub_device_type); + LASSERT(slice != NULL); + return cl2lovsub_lock(slice); +} + +static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cls_obj->co_lu)); + return container_of(slice, struct lov_lock, lls_cl); +} + +static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct lov_page, lps_cl); +} + +static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice) +{ + return container_of0(slice, struct lov_req, lr_cl); +} + +static inline struct lovsub_page * +cl2lovsub_page(const struct cl_page_slice *slice) +{ + LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct lovsub_page, lsb_cl); +} + +static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice) +{ + return container_of0(slice, struct lovsub_req, lsrq_cl); +} + +static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice) +{ + return slice->cpl_page->cp_child; +} + +static inline struct lov_io *cl2lov_io(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio; + + lio = container_of(ios, struct lov_io, lis_cl); + LASSERT(lio == lov_env_io(env)); + return lio; +} + +static inline int lov_targets_nr(const struct lov_device *lov) +{ + return lov->ld_lov->desc.ld_tgt_count; +} + +static inline struct lov_thread_info *lov_env_info(const struct lu_env *env) +{ + struct lov_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &lov_key); + LASSERT(info != NULL); + return info; +} + +static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov) +{ + LASSERT(lov->lo_type == LLT_RAID0); + LASSERT(lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC || + lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC_V3); + return &lov->u.raid0; +} + +/** @} lov */ + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_dev.c b/kernel/drivers/staging/lustre/lustre/lov/lov_dev.c new file mode 100644 index 000000000..711b837dd --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_dev.c @@ -0,0 +1,528 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device and cl_device_type for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +/* class_name2obd() */ +#include "../include/obd_class.h" + +#include "lov_cl_internal.h" +#include "lov_internal.h" + + +struct kmem_cache *lov_lock_kmem; +struct kmem_cache *lov_object_kmem; +struct kmem_cache *lov_thread_kmem; +struct kmem_cache *lov_session_kmem; +struct kmem_cache *lov_req_kmem; + +struct kmem_cache *lovsub_lock_kmem; +struct kmem_cache *lovsub_object_kmem; +struct kmem_cache *lovsub_req_kmem; + +struct kmem_cache *lov_lock_link_kmem; + +/** Lock class of lov_device::ld_mutex. */ +static struct lock_class_key cl_lov_device_mutex_class; + +struct lu_kmem_descr lov_caches[] = { + { + .ckd_cache = &lov_lock_kmem, + .ckd_name = "lov_lock_kmem", + .ckd_size = sizeof(struct lov_lock) + }, + { + .ckd_cache = &lov_object_kmem, + .ckd_name = "lov_object_kmem", + .ckd_size = sizeof(struct lov_object) + }, + { + .ckd_cache = &lov_thread_kmem, + .ckd_name = "lov_thread_kmem", + .ckd_size = sizeof(struct lov_thread_info) + }, + { + .ckd_cache = &lov_session_kmem, + .ckd_name = "lov_session_kmem", + .ckd_size = sizeof(struct lov_session) + }, + { + .ckd_cache = &lov_req_kmem, + .ckd_name = "lov_req_kmem", + .ckd_size = sizeof(struct lov_req) + }, + { + .ckd_cache = &lovsub_lock_kmem, + .ckd_name = "lovsub_lock_kmem", + .ckd_size = sizeof(struct lovsub_lock) + }, + { + .ckd_cache = &lovsub_object_kmem, + .ckd_name = "lovsub_object_kmem", + .ckd_size = sizeof(struct lovsub_object) + }, + { + .ckd_cache = &lovsub_req_kmem, + .ckd_name = "lovsub_req_kmem", + .ckd_size = sizeof(struct lovsub_req) + }, + { + .ckd_cache = &lov_lock_link_kmem, + .ckd_name = "lov_lock_link_kmem", + .ckd_size = sizeof(struct lov_lock_link) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Lov transfer operations. + * + */ + +static void lov_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct lov_req *lr; + + lr = cl2lov_req(slice); + OBD_SLAB_FREE_PTR(lr, lov_req_kmem); +} + +static const struct cl_req_operations lov_req_ops = { + .cro_completion = lov_req_completion +}; + +/***************************************************************************** + * + * Lov device and device type functions. + * + */ + +static void *lov_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS); + if (info != NULL) + INIT_LIST_HEAD(&info->lti_closure.clc_list); + else + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_thread_info *info = data; + LINVRNT(list_empty(&info->lti_closure.clc_list)); + OBD_SLAB_FREE_PTR(info, lov_thread_kmem); +} + +struct lu_context_key lov_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = lov_key_init, + .lct_fini = lov_key_fini +}; + +static void *lov_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_session *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_session *info = data; + OBD_SLAB_FREE_PTR(info, lov_session_kmem); +} + +struct lu_context_key lov_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = lov_session_key_init, + .lct_fini = lov_session_key_fini +}; + +/* type constructor/destructor: lov_type_{init,fini,start,stop}() */ +LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key); + +static struct lu_device *lov_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + int i; + struct lov_device *ld = lu2lov_dev(d); + + LASSERT(ld->ld_lov != NULL); + if (ld->ld_target == NULL) + return NULL; + + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + + lsd = ld->ld_target[i]; + if (lsd != NULL) { + cl_stack_fini(env, lovsub2cl_dev(lsd)); + ld->ld_target[i] = NULL; + } + } + return NULL; +} + +static int lov_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lov_device *ld = lu2lov_dev(d); + int i; + int rc = 0; + + LASSERT(d->ld_site != NULL); + if (ld->ld_target == NULL) + return rc; + + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + struct cl_device *cl; + struct lov_tgt_desc *desc; + + desc = ld->ld_lov->lov_tgts[i]; + if (desc == NULL) + continue; + + cl = cl_type_setup(env, d->ld_site, &lovsub_device_type, + desc->ltd_obd->obd_lu_dev); + if (IS_ERR(cl)) { + rc = PTR_ERR(cl); + break; + } + lsd = cl2lovsub_dev(cl); + lsd->acid_idx = i; + lsd->acid_super = ld; + ld->ld_target[i] = lsd; + } + + if (rc) + lov_device_fini(env, d); + else + ld->ld_flags |= LOV_DEV_INITIALIZED; + + return rc; +} + +static int lov_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct lov_req *lr; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(lr, lov_req_kmem, GFP_NOFS); + if (lr != NULL) { + cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +static const struct cl_device_operations lov_cl_ops = { + .cdo_req_init = lov_req_init +}; + +static void lov_emerg_free(struct lov_device_emerg **emrg, int nr) +{ + int i; + + for (i = 0; i < nr; ++i) { + struct lov_device_emerg *em; + + em = emrg[i]; + if (em != NULL) { + LASSERT(em->emrg_page_list.pl_nr == 0); + if (em->emrg_env != NULL) + cl_env_put(em->emrg_env, &em->emrg_refcheck); + OBD_FREE_PTR(em); + } + } + OBD_FREE(emrg, nr * sizeof(emrg[0])); +} + +static struct lu_device *lov_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lov_device *ld = lu2lov_dev(d); + const int nr = ld->ld_target_nr; + + cl_device_fini(lu2cl_dev(d)); + if (ld->ld_target != NULL) + OBD_FREE(ld->ld_target, nr * sizeof(ld->ld_target[0])); + if (ld->ld_emrg != NULL) + lov_emerg_free(ld->ld_emrg, nr); + OBD_FREE_PTR(ld); + return NULL; +} + +static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct lov_device *ld = lu2lov_dev(dev); + + if (ld->ld_target[index] != NULL) { + cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index])); + ld->ld_target[index] = NULL; + } +} + +static struct lov_device_emerg **lov_emerg_alloc(int nr) +{ + struct lov_device_emerg **emerg; + int i; + int result; + + OBD_ALLOC(emerg, nr * sizeof(emerg[0])); + if (emerg == NULL) + return ERR_PTR(-ENOMEM); + for (result = i = 0; i < nr && result == 0; i++) { + struct lov_device_emerg *em; + + OBD_ALLOC_PTR(em); + if (em != NULL) { + emerg[i] = em; + cl_page_list_init(&em->emrg_page_list); + em->emrg_env = cl_env_alloc(&em->emrg_refcheck, + LCT_REMEMBER|LCT_NOREF); + if (!IS_ERR(em->emrg_env)) + em->emrg_env->le_ctx.lc_cookie = 0x2; + else { + result = PTR_ERR(em->emrg_env); + em->emrg_env = NULL; + } + } else + result = -ENOMEM; + } + if (result != 0) { + lov_emerg_free(emerg, nr); + emerg = ERR_PTR(result); + } + return emerg; +} + +static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev) +{ + int result; + __u32 tgt_size; + __u32 sub_size; + + result = 0; + tgt_size = dev->ld_lov->lov_tgt_size; + sub_size = dev->ld_target_nr; + if (sub_size < tgt_size) { + struct lovsub_device **newd; + struct lov_device_emerg **emerg; + const size_t sz = sizeof(newd[0]); + + emerg = lov_emerg_alloc(tgt_size); + if (IS_ERR(emerg)) + return PTR_ERR(emerg); + + OBD_ALLOC(newd, tgt_size * sz); + if (newd != NULL) { + mutex_lock(&dev->ld_mutex); + if (sub_size > 0) { + memcpy(newd, dev->ld_target, sub_size * sz); + OBD_FREE(dev->ld_target, sub_size * sz); + } + dev->ld_target = newd; + dev->ld_target_nr = tgt_size; + + if (dev->ld_emrg != NULL) + lov_emerg_free(dev->ld_emrg, sub_size); + dev->ld_emrg = emerg; + mutex_unlock(&dev->ld_mutex); + } else { + lov_emerg_free(emerg, tgt_size); + result = -ENOMEM; + } + } + return result; +} + +static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct obd_device *obd = dev->ld_obd; + struct lov_device *ld = lu2lov_dev(dev); + struct lov_tgt_desc *tgt; + struct lovsub_device *lsd; + struct cl_device *cl; + int rc; + + obd_getref(obd); + + tgt = obd->u.lov.lov_tgts[index]; + LASSERT(tgt != NULL); + LASSERT(tgt->ltd_obd != NULL); + + if (!tgt->ltd_obd->obd_set_up) { + CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid)); + return -EINVAL; + } + + rc = lov_expand_targets(env, ld); + if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) { + LASSERT(dev->ld_site != NULL); + + cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type, + tgt->ltd_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + lsd = cl2lovsub_dev(cl); + lsd->acid_idx = index; + lsd->acid_super = ld; + ld->ld_target[index] = lsd; + } else { + CERROR("add failed (%d), deleting %s\n", rc, + obd_uuid2str(&tgt->ltd_uuid)); + lov_cl_del_target(env, dev, index); + rc = PTR_ERR(cl); + } + } + obd_putref(obd); + return rc; +} + +static int lov_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + struct obd_device *obd = d->ld_obd; + int cmd; + int rc; + int gen; + __u32 index; + + obd_getref(obd); + + cmd = cfg->lcfg_command; + rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen); + if (rc == 0) { + switch (cmd) { + case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: + rc = lov_cl_add_target(env, d, index); + if (rc != 0) + lov_del_target(d->ld_obd, index, NULL, 0); + break; + case LCFG_LOV_DEL_OBD: + lov_cl_del_target(env, d, index); + break; + } + } + obd_putref(obd); + return rc; +} + +static const struct lu_device_operations lov_lu_ops = { + .ldo_object_alloc = lov_object_alloc, + .ldo_process_config = lov_process_config, +}; + +static struct lu_device *lov_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lov_device *ld; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(ld); + if (ld == NULL) + return ERR_PTR(-ENOMEM); + + cl_device_init(&ld->ld_cl, t); + d = lov2lu_dev(ld); + d->ld_ops = &lov_lu_ops; + ld->ld_cl.cd_ops = &lov_cl_ops; + + mutex_init(&ld->ld_mutex); + lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class); + + /* setup the LOV OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = lov_setup(obd, cfg); + if (rc) { + lov_device_free(env, d); + return ERR_PTR(rc); + } + + ld->ld_lov = &obd->u.lov; + return d; +} + +static const struct lu_device_type_operations lov_device_type_ops = { + .ldto_init = lov_type_init, + .ldto_fini = lov_type_fini, + + .ldto_start = lov_type_start, + .ldto_stop = lov_type_stop, + + .ldto_device_alloc = lov_device_alloc, + .ldto_device_free = lov_device_free, + + .ldto_device_init = lov_device_init, + .ldto_device_fini = lov_device_fini +}; + +struct lu_device_type lov_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOV_NAME, + .ldt_ops = &lov_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; +EXPORT_SYMBOL(lov_device_type); + +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_ea.c b/kernel/drivers/staging/lustre/lustre/lov/lov_ea.c new file mode 100644 index 000000000..2bcfaeaff --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_ea.c @@ -0,0 +1,363 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_ea.c + * + * Author: Wang Di + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_class.h" +#include "../include/lustre/lustre_idl.h" + +#include "lov_internal.h" + +struct lovea_unpack_args { + struct lov_stripe_md *lsm; + int cursor; +}; + +static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes, + __u16 stripe_count) +{ + if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { + CERROR("bad stripe count %d\n", stripe_count); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm_oi_id(&lmm->lmm_oi) == 0) { + CERROR("zero object id\n"); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + + if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) { + CERROR("bad striping pattern\n"); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm->lmm_stripe_size == 0 || + (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) { + CERROR("bad stripe size %u\n", + le32_to_cpu(lmm->lmm_stripe_size)); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + return 0; +} + +struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size) +{ + struct lov_stripe_md *lsm; + struct lov_oinfo *loi; + int i, oinfo_ptrs_size; + + LASSERT(stripe_count <= LOV_MAX_STRIPE_COUNT); + + oinfo_ptrs_size = sizeof(struct lov_oinfo *) * stripe_count; + *size = sizeof(struct lov_stripe_md) + oinfo_ptrs_size; + + OBD_ALLOC_LARGE(lsm, *size); + if (!lsm) + return NULL; + + for (i = 0; i < stripe_count; i++) { + OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS); + if (loi == NULL) + goto err; + lsm->lsm_oinfo[i] = loi; + } + lsm->lsm_stripe_count = stripe_count; + return lsm; + +err: + while (--i >= 0) + OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, sizeof(*loi)); + OBD_FREE_LARGE(lsm, *size); + return NULL; +} + +void lsm_free_plain(struct lov_stripe_md *lsm) +{ + __u16 stripe_count = lsm->lsm_stripe_count; + int i; + + for (i = 0; i < stripe_count; i++) + OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, + sizeof(struct lov_oinfo)); + OBD_FREE_LARGE(lsm, sizeof(struct lov_stripe_md) + + stripe_count * sizeof(struct lov_oinfo *)); +} + +static void lsm_unpackmd_common(struct lov_stripe_md *lsm, + struct lov_mds_md *lmm) +{ + /* + * This supposes lov_mds_md_v1/v3 first fields are + * are the same + */ + lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi); + lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); + lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern); + lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); + lsm->lsm_pool_name[0] = '\0'; +} + +static void +lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno, + u64 *lov_off, u64 *swidth) +{ + if (swidth) + *swidth = (u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count; +} + +static void +lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno, + u64 *lov_off, u64 *swidth) +{ + if (swidth) + *swidth = (u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count; +} + +static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa, + struct obd_export *md_exp) +{ + return 0; +} + +/* Find minimum stripe maxbytes value. For inactive or + * reconnecting targets use LUSTRE_STRIPE_MAXBYTES. */ +static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes) +{ + struct obd_import *imp = tgt->ltd_obd->u.cli.cl_import; + + if (imp == NULL || !tgt->ltd_active) { + *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES; + return; + } + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_FULL && + (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) && + imp->imp_connect_data.ocd_maxbytes > 0) { + if (*stripe_maxbytes > imp->imp_connect_data.ocd_maxbytes) + *stripe_maxbytes = imp->imp_connect_data.ocd_maxbytes; + } else { + *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES; + } + spin_unlock(&imp->imp_lock); +} + +static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes, + __u16 *stripe_count) +{ + if (lmm_bytes < sizeof(*lmm)) { + CERROR("lov_mds_md_v1 too small: %d, need at least %d\n", + lmm_bytes, (int)sizeof(*lmm)); + return -EINVAL; + } + + *stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED) + *stripe_count = 0; + + if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) { + CERROR("LOV EA V1 too small: %d, need %d\n", + lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + + return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count); +} + +static int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md_v1 *lmm) +{ + struct lov_oinfo *loi; + int i; + int stripe_count; + __u64 stripe_maxbytes = OBD_OBJECT_EOF; + + lsm_unpackmd_common(lsm, lmm); + + stripe_count = lsm_is_released(lsm) ? 0 : lsm->lsm_stripe_count; + + for (i = 0; i < stripe_count; i++) { + /* XXX LOV STACKING call down to osc_unpackmd() */ + loi = lsm->lsm_oinfo[i]; + ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi); + loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx); + loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen); + if (lov_oinfo_is_dummy(loi)) + continue; + + if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) { + CERROR("OST index %d more than OST count %d\n", + loi->loi_ost_idx, lov->desc.ld_tgt_count); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + if (!lov->lov_tgts[loi->loi_ost_idx]) { + CERROR("OST index %d missing\n", loi->loi_ost_idx); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + /* calculate the minimum stripe max bytes */ + lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx], + &stripe_maxbytes); + } + + lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count; + if (lsm->lsm_stripe_count == 0) + lsm->lsm_maxbytes = stripe_maxbytes * lov->desc.ld_tgt_count; + + return 0; +} + +const struct lsm_operations lsm_v1_ops = { + .lsm_free = lsm_free_plain, + .lsm_destroy = lsm_destroy_plain, + .lsm_stripe_by_index = lsm_stripe_by_index_plain, + .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, + .lsm_lmm_verify = lsm_lmm_verify_v1, + .lsm_unpackmd = lsm_unpackmd_v1, +}; + +static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes, + __u16 *stripe_count) +{ + struct lov_mds_md_v3 *lmm; + + lmm = (struct lov_mds_md_v3 *)lmmv1; + + if (lmm_bytes < sizeof(*lmm)) { + CERROR("lov_mds_md_v3 too small: %d, need at least %d\n", + lmm_bytes, (int)sizeof(*lmm)); + return -EINVAL; + } + + *stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED) + *stripe_count = 0; + + if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) { + CERROR("LOV EA V3 too small: %d, need %d\n", + lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + + return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes, + *stripe_count); +} + +static int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md *lmmv1) +{ + struct lov_mds_md_v3 *lmm; + struct lov_oinfo *loi; + int i; + int stripe_count; + __u64 stripe_maxbytes = OBD_OBJECT_EOF; + int cplen = 0; + + lmm = (struct lov_mds_md_v3 *)lmmv1; + + lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm); + + stripe_count = lsm_is_released(lsm) ? 0 : lsm->lsm_stripe_count; + + cplen = strlcpy(lsm->lsm_pool_name, lmm->lmm_pool_name, + sizeof(lsm->lsm_pool_name)); + if (cplen >= sizeof(lsm->lsm_pool_name)) + return -E2BIG; + + for (i = 0; i < stripe_count; i++) { + /* XXX LOV STACKING call down to osc_unpackmd() */ + loi = lsm->lsm_oinfo[i]; + ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi); + loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx); + loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen); + if (lov_oinfo_is_dummy(loi)) + continue; + + if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) { + CERROR("OST index %d more than OST count %d\n", + loi->loi_ost_idx, lov->desc.ld_tgt_count); + lov_dump_lmm_v3(D_WARNING, lmm); + return -EINVAL; + } + if (!lov->lov_tgts[loi->loi_ost_idx]) { + CERROR("OST index %d missing\n", loi->loi_ost_idx); + lov_dump_lmm_v3(D_WARNING, lmm); + return -EINVAL; + } + /* calculate the minimum stripe max bytes */ + lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx], + &stripe_maxbytes); + } + + lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count; + if (lsm->lsm_stripe_count == 0) + lsm->lsm_maxbytes = stripe_maxbytes * lov->desc.ld_tgt_count; + + return 0; +} + +const struct lsm_operations lsm_v3_ops = { + .lsm_free = lsm_free_plain, + .lsm_destroy = lsm_destroy_plain, + .lsm_stripe_by_index = lsm_stripe_by_index_plain, + .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, + .lsm_lmm_verify = lsm_lmm_verify_v3, + .lsm_unpackmd = lsm_unpackmd_v3, +}; + +void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm) +{ + CDEBUG(level, "lsm %p, objid " DOSTID ", maxbytes %#llx, magic 0x%08X, stripe_size %u, stripe_count %u, refc: %d, layout_gen %u, pool [" LOV_POOLNAMEF "]\n", + lsm, + POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic, + lsm->lsm_stripe_size, lsm->lsm_stripe_count, + atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen, + lsm->lsm_pool_name); +} diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_internal.h b/kernel/drivers/staging/lustre/lustre/lov/lov_internal.h new file mode 100644 index 000000000..b644acc9b --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_internal.h @@ -0,0 +1,319 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LOV_INTERNAL_H +#define LOV_INTERNAL_H + +#include "../include/obd_class.h" +#include "../include/lustre/lustre_user.h" + +/* lov_do_div64(a, b) returns a % b, and a = a / b. + * The 32-bit code is LOV-specific due to knowing about stripe limits in + * order to reduce the divisor to a 32-bit number. If the divisor is + * already a 32-bit value the compiler handles this directly. */ +#if BITS_PER_LONG == 64 +# define lov_do_div64(n, base) ({ \ + uint64_t __base = (base); \ + uint64_t __rem; \ + __rem = ((uint64_t)(n)) % __base; \ + (n) = ((uint64_t)(n)) / __base; \ + __rem; \ +}) +#elif BITS_PER_LONG == 32 +# define lov_do_div64(n, base) ({ \ + uint64_t __rem; \ + if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) { \ + int __remainder; \ + LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \ + "division %llu / %llu\n", (n), (uint64_t)(base)); \ + __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1); \ + (n) >>= LOV_MIN_STRIPE_BITS; \ + __rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS); \ + __rem <<= LOV_MIN_STRIPE_BITS; \ + __rem += __remainder; \ + } else { \ + __rem = do_div(n, base); \ + } \ + __rem; \ +}) +#endif + +struct lov_lock_handles { + struct portals_handle llh_handle; + atomic_t llh_refcount; + int llh_stripe_count; + struct lustre_handle llh_handles[0]; +}; + +struct lov_request { + struct obd_info rq_oi; + struct lov_request_set *rq_rqset; + + struct list_head rq_link; + + int rq_idx; /* index in lov->tgts array */ + int rq_stripe; /* stripe number */ + int rq_complete; + int rq_rc; + int rq_buflen; /* length of sub_md */ + + u32 rq_oabufs; + u32 rq_pgaidx; +}; + +struct lov_request_set { + struct ldlm_enqueue_info *set_ei; + struct obd_info *set_oi; + atomic_t set_refcount; + struct obd_export *set_exp; + /* XXX: There is @set_exp already, however obd_statfs gets obd_device + only. */ + struct obd_device *set_obd; + int set_count; + atomic_t set_completes; + atomic_t set_success; + atomic_t set_finish_checked; + struct llog_cookie *set_cookies; + int set_cookie_sent; + struct obd_trans_info *set_oti; + u32 set_oabufs; + struct brw_page *set_pga; + struct lov_lock_handles *set_lockh; + struct list_head set_list; + wait_queue_head_t set_waitq; + spinlock_t set_lock; +}; + +extern struct kmem_cache *lov_oinfo_slab; + +extern struct lu_kmem_descr lov_caches[]; + +void lov_finish_set(struct lov_request_set *set); + +static inline void lov_get_reqset(struct lov_request_set *set) +{ + LASSERT(set != NULL); + LASSERT(atomic_read(&set->set_refcount) > 0); + atomic_inc(&set->set_refcount); +} + +static inline void lov_put_reqset(struct lov_request_set *set) +{ + if (atomic_dec_and_test(&set->set_refcount)) + lov_finish_set(set); +} + +static inline struct lov_lock_handles * +lov_handle2llh(struct lustre_handle *handle) +{ + LASSERT(handle != NULL); + return class_handle2object(handle->cookie); +} + +static inline void lov_llh_put(struct lov_lock_handles *llh) +{ + CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh, + atomic_read(&llh->llh_refcount) - 1); + LASSERT(atomic_read(&llh->llh_refcount) > 0 && + atomic_read(&llh->llh_refcount) < 0x5a5a); + if (atomic_dec_and_test(&llh->llh_refcount)) { + class_handle_unhash(&llh->llh_handle); + /* The structure may be held by other threads because RCU. + * -jxiong */ + if (atomic_read(&llh->llh_refcount)) + return; + + OBD_FREE_RCU(llh, sizeof(*llh) + + sizeof(*llh->llh_handles) * llh->llh_stripe_count, + &llh->llh_handle); + } +} + +#define lov_uuid2str(lv, index) \ + (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid) + +/* lov_merge.c */ +void lov_merge_attrs(struct obdo *tgt, struct obdo *src, u64 valid, + struct lov_stripe_md *lsm, int stripeno, int *set); +int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, + u64 size, int shrink); +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, + struct ost_lvb *lvb, __u64 *kms_place); + +/* lov_offset.c */ +u64 lov_stripe_size(struct lov_stripe_md *lsm, u64 ost_size, + int stripeno); +int lov_stripe_offset(struct lov_stripe_md *lsm, u64 lov_off, + int stripeno, u64 *u64); +u64 lov_size_to_stripe(struct lov_stripe_md *lsm, u64 file_size, + int stripeno); +int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, + u64 start, u64 end, + u64 *obd_start, u64 *obd_end); +int lov_stripe_number(struct lov_stripe_md *lsm, u64 lov_off); + +/* lov_qos.c */ +#define LOV_USES_ASSIGNED_STRIPE 0 +#define LOV_USES_DEFAULT_STRIPE 1 +int qos_add_tgt(struct obd_device *obd, __u32 index); +int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt); +void qos_shrink_lsm(struct lov_request_set *set); +int qos_prep_create(struct obd_export *exp, struct lov_request_set *set); +void qos_update(struct lov_obd *lov); +void qos_statfs_done(struct lov_obd *lov); +void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait); +int qos_remedy_create(struct lov_request_set *set, struct lov_request *req); + +/* lov_request.c */ +void lov_set_add_req(struct lov_request *req, struct lov_request_set *set); +int lov_set_finished(struct lov_request_set *set, int idempotent); +void lov_update_set(struct lov_request_set *set, + struct lov_request *req, int rc); +int lov_update_common_set(struct lov_request_set *set, + struct lov_request *req, int rc); +int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx); +int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_request_set **reqset); +int lov_fini_getattr_set(struct lov_request_set *set); +int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo, + struct obdo *src_oa, struct lov_stripe_md *lsm, + struct obd_trans_info *oti, + struct lov_request_set **reqset); +int lov_fini_destroy_set(struct lov_request_set *set); +int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct lov_request_set **reqset); +int lov_update_setattr_set(struct lov_request_set *set, + struct lov_request *req, int rc); +int lov_fini_setattr_set(struct lov_request_set *set); +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset); +void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, + int success); +int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, + int success); +int lov_fini_statfs_set(struct lov_request_set *set); +int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc); + +/* lov_obd.c */ +void lov_fix_desc(struct lov_desc *desc); +void lov_fix_desc_stripe_size(__u64 *val); +void lov_fix_desc_stripe_count(__u32 *val); +void lov_fix_desc_pattern(__u32 *val); +void lov_fix_desc_qos_maxage(__u32 *val); +__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count); +int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, + struct obd_connect_data *data); +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + __u32 *indexp, int *genp); +int lov_del_target(struct obd_device *obd, __u32 index, + struct obd_uuid *uuidp, int gen); + +/* lov_pack.c */ +int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm, + struct lov_stripe_md *lsm); +int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmm, int lmm_bytes); +int lov_getstripe(struct obd_export *exp, + struct lov_stripe_md *lsm, struct lov_user_md *lump); +int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count, + int pattern, int magic); +int lov_free_memmd(struct lov_stripe_md **lsmp); + +void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm); +void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm); +void lov_dump_lmm_common(int level, void *lmmp); +void lov_dump_lmm(int level, void *lmm); + +/* lov_ea.c */ +struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size); +void lsm_free_plain(struct lov_stripe_md *lsm); +void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm); + +/* lproc_lov.c */ +#if defined (CONFIG_PROC_FS) +extern const struct file_operations lov_proc_target_fops; +void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + +/* lov_cl.c */ +extern struct lu_device_type lov_device_type; + +/* pools */ +extern cfs_hash_ops_t pool_hash_operations; +/* ost_pool methods */ +int lov_ost_pool_init(struct ost_pool *op, unsigned int count); +int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count); +int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count); +int lov_ost_pool_remove(struct ost_pool *op, __u32 idx); +int lov_ost_pool_free(struct ost_pool *op); + +/* high level pool methods */ +int lov_pool_new(struct obd_device *obd, char *poolname); +int lov_pool_del(struct obd_device *obd, char *poolname); +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname); +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname); +void lov_dump_pool(int level, struct pool_desc *pool); +struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname); +int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool); +void lov_pool_putref(struct pool_desc *pool); + +static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm) +{ + LASSERT(atomic_read(&lsm->lsm_refc) > 0); + atomic_inc(&lsm->lsm_refc); + return lsm; +} + +static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi) +{ + if (unlikely(loi->loi_oi.oi.oi_id == 0 && + loi->loi_oi.oi.oi_seq == 0 && + loi->loi_ost_idx == 0 && + loi->loi_ost_gen == 0)) + return true; + + return false; +} + + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_io.c b/kernel/drivers/staging/lustre/lustre/lov/lov_io.c new file mode 100644 index 000000000..cf96e0d01 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_io.c @@ -0,0 +1,990 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +static inline void lov_sub_enter(struct lov_io_sub *sub) +{ + sub->sub_reenter++; +} +static inline void lov_sub_exit(struct lov_io_sub *sub) +{ + sub->sub_reenter--; +} + +static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + if (sub->sub_io != NULL) { + if (sub->sub_io_initialized) { + lov_sub_enter(sub); + cl_io_fini(sub->sub_env, sub->sub_io); + lov_sub_exit(sub); + sub->sub_io_initialized = 0; + lio->lis_active_subios--; + } + if (sub->sub_stripe == lio->lis_single_subio_index) + lio->lis_single_subio_index = -1; + else if (!sub->sub_borrowed) + OBD_FREE_PTR(sub->sub_io); + sub->sub_io = NULL; + } + if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) { + if (!sub->sub_borrowed) + cl_env_put(sub->sub_env, &sub->sub_refcheck); + sub->sub_env = NULL; + } +} + +static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio, + int stripe, loff_t start, loff_t end) +{ + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct cl_io *parent = lio->lis_cl.cis_io; + + switch (io->ci_type) { + case CIT_SETATTR: { + io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr; + io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid; + io->u.ci_setattr.sa_capa = parent->u.ci_setattr.sa_capa; + if (cl_io_is_trunc(io)) { + loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size; + + new_size = lov_size_to_stripe(lsm, new_size, stripe); + io->u.ci_setattr.sa_attr.lvb_size = new_size; + } + break; + } + case CIT_FAULT: { + struct cl_object *obj = parent->ci_obj; + loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index); + + io->u.ci_fault = parent->u.ci_fault; + off = lov_size_to_stripe(lsm, off, stripe); + io->u.ci_fault.ft_index = cl_index(obj, off); + break; + } + case CIT_FSYNC: { + io->u.ci_fsync.fi_start = start; + io->u.ci_fsync.fi_end = end; + io->u.ci_fsync.fi_capa = parent->u.ci_fsync.fi_capa; + io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid; + io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode; + break; + } + case CIT_READ: + case CIT_WRITE: { + io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent); + if (cl_io_is_append(parent)) { + io->u.ci_wr.wr_append = 1; + } else { + io->u.ci_rw.crw_pos = start; + io->u.ci_rw.crw_count = end - start; + } + break; + } + default: + break; + } +} + +static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + struct lov_object *lov = lio->lis_object; + struct lov_device *ld = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev); + struct cl_io *sub_io; + struct cl_object *sub_obj; + struct cl_io *io = lio->lis_cl.cis_io; + + int stripe = sub->sub_stripe; + int result; + + LASSERT(sub->sub_io == NULL); + LASSERT(sub->sub_env == NULL); + LASSERT(sub->sub_stripe < lio->lis_stripe_count); + + if (unlikely(lov_r0(lov)->lo_sub[stripe] == NULL)) + return -EIO; + + result = 0; + sub->sub_io_initialized = 0; + sub->sub_borrowed = 0; + + if (lio->lis_mem_frozen) { + LASSERT(mutex_is_locked(&ld->ld_mutex)); + sub->sub_io = &ld->ld_emrg[stripe]->emrg_subio; + sub->sub_env = ld->ld_emrg[stripe]->emrg_env; + sub->sub_borrowed = 1; + } else { + void *cookie; + + /* obtain new environment */ + cookie = cl_env_reenter(); + sub->sub_env = cl_env_get(&sub->sub_refcheck); + cl_env_reexit(cookie); + if (IS_ERR(sub->sub_env)) + result = PTR_ERR(sub->sub_env); + + if (result == 0) { + /* + * First sub-io. Use ->lis_single_subio to + * avoid dynamic allocation. + */ + if (lio->lis_active_subios == 0) { + sub->sub_io = &lio->lis_single_subio; + lio->lis_single_subio_index = stripe; + } else { + OBD_ALLOC_PTR(sub->sub_io); + if (sub->sub_io == NULL) + result = -ENOMEM; + } + } + } + + if (result == 0) { + sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]); + sub_io = sub->sub_io; + + sub_io->ci_obj = sub_obj; + sub_io->ci_result = 0; + + sub_io->ci_parent = io; + sub_io->ci_lockreq = io->ci_lockreq; + sub_io->ci_type = io->ci_type; + sub_io->ci_no_srvlock = io->ci_no_srvlock; + sub_io->ci_noatime = io->ci_noatime; + + lov_sub_enter(sub); + result = cl_io_sub_init(sub->sub_env, sub_io, + io->ci_type, sub_obj); + lov_sub_exit(sub); + if (result >= 0) { + lio->lis_active_subios++; + sub->sub_io_initialized = 1; + result = 0; + } + } + if (result != 0) + lov_io_sub_fini(env, lio, sub); + return result; +} + +struct lov_io_sub *lov_sub_get(const struct lu_env *env, + struct lov_io *lio, int stripe) +{ + int rc; + struct lov_io_sub *sub = &lio->lis_subs[stripe]; + + LASSERT(stripe < lio->lis_stripe_count); + + if (!sub->sub_io_initialized) { + sub->sub_stripe = stripe; + rc = lov_io_sub_init(env, lio, sub); + } else + rc = 0; + if (rc == 0) + lov_sub_enter(sub); + else + sub = ERR_PTR(rc); + return sub; +} + +void lov_sub_put(struct lov_io_sub *sub) +{ + lov_sub_exit(sub); +} + +/***************************************************************************** + * + * Lov io operations. + * + */ + +static int lov_page_stripe(const struct cl_page *page) +{ + struct lovsub_object *subobj; + + subobj = lu2lovsub( + lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header, + &lovsub_device_type)); + LASSERT(subobj != NULL); + return subobj->lso_index; +} + +struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio, + const struct cl_page_slice *slice) +{ + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct cl_page *page = slice->cpl_page; + int stripe; + + LASSERT(lio->lis_cl.cis_io != NULL); + LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object); + LASSERT(lsm != NULL); + LASSERT(lio->lis_nr_subios > 0); + + stripe = lov_page_stripe(page); + return lov_sub_get(env, lio, stripe); +} + + +static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio, + struct cl_io *io) +{ + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + int result; + + LASSERT(lio->lis_object != NULL); + + /* + * Need to be optimized, we can't afford to allocate a piece of memory + * when writing a page. -jay + */ + OBD_ALLOC_LARGE(lio->lis_subs, + lsm->lsm_stripe_count * sizeof(lio->lis_subs[0])); + if (lio->lis_subs != NULL) { + lio->lis_nr_subios = lio->lis_stripe_count; + lio->lis_single_subio_index = -1; + lio->lis_active_subios = 0; + result = 0; + } else + result = -ENOMEM; + return result; +} + +static void lov_io_slice_init(struct lov_io *lio, + struct lov_object *obj, struct cl_io *io) +{ + io->ci_result = 0; + lio->lis_object = obj; + + LASSERT(obj->lo_lsm != NULL); + lio->lis_stripe_count = obj->lo_lsm->lsm_stripe_count; + + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + lio->lis_io_endpos = lio->lis_endpos; + if (cl_io_is_append(io)) { + LASSERT(io->ci_type == CIT_WRITE); + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + } + break; + + case CIT_SETATTR: + if (cl_io_is_trunc(io)) + lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size; + else + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + case CIT_FAULT: { + pgoff_t index = io->u.ci_fault.ft_index; + lio->lis_pos = cl_offset(io->ci_obj, index); + lio->lis_endpos = cl_offset(io->ci_obj, index + 1); + break; + } + + case CIT_FSYNC: { + lio->lis_pos = io->u.ci_fsync.fi_start; + lio->lis_endpos = io->u.ci_fsync.fi_end; + break; + } + + case CIT_MISC: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + default: + LBUG(); + } +} + +static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_object *lov = cl2lov(ios->cis_obj); + int i; + + if (lio->lis_subs != NULL) { + for (i = 0; i < lio->lis_nr_subios; i++) + lov_io_sub_fini(env, lio, &lio->lis_subs[i]); + OBD_FREE_LARGE(lio->lis_subs, + lio->lis_nr_subios * sizeof(lio->lis_subs[0])); + lio->lis_nr_subios = 0; + } + + LASSERT(atomic_read(&lov->lo_active_ios) > 0); + if (atomic_dec_and_test(&lov->lo_active_ios)) + wake_up_all(&lov->lo_waitq); +} + +static u64 lov_offset_mod(u64 val, int delta) +{ + if (val != OBD_OBJECT_EOF) + val += delta; + return val; +} + +static int lov_io_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct lov_io_sub *sub; + u64 endpos; + u64 start; + u64 end; + int stripe; + int rc = 0; + + endpos = lov_offset_mod(lio->lis_endpos, -1); + for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) { + if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos, + endpos, &start, &end)) + continue; + + if (unlikely(lov_r0(lio->lis_object)->lo_sub[stripe] == NULL)) { + if (ios->cis_io->ci_type == CIT_READ || + ios->cis_io->ci_type == CIT_WRITE || + ios->cis_io->ci_type == CIT_FAULT) + return -EIO; + + continue; + } + + end = lov_offset_mod(end, 1); + sub = lov_sub_get(env, lio, stripe); + if (!IS_ERR(sub)) { + lov_io_sub_inherit(sub->sub_io, lio, stripe, + start, end); + rc = cl_io_iter_init(sub->sub_env, sub->sub_io); + lov_sub_put(sub); + CDEBUG(D_VFSTRACE, "shrink: %d [%llu, %llu)\n", + stripe, start, end); + } else + rc = PTR_ERR(sub); + + if (!rc) + list_add_tail(&sub->sub_linkage, &lio->lis_active); + else + break; + } + return rc; +} + +static int lov_io_rw_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *io = ios->cis_io; + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + __u64 start = io->u.ci_rw.crw_pos; + loff_t next; + unsigned long ssize = lsm->lsm_stripe_size; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + /* fast path for common case. */ + if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) { + + lov_do_div64(start, ssize); + next = (start + 1) * ssize; + if (next <= start * ssize) + next = ~0ull; + + io->ci_continue = next < lio->lis_io_endpos; + io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos, + next) - io->u.ci_rw.crw_pos; + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + CDEBUG(D_VFSTRACE, "stripe: %llu chunk: [%llu, %llu) %llu\n", + (__u64)start, lio->lis_pos, lio->lis_endpos, + (__u64)lio->lis_io_endpos); + } + /* + * XXX The following call should be optimized: we know, that + * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe. + */ + return lov_io_iter_init(env, ios); +} + +static int lov_io_call(const struct lu_env *env, struct lov_io *lio, + int (*iofunc)(const struct lu_env *, struct cl_io *)) +{ + struct cl_io *parent = lio->lis_cl.cis_io; + struct lov_io_sub *sub; + int rc = 0; + + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + lov_sub_enter(sub); + rc = iofunc(sub->sub_env, sub->sub_io); + lov_sub_exit(sub); + if (rc) + break; + + if (parent->ci_result == 0) + parent->ci_result = sub->sub_io->ci_result; + } + return rc; +} + +static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios) +{ + return lov_io_call(env, cl2lov_io(env, ios), cl_io_lock); +} + +static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios) +{ + return lov_io_call(env, cl2lov_io(env, ios), cl_io_start); +} + +static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io) +{ + /* + * It's possible that lov_io_start() wasn't called against this + * sub-io, either because previous sub-io failed, or upper layer + * completed IO. + */ + if (io->ci_state == CIS_IO_GOING) + cl_io_end(env, io); + else + io->ci_state = CIS_IO_FINISHED; + return 0; +} + +static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_iter_fini(env, io); + return 0; +} + +static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_unlock(env, io); + return 0; +} + +static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + int rc; + + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper); + LASSERT(rc == 0); +} + +static void lov_io_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + int rc; + + rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper); + LASSERT(rc == 0); + while (!list_empty(&lio->lis_active)) + list_del_init(lio->lis_active.next); +} + +static void lov_io_unlock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + int rc; + + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper); + LASSERT(rc == 0); +} + + +static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld, + struct cl_page_list *qin, + int idx, int alloc) +{ + return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list; +} + +/** + * lov implementation of cl_operations::cio_submit() method. It takes a list + * of pages in \a queue, splits it into per-stripe sub-lists, invokes + * cl_io_submit() on underlying devices to submit sub-lists, and then splices + * everything back. + * + * Major complication of this function is a need to handle memory cleansing: + * cl_io_submit() is called to write out pages as a part of VM memory + * reclamation, and hence it may not fail due to memory shortages (system + * dead-locks otherwise). To deal with this, some resources (sub-lists, + * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a + * not-memory cleansing context), and in case of memory shortage, these + * pre-allocated resources are used by lov_io_submit() under + * lov_device::ld_mutex mutex. + */ +static int lov_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_object *obj = lio->lis_object; + struct lov_device *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev); + struct cl_page_list *qin = &queue->c2_qin; + struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q; + struct cl_page_list *stripes_qin = NULL; + struct cl_page *page; + struct cl_page *tmp; + int stripe; + +#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc) + + int rc = 0; + int alloc = + !(current->flags & PF_MEMALLOC); + + if (lio->lis_active_subios == 1) { + int idx = lio->lis_single_subio_index; + struct lov_io_sub *sub; + + LASSERT(idx < lio->lis_nr_subios); + sub = lov_sub_get(env, lio, idx); + LASSERT(!IS_ERR(sub)); + LASSERT(sub->sub_io == &lio->lis_single_subio); + rc = cl_io_submit_rw(sub->sub_env, sub->sub_io, + crt, queue); + lov_sub_put(sub); + return rc; + } + + LASSERT(lio->lis_subs != NULL); + if (alloc) { + OBD_ALLOC_LARGE(stripes_qin, + sizeof(*stripes_qin) * lio->lis_nr_subios); + if (stripes_qin == NULL) + return -ENOMEM; + + for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) + cl_page_list_init(&stripes_qin[stripe]); + } else { + /* + * If we get here, it means pageout & swap doesn't help. + * In order to not make things worse, even don't try to + * allocate the memory with __GFP_NOWARN. -jay + */ + mutex_lock(&ld->ld_mutex); + lio->lis_mem_frozen = 1; + } + + cl_2queue_init(cl2q); + cl_page_list_for_each_safe(page, tmp, qin) { + stripe = lov_page_stripe(page); + cl_page_list_move(QIN(stripe), qin, page); + } + + for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) { + struct lov_io_sub *sub; + struct cl_page_list *sub_qin = QIN(stripe); + + if (list_empty(&sub_qin->pl_pages)) + continue; + + cl_page_list_splice(sub_qin, &cl2q->c2_qin); + sub = lov_sub_get(env, lio, stripe); + if (!IS_ERR(sub)) { + rc = cl_io_submit_rw(sub->sub_env, sub->sub_io, + crt, cl2q); + lov_sub_put(sub); + } else + rc = PTR_ERR(sub); + cl_page_list_splice(&cl2q->c2_qin, &queue->c2_qin); + cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout); + if (rc != 0) + break; + } + + for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) { + struct cl_page_list *sub_qin = QIN(stripe); + + if (list_empty(&sub_qin->pl_pages)) + continue; + + cl_page_list_splice(sub_qin, qin); + } + + if (alloc) { + OBD_FREE_LARGE(stripes_qin, + sizeof(*stripes_qin) * lio->lis_nr_subios); + } else { + int i; + + for (i = 0; i < lio->lis_nr_subios; i++) { + struct cl_io *cio = lio->lis_subs[i].sub_io; + + if (cio && cio == &ld->ld_emrg[i]->emrg_subio) + lov_io_sub_fini(env, lio, &lio->lis_subs[i]); + } + lio->lis_mem_frozen = 0; + mutex_unlock(&ld->ld_mutex); + } + + return rc; +#undef QIN +} + +static int lov_io_prepare_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_page *sub_page = lov_sub_page(slice); + struct lov_io_sub *sub; + int result; + + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + result = cl_io_prepare_write(sub->sub_env, sub->sub_io, + sub_page, from, to); + lov_sub_put(sub); + } else + result = PTR_ERR(sub); + return result; +} + +static int lov_io_commit_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_page *sub_page = lov_sub_page(slice); + struct lov_io_sub *sub; + int result; + + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + result = cl_io_commit_write(sub->sub_env, sub->sub_io, + sub_page, from, to); + lov_sub_put(sub); + } else + result = PTR_ERR(sub); + return result; +} + +static int lov_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_fault_io *fio; + struct lov_io *lio; + struct lov_io_sub *sub; + + fio = &ios->cis_io->u.ci_fault; + lio = cl2lov_io(env, ios); + sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page)); + sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob; + lov_sub_put(sub); + return lov_io_start(env, ios); +} + +static void lov_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io_sub *sub; + unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written; + + *written = 0; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + struct cl_io *subio = sub->sub_io; + + lov_sub_enter(sub); + lov_io_end_wrapper(sub->sub_env, subio); + lov_sub_exit(sub); + + if (subio->ci_result == 0) + *written += subio->u.ci_fsync.fi_nr_written; + } +} + +static const struct cl_io_operations lov_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_WRITE] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_SETATTR] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_FAULT] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_fault_start, + .cio_end = lov_io_end + }, + [CIT_FSYNC] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_fsync_end + }, + [CIT_MISC] = { + .cio_fini = lov_io_fini + } + }, + .req_op = { + [CRT_READ] = { + .cio_submit = lov_io_submit + }, + [CRT_WRITE] = { + .cio_submit = lov_io_submit + } + }, + .cio_prepare_write = lov_io_prepare_write, + .cio_commit_write = lov_io_commit_write +}; + +/***************************************************************************** + * + * Empty lov io operations. + * + */ + +static void lov_empty_io_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_object *lov = cl2lov(ios->cis_obj); + + if (atomic_dec_and_test(&lov->lo_active_ios)) + wake_up_all(&lov->lo_waitq); +} + +static void lov_empty_impossible(const struct lu_env *env, + struct cl_io_slice *ios) +{ + LBUG(); +} + +#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible) + +/** + * An io operation vector for files without stripes. + */ +static const struct cl_io_operations lov_empty_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = lov_empty_io_fini, +#if 0 + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE +#endif + }, + [CIT_WRITE] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_SETATTR] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FAULT] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FSYNC] = { + .cio_fini = lov_empty_io_fini + }, + [CIT_MISC] = { + .cio_fini = lov_empty_io_fini + } + }, + .req_op = { + [CRT_READ] = { + .cio_submit = LOV_EMPTY_IMPOSSIBLE + }, + [CRT_WRITE] = { + .cio_submit = LOV_EMPTY_IMPOSSIBLE + } + }, + .cio_commit_write = LOV_EMPTY_IMPOSSIBLE +}; + +int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_object *lov = cl2lov(obj); + + INIT_LIST_HEAD(&lio->lis_active); + lov_io_slice_init(lio, lov, io); + if (io->ci_result == 0) { + io->ci_result = lov_io_subio_init(env, lio, io); + if (io->ci_result == 0) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops); + atomic_inc(&lov->lo_active_ios); + } + } + return io->ci_result; +} + +int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + int result; + + lio->lis_object = lov; + switch (io->ci_type) { + default: + LBUG(); + case CIT_MISC: + case CIT_READ: + result = 0; + break; + case CIT_FSYNC: + case CIT_SETATTR: + result = 1; + break; + case CIT_WRITE: + result = -EBADF; + break; + case CIT_FAULT: + result = -EFAULT; + CERROR("Page fault on a file without stripes: "DFID"\n", + PFID(lu_object_fid(&obj->co_lu))); + break; + } + if (result == 0) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); + atomic_inc(&lov->lo_active_ios); + } + + io->ci_result = result < 0 ? result : 0; + return result != 0; +} + +int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + int result; + + LASSERT(lov->lo_lsm != NULL); + lio->lis_object = lov; + + switch (io->ci_type) { + default: + LASSERTF(0, "invalid type %d\n", io->ci_type); + case CIT_MISC: + case CIT_FSYNC: + result = 1; + break; + case CIT_SETATTR: + /* the truncate to 0 is managed by MDT: + * - in open, for open O_TRUNC + * - in setattr, for truncate + */ + /* the truncate is for size > 0 so triggers a restore */ + if (cl_io_is_trunc(io)) + io->ci_restore_needed = 1; + result = -ENODATA; + break; + case CIT_READ: + case CIT_WRITE: + case CIT_FAULT: + io->ci_restore_needed = 1; + result = -ENODATA; + break; + } + if (result == 0) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); + atomic_inc(&lov->lo_active_ios); + } + + io->ci_result = result < 0 ? result : 0; + return result != 0; +} +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_lock.c b/kernel/drivers/staging/lustre/lustre/lov/lov_lock.c new file mode 100644 index 000000000..f2eca565b --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_lock.c @@ -0,0 +1,1198 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +static struct cl_lock_closure *lov_closure_get(const struct lu_env *env, + struct cl_lock *parent); + +static int lov_lock_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice); +/***************************************************************************** + * + * Lov lock operations. + * + */ + +static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env, + struct cl_lock *parent, + struct lov_lock_sub *lls) +{ + struct lov_sublock_env *subenv; + struct lov_io *lio = lov_env_io(env); + struct cl_io *io = lio->lis_cl.cis_io; + struct lov_io_sub *sub; + + subenv = &lov_env_session(env)->ls_subenv; + + /* + * FIXME: We tend to use the subio's env & io to call the sublock + * lock operations because osc lock sometimes stores some control + * variables in thread's IO information(Now only lockless information). + * However, if the lock's host(object) is different from the object + * for current IO, we have no way to get the subenv and subio because + * they are not initialized at all. As a temp fix, in this case, + * we still borrow the parent's env to call sublock operations. + */ + if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) { + subenv->lse_env = env; + subenv->lse_io = io; + subenv->lse_sub = NULL; + } else { + sub = lov_sub_get(env, lio, lls->sub_stripe); + if (!IS_ERR(sub)) { + subenv->lse_env = sub->sub_env; + subenv->lse_io = sub->sub_io; + subenv->lse_sub = sub; + } else { + subenv = (void *)sub; + } + } + return subenv; +} + +static void lov_sublock_env_put(struct lov_sublock_env *subenv) +{ + if (subenv && subenv->lse_sub) + lov_sub_put(subenv->lse_sub); +} + +static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck, + struct cl_lock *sublock, int idx, + struct lov_lock_link *link) +{ + struct lovsub_lock *lsl; + struct cl_lock *parent = lck->lls_cl.cls_lock; + int rc; + + LASSERT(cl_lock_is_mutexed(parent)); + LASSERT(cl_lock_is_mutexed(sublock)); + + lsl = cl2sub_lock(sublock); + /* + * check that sub-lock doesn't have lock link to this top-lock. + */ + LASSERT(lov_lock_link_find(env, lck, lsl) == NULL); + LASSERT(idx < lck->lls_nr); + + lck->lls_sub[idx].sub_lock = lsl; + lck->lls_nr_filled++; + LASSERT(lck->lls_nr_filled <= lck->lls_nr); + list_add_tail(&link->lll_list, &lsl->lss_parents); + link->lll_idx = idx; + link->lll_super = lck; + cl_lock_get(parent); + lu_ref_add(&parent->cll_reference, "lov-child", sublock); + lck->lls_sub[idx].sub_flags |= LSF_HELD; + cl_lock_user_add(env, sublock); + + rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx); + LASSERT(rc == 0); /* there is no way this can fail, currently */ +} + +static struct cl_lock *lov_sublock_alloc(const struct lu_env *env, + const struct cl_io *io, + struct lov_lock *lck, + int idx, struct lov_lock_link **out) +{ + struct cl_lock *sublock; + struct cl_lock *parent; + struct lov_lock_link *link; + + LASSERT(idx < lck->lls_nr); + + OBD_SLAB_ALLOC_PTR_GFP(link, lov_lock_link_kmem, GFP_NOFS); + if (link != NULL) { + struct lov_sublock_env *subenv; + struct lov_lock_sub *lls; + struct cl_lock_descr *descr; + + parent = lck->lls_cl.cls_lock; + lls = &lck->lls_sub[idx]; + descr = &lls->sub_got; + + subenv = lov_sublock_env_get(env, parent, lls); + if (!IS_ERR(subenv)) { + /* CAVEAT: Don't try to add a field in lov_lock_sub + * to remember the subio. This is because lock is able + * to be cached, but this is not true for IO. This + * further means a sublock might be referenced in + * different io context. -jay */ + + sublock = cl_lock_hold(subenv->lse_env, subenv->lse_io, + descr, "lov-parent", parent); + lov_sublock_env_put(subenv); + } else { + /* error occurs. */ + sublock = (void *)subenv; + } + + if (!IS_ERR(sublock)) + *out = link; + else + OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem); + } else + sublock = ERR_PTR(-ENOMEM); + return sublock; +} + +static void lov_sublock_unlock(const struct lu_env *env, + struct lovsub_lock *lsl, + struct cl_lock_closure *closure, + struct lov_sublock_env *subenv) +{ + lov_sublock_env_put(subenv); + lsl->lss_active = NULL; + cl_lock_disclosure(env, closure); +} + +static int lov_sublock_lock(const struct lu_env *env, + struct lov_lock *lck, + struct lov_lock_sub *lls, + struct cl_lock_closure *closure, + struct lov_sublock_env **lsep) +{ + struct lovsub_lock *sublock; + struct cl_lock *child; + int result = 0; + + LASSERT(list_empty(&closure->clc_list)); + + sublock = lls->sub_lock; + child = sublock->lss_cl.cls_lock; + result = cl_lock_closure_build(env, child, closure); + if (result == 0) { + struct cl_lock *parent = closure->clc_origin; + + LASSERT(cl_lock_is_mutexed(child)); + sublock->lss_active = parent; + + if (unlikely((child->cll_state == CLS_FREEING) || + (child->cll_flags & CLF_CANCELLED))) { + struct lov_lock_link *link; + /* + * we could race with lock deletion which temporarily + * put the lock in freeing state, bug 19080. + */ + LASSERT(!(lls->sub_flags & LSF_HELD)); + + link = lov_lock_link_find(env, lck, sublock); + LASSERT(link != NULL); + lov_lock_unlink(env, link, sublock); + lov_sublock_unlock(env, sublock, closure, NULL); + lck->lls_cancel_race = 1; + result = CLO_REPEAT; + } else if (lsep) { + struct lov_sublock_env *subenv; + subenv = lov_sublock_env_get(env, parent, lls); + if (IS_ERR(subenv)) { + lov_sublock_unlock(env, sublock, + closure, NULL); + result = PTR_ERR(subenv); + } else { + *lsep = subenv; + } + } + } + return result; +} + +/** + * Updates the result of a top-lock operation from a result of sub-lock + * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate + * over sub-locks and lov_subresult() is used to calculate return value of a + * top-operation. To this end, possible return values of sub-operations are + * ordered as + * + * - 0 success + * - CLO_WAIT wait for event + * - CLO_REPEAT repeat top-operation + * - -ne fundamental error + * + * Top-level return code can only go down through this list. CLO_REPEAT + * overwrites CLO_WAIT, because lock mutex was released and sleeping condition + * has to be rechecked by the upper layer. + */ +static int lov_subresult(int result, int rc) +{ + int result_rank; + int rc_rank; + + LASSERTF(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT, + "result = %d", result); + LASSERTF(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT, + "rc = %d\n", rc); + CLASSERT(CLO_WAIT < CLO_REPEAT); + + /* calculate ranks in the ordering above */ + result_rank = result < 0 ? 1 + CLO_REPEAT : result; + rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc; + + if (result_rank < rc_rank) + result = rc; + return result; +} + +/** + * Creates sub-locks for a given lov_lock for the first time. + * + * Goes through all sub-objects of top-object, and creates sub-locks on every + * sub-object intersecting with top-lock extent. This is complicated by the + * fact that top-lock (that is being created) can be accessed concurrently + * through already created sub-locks (possibly shared with other top-locks). + */ +static int lov_lock_sub_init(const struct lu_env *env, + struct lov_lock *lck, const struct cl_io *io) +{ + int result = 0; + int i; + int nr; + u64 start; + u64 end; + u64 file_start; + u64 file_end; + + struct lov_object *loo = cl2lov(lck->lls_cl.cls_obj); + struct lov_layout_raid0 *r0 = lov_r0(loo); + struct cl_lock *parent = lck->lls_cl.cls_lock; + + lck->lls_orig = parent->cll_descr; + file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start); + file_end = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1; + + for (i = 0, nr = 0; i < r0->lo_nr; i++) { + /* + * XXX for wide striping smarter algorithm is desirable, + * breaking out of the loop, early. + */ + if (likely(r0->lo_sub[i] != NULL) && + lov_stripe_intersects(loo->lo_lsm, i, + file_start, file_end, &start, &end)) + nr++; + } + LASSERT(nr > 0); + OBD_ALLOC_LARGE(lck->lls_sub, nr * sizeof(lck->lls_sub[0])); + if (lck->lls_sub == NULL) + return -ENOMEM; + + lck->lls_nr = nr; + /* + * First, fill in sub-lock descriptions in + * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc() + * (called below in this function, and by lov_lock_enqueue()) to + * create sub-locks. At this moment, no other thread can access + * top-lock. + */ + for (i = 0, nr = 0; i < r0->lo_nr; ++i) { + if (likely(r0->lo_sub[i] != NULL) && + lov_stripe_intersects(loo->lo_lsm, i, + file_start, file_end, &start, &end)) { + struct cl_lock_descr *descr; + + descr = &lck->lls_sub[nr].sub_descr; + + LASSERT(descr->cld_obj == NULL); + descr->cld_obj = lovsub2cl(r0->lo_sub[i]); + descr->cld_start = cl_index(descr->cld_obj, start); + descr->cld_end = cl_index(descr->cld_obj, end); + descr->cld_mode = parent->cll_descr.cld_mode; + descr->cld_gid = parent->cll_descr.cld_gid; + descr->cld_enq_flags = parent->cll_descr.cld_enq_flags; + /* XXX has no effect */ + lck->lls_sub[nr].sub_got = *descr; + lck->lls_sub[nr].sub_stripe = i; + nr++; + } + } + LASSERT(nr == lck->lls_nr); + + /* + * Some sub-locks can be missing at this point. This is not a problem, + * because enqueue will create them anyway. Main duty of this function + * is to fill in sub-lock descriptions in a race free manner. + */ + return result; +} + +static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck, + int i, int deluser, int rc) +{ + struct cl_lock *parent = lck->lls_cl.cls_lock; + + LASSERT(cl_lock_is_mutexed(parent)); + + if (lck->lls_sub[i].sub_flags & LSF_HELD) { + struct cl_lock *sublock; + int dying; + + LASSERT(lck->lls_sub[i].sub_lock != NULL); + sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock; + LASSERT(cl_lock_is_mutexed(sublock)); + + lck->lls_sub[i].sub_flags &= ~LSF_HELD; + if (deluser) + cl_lock_user_del(env, sublock); + /* + * If the last hold is released, and cancellation is pending + * for a sub-lock, release parent mutex, to avoid keeping it + * while sub-lock is being paged out. + */ + dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM || + sublock->cll_descr.cld_mode == CLM_GROUP || + (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) && + sublock->cll_holds == 1; + if (dying) + cl_lock_mutex_put(env, parent); + cl_lock_unhold(env, sublock, "lov-parent", parent); + if (dying) { + cl_lock_mutex_get(env, parent); + rc = lov_subresult(rc, CLO_REPEAT); + } + /* + * From now on lck->lls_sub[i].sub_lock is a "weak" pointer, + * not backed by a reference on a + * sub-lock. lovsub_lock_delete() will clear + * lck->lls_sub[i].sub_lock under semaphores, just before + * sub-lock is destroyed. + */ + } + return rc; +} + +static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck, + int i) +{ + struct cl_lock *parent = lck->lls_cl.cls_lock; + + LASSERT(cl_lock_is_mutexed(parent)); + + if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) { + struct cl_lock *sublock; + + LASSERT(lck->lls_sub[i].sub_lock != NULL); + sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock; + LASSERT(cl_lock_is_mutexed(sublock)); + LASSERT(sublock->cll_state != CLS_FREEING); + + lck->lls_sub[i].sub_flags |= LSF_HELD; + + cl_lock_get_trust(sublock); + cl_lock_hold_add(env, sublock, "lov-parent", parent); + cl_lock_user_add(env, sublock); + cl_lock_put(env, sublock); + } +} + +static void lov_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lov_lock *lck; + int i; + + lck = cl2lov_lock(slice); + LASSERT(lck->lls_nr_filled == 0); + if (lck->lls_sub != NULL) { + for (i = 0; i < lck->lls_nr; ++i) + /* + * No sub-locks exists at this point, as sub-lock has + * a reference on its parent. + */ + LASSERT(lck->lls_sub[i].sub_lock == NULL); + OBD_FREE_LARGE(lck->lls_sub, + lck->lls_nr * sizeof(lck->lls_sub[0])); + } + OBD_SLAB_FREE_PTR(lck, lov_lock_kmem); +} + +static int lov_lock_enqueue_wait(const struct lu_env *env, + struct lov_lock *lck, + struct cl_lock *sublock) +{ + struct cl_lock *lock = lck->lls_cl.cls_lock; + int result; + + LASSERT(cl_lock_is_mutexed(lock)); + + cl_lock_mutex_put(env, lock); + result = cl_lock_enqueue_wait(env, sublock, 0); + cl_lock_mutex_get(env, lock); + return result ?: CLO_REPEAT; +} + +/** + * Tries to advance a state machine of a given sub-lock toward enqueuing of + * the top-lock. + * + * \retval 0 if state-transition can proceed + * \retval -ve otherwise. + */ +static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck, + struct cl_lock *sublock, + struct cl_io *io, __u32 enqflags, int last) +{ + int result; + + /* first, try to enqueue a sub-lock ... */ + result = cl_enqueue_try(env, sublock, io, enqflags); + if ((sublock->cll_state == CLS_ENQUEUED) && !(enqflags & CEF_AGL)) { + /* if it is enqueued, try to `wait' on it---maybe it's already + * granted */ + result = cl_wait_try(env, sublock); + if (result == CLO_REENQUEUED) + result = CLO_WAIT; + } + /* + * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in + * parallel, otherwise---enqueue has to wait until sub-lock is granted + * before proceeding to the next one. + */ + if ((result == CLO_WAIT) && (sublock->cll_state <= CLS_HELD) && + (enqflags & CEF_ASYNC) && (!last || (enqflags & CEF_AGL))) + result = 0; + return result; +} + +/** + * Helper function for lov_lock_enqueue() that creates missing sub-lock. + */ +static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent, + struct cl_io *io, struct lov_lock *lck, int idx) +{ + struct lov_lock_link *link = NULL; + struct cl_lock *sublock; + int result; + + LASSERT(parent->cll_depth == 1); + cl_lock_mutex_put(env, parent); + sublock = lov_sublock_alloc(env, io, lck, idx, &link); + if (!IS_ERR(sublock)) + cl_lock_mutex_get(env, sublock); + cl_lock_mutex_get(env, parent); + + if (!IS_ERR(sublock)) { + cl_lock_get_trust(sublock); + if (parent->cll_state == CLS_QUEUING && + lck->lls_sub[idx].sub_lock == NULL) { + lov_sublock_adopt(env, lck, sublock, idx, link); + } else { + OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem); + /* other thread allocated sub-lock, or enqueue is no + * longer going on */ + cl_lock_mutex_put(env, parent); + cl_lock_unhold(env, sublock, "lov-parent", parent); + cl_lock_mutex_get(env, parent); + } + cl_lock_mutex_put(env, sublock); + cl_lock_put(env, sublock); + result = CLO_REPEAT; + } else + result = PTR_ERR(sublock); + return result; +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This + * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock + * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock + * state machines in the face of sub-locks sharing (by multiple top-locks), + * and concurrent sub-lock cancellations. + */ +static int lov_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags) +{ + struct cl_lock *lock = slice->cls_lock; + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, lock); + int i; + int result; + enum cl_lock_state minstate; + + for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct lov_lock_sub *lls; + struct cl_lock *sublock; + struct lov_sublock_env *subenv; + + if (lock->cll_state != CLS_QUEUING) { + /* + * Lock might have left QUEUING state if previous + * iteration released its mutex. Stop enqueing in this + * case and let the upper layer to decide what to do. + */ + LASSERT(i > 0 && result != 0); + break; + } + + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + /* + * Sub-lock might have been canceled, while top-lock was + * cached. + */ + if (sub == NULL) { + result = lov_sublock_fill(env, lock, io, lck, i); + /* lov_sublock_fill() released @lock mutex, + * restart. */ + break; + } + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + lov_sublock_hold(env, lck, i); + rc = lov_lock_enqueue_one(subenv->lse_env, lck, sublock, + subenv->lse_io, enqflags, + i == lck->lls_nr - 1); + minstate = min(minstate, sublock->cll_state); + if (rc == CLO_WAIT) { + switch (sublock->cll_state) { + case CLS_QUEUING: + /* take recursive mutex, the lock is + * released in lov_lock_enqueue_wait. + */ + cl_lock_mutex_get(env, sublock); + lov_sublock_unlock(env, sub, closure, + subenv); + rc = lov_lock_enqueue_wait(env, lck, + sublock); + break; + case CLS_CACHED: + cl_lock_get(sublock); + /* take recursive mutex of sublock */ + cl_lock_mutex_get(env, sublock); + /* need to release all locks in closure + * otherwise it may deadlock. LU-2683.*/ + lov_sublock_unlock(env, sub, closure, + subenv); + /* sublock and parent are held. */ + rc = lov_sublock_release(env, lck, i, + 1, rc); + cl_lock_mutex_put(env, sublock); + cl_lock_put(env, sublock); + break; + default: + lov_sublock_unlock(env, sub, closure, + subenv); + break; + } + } else { + LASSERT(sublock->cll_conflict == NULL); + lov_sublock_unlock(env, sub, closure, subenv); + } + } + result = lov_subresult(result, rc); + if (result != 0) + break; + } + cl_lock_closure_fini(closure); + return result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT; +} + +static int lov_lock_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + int i; + int result; + + for (result = 0, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + struct lov_lock_sub *lls; + struct lov_sublock_env *subenv; + + /* top-lock state cannot change concurrently, because single + * thread (one that released the last hold) carries unlocking + * to the completion. */ + LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT); + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + if (sub == NULL) + continue; + + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + if (lls->sub_flags & LSF_HELD) { + LASSERT(sublock->cll_state == CLS_HELD || + sublock->cll_state == CLS_ENQUEUED); + rc = cl_unuse_try(subenv->lse_env, sublock); + rc = lov_sublock_release(env, lck, i, 0, rc); + } + lov_sublock_unlock(env, sub, closure, subenv); + } + result = lov_subresult(result, rc); + } + + if (result == 0 && lck->lls_cancel_race) { + lck->lls_cancel_race = 0; + result = -ESTALE; + } + cl_lock_closure_fini(closure); + return result; +} + + +static void lov_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + int i; + int result; + + for (result = 0, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + struct lov_lock_sub *lls; + struct lov_sublock_env *subenv; + + /* top-lock state cannot change concurrently, because single + * thread (one that released the last hold) carries unlocking + * to the completion. */ + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + if (sub == NULL) + continue; + + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + if (!(lls->sub_flags & LSF_HELD)) { + lov_sublock_unlock(env, sub, closure, subenv); + continue; + } + + switch (sublock->cll_state) { + case CLS_HELD: + rc = cl_unuse_try(subenv->lse_env, sublock); + lov_sublock_release(env, lck, i, 0, 0); + break; + default: + lov_sublock_release(env, lck, i, 1, 0); + break; + } + lov_sublock_unlock(env, sub, closure, subenv); + } + + if (rc == CLO_REPEAT) { + --i; + continue; + } + + result = lov_subresult(result, rc); + } + + if (result) + CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock, + "lov_lock_cancel fails with %d.\n", result); + + cl_lock_closure_fini(closure); +} + +static int lov_lock_wait(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + enum cl_lock_state minstate; + int reenqueued; + int result; + int i; + +again: + for (result = 0, minstate = CLS_FREEING, i = 0, reenqueued = 0; + i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + struct lov_lock_sub *lls; + struct lov_sublock_env *subenv; + + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + LASSERT(sub != NULL); + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + LASSERT(sublock->cll_state >= CLS_ENQUEUED); + if (sublock->cll_state < CLS_HELD) + rc = cl_wait_try(env, sublock); + + minstate = min(minstate, sublock->cll_state); + lov_sublock_unlock(env, sub, closure, subenv); + } + if (rc == CLO_REENQUEUED) { + reenqueued++; + rc = 0; + } + result = lov_subresult(result, rc); + if (result != 0) + break; + } + /* Each sublock only can be reenqueued once, so will not loop for + * ever. */ + if (result == 0 && reenqueued != 0) + goto again; + cl_lock_closure_fini(closure); + return result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT; +} + +static int lov_lock_use(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + int result; + int i; + + LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT); + + for (result = 0, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + struct lov_lock_sub *lls; + struct lov_sublock_env *subenv; + + LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT); + + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + if (sub == NULL) { + /* + * Sub-lock might have been canceled, while top-lock was + * cached. + */ + result = -ESTALE; + break; + } + + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + LASSERT(sublock->cll_state != CLS_FREEING); + lov_sublock_hold(env, lck, i); + if (sublock->cll_state == CLS_CACHED) { + rc = cl_use_try(subenv->lse_env, sublock, 0); + if (rc != 0) + rc = lov_sublock_release(env, lck, + i, 1, rc); + } else if (sublock->cll_state == CLS_NEW) { + /* Sub-lock might have been canceled, while + * top-lock was cached. */ + result = -ESTALE; + lov_sublock_release(env, lck, i, 1, result); + } + lov_sublock_unlock(env, sub, closure, subenv); + } + result = lov_subresult(result, rc); + if (result != 0) + break; + } + + if (lck->lls_cancel_race) { + /* + * If there is unlocking happened at the same time, then + * sublock_lock state should be FREEING, and lov_sublock_lock + * should return CLO_REPEAT. In this case, it should return + * ESTALE, and up layer should reset the lock state to be NEW. + */ + lck->lls_cancel_race = 0; + LASSERT(result != 0); + result = -ESTALE; + } + cl_lock_closure_fini(closure); + return result; +} + +#if 0 +static int lock_lock_multi_match() +{ + struct cl_lock *lock = slice->cls_lock; + struct cl_lock_descr *subneed = &lov_env_info(env)->lti_ldescr; + struct lov_object *loo = cl2lov(lov->lls_cl.cls_obj); + struct lov_layout_raid0 *r0 = lov_r0(loo); + struct lov_lock_sub *sub; + struct cl_object *subobj; + u64 fstart; + u64 fend; + u64 start; + u64 end; + int i; + + fstart = cl_offset(need->cld_obj, need->cld_start); + fend = cl_offset(need->cld_obj, need->cld_end + 1) - 1; + subneed->cld_mode = need->cld_mode; + cl_lock_mutex_get(env, lock); + for (i = 0; i < lov->lls_nr; ++i) { + sub = &lov->lls_sub[i]; + if (sub->sub_lock == NULL) + continue; + subobj = sub->sub_descr.cld_obj; + if (!lov_stripe_intersects(loo->lo_lsm, sub->sub_stripe, + fstart, fend, &start, &end)) + continue; + subneed->cld_start = cl_index(subobj, start); + subneed->cld_end = cl_index(subobj, end); + subneed->cld_obj = subobj; + if (!cl_lock_ext_match(&sub->sub_got, subneed)) { + result = 0; + break; + } + } + cl_lock_mutex_put(env, lock); +} +#endif + +/** + * Check if the extent region \a descr is covered by \a child against the + * specific \a stripe. + */ +static int lov_lock_stripe_is_matching(const struct lu_env *env, + struct lov_object *lov, int stripe, + const struct cl_lock_descr *child, + const struct cl_lock_descr *descr) +{ + struct lov_stripe_md *lsm = lov->lo_lsm; + u64 start; + u64 end; + int result; + + if (lov_r0(lov)->lo_nr == 1) + return cl_lock_ext_match(child, descr); + + /* + * For a multi-stripes object: + * - make sure the descr only covers child's stripe, and + * - check if extent is matching. + */ + start = cl_offset(&lov->lo_cl, descr->cld_start); + end = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1; + result = 0; + /* glimpse should work on the object with LOV EA hole. */ + if (end - start <= lsm->lsm_stripe_size) { + int idx; + + idx = lov_stripe_number(lsm, start); + if (idx == stripe || + unlikely(lov_r0(lov)->lo_sub[idx] == NULL)) { + idx = lov_stripe_number(lsm, end); + if (idx == stripe || + unlikely(lov_r0(lov)->lo_sub[idx] == NULL)) + result = 1; + } + } + + if (result != 0) { + struct cl_lock_descr *subd = &lov_env_info(env)->lti_ldescr; + u64 sub_start; + u64 sub_end; + + subd->cld_obj = NULL; /* don't need sub object at all */ + subd->cld_mode = descr->cld_mode; + subd->cld_gid = descr->cld_gid; + result = lov_stripe_intersects(lsm, stripe, start, end, + &sub_start, &sub_end); + LASSERT(result); + subd->cld_start = cl_index(child->cld_obj, sub_start); + subd->cld_end = cl_index(child->cld_obj, sub_end); + result = cl_lock_ext_match(child, subd); + } + return result; +} + +/** + * An implementation of cl_lock_operations::clo_fits_into() method. + * + * Checks whether a lock (given by \a slice) is suitable for \a + * io. Multi-stripe locks can be used only for "quick" io, like truncate, or + * O_APPEND write. + * + * \see ccc_lock_fits_into(). + */ +static int lov_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + struct lov_lock *lov = cl2lov_lock(slice); + struct lov_object *obj = cl2lov(slice->cls_obj); + int result; + + LASSERT(cl_object_same(need->cld_obj, slice->cls_obj)); + LASSERT(lov->lls_nr > 0); + + /* for top lock, it's necessary to match enq flags otherwise it will + * run into problem if a sublock is missing and reenqueue. */ + if (need->cld_enq_flags != lov->lls_orig.cld_enq_flags) + return 0; + + if (need->cld_mode == CLM_GROUP) + /* + * always allow to match group lock. + */ + result = cl_lock_ext_match(&lov->lls_orig, need); + else if (lov->lls_nr == 1) { + struct cl_lock_descr *got = &lov->lls_sub[0].sub_got; + result = lov_lock_stripe_is_matching(env, + cl2lov(slice->cls_obj), + lov->lls_sub[0].sub_stripe, + got, need); + } else if (io->ci_type != CIT_SETATTR && io->ci_type != CIT_MISC && + !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM) + /* + * Multi-stripe locks are only suitable for `quick' IO and for + * glimpse. + */ + result = 0; + else + /* + * Most general case: multi-stripe existing lock, and + * (potentially) multi-stripe @need lock. Check that @need is + * covered by @lov's sub-locks. + * + * For now, ignore lock expansions made by the server, and + * match against original lock extent. + */ + result = cl_lock_ext_match(&lov->lls_orig, need); + CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %d %d/%d: %d\n", + PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got), + lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr, + result); + return result; +} + +void lov_lock_unlink(const struct lu_env *env, + struct lov_lock_link *link, struct lovsub_lock *sub) +{ + struct lov_lock *lck = link->lll_super; + struct cl_lock *parent = lck->lls_cl.cls_lock; + + LASSERT(cl_lock_is_mutexed(parent)); + LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock)); + + list_del_init(&link->lll_list); + LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub); + /* yank this sub-lock from parent's array */ + lck->lls_sub[link->lll_idx].sub_lock = NULL; + LASSERT(lck->lls_nr_filled > 0); + lck->lls_nr_filled--; + lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock); + cl_lock_put(env, parent); + OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem); +} + +struct lov_lock_link *lov_lock_link_find(const struct lu_env *env, + struct lov_lock *lck, + struct lovsub_lock *sub) +{ + struct lov_lock_link *scan; + + LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock)); + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + if (scan->lll_super == lck) + return scan; + } + return NULL; +} + +/** + * An implementation of cl_lock_operations::clo_delete() method. This is + * invoked for "top-to-bottom" delete, when lock destruction starts from the + * top-lock, e.g., as a result of inode destruction. + * + * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there: + * this is done separately elsewhere: + * + * - for inode destruction, lov_object_delete() calls cl_object_kill() for + * each sub-object, purging its locks; + * + * - in other cases (e.g., a fatal error with a top-lock) sub-locks are + * left in the cache. + */ +static void lov_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + struct lov_lock_link *link; + int rc; + int i; + + LASSERT(slice->cls_lock->cll_state == CLS_FREEING); + + for (i = 0; i < lck->lls_nr; ++i) { + struct lov_lock_sub *lls = &lck->lls_sub[i]; + struct lovsub_lock *lsl = lls->sub_lock; + + if (lsl == NULL) /* already removed */ + continue; + + rc = lov_sublock_lock(env, lck, lls, closure, NULL); + if (rc == CLO_REPEAT) { + --i; + continue; + } + + LASSERT(rc == 0); + LASSERT(lsl->lss_cl.cls_lock->cll_state < CLS_FREEING); + + if (lls->sub_flags & LSF_HELD) + lov_sublock_release(env, lck, i, 1, 0); + + link = lov_lock_link_find(env, lck, lsl); + LASSERT(link != NULL); + lov_lock_unlink(env, link, lsl); + LASSERT(lck->lls_sub[i].sub_lock == NULL); + + lov_sublock_unlock(env, lsl, closure, NULL); + } + + cl_lock_closure_fini(closure); +} + +static int lov_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + int i; + + (*p)(env, cookie, "%d\n", lck->lls_nr); + for (i = 0; i < lck->lls_nr; ++i) { + struct lov_lock_sub *sub; + + sub = &lck->lls_sub[i]; + (*p)(env, cookie, " %d %x: ", i, sub->sub_flags); + if (sub->sub_lock != NULL) + cl_lock_print(env, cookie, p, + sub->sub_lock->lss_cl.cls_lock); + else + (*p)(env, cookie, "---\n"); + } + return 0; +} + +static const struct cl_lock_operations lov_lock_ops = { + .clo_fini = lov_lock_fini, + .clo_enqueue = lov_lock_enqueue, + .clo_wait = lov_lock_wait, + .clo_use = lov_lock_use, + .clo_unuse = lov_lock_unuse, + .clo_cancel = lov_lock_cancel, + .clo_fits_into = lov_lock_fits_into, + .clo_delete = lov_lock_delete, + .clo_print = lov_lock_print +}; + +int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lov_lock *lck; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS); + if (lck != NULL) { + cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops); + result = lov_lock_sub_init(env, lck, io); + } else + result = -ENOMEM; + return result; +} + +static void lov_empty_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + OBD_SLAB_FREE_PTR(lck, lov_lock_kmem); +} + +static int lov_empty_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + (*p)(env, cookie, "empty\n"); + return 0; +} + +/* XXX: more methods will be added later. */ +static const struct cl_lock_operations lov_empty_lock_ops = { + .clo_fini = lov_empty_lock_fini, + .clo_print = lov_empty_lock_print +}; + +int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lov_lock *lck; + int result = -ENOMEM; + + OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS); + if (lck != NULL) { + cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops); + lck->lls_orig = lock->cll_descr; + result = 0; + } + return result; +} + +static struct cl_lock_closure *lov_closure_get(const struct lu_env *env, + struct cl_lock *parent) +{ + struct cl_lock_closure *closure; + + closure = &lov_env_info(env)->lti_closure; + LASSERT(list_empty(&closure->clc_list)); + cl_lock_closure_init(env, closure, parent, 1); + return closure; +} + + +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_merge.c b/kernel/drivers/staging/lustre/lustre/lov/lov_merge.c new file mode 100644 index 000000000..b7e7bfabe --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_merge.c @@ -0,0 +1,186 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_class.h" +#include "lov_internal.h" + +/** Merge the lock value block(&lvb) attributes and KMS from each of the + * stripes in a file into a single lvb. It is expected that the caller + * initializes the current atime, mtime, ctime to avoid regressing a more + * uptodate time on the local client. + */ +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, + struct ost_lvb *lvb, __u64 *kms_place) +{ + __u64 size = 0; + __u64 kms = 0; + __u64 blocks = 0; + s64 current_mtime = lvb->lvb_mtime; + s64 current_atime = lvb->lvb_atime; + s64 current_ctime = lvb->lvb_ctime; + int i; + int rc = 0; + + assert_spin_locked(&lsm->lsm_lock); + LASSERT(lsm->lsm_lock_owner == current_pid()); + + CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s=%llu m=%llu a=%llu c=%llu b=%llu\n", + POSTID(&lsm->lsm_oi), lvb->lvb_size, lvb->lvb_mtime, + lvb->lvb_atime, lvb->lvb_ctime, lvb->lvb_blocks); + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + u64 lov_size, tmpsize; + + if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) { + rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks); + continue; + } + + tmpsize = loi->loi_kms; + lov_size = lov_stripe_size(lsm, tmpsize, i); + if (lov_size > kms) + kms = lov_size; + + if (loi->loi_lvb.lvb_size > tmpsize) + tmpsize = loi->loi_lvb.lvb_size; + + lov_size = lov_stripe_size(lsm, tmpsize, i); + if (lov_size > size) + size = lov_size; + /* merge blocks, mtime, atime */ + blocks += loi->loi_lvb.lvb_blocks; + if (loi->loi_lvb.lvb_mtime > current_mtime) + current_mtime = loi->loi_lvb.lvb_mtime; + if (loi->loi_lvb.lvb_atime > current_atime) + current_atime = loi->loi_lvb.lvb_atime; + if (loi->loi_lvb.lvb_ctime > current_ctime) + current_ctime = loi->loi_lvb.lvb_ctime; + + CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s=%llu m=%llu a=%llu c=%llu b=%llu\n", + POSTID(&lsm->lsm_oi), loi->loi_ost_idx, + loi->loi_lvb.lvb_size, loi->loi_lvb.lvb_mtime, + loi->loi_lvb.lvb_atime, loi->loi_lvb.lvb_ctime, + loi->loi_lvb.lvb_blocks); + } + + *kms_place = kms; + lvb->lvb_size = size; + lvb->lvb_blocks = blocks; + lvb->lvb_mtime = current_mtime; + lvb->lvb_atime = current_atime; + lvb->lvb_ctime = current_ctime; + return rc; +} + +/* Must be called under the lov_stripe_lock() */ +int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, + u64 size, int shrink) +{ + struct lov_oinfo *loi; + int stripe = 0; + __u64 kms; + + assert_spin_locked(&lsm->lsm_lock); + LASSERT(lsm->lsm_lock_owner == current_pid()); + + if (shrink) { + for (; stripe < lsm->lsm_stripe_count; stripe++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[stripe]; + kms = lov_size_to_stripe(lsm, size, stripe); + CDEBUG(D_INODE, + "stripe %d KMS %sing %llu->%llu\n", + stripe, kms > loi->loi_kms ? "increase":"shrink", + loi->loi_kms, kms); + loi_kms_set(loi, loi->loi_lvb.lvb_size = kms); + } + return 0; + } + + if (size > 0) + stripe = lov_stripe_number(lsm, size - 1); + kms = lov_size_to_stripe(lsm, size, stripe); + loi = lsm->lsm_oinfo[stripe]; + + CDEBUG(D_INODE, "stripe %d KMS %sincreasing %llu->%llu\n", + stripe, kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms); + if (kms > loi->loi_kms) + loi_kms_set(loi, kms); + + return 0; +} + +void lov_merge_attrs(struct obdo *tgt, struct obdo *src, u64 valid, + struct lov_stripe_md *lsm, int stripeno, int *set) +{ + valid &= src->o_valid; + + if (*set) { + if (valid & OBD_MD_FLSIZE) { + /* this handles sparse files properly */ + u64 lov_size; + + lov_size = lov_stripe_size(lsm, src->o_size, stripeno); + if (lov_size > tgt->o_size) + tgt->o_size = lov_size; + } + if (valid & OBD_MD_FLBLOCKS) + tgt->o_blocks += src->o_blocks; + if (valid & OBD_MD_FLBLKSZ) + tgt->o_blksize += src->o_blksize; + if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime) + tgt->o_ctime = src->o_ctime; + if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime) + tgt->o_mtime = src->o_mtime; + if (valid & OBD_MD_FLDATAVERSION) + tgt->o_data_version += src->o_data_version; + } else { + memcpy(tgt, src, sizeof(*tgt)); + tgt->o_oi = lsm->lsm_oi; + if (valid & OBD_MD_FLSIZE) + tgt->o_size = lov_stripe_size(lsm, src->o_size, + stripeno); + } + + /* data_version needs to be valid on all stripes to be correct! */ + if (!(valid & OBD_MD_FLDATAVERSION)) + tgt->o_valid &= ~OBD_MD_FLDATAVERSION; + + *set += 1; +} diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_obd.c b/kernel/drivers/staging/lustre/lustre/lov/lov_obd.c new file mode 100644 index 000000000..027815766 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_obd.c @@ -0,0 +1,2395 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_obd.c + * + * Author: Phil Schwan + * Author: Peter Braam + * Author: Mike Shaver + * Author: Nathan Rutman + */ + +#define DEBUG_SUBSYSTEM S_LOV +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_support.h" +#include "../include/lustre_lib.h" +#include "../include/lustre_net.h" +#include "../include/lustre/lustre_idl.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_mds.h" +#include "../include/obd_class.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre_param.h" +#include "../include/cl_object.h" +#include "../include/lclient.h" /* for cl_client_lru */ +#include "../include/lustre/ll_fiemap.h" +#include "../include/lustre_fid.h" + +#include "lov_internal.h" + +/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion. + Any function that expects lov_tgts to remain stationary must take a ref. */ +static void lov_getref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + + /* nobody gets through here until lov_putref is done */ + mutex_lock(&lov->lov_lock); + atomic_inc(&lov->lov_refcount); + mutex_unlock(&lov->lov_lock); + return; +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt); + +static void lov_putref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + + mutex_lock(&lov->lov_lock); + /* ok to dec to 0 more than once -- ltd_exp's will be null */ + if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) { + LIST_HEAD(kill); + int i; + struct lov_tgt_desc *tgt, *n; + CDEBUG(D_CONFIG, "destroying %d lov targets\n", + lov->lov_death_row); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + + if (!tgt || !tgt->ltd_reap) + continue; + list_add(&tgt->ltd_kill, &kill); + /* XXX - right now there is a dependency on ld_tgt_count + * being the maximum tgt index for computing the + * mds_max_easize. So we can't shrink it. */ + lov_ost_pool_remove(&lov->lov_packed, i); + lov->lov_tgts[i] = NULL; + lov->lov_death_row--; + } + mutex_unlock(&lov->lov_lock); + + list_for_each_entry_safe(tgt, n, &kill, ltd_kill) { + list_del(&tgt->ltd_kill); + /* Disconnect */ + __lov_del_obd(obd, tgt); + } + } else { + mutex_unlock(&lov->lov_lock); + } +} + +static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, + enum obd_notify_event ev); +static int lov_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data); + + +#define MAX_STRING_SIZE 128 +int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, + struct obd_connect_data *data) +{ + struct lov_obd *lov = &obd->u.lov; + struct obd_uuid *tgt_uuid; + struct obd_device *tgt_obd; + static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" }; + struct obd_import *imp; + struct proc_dir_entry *lov_proc_dir; + int rc; + + if (!lov->lov_tgts[index]) + return -EINVAL; + + tgt_uuid = &lov->lov_tgts[index]->ltd_uuid; + tgt_obd = lov->lov_tgts[index]->ltd_obd; + + if (!tgt_obd->obd_set_up) { + CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid)); + return -EINVAL; + } + + /* override the sp_me from lov */ + tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me; + + if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX)) + data->ocd_index = index; + + /* + * Divine LOV knows that OBDs under it are OSCs. + */ + imp = tgt_obd->u.cli.cl_import; + + if (activate) { + tgt_obd->obd_no_recov = 0; + /* FIXME this is probably supposed to be + ptlrpc_set_import_active. Horrible naming. */ + ptlrpc_activate_import(imp); + } + + rc = obd_register_observer(tgt_obd, obd); + if (rc) { + CERROR("Target %s register_observer error %d\n", + obd_uuid2str(tgt_uuid), rc); + return rc; + } + + + if (imp->imp_invalid) { + CDEBUG(D_CONFIG, "not connecting OSC %s; administratively disabled\n", + obd_uuid2str(tgt_uuid)); + return 0; + } + + rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd, + &lov_osc_uuid, data, NULL); + if (rc || !lov->lov_tgts[index]->ltd_exp) { + CERROR("Target %s connect error %d\n", + obd_uuid2str(tgt_uuid), rc); + return -ENODEV; + } + + lov->lov_tgts[index]->ltd_reap = 0; + + CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index, + obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in"); + + lov_proc_dir = obd->obd_proc_private; + if (lov_proc_dir) { + struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd; + struct proc_dir_entry *osc_symlink; + + LASSERT(osc_obd != NULL); + LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC); + LASSERT(osc_obd->obd_type->typ_name != NULL); + + osc_symlink = lprocfs_add_symlink(osc_obd->obd_name, + lov_proc_dir, + "../../../%s/%s", + osc_obd->obd_type->typ_name, + osc_obd->obd_name); + if (osc_symlink == NULL) { + CERROR("could not register LOV target /proc/fs/lustre/%s/%s/target_obds/%s.", + obd->obd_type->typ_name, obd->obd_name, + osc_obd->obd_name); + lprocfs_remove(&lov_proc_dir); + obd->obd_proc_private = NULL; + } + } + + return 0; +} + +static int lov_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct lustre_handle conn; + int i, rc; + + CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects); + + rc = class_connect(&conn, obd, cluuid); + if (rc) + return rc; + + *exp = class_conn2export(&conn); + + /* Why should there ever be more than 1 connect? */ + lov->lov_connects++; + LASSERT(lov->lov_connects == 1); + + memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd)); + if (data) + lov->lov_ocd = *data; + + obd_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + if (!tgt || obd_uuid_empty(&tgt->ltd_uuid)) + continue; + /* Flags will be lowest common denominator */ + rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd); + if (rc) { + CERROR("%s: lov connect tgt %d failed: %d\n", + obd->obd_name, i, rc); + continue; + } + /* connect to administrative disabled ost */ + if (!lov->lov_tgts[i]->ltd_exp) + continue; + + rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd, + OBD_NOTIFY_CONNECT, (void *)&i); + if (rc) { + CERROR("%s error sending notify %d\n", + obd->obd_name, rc); + } + } + obd_putref(obd); + + return 0; +} + +static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct proc_dir_entry *lov_proc_dir; + struct lov_obd *lov = &obd->u.lov; + struct obd_device *osc_obd; + int rc; + + osc_obd = class_exp2obd(tgt->ltd_exp); + CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", + obd->obd_name, osc_obd ? osc_obd->obd_name : "NULL"); + + if (tgt->ltd_active) { + tgt->ltd_active = 0; + lov->desc.ld_active_tgt_count--; + tgt->ltd_exp->exp_obd->obd_inactive = 1; + } + + if (osc_obd) { + lov_proc_dir = obd->obd_proc_private; + if (lov_proc_dir) { + lprocfs_remove_proc_entry(osc_obd->obd_name, lov_proc_dir); + } + /* Pass it on to our clients. + * XXX This should be an argument to disconnect, + * XXX not a back-door flag on the OBD. Ah well. + */ + osc_obd->obd_force = obd->obd_force; + osc_obd->obd_fail = obd->obd_fail; + osc_obd->obd_no_recov = obd->obd_no_recov; + } + + obd_register_observer(osc_obd, NULL); + + rc = obd_disconnect(tgt->ltd_exp); + if (rc) { + CERROR("Target %s disconnect error %d\n", + tgt->ltd_uuid.uuid, rc); + rc = 0; + } + + tgt->ltd_exp = NULL; + return 0; +} + +static int lov_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + int i, rc; + + if (!lov->lov_tgts) + goto out; + + /* Only disconnect the underlying layers on the final disconnect. */ + lov->lov_connects--; + if (lov->lov_connects != 0) { + /* why should there be more than 1 connect? */ + CERROR("disconnect #%d\n", lov->lov_connects); + goto out; + } + + /* Let's hold another reference so lov_del_obd doesn't spin through + putref every time */ + obd_getref(obd); + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) { + /* Disconnection is the last we know about an obd */ + lov_del_target(obd, i, NULL, lov->lov_tgts[i]->ltd_gen); + } + } + obd_putref(obd); + +out: + rc = class_disconnect(exp); /* bz 9811 */ + return rc; +} + +/* Error codes: + * + * -EINVAL : UUID can't be found in the LOV's target list + * -ENOTCONN: The UUID is found, but the target connection is bad (!) + * -EBADF : The UUID is found, but the OBD is the wrong type (!) + * any >= 0 : is log target index + */ +static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, + enum obd_notify_event ev) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + int index, activate, active; + + CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n", + lov, uuid->uuid, ev); + + obd_getref(obd); + for (index = 0; index < lov->desc.ld_tgt_count; index++) { + tgt = lov->lov_tgts[index]; + if (!tgt) + continue; + /* + * LU-642, initially inactive OSC could miss the obd_connect, + * we make up for it here. + */ + if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL && + obd_uuid_equals(uuid, &tgt->ltd_uuid)) { + struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"}; + + obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd, + &lov_osc_uuid, &lov->lov_ocd, NULL); + } + if (!tgt->ltd_exp) + continue; + + CDEBUG(D_INFO, "lov idx %d is %s conn %#llx\n", + index, obd_uuid2str(&tgt->ltd_uuid), + tgt->ltd_exp->exp_handle.h_cookie); + if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) + break; + } + + if (index == lov->desc.ld_tgt_count) { + index = -EINVAL; + goto out; + } + + if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) { + activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0; + + if (lov->lov_tgts[index]->ltd_activate == activate) { + CDEBUG(D_INFO, "OSC %s already %sactivate!\n", + uuid->uuid, activate ? "" : "de"); + } else { + lov->lov_tgts[index]->ltd_activate = activate; + CDEBUG(D_CONFIG, "%sactivate OSC %s\n", + activate ? "" : "de", obd_uuid2str(uuid)); + } + + } else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) { + active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0; + + if (lov->lov_tgts[index]->ltd_active == active) { + CDEBUG(D_INFO, "OSC %s already %sactive!\n", + uuid->uuid, active ? "" : "in"); + goto out; + } else { + CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n", + obd_uuid2str(uuid), active ? "" : "in"); + } + + lov->lov_tgts[index]->ltd_active = active; + if (active) { + lov->desc.ld_active_tgt_count++; + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0; + } else { + lov->desc.ld_active_tgt_count--; + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1; + } + } else { + CERROR("Unknown event(%d) for uuid %s", ev, uuid->uuid); + } + + out: + obd_putref(obd); + return index; +} + +static int lov_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data) +{ + int rc = 0; + struct lov_obd *lov = &obd->u.lov; + + down_read(&lov->lov_notify_lock); + if (!lov->lov_connects) { + up_read(&lov->lov_notify_lock); + return rc; + } + + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE || + ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) { + struct obd_uuid *uuid; + + LASSERT(watched); + + if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + up_read(&lov->lov_notify_lock); + CERROR("unexpected notification of %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + return -EINVAL; + } + uuid = &watched->u.cli.cl_target_uuid; + + /* Set OSC as active before notifying the observer, so the + * observer can use the OSC normally. + */ + rc = lov_set_osc_active(obd, uuid, ev); + if (rc < 0) { + up_read(&lov->lov_notify_lock); + CERROR("event(%d) of %s failed: %d\n", ev, + obd_uuid2str(uuid), rc); + return rc; + } + /* active event should be pass lov target index as data */ + data = &rc; + } + + /* Pass the notification up the chain. */ + if (watched) { + rc = obd_notify_observer(obd, watched, ev, data); + } else { + /* NULL watched means all osc's in the lov (only for syncs) */ + /* sync event should be send lov idx as data */ + struct lov_obd *lov = &obd->u.lov; + int i, is_sync; + + data = &i; + is_sync = (ev == OBD_NOTIFY_SYNC) || + (ev == OBD_NOTIFY_SYNC_NONBLOCK); + + obd_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + /* don't send sync event if target not + * connected/activated */ + if (is_sync && !lov->lov_tgts[i]->ltd_active) + continue; + + rc = obd_notify_observer(obd, lov->lov_tgts[i]->ltd_obd, + ev, data); + if (rc) { + CERROR("%s: notify %s of %s failed %d\n", + obd->obd_name, + obd->obd_observer->obd_name, + lov->lov_tgts[i]->ltd_obd->obd_name, + rc); + } + } + obd_putref(obd); + } + + up_read(&lov->lov_notify_lock); + return rc; +} + +static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + __u32 index, int gen, int active) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct obd_device *tgt_obd; + int rc; + + CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n", + uuidp->uuid, index, gen, active); + + if (gen <= 0) { + CERROR("request to add OBD %s with invalid generation: %d\n", + uuidp->uuid, gen); + return -EINVAL; + } + + tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME, + &obd->obd_uuid); + if (tgt_obd == NULL) + return -EINVAL; + + mutex_lock(&lov->lov_lock); + + if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) { + tgt = lov->lov_tgts[index]; + CERROR("UUID %s already assigned at LOV target index %d\n", + obd_uuid2str(&tgt->ltd_uuid), index); + mutex_unlock(&lov->lov_lock); + return -EEXIST; + } + + if (index >= lov->lov_tgt_size) { + /* We need to reallocate the lov target array. */ + struct lov_tgt_desc **newtgts, **old = NULL; + __u32 newsize, oldsize = 0; + + newsize = max_t(__u32, lov->lov_tgt_size, 2); + while (newsize < index + 1) + newsize <<= 1; + OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize); + if (newtgts == NULL) { + mutex_unlock(&lov->lov_lock); + return -ENOMEM; + } + + if (lov->lov_tgt_size) { + memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) * + lov->lov_tgt_size); + old = lov->lov_tgts; + oldsize = lov->lov_tgt_size; + } + + lov->lov_tgts = newtgts; + lov->lov_tgt_size = newsize; + smp_rmb(); + if (old) + OBD_FREE(old, sizeof(*old) * oldsize); + + CDEBUG(D_CONFIG, "tgts: %p size: %d\n", + lov->lov_tgts, lov->lov_tgt_size); + } + + OBD_ALLOC_PTR(tgt); + if (!tgt) { + mutex_unlock(&lov->lov_lock); + return -ENOMEM; + } + + rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size); + if (rc) { + mutex_unlock(&lov->lov_lock); + OBD_FREE_PTR(tgt); + return rc; + } + + tgt->ltd_uuid = *uuidp; + tgt->ltd_obd = tgt_obd; + /* XXX - add a sanity check on the generation number. */ + tgt->ltd_gen = gen; + tgt->ltd_index = index; + tgt->ltd_activate = active; + lov->lov_tgts[index] = tgt; + if (index >= lov->desc.ld_tgt_count) + lov->desc.ld_tgt_count = index + 1; + + mutex_unlock(&lov->lov_lock); + + CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n", + index, tgt->ltd_gen, lov->desc.ld_tgt_count); + + rc = obd_notify(obd, tgt_obd, OBD_NOTIFY_CREATE, &index); + + if (lov->lov_connects == 0) { + /* lov_connect hasn't been called yet. We'll do the + lov_connect_obd on this target when that fn first runs, + because we don't know the connect flags yet. */ + return 0; + } + + obd_getref(obd); + + rc = lov_connect_obd(obd, index, active, &lov->lov_ocd); + if (rc) + goto out; + + /* connect to administrative disabled ost */ + if (!tgt->ltd_exp) { + rc = 0; + goto out; + } + + if (lov->lov_cache != NULL) { + rc = obd_set_info_async(NULL, tgt->ltd_exp, + sizeof(KEY_CACHE_SET), KEY_CACHE_SET, + sizeof(struct cl_client_cache), lov->lov_cache, + NULL); + if (rc < 0) + goto out; + } + + rc = lov_notify(obd, tgt->ltd_exp->exp_obd, + active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE, + (void *)&index); + +out: + if (rc) { + CERROR("add failed (%d), deleting %s\n", rc, + obd_uuid2str(&tgt->ltd_uuid)); + lov_del_target(obd, index, NULL, 0); + } + obd_putref(obd); + return rc; +} + +/* Schedule a target for deletion */ +int lov_del_target(struct obd_device *obd, __u32 index, + struct obd_uuid *uuidp, int gen) +{ + struct lov_obd *lov = &obd->u.lov; + int count = lov->desc.ld_tgt_count; + int rc = 0; + + if (index >= count) { + CERROR("LOV target index %d >= number of LOV OBDs %d.\n", + index, count); + return -EINVAL; + } + + /* to make sure there's no ongoing lov_notify() now */ + down_write(&lov->lov_notify_lock); + obd_getref(obd); + + if (!lov->lov_tgts[index]) { + CERROR("LOV target at index %d is not setup.\n", index); + rc = -EINVAL; + goto out; + } + + if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) { + CERROR("LOV target UUID %s at index %d doesn't match %s.\n", + lov_uuid2str(lov, index), index, + obd_uuid2str(uuidp)); + rc = -EINVAL; + goto out; + } + + CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n", + lov_uuid2str(lov, index), index, + lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp, + lov->lov_tgts[index]->ltd_active); + + lov->lov_tgts[index]->ltd_reap = 1; + lov->lov_death_row++; + /* we really delete it from obd_putref */ +out: + obd_putref(obd); + up_write(&lov->lov_notify_lock); + + return rc; +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct obd_device *osc_obd; + + LASSERT(tgt); + LASSERT(tgt->ltd_reap); + + osc_obd = class_exp2obd(tgt->ltd_exp); + + CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", + tgt->ltd_uuid.uuid, + osc_obd ? osc_obd->obd_name : ""); + + if (tgt->ltd_exp) + lov_disconnect_obd(obd, tgt); + + OBD_FREE_PTR(tgt); + + /* Manual cleanup - no cleanup logs to clean up the osc's. We must + do it ourselves. And we can't do it from lov_cleanup, + because we just lost our only reference to it. */ + if (osc_obd) + class_manual_cleanup(osc_obd); +} + +void lov_fix_desc_stripe_size(__u64 *val) +{ + if (*val < LOV_MIN_STRIPE_SIZE) { + if (*val != 0) + LCONSOLE_INFO("Increasing default stripe size to minimum %u\n", + LOV_DESC_STRIPE_SIZE_DEFAULT); + *val = LOV_DESC_STRIPE_SIZE_DEFAULT; + } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) { + *val &= ~(LOV_MIN_STRIPE_SIZE - 1); + LCONSOLE_WARN("Changing default stripe size to %llu (a multiple of %u)\n", + *val, LOV_MIN_STRIPE_SIZE); + } +} + +void lov_fix_desc_stripe_count(__u32 *val) +{ + if (*val == 0) + *val = 1; +} + +void lov_fix_desc_pattern(__u32 *val) +{ + /* from lov_setstripe */ + if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) { + LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val); + *val = 0; + } +} + +void lov_fix_desc_qos_maxage(__u32 *val) +{ + if (*val == 0) + *val = LOV_DESC_QOS_MAXAGE_DEFAULT; +} + +void lov_fix_desc(struct lov_desc *desc) +{ + lov_fix_desc_stripe_size(&desc->ld_default_stripe_size); + lov_fix_desc_stripe_count(&desc->ld_default_stripe_count); + lov_fix_desc_pattern(&desc->ld_pattern); + lov_fix_desc_qos_maxage(&desc->ld_qos_maxage); +} + +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars = { NULL }; + struct lov_desc *desc; + struct lov_obd *lov = &obd->u.lov; + int rc; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("LOV setup requires a descriptor\n"); + return -EINVAL; + } + + desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1); + + if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("descriptor size wrong: %d > %d\n", + (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); + return -EINVAL; + } + + if (desc->ld_magic != LOV_DESC_MAGIC) { + if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) { + CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n", + obd->obd_name, desc); + lustre_swab_lov_desc(desc); + } else { + CERROR("%s: Bad lov desc magic: %#x\n", + obd->obd_name, desc->ld_magic); + return -EINVAL; + } + } + + lov_fix_desc(desc); + + desc->ld_active_tgt_count = 0; + lov->desc = *desc; + lov->lov_tgt_size = 0; + + mutex_init(&lov->lov_lock); + atomic_set(&lov->lov_refcount, 0); + lov->lov_sp_me = LUSTRE_SP_CLI; + + init_rwsem(&lov->lov_notify_lock); + + lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS, + HASH_POOLS_MAX_BITS, + HASH_POOLS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &pool_hash_operations, + CFS_HASH_DEFAULT); + INIT_LIST_HEAD(&lov->lov_pool_list); + lov->lov_pool_count = 0; + rc = lov_ost_pool_init(&lov->lov_packed, 0); + if (rc) + goto out; + + lprocfs_lov_init_vars(&lvars); + lprocfs_obd_setup(obd, lvars.obd_vars); +#if defined (CONFIG_PROC_FS) + { + int rc1; + + rc1 = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", + 0444, &lov_proc_target_fops, obd); + if (rc1) + CWARN("Error adding the target_obd file\n"); + } +#endif + lov->lov_pool_proc_entry = lprocfs_register("pools", + obd->obd_proc_entry, + NULL, NULL); + + return 0; + +out: + return rc; +} + +static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + struct lov_obd *lov = &obd->u.lov; + + switch (stage) { + case OBD_CLEANUP_EARLY: { + int i; + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) + continue; + obd_precleanup(class_exp2obd(lov->lov_tgts[i]->ltd_exp), + OBD_CLEANUP_EARLY); + } + break; + } + default: + break; + } + + return 0; +} + +static int lov_cleanup(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + struct list_head *pos, *tmp; + struct pool_desc *pool; + + list_for_each_safe(pos, tmp, &lov->lov_pool_list) { + pool = list_entry(pos, struct pool_desc, pool_list); + /* free pool structs */ + CDEBUG(D_INFO, "delete pool %p\n", pool); + /* In the function below, .hs_keycmp resolves to + * pool_hashkey_keycmp() */ + /* coverity[overrun-buffer-val] */ + lov_pool_del(obd, pool->pool_name); + } + cfs_hash_putref(lov->lov_pools_hash_body); + lov_ost_pool_free(&lov->lov_packed); + + lprocfs_obd_cleanup(obd); + if (lov->lov_tgts) { + int i; + obd_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + /* Inactive targets may never have connected */ + if (lov->lov_tgts[i]->ltd_active || + atomic_read(&lov->lov_refcount)) + /* We should never get here - these + should have been removed in the + disconnect. */ + CERROR("lov tgt %d not cleaned! deathrow=%d, lovrc=%d\n", + i, lov->lov_death_row, + atomic_read(&lov->lov_refcount)); + lov_del_target(obd, i, NULL, 0); + } + obd_putref(obd); + OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) * + lov->lov_tgt_size); + lov->lov_tgt_size = 0; + } + return 0; +} + +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + __u32 *indexp, int *genp) +{ + struct obd_uuid obd_uuid; + int cmd; + int rc = 0; + + switch (cmd = lcfg->lcfg_command) { + case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: + case LCFG_LOV_DEL_OBD: { + __u32 index; + int gen; + /* lov_modify_tgts add 0:lov_mdsA 1:ost1_UUID 2:0 3:1 */ + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) { + rc = -EINVAL; + goto out; + } + + obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); + + if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1) { + rc = -EINVAL; + goto out; + } + if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1) { + rc = -EINVAL; + goto out; + } + index = *indexp; + gen = *genp; + if (cmd == LCFG_LOV_ADD_OBD) + rc = lov_add_target(obd, &obd_uuid, index, gen, 1); + else if (cmd == LCFG_LOV_ADD_INA) + rc = lov_add_target(obd, &obd_uuid, index, gen, 0); + else + rc = lov_del_target(obd, index, &obd_uuid, gen); + goto out; + } + case LCFG_PARAM: { + struct lprocfs_static_vars lvars = { NULL }; + struct lov_desc *desc = &(obd->u.lov.desc); + + if (!desc) { + rc = -EINVAL; + goto out; + } + + lprocfs_lov_init_vars(&lvars); + + rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars, + lcfg, obd); + if (rc > 0) + rc = 0; + goto out; + } + case LCFG_POOL_NEW: + case LCFG_POOL_ADD: + case LCFG_POOL_DEL: + case LCFG_POOL_REM: + goto out; + + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + rc = -EINVAL; + goto out; + + } + } +out: + return rc; +} + +static int lov_recreate(struct obd_export *exp, struct obdo *src_oa, + struct lov_stripe_md **ea, struct obd_trans_info *oti) +{ + struct lov_stripe_md *obj_mdp, *lsm; + struct lov_obd *lov = &exp->exp_obd->u.lov; + unsigned ost_idx; + int rc, i; + + LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS && + src_oa->o_flags & OBD_FL_RECREATE_OBJS); + + OBD_ALLOC(obj_mdp, sizeof(*obj_mdp)); + if (obj_mdp == NULL) + return -ENOMEM; + + ost_idx = src_oa->o_nlink; + lsm = *ea; + if (lsm == NULL) { + rc = -EINVAL; + goto out; + } + if (ost_idx >= lov->desc.ld_tgt_count || + !lov->lov_tgts[ost_idx]) { + rc = -EINVAL; + goto out; + } + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + + if (lov_oinfo_is_dummy(loi)) + continue; + + if (loi->loi_ost_idx == ost_idx) { + if (ostid_id(&loi->loi_oi) != ostid_id(&src_oa->o_oi)) { + rc = -EINVAL; + goto out; + } + break; + } + } + if (i == lsm->lsm_stripe_count) { + rc = -EINVAL; + goto out; + } + + rc = obd_create(NULL, lov->lov_tgts[ost_idx]->ltd_exp, + src_oa, &obj_mdp, oti); +out: + OBD_FREE(obj_mdp, sizeof(*obj_mdp)); + return rc; +} + +/* the LOV expects oa->o_id to be set to the LOV object id */ +static int lov_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *src_oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + struct lov_obd *lov; + int rc = 0; + + LASSERT(ea != NULL); + if (exp == NULL) + return -EINVAL; + + if ((src_oa->o_valid & OBD_MD_FLFLAGS) && + src_oa->o_flags == OBD_FL_DELORPHAN) { + /* should be used with LOV anymore */ + LBUG(); + } + + lov = &exp->exp_obd->u.lov; + if (!lov->desc.ld_active_tgt_count) + return -EIO; + + obd_getref(exp->exp_obd); + /* Recreate a specific object id at the given OST index */ + if ((src_oa->o_valid & OBD_MD_FLFLAGS) && + (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) { + rc = lov_recreate(exp, src_oa, ea, oti); + } + + obd_putref(exp->exp_obd); + return rc; +} + +#define ASSERT_LSM_MAGIC(lsmp) \ +do { \ + LASSERT((lsmp) != NULL); \ + LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 || \ + (lsmp)->lsm_magic == LOV_MAGIC_V3), \ + "%p->lsm_magic=%x\n", (lsmp), (lsmp)->lsm_magic); \ +} while (0) + +static int lov_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *lsm, + struct obd_trans_info *oti, struct obd_export *md_exp, + void *capa) +{ + struct lov_request_set *set; + struct obd_info oinfo; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + int rc = 0, err = 0; + + ASSERT_LSM_MAGIC(lsm); + + if (!exp || !exp->exp_obd) + return -ENODEV; + + if (oa->o_valid & OBD_MD_FLCOOKIE) { + LASSERT(oti); + LASSERT(oti->oti_logcookies); + } + + lov = &exp->exp_obd->u.lov; + obd_getref(exp->exp_obd); + rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set); + if (rc) + goto out; + + list_for_each(pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + if (oa->o_valid & OBD_MD_FLCOOKIE) + oti->oti_logcookies = set->set_cookies + req->rq_stripe; + + err = obd_destroy(env, lov->lov_tgts[req->rq_idx]->ltd_exp, + req->rq_oi.oi_oa, NULL, oti, NULL, capa); + err = lov_update_common_set(set, req, err); + if (err) { + CERROR("%s: destroying objid "DOSTID" subobj " + DOSTID" on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, POSTID(&oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), + req->rq_idx, err); + if (!rc) + rc = err; + } + } + + if (rc == 0) { + LASSERT(lsm_op_find(lsm->lsm_magic) != NULL); + rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp); + } + err = lov_fini_destroy_set(set); +out: + obd_putref(exp->exp_obd); + return rc ? rc : err; +} + +static int lov_getattr_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + + /* don't do attribute merge if this async op failed */ + if (rc) + atomic_set(&lovset->set_completes, 0); + err = lov_fini_getattr_set(lovset); + return rc ? rc : err; +} + +static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct ptlrpc_request_set *rqset) +{ + struct lov_request_set *lovset; + struct lov_obd *lov; + struct list_head *pos; + struct lov_request *req; + int rc = 0, err; + + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); + + if (!exp || !exp->exp_obd) + return -ENODEV; + + lov = &exp->exp_obd->u.lov; + + rc = lov_prep_getattr_set(exp, oinfo, &lovset); + if (rc) + return rc; + + CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n", + POSTID(&oinfo->oi_md->lsm_oi), oinfo->oi_md->lsm_stripe_count, + oinfo->oi_md->lsm_stripe_size); + + list_for_each(pos, &lovset->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + CDEBUG(D_INFO, "objid " DOSTID "[%d] has subobj " DOSTID " at idx%u\n", + POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe, + POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx); + rc = obd_getattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, rqset); + if (rc) { + CERROR("%s: getattr objid "DOSTID" subobj" + DOSTID" on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, + POSTID(&oinfo->oi_oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), + req->rq_idx, rc); + goto out; + } + } + + if (!list_empty(&rqset->set_requests)) { + LASSERT(rc == 0); + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_getattr_interpret; + rqset->set_arg = (void *)lovset; + return rc; + } +out: + if (rc) + atomic_set(&lovset->set_completes, 0); + err = lov_fini_getattr_set(lovset); + return rc ? rc : err; +} + +static int lov_setattr_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + + if (rc) + atomic_set(&lovset->set_completes, 0); + err = lov_fini_setattr_set(lovset); + return rc ? rc : err; +} + +/* If @oti is given, the request goes from MDS and responses from OSTs are not + needed. Otherwise, a client is waiting for responses. */ +static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + struct lov_request_set *set; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + int rc = 0; + + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); + if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) { + LASSERT(oti); + LASSERT(oti->oti_logcookies); + } + + if (!exp || !exp->exp_obd) + return -ENODEV; + + lov = &exp->exp_obd->u.lov; + rc = lov_prep_setattr_set(exp, oinfo, oti, &set); + if (rc) + return rc; + + CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n", + POSTID(&oinfo->oi_md->lsm_oi), + oinfo->oi_md->lsm_stripe_count, + oinfo->oi_md->lsm_stripe_size); + + list_for_each(pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) + oti->oti_logcookies = set->set_cookies + req->rq_stripe; + + CDEBUG(D_INFO, "objid " DOSTID "[%d] has subobj " DOSTID " at idx%u\n", + POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe, + POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx); + + rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, oti, rqset); + if (rc) { + CERROR("error: setattr objid "DOSTID" subobj" + DOSTID" on OST idx %d: rc = %d\n", + POSTID(&set->set_oi->oi_oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), + req->rq_idx, rc); + break; + } + } + + /* If we are not waiting for responses on async requests, return. */ + if (rc || !rqset || list_empty(&rqset->set_requests)) { + int err; + if (rc) + atomic_set(&set->set_completes, 0); + err = lov_fini_setattr_set(set); + return rc ? rc : err; + } + + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_setattr_interpret; + rqset->set_arg = (void *)set; + + return 0; +} + +/* find any ldlm lock of the inode in lov + * return 0 not find + * 1 find one + * < 0 error */ +static int lov_find_cbdata(struct obd_export *exp, + struct lov_stripe_md *lsm, ldlm_iterator_t it, + void *data) +{ + struct lov_obd *lov; + int rc = 0, i; + + ASSERT_LSM_MAGIC(lsm); + + if (!exp || !exp->exp_obd) + return -ENODEV; + + lov = &exp->exp_obd->u.lov; + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_stripe_md submd; + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + + if (lov_oinfo_is_dummy(loi)) + continue; + + if (!lov->lov_tgts[loi->loi_ost_idx]) { + CDEBUG(D_HA, "lov idx %d NULL\n", loi->loi_ost_idx); + continue; + } + + submd.lsm_oi = loi->loi_oi; + submd.lsm_stripe_count = 0; + rc = obd_find_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, + &submd, it, data); + if (rc != 0) + return rc; + } + return rc; +} + +int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + + if (rc) + atomic_set(&lovset->set_completes, 0); + + err = lov_fini_statfs_set(lovset); + return rc ? rc : err; +} + +static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo, + __u64 max_age, struct ptlrpc_request_set *rqset) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_request_set *set; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + int rc = 0; + + LASSERT(oinfo != NULL); + LASSERT(oinfo->oi_osfs != NULL); + + lov = &obd->u.lov; + rc = lov_prep_statfs_set(obd, oinfo, &set); + if (rc) + return rc; + + list_for_each(pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, max_age, rqset); + if (rc) + break; + } + + if (rc || list_empty(&rqset->set_requests)) { + int err; + if (rc) + atomic_set(&set->set_completes, 0); + err = lov_fini_statfs_set(set); + return rc ? rc : err; + } + + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_statfs_interpret; + rqset->set_arg = (void *)set; + return 0; +} + +static int lov_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags) +{ + struct ptlrpc_request_set *set = NULL; + struct obd_info oinfo = { { { 0 } } }; + int rc = 0; + + /* for obdclass we forbid using obd_statfs_rqset, but prefer using async + * statfs requests */ + set = ptlrpc_prep_set(); + if (set == NULL) + return -ENOMEM; + + oinfo.oi_osfs = osfs; + oinfo.oi_flags = flags; + rc = lov_statfs_async(exp, &oinfo, max_age, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + + return rc; +} + +static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lov_obd *lov = &obddev->u.lov; + int i = 0, rc = 0, count = lov->desc.ld_tgt_count; + struct obd_uuid *uuidp; + + switch (cmd) { + case IOC_OBD_STATFS: { + struct obd_ioctl_data *data = karg; + struct obd_device *osc_obd; + struct obd_statfs stat_buf = {0}; + __u32 index; + __u32 flags; + + memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); + if ((index >= count)) + return -ENODEV; + + if (!lov->lov_tgts[index]) + /* Try again with the next index */ + return -EAGAIN; + if (!lov->lov_tgts[index]->ltd_active) + return -ENODATA; + + osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp); + if (!osc_obd) + return -EINVAL; + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd), + min((int) data->ioc_plen2, + (int) sizeof(struct obd_uuid)))) + return -EFAULT; + + flags = uarg ? *(__u32 *)uarg : 0; + /* got statfs data */ + rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + flags); + if (rc) + return rc; + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min((int) data->ioc_plen1, + (int) sizeof(stat_buf)))) + return -EFAULT; + break; + } + case OBD_IOC_LOV_GET_CONFIG: { + struct obd_ioctl_data *data; + struct lov_desc *desc; + char *buf = NULL; + __u32 *genp; + + len = 0; + if (obd_ioctl_getdata(&buf, &len, (void *)uarg)) + return -EINVAL; + + data = (struct obd_ioctl_data *)buf; + + if (sizeof(*desc) > data->ioc_inllen1) { + obd_ioctl_freedata(buf, len); + return -EINVAL; + } + + if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) { + obd_ioctl_freedata(buf, len); + return -EINVAL; + } + + if (sizeof(__u32) * count > data->ioc_inllen3) { + obd_ioctl_freedata(buf, len); + return -EINVAL; + } + + desc = (struct lov_desc *)data->ioc_inlbuf1; + memcpy(desc, &(lov->desc), sizeof(*desc)); + + uuidp = (struct obd_uuid *)data->ioc_inlbuf2; + genp = (__u32 *)data->ioc_inlbuf3; + /* the uuid will be empty for deleted OSTs */ + for (i = 0; i < count; i++, uuidp++, genp++) { + if (!lov->lov_tgts[i]) + continue; + *uuidp = lov->lov_tgts[i]->ltd_uuid; + *genp = lov->lov_tgts[i]->ltd_gen; + } + + if (copy_to_user((void *)uarg, buf, len)) + rc = -EFAULT; + obd_ioctl_freedata(buf, len); + break; + } + case LL_IOC_LOV_GETSTRIPE: + rc = lov_getstripe(exp, karg, uarg); + break; + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct lov_tgt_desc *tgt = NULL; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_OSTIDX) { + if (qctl->qc_idx < 0 || count <= qctl->qc_idx) + return -EINVAL; + + tgt = lov->lov_tgts[qctl->qc_idx]; + if (!tgt || !tgt->ltd_exp) + return -EINVAL; + } else if (qctl->qc_valid == QC_UUID) { + for (i = 0; i < count; i++) { + tgt = lov->lov_tgts[i]; + if (!tgt || + !obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (tgt->ltd_exp == NULL) + return -EINVAL; + + break; + } + } else { + return -EINVAL; + } + + if (i >= count) + return -EAGAIN; + + LASSERT(tgt && tgt->ltd_exp); + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + return -ENOMEM; + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_OSTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } + default: { + int set = 0; + + if (count == 0) + return -ENOTTY; + + for (i = 0; i < count; i++) { + int err; + struct obd_device *osc_obd; + + /* OST was disconnected */ + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) + continue; + + /* ll_umount_begin() sets force flag but for lov, not + * osc. Let's pass it through */ + osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp); + osc_obd->obd_force = obddev->obd_force; + err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp, + len, karg, uarg); + if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) { + return err; + } else if (err) { + if (lov->lov_tgts[i]->ltd_active) { + CDEBUG(err == -ENOTTY ? + D_IOCTL : D_WARNING, + "iocontrol OSC %s on OST idx %d cmd %x: err = %d\n", + lov_uuid2str(lov, i), + i, cmd, err); + if (!rc) + rc = err; + } + } else { + set = 1; + } + } + if (!set && !rc) + rc = -EIO; + } + } + + return rc; +} + +#define FIEMAP_BUFFER_SIZE 4096 + +/** + * Non-zero fe_logical indicates that this is a continuation FIEMAP + * call. The local end offset and the device are sent in the first + * fm_extent. This function calculates the stripe number from the index. + * This function returns a stripe_no on which mapping is to be restarted. + * + * This function returns fm_end_offset which is the in-OST offset at which + * mapping should be restarted. If fm_end_offset=0 is returned then caller + * will re-calculate proper offset in next stripe. + * Note that the first extent is passed to lov_get_info via the value field. + * + * \param fiemap fiemap request header + * \param lsm striping information for the file + * \param fm_start logical start of mapping + * \param fm_end logical end of mapping + * \param start_stripe starting stripe will be returned in this + */ +static u64 fiemap_calc_fm_end_offset(struct ll_user_fiemap *fiemap, + struct lov_stripe_md *lsm, u64 fm_start, + u64 fm_end, int *start_stripe) +{ + u64 local_end = fiemap->fm_extents[0].fe_logical; + u64 lun_start, lun_end; + u64 fm_end_offset; + int stripe_no = -1, i; + + if (fiemap->fm_extent_count == 0 || + fiemap->fm_extents[0].fe_logical == 0) + return 0; + + /* Find out stripe_no from ost_index saved in the fe_device */ + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *oinfo = lsm->lsm_oinfo[i]; + + if (lov_oinfo_is_dummy(oinfo)) + continue; + + if (oinfo->loi_ost_idx == fiemap->fm_extents[0].fe_device) { + stripe_no = i; + break; + } + } + if (stripe_no == -1) + return -EINVAL; + + /* If we have finished mapping on previous device, shift logical + * offset to start of next device */ + if ((lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end, + &lun_start, &lun_end)) != 0 && + local_end < lun_end) { + fm_end_offset = local_end; + *start_stripe = stripe_no; + } else { + /* This is a special value to indicate that caller should + * calculate offset in next stripe. */ + fm_end_offset = 0; + *start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count; + } + + return fm_end_offset; +} + +/** + * We calculate on which OST the mapping will end. If the length of mapping + * is greater than (stripe_size * stripe_count) then the last_stripe will + * will be one just before start_stripe. Else we check if the mapping + * intersects each OST and find last_stripe. + * This function returns the last_stripe and also sets the stripe_count + * over which the mapping is spread + * + * \param lsm striping information for the file + * \param fm_start logical start of mapping + * \param fm_end logical end of mapping + * \param start_stripe starting stripe of the mapping + * \param stripe_count the number of stripes across which to map is returned + * + * \retval last_stripe return the last stripe of the mapping + */ +static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, u64 fm_start, + u64 fm_end, int start_stripe, + int *stripe_count) +{ + int last_stripe; + u64 obd_start, obd_end; + int i, j; + + if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) { + last_stripe = start_stripe < 1 ? lsm->lsm_stripe_count - 1 : + start_stripe - 1; + *stripe_count = lsm->lsm_stripe_count; + } else { + for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count; + i = (i + 1) % lsm->lsm_stripe_count, j++) { + if ((lov_stripe_intersects(lsm, i, fm_start, fm_end, + &obd_start, &obd_end)) == 0) + break; + } + *stripe_count = j; + last_stripe = (start_stripe + j - 1) %lsm->lsm_stripe_count; + } + + return last_stripe; +} + +/** + * Set fe_device and copy extents from local buffer into main return buffer. + * + * \param fiemap fiemap request header + * \param lcl_fm_ext array of local fiemap extents to be copied + * \param ost_index OST index to be written into the fm_device field for each + extent + * \param ext_count number of extents to be copied + * \param current_extent where to start copying in main extent array + */ +static void fiemap_prepare_and_copy_exts(struct ll_user_fiemap *fiemap, + struct ll_fiemap_extent *lcl_fm_ext, + int ost_index, unsigned int ext_count, + int current_extent) +{ + char *to; + int ext; + + for (ext = 0; ext < ext_count; ext++) { + lcl_fm_ext[ext].fe_device = ost_index; + lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET; + } + + /* Copy fm_extent's from fm_local to return buffer */ + to = (char *)fiemap + fiemap_count_to_size(current_extent); + memcpy(to, lcl_fm_ext, ext_count * sizeof(struct ll_fiemap_extent)); +} + +/** + * Break down the FIEMAP request and send appropriate calls to individual OSTs. + * This also handles the restarting of FIEMAP calls in case mapping overflows + * the available number of extents in single call. + */ +static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key, + __u32 *vallen, void *val, struct lov_stripe_md *lsm) +{ + struct ll_fiemap_info_key *fm_key = key; + struct ll_user_fiemap *fiemap = val; + struct ll_user_fiemap *fm_local = NULL; + struct ll_fiemap_extent *lcl_fm_ext; + int count_local; + unsigned int get_num_extents = 0; + int ost_index = 0, actual_start_stripe, start_stripe; + u64 fm_start, fm_end, fm_length, fm_end_offset; + u64 curr_loc; + int current_extent = 0, rc = 0, i; + int ost_eof = 0; /* EOF for object */ + int ost_done = 0; /* done with required mapping for this OST? */ + int last_stripe; + int cur_stripe = 0, cur_stripe_wrap = 0, stripe_count; + unsigned int buffer_size = FIEMAP_BUFFER_SIZE; + + if (!lsm_has_objects(lsm)) { + rc = 0; + goto out; + } + + if (fiemap_count_to_size(fm_key->fiemap.fm_extent_count) < buffer_size) + buffer_size = fiemap_count_to_size(fm_key->fiemap.fm_extent_count); + + OBD_ALLOC_LARGE(fm_local, buffer_size); + if (fm_local == NULL) { + rc = -ENOMEM; + goto out; + } + lcl_fm_ext = &fm_local->fm_extents[0]; + + count_local = fiemap_size_to_count(buffer_size); + + memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap)); + fm_start = fiemap->fm_start; + fm_length = fiemap->fm_length; + /* Calculate start stripe, last stripe and length of mapping */ + actual_start_stripe = start_stripe = lov_stripe_number(lsm, fm_start); + fm_end = (fm_length == ~0ULL ? fm_key->oa.o_size : + fm_start + fm_length - 1); + /* If fm_length != ~0ULL but fm_start+fm_length-1 exceeds file size */ + if (fm_end > fm_key->oa.o_size) + fm_end = fm_key->oa.o_size; + + last_stripe = fiemap_calc_last_stripe(lsm, fm_start, fm_end, + actual_start_stripe, &stripe_count); + + fm_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fm_start, + fm_end, &start_stripe); + if (fm_end_offset == -EINVAL) { + rc = -EINVAL; + goto out; + } + + if (fiemap_count_to_size(fiemap->fm_extent_count) > *vallen) + fiemap->fm_extent_count = fiemap_size_to_count(*vallen); + if (fiemap->fm_extent_count == 0) { + get_num_extents = 1; + count_local = 0; + } + /* Check each stripe */ + for (cur_stripe = start_stripe, i = 0; i < stripe_count; + i++, cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) { + u64 req_fm_len; /* Stores length of required mapping */ + u64 len_mapped_single_call; + u64 lun_start, lun_end, obd_object_end; + unsigned int ext_count; + + cur_stripe_wrap = cur_stripe; + + /* Find out range of mapping on this stripe */ + if ((lov_stripe_intersects(lsm, cur_stripe, fm_start, fm_end, + &lun_start, &obd_object_end)) == 0) + continue; + + if (lov_oinfo_is_dummy(lsm->lsm_oinfo[cur_stripe])) { + rc = -EIO; + goto out; + } + + /* If this is a continuation FIEMAP call and we are on + * starting stripe then lun_start needs to be set to + * fm_end_offset */ + if (fm_end_offset != 0 && cur_stripe == start_stripe) + lun_start = fm_end_offset; + + if (fm_length != ~0ULL) { + /* Handle fm_start + fm_length overflow */ + if (fm_start + fm_length < fm_start) + fm_length = ~0ULL - fm_start; + lun_end = lov_size_to_stripe(lsm, fm_start + fm_length, + cur_stripe); + } else { + lun_end = ~0ULL; + } + + if (lun_start == lun_end) + continue; + + req_fm_len = obd_object_end - lun_start; + fm_local->fm_length = 0; + len_mapped_single_call = 0; + + /* If the output buffer is very large and the objects have many + * extents we may need to loop on a single OST repeatedly */ + ost_eof = 0; + ost_done = 0; + do { + if (get_num_extents == 0) { + /* Don't get too many extents. */ + if (current_extent + count_local > + fiemap->fm_extent_count) + count_local = fiemap->fm_extent_count - + current_extent; + } + + lun_start += len_mapped_single_call; + fm_local->fm_length = req_fm_len - len_mapped_single_call; + req_fm_len = fm_local->fm_length; + fm_local->fm_extent_count = count_local; + fm_local->fm_mapped_extents = 0; + fm_local->fm_flags = fiemap->fm_flags; + + fm_key->oa.o_oi = lsm->lsm_oinfo[cur_stripe]->loi_oi; + ost_index = lsm->lsm_oinfo[cur_stripe]->loi_ost_idx; + + if (ost_index < 0 || + ost_index >= lov->desc.ld_tgt_count) { + rc = -EINVAL; + goto out; + } + + /* If OST is inactive, return extent with UNKNOWN flag */ + if (!lov->lov_tgts[ost_index]->ltd_active) { + fm_local->fm_flags |= FIEMAP_EXTENT_LAST; + fm_local->fm_mapped_extents = 1; + + lcl_fm_ext[0].fe_logical = lun_start; + lcl_fm_ext[0].fe_length = obd_object_end - + lun_start; + lcl_fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN; + + goto inactive_tgt; + } + + fm_local->fm_start = lun_start; + fm_local->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER; + memcpy(&fm_key->fiemap, fm_local, sizeof(*fm_local)); + *vallen=fiemap_count_to_size(fm_local->fm_extent_count); + rc = obd_get_info(NULL, + lov->lov_tgts[ost_index]->ltd_exp, + keylen, key, vallen, fm_local, lsm); + if (rc != 0) + goto out; + +inactive_tgt: + ext_count = fm_local->fm_mapped_extents; + if (ext_count == 0) { + ost_done = 1; + /* If last stripe has hole at the end, + * then we need to return */ + if (cur_stripe_wrap == last_stripe) { + fiemap->fm_mapped_extents = 0; + goto finish; + } + break; + } + + /* If we just need num of extents then go to next device */ + if (get_num_extents) { + current_extent += ext_count; + break; + } + + len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical - + lun_start + lcl_fm_ext[ext_count - 1].fe_length; + + /* Have we finished mapping on this device? */ + if (req_fm_len <= len_mapped_single_call) + ost_done = 1; + + /* Clear the EXTENT_LAST flag which can be present on + * last extent */ + if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST) + lcl_fm_ext[ext_count - 1].fe_flags &= + ~FIEMAP_EXTENT_LAST; + + curr_loc = lov_stripe_size(lsm, + lcl_fm_ext[ext_count - 1].fe_logical+ + lcl_fm_ext[ext_count - 1].fe_length, + cur_stripe); + if (curr_loc >= fm_key->oa.o_size) + ost_eof = 1; + + fiemap_prepare_and_copy_exts(fiemap, lcl_fm_ext, + ost_index, ext_count, + current_extent); + + current_extent += ext_count; + + /* Ran out of available extents? */ + if (current_extent >= fiemap->fm_extent_count) + goto finish; + } while (ost_done == 0 && ost_eof == 0); + + if (cur_stripe_wrap == last_stripe) + goto finish; + } + +finish: + /* Indicate that we are returning device offsets unless file just has + * single stripe */ + if (lsm->lsm_stripe_count > 1) + fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER; + + if (get_num_extents) + goto skip_last_device_calc; + + /* Check if we have reached the last stripe and whether mapping for that + * stripe is done. */ + if (cur_stripe_wrap == last_stripe) { + if (ost_done || ost_eof) + fiemap->fm_extents[current_extent - 1].fe_flags |= + FIEMAP_EXTENT_LAST; + } + +skip_last_device_calc: + fiemap->fm_mapped_extents = current_extent; + +out: + OBD_FREE_LARGE(fm_local, buffer_size); + return rc; +} + +static int lov_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lov_obd *lov = &obddev->u.lov; + int i, rc; + + if (!vallen || !val) + return -EFAULT; + + obd_getref(obddev); + + if (KEY_IS(KEY_LOCK_TO_STRIPE)) { + struct { + char name[16]; + struct ldlm_lock *lock; + } *data = key; + struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name; + struct lov_oinfo *loi; + __u32 *stripe = val; + + if (*vallen < sizeof(*stripe)) { + rc = -EFAULT; + goto out; + } + *vallen = sizeof(*stripe); + + /* XXX This is another one of those bits that will need to + * change if we ever actually support nested LOVs. It uses + * the lock's export to find out which stripe it is. */ + /* XXX - it's assumed all the locks for deleted OSTs have + * been cancelled. Also, the export for deleted OSTs will + * be NULL and won't match the lock's export. */ + for (i = 0; i < lsm->lsm_stripe_count; i++) { + loi = lsm->lsm_oinfo[i]; + if (lov_oinfo_is_dummy(loi)) + continue; + + if (!lov->lov_tgts[loi->loi_ost_idx]) + continue; + if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp == + data->lock->l_conn_export && + ostid_res_name_eq(&loi->loi_oi, res_id)) { + *stripe = i; + rc = 0; + goto out; + } + } + LDLM_ERROR(data->lock, "lock on inode without such object"); + dump_lsm(D_ERROR, lsm); + rc = -ENXIO; + goto out; + } else if (KEY_IS(KEY_LAST_ID)) { + struct obd_id_info *info = val; + __u32 size = sizeof(u64); + struct lov_tgt_desc *tgt; + + LASSERT(*vallen == sizeof(struct obd_id_info)); + tgt = lov->lov_tgts[info->idx]; + + if (!tgt || !tgt->ltd_active) { + rc = -ESRCH; + goto out; + } + + rc = obd_get_info(env, tgt->ltd_exp, keylen, key, + &size, info->data, NULL); + rc = 0; + goto out; + } else if (KEY_IS(KEY_LOVDESC)) { + struct lov_desc *desc_ret = val; + *desc_ret = lov->desc; + + rc = 0; + goto out; + } else if (KEY_IS(KEY_FIEMAP)) { + rc = lov_fiemap(lov, keylen, key, vallen, val, lsm); + goto out; + } else if (KEY_IS(KEY_CONNECT_FLAG)) { + struct lov_tgt_desc *tgt; + __u64 ost_idx = *((__u64 *)val); + + LASSERT(*vallen == sizeof(__u64)); + LASSERT(ost_idx < lov->desc.ld_tgt_count); + tgt = lov->lov_tgts[ost_idx]; + + if (!tgt || !tgt->ltd_exp) { + rc = -ESRCH; + goto out; + } + + *((__u64 *)val) = exp_connect_flags(tgt->ltd_exp); + rc = 0; + goto out; + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = lov->desc.ld_tgt_count; + rc = 0; + goto out; + } + + rc = -EINVAL; + +out: + obd_putref(obddev); + return rc; +} + +static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, u32 vallen, + void *val, struct ptlrpc_request_set *set) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lov_obd *lov = &obddev->u.lov; + u32 count; + int i, rc = 0, err; + struct lov_tgt_desc *tgt; + unsigned incr, check_uuid, + do_inactive, no_set; + unsigned next_id = 0, mds_con = 0, capa = 0; + + incr = check_uuid = do_inactive = no_set = 0; + if (set == NULL) { + no_set = 1; + set = ptlrpc_prep_set(); + if (!set) + return -ENOMEM; + } + + obd_getref(obddev); + count = lov->desc.ld_tgt_count; + + if (KEY_IS(KEY_NEXT_ID)) { + count = vallen / sizeof(struct obd_id_info); + vallen = sizeof(u64); + incr = sizeof(struct obd_id_info); + do_inactive = 1; + next_id = 1; + } else if (KEY_IS(KEY_CHECKSUM)) { + do_inactive = 1; + } else if (KEY_IS(KEY_EVICT_BY_NID)) { + /* use defaults: do_inactive = incr = 0; */ + } else if (KEY_IS(KEY_MDS_CONN)) { + mds_con = 1; + } else if (KEY_IS(KEY_CAPA_KEY)) { + capa = 1; + } else if (KEY_IS(KEY_CACHE_SET)) { + LASSERT(lov->lov_cache == NULL); + lov->lov_cache = val; + do_inactive = 1; + } + + for (i = 0; i < count; i++, val = (char *)val + incr) { + if (next_id) { + tgt = lov->lov_tgts[((struct obd_id_info *)val)->idx]; + } else { + tgt = lov->lov_tgts[i]; + } + /* OST was disconnected */ + if (!tgt || !tgt->ltd_exp) + continue; + + /* OST is inactive and we don't want inactive OSCs */ + if (!tgt->ltd_active && !do_inactive) + continue; + + if (mds_con) { + struct mds_group_info *mgi; + + LASSERT(vallen == sizeof(*mgi)); + mgi = (struct mds_group_info *)val; + + /* Only want a specific OSC */ + if (mgi->uuid && !obd_uuid_equals(mgi->uuid, + &tgt->ltd_uuid)) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, sizeof(int), + &mgi->group, set); + } else if (next_id) { + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, vallen, + ((struct obd_id_info *)val)->data, set); + } else if (capa) { + struct mds_capa_info *info = (struct mds_capa_info *)val; + + LASSERT(vallen == sizeof(*info)); + + /* Only want a specific OSC */ + if (info->uuid && + !obd_uuid_equals(info->uuid, &tgt->ltd_uuid)) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, keylen, + key, sizeof(*info->capa), + info->capa, set); + } else { + /* Only want a specific OSC */ + if (check_uuid && + !obd_uuid_equals(val, &tgt->ltd_uuid)) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, vallen, val, set); + } + + if (!rc) + rc = err; + } + + obd_putref(obddev); + if (no_set) { + err = ptlrpc_set_wait(set); + if (!rc) + rc = err; + ptlrpc_set_destroy(set); + } + return rc; +} + +void lov_stripe_lock(struct lov_stripe_md *md) + __acquires(&md->lsm_lock) +{ + LASSERT(md->lsm_lock_owner != current_pid()); + spin_lock(&md->lsm_lock); + LASSERT(md->lsm_lock_owner == 0); + md->lsm_lock_owner = current_pid(); +} +EXPORT_SYMBOL(lov_stripe_lock); + +void lov_stripe_unlock(struct lov_stripe_md *md) + __releases(&md->lsm_lock) +{ + LASSERT(md->lsm_lock_owner == current_pid()); + md->lsm_lock_owner = 0; + spin_unlock(&md->lsm_lock); +} +EXPORT_SYMBOL(lov_stripe_unlock); + +static int lov_quotactl(struct obd_device *obd, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + __u64 curspace = 0; + __u64 bhardlimit = 0; + int i, rc = 0; + + if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON && + oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF && + oqctl->qc_cmd != Q_GETOQUOTA && + oqctl->qc_cmd != Q_INITQUOTA && + oqctl->qc_cmd != LUSTRE_Q_SETQUOTA && + oqctl->qc_cmd != Q_FINVALIDATE) { + CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd); + return -EFAULT; + } + + /* for lov tgt */ + obd_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + int err; + + tgt = lov->lov_tgts[i]; + + if (!tgt) + continue; + + if (!tgt->ltd_active || tgt->ltd_reap) { + if (oqctl->qc_cmd == Q_GETOQUOTA && + lov->lov_tgts[i]->ltd_activate) { + rc = -EREMOTEIO; + CERROR("ost %d is inactive\n", i); + } else { + CDEBUG(D_HA, "ost %d is inactive\n", i); + } + continue; + } + + err = obd_quotactl(tgt->ltd_exp, oqctl); + if (err) { + if (tgt->ltd_active && !rc) + rc = err; + continue; + } + + if (oqctl->qc_cmd == Q_GETOQUOTA) { + curspace += oqctl->qc_dqblk.dqb_curspace; + bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit; + } + } + obd_putref(obd); + + if (oqctl->qc_cmd == Q_GETOQUOTA) { + oqctl->qc_dqblk.dqb_curspace = curspace; + oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit; + } + return rc; +} + +static int lov_quotacheck(struct obd_device *obd, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct lov_obd *lov = &obd->u.lov; + int i, rc = 0; + + obd_getref(obd); + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + /* Skip quota check on the administratively disabled OSTs. */ + if (!lov->lov_tgts[i]->ltd_activate) { + CWARN("lov idx %d was administratively disabled, skip quotacheck on it.\n", + i); + continue; + } + + if (!lov->lov_tgts[i]->ltd_active) { + CERROR("lov idx %d inactive\n", i); + rc = -EIO; + goto out; + } + } + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + int err; + + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_activate) + continue; + + err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl); + if (err && !rc) + rc = err; + } + +out: + obd_putref(obd); + + return rc; +} + +static struct obd_ops lov_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = lov_setup, + .o_precleanup = lov_precleanup, + .o_cleanup = lov_cleanup, + /*.o_process_config = lov_process_config,*/ + .o_connect = lov_connect, + .o_disconnect = lov_disconnect, + .o_statfs = lov_statfs, + .o_statfs_async = lov_statfs_async, + .o_packmd = lov_packmd, + .o_unpackmd = lov_unpackmd, + .o_create = lov_create, + .o_destroy = lov_destroy, + .o_getattr_async = lov_getattr_async, + .o_setattr_async = lov_setattr_async, + .o_adjust_kms = lov_adjust_kms, + .o_find_cbdata = lov_find_cbdata, + .o_iocontrol = lov_iocontrol, + .o_get_info = lov_get_info, + .o_set_info_async = lov_set_info_async, + .o_notify = lov_notify, + .o_pool_new = lov_pool_new, + .o_pool_rem = lov_pool_remove, + .o_pool_add = lov_pool_add, + .o_pool_del = lov_pool_del, + .o_getref = lov_getref, + .o_putref = lov_putref, + .o_quotactl = lov_quotactl, + .o_quotacheck = lov_quotacheck, +}; + +struct kmem_cache *lov_oinfo_slab; + +static int __init lov_init(void) +{ + struct lprocfs_static_vars lvars = { NULL }; + int rc; + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches); + + rc = lu_kmem_init(lov_caches); + if (rc) + return rc; + + lov_oinfo_slab = kmem_cache_create("lov_oinfo", + sizeof(struct lov_oinfo), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (lov_oinfo_slab == NULL) { + lu_kmem_fini(lov_caches); + return -ENOMEM; + } + lprocfs_lov_init_vars(&lvars); + + rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars, + LUSTRE_LOV_NAME, &lov_device_type); + + if (rc) { + kmem_cache_destroy(lov_oinfo_slab); + lu_kmem_fini(lov_caches); + } + + return rc; +} + +static void /*__exit*/ lov_exit(void) +{ + class_unregister_type(LUSTRE_LOV_NAME); + kmem_cache_destroy(lov_oinfo_slab); + + lu_kmem_fini(lov_caches); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(LUSTRE_VERSION_STRING); + +module_init(lov_init); +module_exit(lov_exit); diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_object.c b/kernel/drivers/staging/lustre/lustre/lov/lov_object.c new file mode 100644 index 000000000..a22342fa7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_object.c @@ -0,0 +1,1001 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" +#include "../include/lclient.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Layout operations. + * + */ + +struct lov_layout_operations { + int (*llo_init)(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state); + int (*llo_delete)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + void (*llo_fini)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + void (*llo_install)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + int (*llo_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + int (*llo_page_init)(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage); + int (*llo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + int (*llo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + int (*llo_getattr)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +}; + +static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov); + +/***************************************************************************** + * + * Lov object layout operations. + * + */ + +static void lov_install_empty(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ + /* + * File without objects. + */ +} + +static int lov_init_empty(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + return 0; +} + +static void lov_install_raid0(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ +} + +static struct cl_object *lov_sub_find(const struct lu_env *env, + struct cl_device *dev, + const struct lu_fid *fid, + const struct cl_object_conf *conf) +{ + struct lu_object *o; + + o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu); + LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type)); + return lu2cl(o); +} + +static int lov_init_sub(const struct lu_env *env, struct lov_object *lov, + struct cl_object *stripe, struct lov_layout_raid0 *r0, + int idx) +{ + struct cl_object_header *hdr; + struct cl_object_header *subhdr; + struct cl_object_header *parent; + struct lov_oinfo *oinfo; + int result; + + if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) { + /* For sanity:test_206. + * Do not leave the object in cache to avoid accessing + * freed memory. This is because osc_object is referring to + * lov_oinfo of lsm_stripe_data which will be freed due to + * this failure. */ + cl_object_kill(env, stripe); + cl_object_put(env, stripe); + return -EIO; + } + + hdr = cl_object_header(lov2cl(lov)); + subhdr = cl_object_header(stripe); + + oinfo = lov->lo_lsm->lsm_oinfo[idx]; + CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: ostid: "DOSTID + " idx: %d gen: %d\n", + PFID(&subhdr->coh_lu.loh_fid), subhdr, idx, + PFID(&hdr->coh_lu.loh_fid), hdr, POSTID(&oinfo->loi_oi), + oinfo->loi_ost_idx, oinfo->loi_ost_gen); + + /* reuse ->coh_attr_guard to protect coh_parent change */ + spin_lock(&subhdr->coh_attr_guard); + parent = subhdr->coh_parent; + if (parent == NULL) { + subhdr->coh_parent = hdr; + spin_unlock(&subhdr->coh_attr_guard); + subhdr->coh_nesting = hdr->coh_nesting + 1; + lu_object_ref_add(&stripe->co_lu, "lov-parent", lov); + r0->lo_sub[idx] = cl2lovsub(stripe); + r0->lo_sub[idx]->lso_super = lov; + r0->lo_sub[idx]->lso_index = idx; + result = 0; + } else { + struct lu_object *old_obj; + struct lov_object *old_lov; + unsigned int mask = D_INODE; + + spin_unlock(&subhdr->coh_attr_guard); + old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type); + LASSERT(old_obj != NULL); + old_lov = cl2lov(lu2cl(old_obj)); + if (old_lov->lo_layout_invalid) { + /* the object's layout has already changed but isn't + * refreshed */ + lu_object_unhash(env, &stripe->co_lu); + result = -EAGAIN; + } else { + mask = D_ERROR; + result = -EIO; + } + + LU_OBJECT_DEBUG(mask, env, &stripe->co_lu, + "stripe %d is already owned.\n", idx); + LU_OBJECT_DEBUG(mask, env, old_obj, "owned.\n"); + LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n"); + cl_object_put(env, stripe); + } + return result; +} + +static int lov_init_raid0(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + int result; + int i; + + struct cl_object *stripe; + struct lov_thread_info *lti = lov_env_info(env); + struct cl_object_conf *subconf = <i->lti_stripe_conf; + struct lov_stripe_md *lsm = conf->u.coc_md->lsm; + struct lu_fid *ofid = <i->lti_fid; + struct lov_layout_raid0 *r0 = &state->raid0; + + if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) { + dump_lsm(D_ERROR, lsm); + LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n", + LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic); + } + + LASSERT(lov->lo_lsm == NULL); + lov->lo_lsm = lsm_addref(lsm); + r0->lo_nr = lsm->lsm_stripe_count; + LASSERT(r0->lo_nr <= lov_targets_nr(dev)); + + OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0])); + if (r0->lo_sub != NULL) { + result = 0; + subconf->coc_inode = conf->coc_inode; + spin_lock_init(&r0->lo_sub_lock); + /* + * Create stripe cl_objects. + */ + for (i = 0; i < r0->lo_nr && result == 0; ++i) { + struct cl_device *subdev; + struct lov_oinfo *oinfo = lsm->lsm_oinfo[i]; + int ost_idx = oinfo->loi_ost_idx; + + if (lov_oinfo_is_dummy(oinfo)) + continue; + + result = ostid_to_fid(ofid, &oinfo->loi_oi, + oinfo->loi_ost_idx); + if (result != 0) + goto out; + + subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); + subconf->u.coc_oinfo = oinfo; + LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx); + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + stripe = lov_sub_find(env, subdev, ofid, subconf); + if (!IS_ERR(stripe)) { + result = lov_init_sub(env, lov, stripe, r0, i); + if (result == -EAGAIN) { /* try again */ + --i; + result = 0; + } + } else { + result = PTR_ERR(stripe); + } + } + } else + result = -ENOMEM; +out: + return result; +} + +static int lov_init_released(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + struct lov_stripe_md *lsm = conf->u.coc_md->lsm; + + LASSERT(lsm != NULL); + LASSERT(lsm_is_released(lsm)); + LASSERT(lov->lo_lsm == NULL); + + lov->lo_lsm = lsm_addref(lsm); + return 0; +} + +static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED); + + lov_layout_wait(env, lov); + + cl_object_prune(env, &lov->lo_cl); + return 0; +} + +static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov, + struct lovsub_object *los, int idx) +{ + struct cl_object *sub; + struct lov_layout_raid0 *r0; + struct lu_site *site; + struct lu_site_bkt_data *bkt; + wait_queue_t *waiter; + + r0 = &lov->u.raid0; + LASSERT(r0->lo_sub[idx] == los); + + sub = lovsub2cl(los); + site = sub->co_lu.lo_dev->ld_site; + bkt = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid); + + cl_object_kill(env, sub); + /* release a reference to the sub-object and ... */ + lu_object_ref_del(&sub->co_lu, "lov-parent", lov); + cl_object_put(env, sub); + + /* ... wait until it is actually destroyed---sub-object clears its + * ->lo_sub[] slot in lovsub_object_fini() */ + if (r0->lo_sub[idx] == los) { + waiter = &lov_env_info(env)->lti_waiter; + init_waitqueue_entry(waiter, current); + add_wait_queue(&bkt->lsb_marche_funebre, waiter); + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) { + /* this wait-queue is signaled at the end of + * lu_object_free(). */ + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock(&r0->lo_sub_lock); + if (r0->lo_sub[idx] == los) { + spin_unlock(&r0->lo_sub_lock); + schedule(); + } else { + spin_unlock(&r0->lo_sub_lock); + set_current_state(TASK_RUNNING); + break; + } + } + remove_wait_queue(&bkt->lsb_marche_funebre, waiter); + } + LASSERT(r0->lo_sub[idx] == NULL); +} + +static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_raid0 *r0 = &state->raid0; + struct lov_stripe_md *lsm = lov->lo_lsm; + int i; + + dump_lsm(D_INODE, lsm); + + lov_layout_wait(env, lov); + if (r0->lo_sub != NULL) { + for (i = 0; i < r0->lo_nr; ++i) { + struct lovsub_object *los = r0->lo_sub[i]; + + if (los != NULL) { + cl_locks_prune(env, &los->lso_cl, 1); + /* + * If top-level object is to be evicted from + * the cache, so are its sub-objects. + */ + lov_subobject_kill(env, lov, los, i); + } + } + } + cl_object_prune(env, &lov->lo_cl); + return 0; +} + +static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED); +} + +static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_raid0 *r0 = &state->raid0; + + if (r0->lo_sub != NULL) { + OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0])); + r0->lo_sub = NULL; + } + + dump_lsm(D_INODE, lov->lo_lsm); + lov_free_memmd(&lov->lo_lsm); +} + +static void lov_fini_released(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + dump_lsm(D_INODE, lov->lo_lsm); + lov_free_memmd(&lov->lo_lsm); +} + +static int lov_print_empty(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid); + return 0; +} + +static int lov_print_raid0(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct lov_object *lov = lu2lov(o); + struct lov_layout_raid0 *r0 = lov_r0(lov); + struct lov_stripe_md *lsm = lov->lo_lsm; + int i; + + (*p)(env, cookie, "stripes: %d, %s, lsm{%p 0x%08X %d %u %u}:\n", + r0->lo_nr, lov->lo_layout_invalid ? "invalid" : "valid", lsm, + lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + lsm->lsm_stripe_count, lsm->lsm_layout_gen); + for (i = 0; i < r0->lo_nr; ++i) { + struct lu_object *sub; + + if (r0->lo_sub[i] != NULL) { + sub = lovsub2lu(r0->lo_sub[i]); + lu_object_print(env, cookie, p, sub); + } else { + (*p)(env, cookie, "sub %d absent\n", i); + } + } + return 0; +} + +static int lov_print_released(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct lov_object *lov = lu2lov(o); + struct lov_stripe_md *lsm = lov->lo_lsm; + + (*p)(env, cookie, + "released: %s, lsm{%p 0x%08X %d %u %u}:\n", + lov->lo_layout_invalid ? "invalid" : "valid", lsm, + lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + lsm->lsm_stripe_count, lsm->lsm_layout_gen); + return 0; +} + +/** + * Implements cl_object_operations::coo_attr_get() method for an object + * without stripes (LLT_EMPTY layout type). + * + * The only attributes this layer is authoritative in this case is + * cl_attr::cat_blocks---it's 0. + */ +static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + attr->cat_blocks = 0; + return 0; +} + +static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_layout_raid0 *r0 = lov_r0(lov); + struct cl_attr *lov_attr = &r0->lo_attr; + int result = 0; + + /* this is called w/o holding type guard mutex, so it must be inside + * an on going IO otherwise lsm may be replaced. + * LU-2117: it turns out there exists one exception. For mmaped files, + * the lock of those files may be requested in the other file's IO + * context, and this function is called in ccc_lock_state(), it will + * hit this assertion. + * Anyway, it's still okay to call attr_get w/o type guard as layout + * can't go if locks exist. */ + /* LASSERT(atomic_read(&lsm->lsm_refc) > 1); */ + + if (!r0->lo_attr_valid) { + struct lov_stripe_md *lsm = lov->lo_lsm; + struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb; + __u64 kms = 0; + + memset(lvb, 0, sizeof(*lvb)); + /* XXX: timestamps can be negative by sanity:test_39m, + * how can it be? */ + lvb->lvb_atime = LLONG_MIN; + lvb->lvb_ctime = LLONG_MIN; + lvb->lvb_mtime = LLONG_MIN; + + /* + * XXX that should be replaced with a loop over sub-objects, + * doing cl_object_attr_get() on them. But for now, let's + * reuse old lov code. + */ + + /* + * XXX take lsm spin-lock to keep lov_merge_lvb_kms() + * happy. It's not needed, because new code uses + * ->coh_attr_guard spin-lock to protect consistency of + * sub-object attributes. + */ + lov_stripe_lock(lsm); + result = lov_merge_lvb_kms(lsm, lvb, &kms); + lov_stripe_unlock(lsm); + if (result == 0) { + cl_lvb2attr(lov_attr, lvb); + lov_attr->cat_kms = kms; + r0->lo_attr_valid = 1; + } + } + if (result == 0) { /* merge results */ + attr->cat_blocks = lov_attr->cat_blocks; + attr->cat_size = lov_attr->cat_size; + attr->cat_kms = lov_attr->cat_kms; + if (attr->cat_atime < lov_attr->cat_atime) + attr->cat_atime = lov_attr->cat_atime; + if (attr->cat_ctime < lov_attr->cat_ctime) + attr->cat_ctime = lov_attr->cat_ctime; + if (attr->cat_mtime < lov_attr->cat_mtime) + attr->cat_mtime = lov_attr->cat_mtime; + } + return result; +} + +static const struct lov_layout_operations lov_dispatch[] = { + [LLT_EMPTY] = { + .llo_init = lov_init_empty, + .llo_delete = lov_delete_empty, + .llo_fini = lov_fini_empty, + .llo_install = lov_install_empty, + .llo_print = lov_print_empty, + .llo_page_init = lov_page_init_empty, + .llo_lock_init = lov_lock_init_empty, + .llo_io_init = lov_io_init_empty, + .llo_getattr = lov_attr_get_empty + }, + [LLT_RAID0] = { + .llo_init = lov_init_raid0, + .llo_delete = lov_delete_raid0, + .llo_fini = lov_fini_raid0, + .llo_install = lov_install_raid0, + .llo_print = lov_print_raid0, + .llo_page_init = lov_page_init_raid0, + .llo_lock_init = lov_lock_init_raid0, + .llo_io_init = lov_io_init_raid0, + .llo_getattr = lov_attr_get_raid0 + }, + [LLT_RELEASED] = { + .llo_init = lov_init_released, + .llo_delete = lov_delete_empty, + .llo_fini = lov_fini_released, + .llo_install = lov_install_empty, + .llo_print = lov_print_released, + .llo_page_init = lov_page_init_empty, + .llo_lock_init = lov_lock_init_empty, + .llo_io_init = lov_io_init_released, + .llo_getattr = lov_attr_get_empty + } +}; + +/** + * Performs a double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH_NOLOCK(obj, op, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + __llt = __obj->lo_type; \ + LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ +}) + +/** + * Return lov_layout_type associated with a given lsm + */ +static enum lov_layout_type lov_type(struct lov_stripe_md *lsm) +{ + if (lsm == NULL) + return LLT_EMPTY; + if (lsm_is_released(lsm)) + return LLT_RELEASED; + return LLT_RAID0; +} + +static inline void lov_conf_freeze(struct lov_object *lov) +{ + if (lov->lo_owner != current) + down_read(&lov->lo_type_guard); +} + +static inline void lov_conf_thaw(struct lov_object *lov) +{ + if (lov->lo_owner != current) + up_read(&lov->lo_type_guard); +} + +#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + int __lock = !!(lock); \ + typeof(lov_dispatch[0].op(__VA_ARGS__)) __result; \ + \ + if (__lock) \ + lov_conf_freeze(__obj); \ + __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__); \ + if (__lock) \ + lov_conf_thaw(__obj); \ + __result; \ +}) + +/** + * Performs a locked double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH(obj, op, ...) \ + LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__) + +#define LOV_2DISPATCH_VOID(obj, op, ...) \ +do { \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + lov_conf_freeze(__obj); \ + __llt = __obj->lo_type; \ + LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ + lov_conf_thaw(__obj); \ +} while (0) + +static void lov_conf_lock(struct lov_object *lov) +{ + LASSERT(lov->lo_owner != current); + down_write(&lov->lo_type_guard); + LASSERT(lov->lo_owner == NULL); + lov->lo_owner = current; +} + +static void lov_conf_unlock(struct lov_object *lov) +{ + lov->lo_owner = NULL; + up_write(&lov->lo_type_guard); +} + +static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov) +{ + struct l_wait_info lwi = { 0 }; + + while (atomic_read(&lov->lo_active_ios) > 0) { + CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n", + PFID(lu_object_fid(lov2lu(lov))), + atomic_read(&lov->lo_active_ios)); + + l_wait_event(lov->lo_waitq, + atomic_read(&lov->lo_active_ios) == 0, &lwi); + } + return 0; +} + +static int lov_layout_change(const struct lu_env *unused, + struct lov_object *lov, + const struct cl_object_conf *conf) +{ + int result; + enum lov_layout_type llt = LLT_EMPTY; + union lov_layout_state *state = &lov->u; + const struct lov_layout_operations *old_ops; + const struct lov_layout_operations *new_ops; + + struct cl_object_header *hdr = cl_object_header(&lov->lo_cl); + void *cookie; + struct lu_env *env; + int refcheck; + + LASSERT(0 <= lov->lo_type && lov->lo_type < ARRAY_SIZE(lov_dispatch)); + + if (conf->u.coc_md != NULL) + llt = lov_type(conf->u.coc_md->lsm); + LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch)); + + cookie = cl_env_reenter(); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) { + cl_env_reexit(cookie); + return PTR_ERR(env); + } + + CDEBUG(D_INODE, DFID" from %s to %s\n", + PFID(lu_object_fid(lov2lu(lov))), + llt2str(lov->lo_type), llt2str(llt)); + + old_ops = &lov_dispatch[lov->lo_type]; + new_ops = &lov_dispatch[llt]; + + result = old_ops->llo_delete(env, lov, &lov->u); + if (result == 0) { + old_ops->llo_fini(env, lov, &lov->u); + + LASSERT(atomic_read(&lov->lo_active_ios) == 0); + LASSERT(hdr->coh_tree.rnode == NULL); + LASSERT(hdr->coh_pages == 0); + + lov->lo_type = LLT_EMPTY; + result = new_ops->llo_init(env, + lu2lov_dev(lov->lo_cl.co_lu.lo_dev), + lov, conf, state); + if (result == 0) { + new_ops->llo_install(env, lov, state); + lov->lo_type = llt; + } else { + new_ops->llo_delete(env, lov, state); + new_ops->llo_fini(env, lov, state); + /* this file becomes an EMPTY file. */ + } + } + + cl_env_put(env, &refcheck); + cl_env_reexit(cookie); + return result; +} + +/***************************************************************************** + * + * Lov object operations. + * + */ +int lov_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lov_device *dev = lu2lov_dev(obj->lo_dev); + struct lov_object *lov = lu2lov(obj); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + union lov_layout_state *set = &lov->u; + const struct lov_layout_operations *ops; + int result; + + init_rwsem(&lov->lo_type_guard); + atomic_set(&lov->lo_active_ios, 0); + init_waitqueue_head(&lov->lo_waitq); + + cl_object_page_init(lu2cl(obj), sizeof(struct lov_page)); + + /* no locking is necessary, as object is being created */ + lov->lo_type = lov_type(cconf->u.coc_md->lsm); + ops = &lov_dispatch[lov->lo_type]; + result = ops->llo_init(env, dev, lov, cconf, set); + if (result == 0) + ops->llo_install(env, lov, set); + return result; +} + +static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lov_stripe_md *lsm = NULL; + struct lov_object *lov = cl2lov(obj); + int result = 0; + + lov_conf_lock(lov); + if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { + lov->lo_layout_invalid = true; + result = 0; + goto out; + } + + if (conf->coc_opc == OBJECT_CONF_WAIT) { + if (lov->lo_layout_invalid && + atomic_read(&lov->lo_active_ios) > 0) { + lov_conf_unlock(lov); + result = lov_layout_wait(env, lov); + lov_conf_lock(lov); + } + goto out; + } + + LASSERT(conf->coc_opc == OBJECT_CONF_SET); + + if (conf->u.coc_md != NULL) + lsm = conf->u.coc_md->lsm; + if ((lsm == NULL && lov->lo_lsm == NULL) || + ((lsm != NULL && lov->lo_lsm != NULL) && + (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) && + (lov->lo_lsm->lsm_pattern == lsm->lsm_pattern))) { + /* same version of layout */ + lov->lo_layout_invalid = false; + result = 0; + goto out; + } + + /* will change layout - check if there still exists active IO. */ + if (atomic_read(&lov->lo_active_ios) > 0) { + lov->lo_layout_invalid = true; + result = -EBUSY; + goto out; + } + + lov->lo_layout_invalid = lov_layout_change(env, lov, conf); + +out: + lov_conf_unlock(lov); + CDEBUG(D_INODE, DFID" lo_layout_invalid=%d\n", + PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid); + return result; +} + +static void lov_object_delete(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u); +} + +static void lov_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u); + lu_object_fini(obj); + OBD_SLAB_FREE_PTR(lov, lov_object_kmem); +} + +static int lov_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o); +} + +int lov_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), + llo_page_init, env, obj, page, vmpage); +} + +/** + * Implements cl_object_operations::clo_io_init() method for lov + * layer. Dispatches to the appropriate layout io initialization method. + */ +int lov_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl); + return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init, + !io->ci_ignore_layout, env, obj, io); +} + +/** + * An implementation of cl_object_operations::clo_attr_get() method for lov + * layer. For raid0 layout this collects and merges attributes of all + * sub-objects. + */ +static int lov_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + /* do not take lock, as this function is called under a + * spin-lock. Layout is protected from changing by ongoing IO. */ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr); +} + +static int lov_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + /* + * No dispatch is required here, as no layout implements this. + */ + return 0; +} + +int lov_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + /* No need to lock because we've taken one refcount of layout. */ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock, + io); +} + +static const struct cl_object_operations lov_ops = { + .coo_page_init = lov_page_init, + .coo_lock_init = lov_lock_init, + .coo_io_init = lov_io_init, + .coo_attr_get = lov_attr_get, + .coo_attr_set = lov_attr_set, + .coo_conf_set = lov_conf_set +}; + +static const struct lu_object_operations lov_lu_obj_ops = { + .loo_object_init = lov_object_init, + .loo_object_delete = lov_object_delete, + .loo_object_release = NULL, + .loo_object_free = lov_object_free, + .loo_object_print = lov_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *lov_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct lov_object *lov; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, GFP_NOFS); + if (lov != NULL) { + obj = lov2lu(lov); + lu_object_init(obj, NULL, dev); + lov->lo_cl.co_ops = &lov_ops; + lov->lo_type = -1; /* invalid, to catch uninitialized type */ + /* + * object io operation vector (cl_object::co_iop) is installed + * later in lov_object_init(), as different vectors are used + * for object with different layouts. + */ + obj->lo_ops = &lov_lu_obj_ops; + } else + obj = NULL; + return obj; +} + +struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov) +{ + struct lov_stripe_md *lsm = NULL; + + lov_conf_freeze(lov); + if (lov->lo_lsm != NULL) { + lsm = lsm_addref(lov->lo_lsm); + CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n", + lsm, atomic_read(&lsm->lsm_refc), + lov->lo_layout_invalid, current); + } + lov_conf_thaw(lov); + return lsm; +} + +void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm) +{ + if (lsm == NULL) + return; + + CDEBUG(D_INODE, "lsm %p decref %d by %p.\n", + lsm, atomic_read(&lsm->lsm_refc), current); + + lov_free_memmd(&lsm); +} + +struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj) +{ + struct lu_object *luobj; + struct lov_stripe_md *lsm = NULL; + + if (clobj == NULL) + return NULL; + + luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu, + &lov_device_type); + if (luobj != NULL) + lsm = lov_lsm_addref(lu2lov(luobj)); + return lsm; +} +EXPORT_SYMBOL(lov_lsm_get); + +void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm) +{ + if (lsm != NULL) + lov_free_memmd(&lsm); +} +EXPORT_SYMBOL(lov_lsm_put); + +int lov_read_and_clear_async_rc(struct cl_object *clob) +{ + struct lu_object *luobj; + int rc = 0; + + luobj = lu_object_locate(&cl_object_header(clob)->coh_lu, + &lov_device_type); + if (luobj != NULL) { + struct lov_object *lov = lu2lov(luobj); + + lov_conf_freeze(lov); + switch (lov->lo_type) { + case LLT_RAID0: { + struct lov_stripe_md *lsm; + int i; + + lsm = lov->lo_lsm; + LASSERT(lsm != NULL); + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + + if (lov_oinfo_is_dummy(loi)) + continue; + + if (loi->loi_ar.ar_rc && !rc) + rc = loi->loi_ar.ar_rc; + loi->loi_ar.ar_rc = 0; + } + } + case LLT_RELEASED: + case LLT_EMPTY: + break; + default: + LBUG(); + } + lov_conf_thaw(lov); + } + return rc; +} +EXPORT_SYMBOL(lov_read_and_clear_async_rc); + +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_offset.c b/kernel/drivers/staging/lustre/lustre/lov/lov_offset.c new file mode 100644 index 000000000..9c8c77c05 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_offset.c @@ -0,0 +1,264 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_class.h" + +#include "lov_internal.h" + +/* compute object size given "stripeno" and the ost size */ +u64 lov_stripe_size(struct lov_stripe_md *lsm, u64 ost_size, + int stripeno) +{ + unsigned long ssize = lsm->lsm_stripe_size; + unsigned long stripe_size; + u64 swidth; + u64 lov_size; + int magic = lsm->lsm_magic; + + if (ost_size == 0) + return 0; + + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_size = lov_do_div64(ost_size, ssize); + if (stripe_size) + lov_size = ost_size * swidth + stripeno * ssize + stripe_size; + else + lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize; + + return lov_size; +} + +/* we have an offset in file backed by an lov and want to find out where + * that offset lands in our given stripe of the file. for the easy + * case where the offset is within the stripe, we just have to scale the + * offset down to make it relative to the stripe instead of the lov. + * + * the harder case is what to do when the offset doesn't intersect the + * stripe. callers will want start offsets clamped ahead to the start + * of the nearest stripe in the file. end offsets similarly clamped to the + * nearest ending byte of a stripe in the file: + * + * all this function does is move offsets to the nearest region of the + * stripe, and it does its work "mod" the full length of all the stripes. + * consider a file with 3 stripes: + * + * S E + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + * + * to find stripe 1's offsets for S and E, it divides by the full stripe + * width and does its math in the context of a single set of stripes: + * + * S E + * ----------------------------------- + * | 0 | 1 | 2 | + * ----------------------------------- + * + * it'll notice that E is outside stripe 1 and clamp it to the end of the + * stripe, then multiply it back out by lov_off to give the real offsets in + * the stripe: + * + * S E + * --------------------------------------------------------------------- + * | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------------------------------------------- + * + * it would have done similarly and pulled S forward to the start of a 1 + * stripe if, say, S had landed in a 0 stripe. + * + * this rounding isn't always correct. consider an E lov offset that lands + * on a 0 stripe, the "mod stripe width" math will pull it forward to the + * start of a 1 stripe, when in fact it wanted to be rounded back to the end + * of a previous 1 stripe. this logic is handled by callers and this is why: + * + * this function returns < 0 when the offset was "before" the stripe and + * was moved forward to the start of the stripe in question; 0 when it + * falls in the stripe and no shifting was done; > 0 when the offset + * was outside the stripe and was pulled back to its final byte. */ +int lov_stripe_offset(struct lov_stripe_md *lsm, u64 lov_off, + int stripeno, u64 *obdoff) +{ + unsigned long ssize = lsm->lsm_stripe_size; + u64 stripe_off, this_stripe, swidth; + int magic = lsm->lsm_magic; + int ret = 0; + + if (lov_off == OBD_OBJECT_EOF) { + *obdoff = OBD_OBJECT_EOF; + return 0; + } + + LASSERT(lsm_op_find(magic) != NULL); + + lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off, + &swidth); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_off = lov_do_div64(lov_off, swidth); + + this_stripe = (u64)stripeno * ssize; + if (stripe_off < this_stripe) { + stripe_off = 0; + ret = -1; + } else { + stripe_off -= this_stripe; + + if (stripe_off >= ssize) { + stripe_off = ssize; + ret = 1; + } + } + + *obdoff = lov_off * ssize + stripe_off; + return ret; +} + +/* Given a whole-file size and a stripe number, give the file size which + * corresponds to the individual object of that stripe. + * + * This behaves basically in the same was as lov_stripe_offset, except that + * file sizes falling before the beginning of a stripe are clamped to the end + * of the previous stripe, not the beginning of the next: + * + * S + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + * + * if clamped to stripe 2 becomes: + * + * S + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + */ +u64 lov_size_to_stripe(struct lov_stripe_md *lsm, u64 file_size, + int stripeno) +{ + unsigned long ssize = lsm->lsm_stripe_size; + u64 stripe_off, this_stripe, swidth; + int magic = lsm->lsm_magic; + + if (file_size == OBD_OBJECT_EOF) + return OBD_OBJECT_EOF; + + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size, + &swidth); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_off = lov_do_div64(file_size, swidth); + + this_stripe = (u64)stripeno * ssize; + if (stripe_off < this_stripe) { + /* Move to end of previous stripe, or zero */ + if (file_size > 0) { + file_size--; + stripe_off = ssize; + } else { + stripe_off = 0; + } + } else { + stripe_off -= this_stripe; + + if (stripe_off >= ssize) { + /* Clamp to end of this stripe */ + stripe_off = ssize; + } + } + + return (file_size * ssize + stripe_off); +} + +/* given an extent in an lov and a stripe, calculate the extent of the stripe + * that is contained within the lov extent. this returns true if the given + * stripe does intersect with the lov extent. */ +int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, + u64 start, u64 end, u64 *obd_start, u64 *obd_end) +{ + int start_side, end_side; + + start_side = lov_stripe_offset(lsm, start, stripeno, obd_start); + end_side = lov_stripe_offset(lsm, end, stripeno, obd_end); + + CDEBUG(D_INODE, "[%llu->%llu] -> [(%d) %llu->%llu (%d)]\n", + start, end, start_side, *obd_start, *obd_end, end_side); + + /* this stripe doesn't intersect the file extent when neither + * start or the end intersected the stripe and obd_start and + * obd_end got rounded up to the save value. */ + if (start_side != 0 && end_side != 0 && *obd_start == *obd_end) + return 0; + + /* as mentioned in the lov_stripe_offset commentary, end + * might have been shifted in the wrong direction. This + * happens when an end offset is before the stripe when viewed + * through the "mod stripe size" math. we detect it being shifted + * in the wrong direction and touch it up. + * interestingly, this can't underflow since end must be > start + * if we passed through the previous check. + * (should we assert for that somewhere?) */ + if (end_side != 0) + (*obd_end)--; + + return 1; +} + +/* compute which stripe number "lov_off" will be written into */ +int lov_stripe_number(struct lov_stripe_md *lsm, u64 lov_off) +{ + unsigned long ssize = lsm->lsm_stripe_size; + u64 stripe_off, swidth; + int magic = lsm->lsm_magic; + + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth); + + stripe_off = lov_do_div64(lov_off, swidth); + + /* Puts stripe_off/ssize result into stripe_off */ + lov_do_div64(stripe_off, ssize); + + return stripe_off; +} diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_pack.c b/kernel/drivers/staging/lustre/lustre/lov/lov_pack.c new file mode 100644 index 000000000..5356d5324 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_pack.c @@ -0,0 +1,511 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_pack.c + * + * (Un)packing of OST/MDS requests + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "../include/lustre_net.h" +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lustre/lustre_user.h" + +#include "lov_internal.h" + +void lov_dump_lmm_common(int level, void *lmmp) +{ + struct lov_mds_md *lmm = lmmp; + struct ost_id oi; + + lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi); + CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n", + POSTID(&oi), le32_to_cpu(lmm->lmm_magic), + le32_to_cpu(lmm->lmm_pattern)); + CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n", + le32_to_cpu(lmm->lmm_stripe_size), + le16_to_cpu(lmm->lmm_stripe_count), + le16_to_cpu(lmm->lmm_layout_gen)); +} + +static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod, + int stripe_count) +{ + int i; + + if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { + CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n", + stripe_count, LOV_V1_INSANE_STRIPE_COUNT); + return; + } + + for (i = 0; i < stripe_count; ++i, ++lod) { + struct ost_id oi; + + ostid_le_to_cpu(&lod->l_ost_oi, &oi); + CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i, + le32_to_cpu(lod->l_ost_idx), POSTID(&oi)); + } +} + +void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm) +{ + lov_dump_lmm_common(level, lmm); + lov_dump_lmm_objects(level, lmm->lmm_objects, + le16_to_cpu(lmm->lmm_stripe_count)); +} + +void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm) +{ + lov_dump_lmm_common(level, lmm); + CDEBUG(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name); + lov_dump_lmm_objects(level, lmm->lmm_objects, + le16_to_cpu(lmm->lmm_stripe_count)); +} + +void lov_dump_lmm(int level, void *lmm) +{ + int magic; + + magic = le32_to_cpu(((struct lov_mds_md *)lmm)->lmm_magic); + switch (magic) { + case LOV_MAGIC_V1: + lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)lmm); + break; + case LOV_MAGIC_V3: + lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)lmm); + break; + default: + CDEBUG(level, "unrecognized lmm_magic %x, assuming %x\n", + magic, LOV_MAGIC_V1); + lov_dump_lmm_common(level, lmm); + break; + } +} + +/* Pack LOV object metadata for disk storage. It is packed in LE byte + * order and is opaque to the networking layer. + * + * XXX In the future, this will be enhanced to get the EA size from the + * underlying OSC device(s) to get their EA sizes so we can stack + * LOVs properly. For now lov_mds_md_size() just assumes one u64 + * per stripe. + */ +int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + struct lov_mds_md_v1 *lmmv1; + struct lov_mds_md_v3 *lmmv3; + __u16 stripe_count; + struct lov_ost_data_v1 *lmm_objects; + int lmm_size, lmm_magic; + int i; + int cplen = 0; + + if (lsm) { + lmm_magic = lsm->lsm_magic; + } else { + if (lmmp && *lmmp) + lmm_magic = le32_to_cpu((*lmmp)->lmm_magic); + else + /* lsm == NULL and lmmp == NULL */ + lmm_magic = LOV_MAGIC; + } + + if ((lmm_magic != LOV_MAGIC_V1) && + (lmm_magic != LOV_MAGIC_V3)) { + CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n", + lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3); + return -EINVAL; + + } + + if (lsm) { + /* If we are just sizing the EA, limit the stripe count + * to the actual number of OSTs in this filesystem. */ + if (!lmmp) { + stripe_count = lov_get_stripecnt(lov, lmm_magic, + lsm->lsm_stripe_count); + lsm->lsm_stripe_count = stripe_count; + } else if (!lsm_is_released(lsm)) { + stripe_count = lsm->lsm_stripe_count; + } else { + stripe_count = 0; + } + } else { + /* No need to allocate more than maximum supported stripes. + * Anyway, this is pretty inaccurate since ld_tgt_count now + * represents max index and we should rely on the actual number + * of OSTs instead */ + stripe_count = lov_mds_md_max_stripe_count( + lov->lov_ocd.ocd_max_easize, lmm_magic); + + if (stripe_count > lov->desc.ld_tgt_count) + stripe_count = lov->desc.ld_tgt_count; + } + + /* XXX LOV STACKING call into osc for sizes */ + lmm_size = lov_mds_md_size(stripe_count, lmm_magic); + + if (!lmmp) + return lmm_size; + + if (*lmmp && !lsm) { + stripe_count = le16_to_cpu((*lmmp)->lmm_stripe_count); + lmm_size = lov_mds_md_size(stripe_count, lmm_magic); + OBD_FREE_LARGE(*lmmp, lmm_size); + *lmmp = NULL; + return 0; + } + + if (!*lmmp) { + OBD_ALLOC_LARGE(*lmmp, lmm_size); + if (!*lmmp) + return -ENOMEM; + } + + CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n", + lmm_magic, lmm_size); + + lmmv1 = *lmmp; + lmmv3 = (struct lov_mds_md_v3 *)*lmmp; + if (lmm_magic == LOV_MAGIC_V3) + lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3); + else + lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1); + + if (!lsm) + return lmm_size; + + /* lmmv1 and lmmv3 point to the same struct and have the + * same first fields + */ + lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi); + lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size); + lmmv1->lmm_stripe_count = cpu_to_le16(stripe_count); + lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern); + lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen); + if (lsm->lsm_magic == LOV_MAGIC_V3) { + cplen = strlcpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name, + sizeof(lmmv3->lmm_pool_name)); + if (cplen >= sizeof(lmmv3->lmm_pool_name)) + return -E2BIG; + lmm_objects = lmmv3->lmm_objects; + } else { + lmm_objects = lmmv1->lmm_objects; + } + + for (i = 0; i < stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + /* XXX LOV STACKING call down to osc_packmd() to do packing */ + LASSERTF(ostid_id(&loi->loi_oi) != 0, "lmm_oi "DOSTID + " stripe %u/%u idx %u\n", POSTID(&lmmv1->lmm_oi), + i, stripe_count, loi->loi_ost_idx); + ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi); + lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen); + lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx); + } + + return lmm_size; +} + +/* Find the max stripecount we should use */ +__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count) +{ + __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD; + + if (!stripe_count) + stripe_count = lov->desc.ld_default_stripe_count; + if (stripe_count > lov->desc.ld_active_tgt_count) + stripe_count = lov->desc.ld_active_tgt_count; + if (!stripe_count) + stripe_count = 1; + + /* stripe count is based on whether ldiskfs can handle + * larger EA sizes */ + if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE && + lov->lov_ocd.ocd_max_easize) + max_stripes = lov_mds_md_max_stripe_count( + lov->lov_ocd.ocd_max_easize, magic); + + if (stripe_count > max_stripes) + stripe_count = max_stripes; + + return stripe_count; +} + + +static int lov_verify_lmm(void *lmm, int lmm_bytes, __u16 *stripe_count) +{ + int rc; + + if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) { + char *buffer; + int sz; + + CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n", + le32_to_cpu(*(__u32 *)lmm), lmm_bytes); + sz = lmm_bytes * 2 + 1; + OBD_ALLOC_LARGE(buffer, sz); + if (buffer != NULL) { + int i; + + for (i = 0; i < lmm_bytes; i++) + sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]); + buffer[sz - 1] = '\0'; + CERROR("%s\n", buffer); + OBD_FREE_LARGE(buffer, sz); + } + return -EINVAL; + } + rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm, + lmm_bytes, stripe_count); + return rc; +} + +int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count, + int pattern, int magic) +{ + int i, lsm_size; + + CDEBUG(D_INFO, "alloc lsm, stripe_count %d\n", stripe_count); + + *lsmp = lsm_alloc_plain(stripe_count, &lsm_size); + if (!*lsmp) { + CERROR("can't allocate lsmp stripe_count %d\n", stripe_count); + return -ENOMEM; + } + + atomic_set(&(*lsmp)->lsm_refc, 1); + spin_lock_init(&(*lsmp)->lsm_lock); + (*lsmp)->lsm_magic = magic; + (*lsmp)->lsm_stripe_count = stripe_count; + (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count; + (*lsmp)->lsm_pattern = pattern; + (*lsmp)->lsm_pool_name[0] = '\0'; + (*lsmp)->lsm_layout_gen = 0; + if (stripe_count > 0) + (*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0; + + for (i = 0; i < stripe_count; i++) + loi_init((*lsmp)->lsm_oinfo[i]); + + return lsm_size; +} + +int lov_free_memmd(struct lov_stripe_md **lsmp) +{ + struct lov_stripe_md *lsm = *lsmp; + int refc; + + *lsmp = NULL; + LASSERT(atomic_read(&lsm->lsm_refc) > 0); + refc = atomic_dec_return(&lsm->lsm_refc); + if (refc == 0) { + LASSERT(lsm_op_find(lsm->lsm_magic) != NULL); + lsm_op_find(lsm->lsm_magic)->lsm_free(lsm); + } + return refc; +} + + +/* Unpack LOV object metadata from disk storage. It is packed in LE byte + * order and is opaque to the networking layer. + */ +int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmm, int lmm_bytes) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + int rc = 0, lsm_size; + __u16 stripe_count; + __u32 magic; + __u32 pattern; + + /* If passed an MDS struct use values from there, otherwise defaults */ + if (lmm) { + rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count); + if (rc) + return rc; + magic = le32_to_cpu(lmm->lmm_magic); + } else { + magic = LOV_MAGIC; + stripe_count = lov_get_stripecnt(lov, magic, 0); + } + + /* If we aren't passed an lsmp struct, we just want the size */ + if (!lsmp) { + /* XXX LOV STACKING call into osc for sizes */ + LBUG(); + return lov_stripe_md_size(stripe_count); + } + /* If we are passed an allocated struct but nothing to unpack, free */ + if (*lsmp && !lmm) { + lov_free_memmd(lsmp); + return 0; + } + + pattern = le32_to_cpu(lmm->lmm_pattern); + lsm_size = lov_alloc_memmd(lsmp, stripe_count, pattern, magic); + if (lsm_size < 0) + return lsm_size; + + /* If we are passed a pointer but nothing to unpack, we only alloc */ + if (!lmm) + return lsm_size; + + LASSERT(lsm_op_find(magic) != NULL); + rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm); + if (rc) { + lov_free_memmd(lsmp); + return rc; + } + + return lsm_size; +} + +/* Retrieve object striping information. + * + * @lump is a pointer to an in-core struct with lmm_ost_count indicating + * the maximum number of OST indices which will fit in the user buffer. + * lmm_magic must be LOV_USER_MAGIC. + */ +int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, + struct lov_user_md *lump) +{ + /* + * XXX huge struct allocated on stack. + */ + /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ + struct lov_user_md_v3 lum; + struct lov_mds_md *lmmk = NULL; + int rc, lmm_size; + int lum_size; + mm_segment_t seg; + + if (!lsm) + return -ENODATA; + + /* + * "Switch to kernel segment" to allow copying from kernel space by + * copy_{to,from}_user(). + */ + seg = get_fs(); + set_fs(KERNEL_DS); + + /* we only need the header part from user space to get lmm_magic and + * lmm_stripe_count, (the header part is common to v1 and v3) */ + lum_size = sizeof(struct lov_user_md_v1); + if (copy_from_user(&lum, lump, lum_size)) { + rc = -EFAULT; + goto out_set; + } else if ((lum.lmm_magic != LOV_USER_MAGIC) && + (lum.lmm_magic != LOV_USER_MAGIC_V3)) { + rc = -EINVAL; + goto out_set; + } + + if (lum.lmm_stripe_count && + (lum.lmm_stripe_count < lsm->lsm_stripe_count)) { + /* Return right size of stripe to user */ + lum.lmm_stripe_count = lsm->lsm_stripe_count; + rc = copy_to_user(lump, &lum, lum_size); + rc = -EOVERFLOW; + goto out_set; + } + rc = lov_packmd(exp, &lmmk, lsm); + if (rc < 0) + goto out_set; + lmm_size = rc; + rc = 0; + + /* FIXME: Bug 1185 - copy fields properly when structs change */ + /* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */ + CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3)); + CLASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lmmk->lmm_objects[0])); + + if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) && + ((lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) || + (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)))) { + lustre_swab_lov_mds_md(lmmk); + lustre_swab_lov_user_md_objects( + (struct lov_user_ost_data *)lmmk->lmm_objects, + lmmk->lmm_stripe_count); + } + if (lum.lmm_magic == LOV_USER_MAGIC) { + /* User request for v1, we need skip lmm_pool_name */ + if (lmmk->lmm_magic == LOV_MAGIC_V3) { + memmove((char *)(&lmmk->lmm_stripe_count) + + sizeof(lmmk->lmm_stripe_count), + ((struct lov_mds_md_v3 *)lmmk)->lmm_objects, + lmmk->lmm_stripe_count * + sizeof(struct lov_ost_data_v1)); + lmm_size -= LOV_MAXPOOLNAME; + } + } else { + /* if v3 we just have to update the lum_size */ + lum_size = sizeof(struct lov_user_md_v3); + } + + /* User wasn't expecting this many OST entries */ + if (lum.lmm_stripe_count == 0) + lmm_size = lum_size; + else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) { + rc = -EOVERFLOW; + goto out_set; + } + /* + * Have a difference between lov_mds_md & lov_user_md. + * So we have to re-order the data before copy to user. + */ + lum.lmm_stripe_count = lmmk->lmm_stripe_count; + lum.lmm_layout_gen = lmmk->lmm_layout_gen; + ((struct lov_user_md *)lmmk)->lmm_layout_gen = lum.lmm_layout_gen; + ((struct lov_user_md *)lmmk)->lmm_stripe_count = lum.lmm_stripe_count; + if (copy_to_user(lump, lmmk, lmm_size)) + rc = -EFAULT; + + obd_free_diskmd(exp, &lmmk); +out_set: + set_fs(seg); + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_page.c b/kernel/drivers/staging/lustre/lustre/lov/lov_page.c new file mode 100644 index 000000000..c4596e8e5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_page.c @@ -0,0 +1,232 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lov page operations. + * + */ + +static int lov_page_invariant(const struct cl_page_slice *slice) +{ + const struct cl_page *page = slice->cpl_page; + const struct cl_page *sub = lov_sub_page(slice); + + return ergo(sub != NULL, + page->cp_child == sub && + sub->cp_parent == page && + page->cp_state == sub->cp_state); +} + +static void lov_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct cl_page *sub = lov_sub_page(slice); + + LINVRNT(lov_page_invariant(slice)); + + if (sub != NULL) { + LASSERT(sub->cp_state == CPS_FREEING); + lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent); + sub->cp_parent = NULL; + slice->cpl_page->cp_child = NULL; + cl_page_put(env, sub); + } +} + +static int lov_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io, + int nonblock) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_io_sub *sub; + + LINVRNT(lov_page_invariant(slice)); + LINVRNT(!cl2lov_page(slice)->lps_invalid); + + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + lov_sub_page(slice)->cp_owner = sub->sub_io; + lov_sub_put(sub); + } else + LBUG(); /* Arrgh */ + return 0; +} + +static void lov_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + lov_page_own(env, slice, io, 0); +} + +static int lov_page_cache_add(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_io_sub *sub; + int rc = 0; + + LINVRNT(lov_page_invariant(slice)); + LINVRNT(!cl2lov_page(slice)->lps_invalid); + + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + rc = cl_page_cache_add(sub->sub_env, sub->sub_io, + slice->cpl_page->cp_child, CRT_WRITE); + lov_sub_put(sub); + } else { + rc = PTR_ERR(sub); + CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, "rc = %d\n", rc); + } + return rc; +} + +static int lov_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct lov_page *lp = cl2lov_page(slice); + + return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp); +} + +static const struct cl_page_operations lov_page_ops = { + .cpo_fini = lov_page_fini, + .cpo_own = lov_page_own, + .cpo_assume = lov_page_assume, + .io = { + [CRT_WRITE] = { + .cpo_cache_add = lov_page_cache_add + } + }, + .cpo_print = lov_page_print +}; + +static void lov_empty_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + LASSERT(slice->cpl_page->cp_child == NULL); +} + +int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct lov_object *loo = cl2lov(obj); + struct lov_layout_raid0 *r0 = lov_r0(loo); + struct lov_io *lio = lov_env_io(env); + struct cl_page *subpage; + struct cl_object *subobj; + struct lov_io_sub *sub; + struct lov_page *lpg = cl_object_page_slice(obj, page); + loff_t offset; + u64 suboff; + int stripe; + int rc; + + offset = cl_offset(obj, page->cp_index); + stripe = lov_stripe_number(loo->lo_lsm, offset); + LASSERT(stripe < r0->lo_nr); + rc = lov_stripe_offset(loo->lo_lsm, offset, stripe, + &suboff); + LASSERT(rc == 0); + + lpg->lps_invalid = 1; + cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_page_ops); + + sub = lov_sub_get(env, lio, stripe); + if (IS_ERR(sub)) { + rc = PTR_ERR(sub); + goto out; + } + + subobj = lovsub2cl(r0->lo_sub[stripe]); + subpage = cl_page_find_sub(sub->sub_env, subobj, + cl_index(subobj, suboff), vmpage, page); + lov_sub_put(sub); + if (IS_ERR(subpage)) { + rc = PTR_ERR(subpage); + goto out; + } + + if (likely(subpage->cp_parent == page)) { + lu_ref_add(&subpage->cp_reference, "lov", page); + lpg->lps_invalid = 0; + rc = 0; + } else { + CL_PAGE_DEBUG(D_ERROR, env, page, "parent page\n"); + CL_PAGE_DEBUG(D_ERROR, env, subpage, "child page\n"); + LASSERT(0); + } + +out: + return rc; +} + + +static const struct cl_page_operations lov_empty_page_ops = { + .cpo_fini = lov_empty_page_fini, + .cpo_print = lov_page_print +}; + +int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct lov_page *lpg = cl_object_page_slice(obj, page); + void *addr; + + cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops); + addr = kmap(vmpage); + memset(addr, 0, cl_page_size(obj)); + kunmap(vmpage); + cl_page_export(env, page, 1); + return 0; +} + + +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_pool.c b/kernel/drivers/staging/lustre/lustre/lov/lov_pool.c new file mode 100644 index 000000000..d96163de7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_pool.c @@ -0,0 +1,673 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see [sun.com URL with a + * copy of GPLv2]. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_pool.c + * + * OST pool methods + * + * Author: Jacques-Charles LAFOUCRIERE + * Author: Alex Lyashkov + * Author: Nathaniel Rutman + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd.h" +#include "lov_internal.h" + +#define pool_tgt(_p, _i) \ + _p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]] + +static void lov_pool_getref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + atomic_inc(&pool->pool_refcount); +} + +void lov_pool_putref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + if (atomic_dec_and_test(&pool->pool_refcount)) { + LASSERT(hlist_unhashed(&pool->pool_hash)); + LASSERT(list_empty(&pool->pool_list)); + LASSERT(pool->pool_proc_entry == NULL); + lov_ost_pool_free(&(pool->pool_rr.lqr_pool)); + lov_ost_pool_free(&(pool->pool_obds)); + OBD_FREE_PTR(pool); + } +} + +static void lov_pool_putref_locked(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + LASSERT(atomic_read(&pool->pool_refcount) > 1); + + atomic_dec(&pool->pool_refcount); +} + +/* + * hash function using a Rotating Hash algorithm + * Knuth, D. The Art of Computer Programming, + * Volume 3: Sorting and Searching, + * Chapter 6.4. + * Addison Wesley, 1973 + */ +static __u32 pool_hashfn(struct cfs_hash *hash_body, const void *key, unsigned mask) +{ + int i; + __u32 result; + char *poolname; + + result = 0; + poolname = (char *)key; + for (i = 0; i < LOV_MAXPOOLNAME; i++) { + if (poolname[i] == '\0') + break; + result = (result << 4)^(result >> 28) ^ poolname[i]; + } + return (result % mask); +} + +static void *pool_key(struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + return pool->pool_name; +} + +static int pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode) +{ + char *pool_name; + struct pool_desc *pool; + + pool_name = (char *)key; + pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash); + return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME); +} + +static void *pool_hashobject(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct pool_desc, pool_hash); +} + +static void pool_hashrefcount_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + lov_pool_getref(pool); +} + +static void pool_hashrefcount_put_locked(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + lov_pool_putref_locked(pool); +} + +cfs_hash_ops_t pool_hash_operations = { + .hs_hash = pool_hashfn, + .hs_key = pool_key, + .hs_keycmp = pool_hashkey_keycmp, + .hs_object = pool_hashobject, + .hs_get = pool_hashrefcount_get, + .hs_put_locked = pool_hashrefcount_put_locked, + +}; + +#if defined (CONFIG_PROC_FS) +/* ifdef needed for liblustre support */ +/* + * pool /proc seq_file methods + */ +/* + * iterator is used to go through the target pool entries + * index is the current entry index in the lp_array[] array + * index >= pos returned to the seq_file interface + * pos is from 0 to (pool->pool_obds.op_count - 1) + */ +#define POOL_IT_MAGIC 0xB001CEA0 +struct pool_iterator { + int magic; + struct pool_desc *pool; + int idx; /* from 0 to pool_tgt_size - 1 */ +}; + +static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + int prev_idx; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic); + + /* test if end of file */ + if (*pos >= pool_tgt_count(iter->pool)) + return NULL; + + /* iterate to find a non empty entry */ + prev_idx = iter->idx; + down_read(&pool_tgt_rw_sem(iter->pool)); + iter->idx++; + if (iter->idx == pool_tgt_count(iter->pool)) { + iter->idx = prev_idx; /* we stay on the last entry */ + up_read(&pool_tgt_rw_sem(iter->pool)); + return NULL; + } + up_read(&pool_tgt_rw_sem(iter->pool)); + (*pos)++; + /* return != NULL to continue */ + return iter; +} + +static void *pool_proc_start(struct seq_file *s, loff_t *pos) +{ + struct pool_desc *pool = (struct pool_desc *)s->private; + struct pool_iterator *iter; + + lov_pool_getref(pool); + if ((pool_tgt_count(pool) == 0) || + (*pos >= pool_tgt_count(pool))) { + /* iter is not created, so stop() has no way to + * find pool to dec ref */ + lov_pool_putref(pool); + return NULL; + } + + OBD_ALLOC_PTR(iter); + if (!iter) + return ERR_PTR(-ENOMEM); + iter->magic = POOL_IT_MAGIC; + iter->pool = pool; + iter->idx = 0; + + /* we use seq_file private field to memorized iterator so + * we can free it at stop() */ + /* /!\ do not forget to restore it to pool before freeing it */ + s->private = iter; + if (*pos > 0) { + loff_t i; + void *ptr; + + i = 0; + do { + ptr = pool_proc_next(s, &iter, &i); + } while ((i < *pos) && (ptr != NULL)); + return ptr; + } + return iter; +} + +static void pool_proc_stop(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + + /* in some cases stop() method is called 2 times, without + * calling start() method (see seq_read() from fs/seq_file.c) + * we have to free only if s->private is an iterator */ + if ((iter) && (iter->magic == POOL_IT_MAGIC)) { + /* we restore s->private so next call to pool_proc_start() + * will work */ + s->private = iter->pool; + lov_pool_putref(iter->pool); + OBD_FREE_PTR(iter); + } + return; +} + +static int pool_proc_show(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)v; + struct lov_tgt_desc *tgt; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic); + LASSERT(iter->pool != NULL); + LASSERT(iter->idx <= pool_tgt_count(iter->pool)); + + down_read(&pool_tgt_rw_sem(iter->pool)); + tgt = pool_tgt(iter->pool, iter->idx); + up_read(&pool_tgt_rw_sem(iter->pool)); + if (tgt) + seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid))); + + return 0; +} + +static struct seq_operations pool_proc_ops = { + .start = pool_proc_start, + .next = pool_proc_next, + .stop = pool_proc_stop, + .show = pool_proc_show, +}; + +static int pool_proc_open(struct inode *inode, struct file *file) +{ + int rc; + + rc = seq_open(file, &pool_proc_ops); + if (!rc) { + struct seq_file *s = file->private_data; + s->private = PDE_DATA(inode); + } + return rc; +} + +static struct file_operations pool_proc_operations = { + .open = pool_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +void lov_dump_pool(int level, struct pool_desc *pool) +{ + int i; + + lov_pool_getref(pool); + + CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n", + pool->pool_name, pool->pool_obds.op_count); + down_read(&pool_tgt_rw_sem(pool)); + + for (i = 0; i < pool_tgt_count(pool) ; i++) { + if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp) + continue; + CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n", + pool->pool_name, i, + obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid))); + } + + up_read(&pool_tgt_rw_sem(pool)); + lov_pool_putref(pool); +} + +#define LOV_POOL_INIT_COUNT 2 +int lov_ost_pool_init(struct ost_pool *op, unsigned int count) +{ + if (count == 0) + count = LOV_POOL_INIT_COUNT; + op->op_array = NULL; + op->op_count = 0; + init_rwsem(&op->op_rw_sem); + op->op_size = count; + OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0])); + if (op->op_array == NULL) { + op->op_size = 0; + return -ENOMEM; + } + return 0; +} + +/* Caller must hold write op_rwlock */ +int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count) +{ + __u32 *new; + int new_size; + + LASSERT(min_count != 0); + + if (op->op_count < op->op_size) + return 0; + + new_size = max(min_count, 2 * op->op_size); + OBD_ALLOC(new, new_size * sizeof(op->op_array[0])); + if (new == NULL) + return -ENOMEM; + + /* copy old array to new one */ + memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0])); + OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0])); + op->op_array = new; + op->op_size = new_size; + return 0; +} + +int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count) +{ + int rc = 0, i; + + down_write(&op->op_rw_sem); + + rc = lov_ost_pool_extend(op, min_count); + if (rc) + goto out; + + /* search ost in pool array */ + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) { + rc = -EEXIST; + goto out; + } + } + /* ost not found we add it */ + op->op_array[op->op_count] = idx; + op->op_count++; +out: + up_write(&op->op_rw_sem); + return rc; +} + +int lov_ost_pool_remove(struct ost_pool *op, __u32 idx) +{ + int i; + + down_write(&op->op_rw_sem); + + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) { + memmove(&op->op_array[i], &op->op_array[i + 1], + (op->op_count - i - 1) * sizeof(op->op_array[0])); + op->op_count--; + up_write(&op->op_rw_sem); + return 0; + } + } + + up_write(&op->op_rw_sem); + return -EINVAL; +} + +int lov_ost_pool_free(struct ost_pool *op) +{ + if (op->op_size == 0) + return 0; + + down_write(&op->op_rw_sem); + + OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0])); + op->op_array = NULL; + op->op_count = 0; + op->op_size = 0; + + up_write(&op->op_rw_sem); + return 0; +} + + +int lov_pool_new(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *new_pool; + int rc; + + lov = &(obd->u.lov); + + if (strlen(poolname) > LOV_MAXPOOLNAME) + return -ENAMETOOLONG; + + OBD_ALLOC_PTR(new_pool); + if (new_pool == NULL) + return -ENOMEM; + + strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME); + new_pool->pool_name[LOV_MAXPOOLNAME] = '\0'; + new_pool->pool_lobd = obd; + /* ref count init to 1 because when created a pool is always used + * up to deletion + */ + atomic_set(&new_pool->pool_refcount, 1); + rc = lov_ost_pool_init(&new_pool->pool_obds, 0); + if (rc) + goto out_err; + + memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr)); + rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0); + if (rc) + goto out_free_pool_obds; + + INIT_HLIST_NODE(&new_pool->pool_hash); + +#if defined (CONFIG_PROC_FS) + /* we need this assert seq_file is not implemented for liblustre */ + /* get ref for /proc file */ + lov_pool_getref(new_pool); + new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry, + poolname, new_pool, + &pool_proc_operations); + if (IS_ERR(new_pool->pool_proc_entry)) { + CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname); + new_pool->pool_proc_entry = NULL; + lov_pool_putref(new_pool); + } + CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry); +#endif + + spin_lock(&obd->obd_dev_lock); + list_add_tail(&new_pool->pool_list, &lov->lov_pool_list); + lov->lov_pool_count++; + spin_unlock(&obd->obd_dev_lock); + + /* add to find only when it fully ready */ + rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname, + &new_pool->pool_hash); + if (rc) { + rc = -EEXIST; + goto out_err; + } + + CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n", + poolname, lov->lov_pool_count); + + return 0; + +out_err: + spin_lock(&obd->obd_dev_lock); + list_del_init(&new_pool->pool_list); + lov->lov_pool_count--; + spin_unlock(&obd->obd_dev_lock); + + lprocfs_remove(&new_pool->pool_proc_entry); + + lov_ost_pool_free(&new_pool->pool_rr.lqr_pool); +out_free_pool_obds: + lov_ost_pool_free(&new_pool->pool_obds); + OBD_FREE_PTR(new_pool); + return rc; +} + +int lov_pool_del(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *pool; + + lov = &(obd->u.lov); + + /* lookup and kill hash reference */ + pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + return -ENOENT; + + if (pool->pool_proc_entry != NULL) { + CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry); + lprocfs_remove(&pool->pool_proc_entry); + lov_pool_putref(pool); + } + + spin_lock(&obd->obd_dev_lock); + list_del_init(&pool->pool_list); + lov->lov_pool_count--; + spin_unlock(&obd->obd_dev_lock); + + /* release last reference */ + lov_pool_putref(pool); + + return 0; +} + + +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int lov_idx; + int rc; + + lov = &(obd->u.lov); + + pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + return -ENOENT; + + obd_str2uuid(&ost_uuid, ostname); + + + /* search ost in lov array */ + obd_getref(obd); + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) + continue; + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) + break; + } + /* test if ost found in lov */ + if (lov_idx == lov->desc.ld_tgt_count) { + rc = -EINVAL; + goto out; + } + + rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size); + if (rc) + goto out; + + pool->pool_rr.lqr_dirty = 1; + + CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n", + ostname, poolname, pool_tgt_count(pool)); + +out: + obd_putref(obd); + lov_pool_putref(pool); + return rc; +} + +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int lov_idx; + int rc = 0; + + lov = &(obd->u.lov); + + pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + return -ENOENT; + + obd_str2uuid(&ost_uuid, ostname); + + obd_getref(obd); + /* search ost in lov array, to get index */ + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) + continue; + + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) + break; + } + + /* test if ost found in lov */ + if (lov_idx == lov->desc.ld_tgt_count) { + rc = -EINVAL; + goto out; + } + + lov_ost_pool_remove(&pool->pool_obds, lov_idx); + + pool->pool_rr.lqr_dirty = 1; + + CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname, + poolname); + +out: + obd_putref(obd); + lov_pool_putref(pool); + return rc; +} + +int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool) +{ + int i, rc; + + /* caller may no have a ref on pool if it got the pool + * without calling lov_find_pool() (e.g. go through the lov pool + * list) + */ + lov_pool_getref(pool); + + down_read(&pool_tgt_rw_sem(pool)); + + for (i = 0; i < pool_tgt_count(pool); i++) { + if (pool_tgt_array(pool)[i] == idx) { + rc = 0; + goto out; + } + } + rc = -ENOENT; +out: + up_read(&pool_tgt_rw_sem(pool)); + + lov_pool_putref(pool); + return rc; +} + +struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname) +{ + struct pool_desc *pool; + + pool = NULL; + if (poolname[0] != '\0') { + pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + CWARN("Request for an unknown pool ("LOV_POOLNAMEF")\n", + poolname); + if ((pool != NULL) && (pool_tgt_count(pool) == 0)) { + CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n", + poolname); + /* pool is ignored, so we remove ref on it */ + lov_pool_putref(pool); + pool = NULL; + } + } + return pool; +} diff --git a/kernel/drivers/staging/lustre/lustre/lov/lov_request.c b/kernel/drivers/staging/lustre/lustre/lov/lov_request.c new file mode 100644 index 000000000..933e2d1f8 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lov_request.c @@ -0,0 +1,773 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_class.h" +#include "../include/lustre/lustre_idl.h" +#include "lov_internal.h" + +static void lov_init_set(struct lov_request_set *set) +{ + set->set_count = 0; + atomic_set(&set->set_completes, 0); + atomic_set(&set->set_success, 0); + atomic_set(&set->set_finish_checked, 0); + set->set_cookies = NULL; + INIT_LIST_HEAD(&set->set_list); + atomic_set(&set->set_refcount, 1); + init_waitqueue_head(&set->set_waitq); + spin_lock_init(&set->set_lock); +} + +void lov_finish_set(struct lov_request_set *set) +{ + struct list_head *pos, *n; + + LASSERT(set); + list_for_each_safe(pos, n, &set->set_list) { + struct lov_request *req = list_entry(pos, + struct lov_request, + rq_link); + list_del_init(&req->rq_link); + + if (req->rq_oi.oi_oa) + OBDO_FREE(req->rq_oi.oi_oa); + if (req->rq_oi.oi_md) + OBD_FREE_LARGE(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_osfs) + OBD_FREE(req->rq_oi.oi_osfs, + sizeof(*req->rq_oi.oi_osfs)); + OBD_FREE(req, sizeof(*req)); + } + + if (set->set_pga) { + int len = set->set_oabufs * sizeof(*set->set_pga); + OBD_FREE_LARGE(set->set_pga, len); + } + if (set->set_lockh) + lov_llh_put(set->set_lockh); + + OBD_FREE(set, sizeof(*set)); +} + +int lov_set_finished(struct lov_request_set *set, int idempotent) +{ + int completes = atomic_read(&set->set_completes); + + CDEBUG(D_INFO, "check set %d/%d\n", completes, set->set_count); + + if (completes == set->set_count) { + if (idempotent) + return 1; + if (atomic_inc_return(&set->set_finish_checked) == 1) + return 1; + } + return 0; +} + +void lov_update_set(struct lov_request_set *set, + struct lov_request *req, int rc) +{ + req->rq_complete = 1; + req->rq_rc = rc; + + atomic_inc(&set->set_completes); + if (rc == 0) + atomic_inc(&set->set_success); + + wake_up(&set->set_waitq); +} + +int lov_update_common_set(struct lov_request_set *set, + struct lov_request *req, int rc) +{ + struct lov_obd *lov = &set->set_exp->exp_obd->u.lov; + + lov_update_set(set, req, rc); + + /* grace error on inactive ost */ + if (rc && !(lov->lov_tgts[req->rq_idx] && + lov->lov_tgts[req->rq_idx]->ltd_active)) + rc = 0; + + /* FIXME in raid1 regime, should return 0 */ + return rc; +} + +void lov_set_add_req(struct lov_request *req, struct lov_request_set *set) +{ + list_add_tail(&req->rq_link, &set->set_list); + set->set_count++; + req->rq_rqset = set; +} + +static int lov_check_set(struct lov_obd *lov, int idx) +{ + int rc; + struct lov_tgt_desc *tgt; + + mutex_lock(&lov->lov_lock); + tgt = lov->lov_tgts[idx]; + rc = !tgt || tgt->ltd_active || + (tgt->ltd_exp && + class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried); + mutex_unlock(&lov->lov_lock); + + return rc; +} + +/* Check if the OSC connection exists and is active. + * If the OSC has not yet had a chance to connect to the OST the first time, + * wait once for it to connect instead of returning an error. + */ +int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx) +{ + wait_queue_head_t waitq; + struct l_wait_info lwi; + struct lov_tgt_desc *tgt; + int rc = 0; + + mutex_lock(&lov->lov_lock); + + tgt = lov->lov_tgts[ost_idx]; + + if (unlikely(tgt == NULL)) { + rc = 0; + goto out; + } + + if (likely(tgt->ltd_active)) { + rc = 1; + goto out; + } + + if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried) { + rc = 0; + goto out; + } + + mutex_unlock(&lov->lov_lock); + + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout), + cfs_time_seconds(1), NULL, NULL); + + rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi); + if (tgt != NULL && tgt->ltd_active) + return 1; + + return 0; + +out: + mutex_unlock(&lov->lov_lock); + return rc; +} + +static int common_attr_done(struct lov_request_set *set) +{ + struct list_head *pos; + struct lov_request *req; + struct obdo *tmp_oa; + int rc = 0, attrset = 0; + + LASSERT(set->set_oi != NULL); + + if (set->set_oi->oi_oa == NULL) + return 0; + + if (!atomic_read(&set->set_success)) + return -EIO; + + OBDO_ALLOC(tmp_oa); + if (tmp_oa == NULL) { + rc = -ENOMEM; + goto out; + } + + list_for_each(pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + if (!req->rq_complete || req->rq_rc) + continue; + if (req->rq_oi.oi_oa->o_valid == 0) /* inactive stripe */ + continue; + lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa, + req->rq_oi.oi_oa->o_valid, + set->set_oi->oi_md, req->rq_stripe, &attrset); + } + if (!attrset) { + CERROR("No stripes had valid attrs\n"); + rc = -EIO; + } + if ((set->set_oi->oi_oa->o_valid & OBD_MD_FLEPOCH) && + (set->set_oi->oi_md->lsm_stripe_count != attrset)) { + /* When we take attributes of some epoch, we require all the + * ost to be active. */ + CERROR("Not all the stripes had valid attrs\n"); + rc = -EIO; + goto out; + } + + tmp_oa->o_oi = set->set_oi->oi_oa->o_oi; + memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa)); +out: + if (tmp_oa) + OBDO_FREE(tmp_oa); + return rc; + +} + +int lov_fini_getattr_set(struct lov_request_set *set) +{ + int rc = 0; + + if (set == NULL) + return 0; + LASSERT(set->set_exp); + if (atomic_read(&set->set_completes)) + rc = common_attr_done(set); + + lov_put_reqset(set); + + return rc; +} + +/* The callback for osc_getattr_async that finalizes a request info when a + * response is received. */ +static int cb_getattr_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + return lov_update_common_set(lovreq->rq_rqset, lovreq, rc); +} + +int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int rc = 0, i; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + return -ENOMEM; + lov_init_set(set); + + set->set_exp = exp; + set->set_oi = oinfo; + + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) { + struct lov_oinfo *loi; + struct lov_request *req; + + loi = oinfo->oi_md->lsm_oinfo[i]; + if (lov_oinfo_is_dummy(loi)) + continue; + + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + if (oinfo->oi_oa->o_valid & OBD_MD_FLEPOCH) { + /* SOM requires all the OSTs to be active. */ + rc = -EIO; + goto out_set; + } + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto out_set; + } + + req->rq_stripe = i; + req->rq_idx = loi->loi_ost_idx; + + OBDO_ALLOC(req->rq_oi.oi_oa); + if (req->rq_oi.oi_oa == NULL) { + OBD_FREE(req, sizeof(*req)); + rc = -ENOMEM; + goto out_set; + } + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_oi = loi->loi_oi; + req->rq_oi.oi_cb_up = cb_getattr_update; + req->rq_oi.oi_capa = oinfo->oi_capa; + + lov_set_add_req(req, set); + } + if (!set->set_count) { + rc = -EIO; + goto out_set; + } + *reqset = set; + return rc; +out_set: + lov_fini_getattr_set(set); + return rc; +} + +int lov_fini_destroy_set(struct lov_request_set *set) +{ + if (set == NULL) + return 0; + LASSERT(set->set_exp); + if (atomic_read(&set->set_completes)) { + /* FIXME update qos data here */ + } + + lov_put_reqset(set); + + return 0; +} + +int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo, + struct obdo *src_oa, struct lov_stripe_md *lsm, + struct obd_trans_info *oti, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int rc = 0, i; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + return -ENOMEM; + lov_init_set(set); + + set->set_exp = exp; + set->set_oi = oinfo; + set->set_oi->oi_md = lsm; + set->set_oi->oi_oa = src_oa; + set->set_oti = oti; + if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE) + set->set_cookies = oti->oti_logcookies; + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *loi; + struct lov_request *req; + + loi = lsm->lsm_oinfo[i]; + if (lov_oinfo_is_dummy(loi)) + continue; + + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto out_set; + } + + req->rq_stripe = i; + req->rq_idx = loi->loi_ost_idx; + + OBDO_ALLOC(req->rq_oi.oi_oa); + if (req->rq_oi.oi_oa == NULL) { + OBD_FREE(req, sizeof(*req)); + rc = -ENOMEM; + goto out_set; + } + memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_oi = loi->loi_oi; + lov_set_add_req(req, set); + } + if (!set->set_count) { + rc = -EIO; + goto out_set; + } + *reqset = set; + return rc; +out_set: + lov_fini_destroy_set(set); + return rc; +} + +int lov_fini_setattr_set(struct lov_request_set *set) +{ + int rc = 0; + + if (set == NULL) + return 0; + LASSERT(set->set_exp); + if (atomic_read(&set->set_completes)) { + rc = common_attr_done(set); + /* FIXME update qos data here */ + } + + lov_put_reqset(set); + return rc; +} + +int lov_update_setattr_set(struct lov_request_set *set, + struct lov_request *req, int rc) +{ + struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov; + struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md; + + lov_update_set(set, req, rc); + + /* grace error on inactive ost */ + if (rc && !(lov->lov_tgts[req->rq_idx] && + lov->lov_tgts[req->rq_idx]->ltd_active)) + rc = 0; + + if (rc == 0) { + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME) + lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_ctime = + req->rq_oi.oi_oa->o_ctime; + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME) + lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_mtime = + req->rq_oi.oi_oa->o_mtime; + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME) + lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_atime = + req->rq_oi.oi_oa->o_atime; + } + + return rc; +} + +/* The callback for osc_setattr_async that finalizes a request info when a + * response is received. */ +static int cb_setattr_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc); +} + +int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int rc = 0, i; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + return -ENOMEM; + lov_init_set(set); + + set->set_exp = exp; + set->set_oti = oti; + set->set_oi = oinfo; + if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) + set->set_cookies = oti->oti_logcookies; + + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) { + struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i]; + struct lov_request *req; + + if (lov_oinfo_is_dummy(loi)) + continue; + + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto out_set; + } + req->rq_stripe = i; + req->rq_idx = loi->loi_ost_idx; + + OBDO_ALLOC(req->rq_oi.oi_oa); + if (req->rq_oi.oi_oa == NULL) { + OBD_FREE(req, sizeof(*req)); + rc = -ENOMEM; + goto out_set; + } + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_oi = loi->loi_oi; + req->rq_oi.oi_oa->o_stripe_idx = i; + req->rq_oi.oi_cb_up = cb_setattr_update; + req->rq_oi.oi_capa = oinfo->oi_capa; + + if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) { + int off = lov_stripe_offset(oinfo->oi_md, + oinfo->oi_oa->o_size, i, + &req->rq_oi.oi_oa->o_size); + + if (off < 0 && req->rq_oi.oi_oa->o_size) + req->rq_oi.oi_oa->o_size--; + + CDEBUG(D_INODE, "stripe %d has size %llu/%llu\n", + i, req->rq_oi.oi_oa->o_size, + oinfo->oi_oa->o_size); + } + lov_set_add_req(req, set); + } + if (!set->set_count) { + rc = -EIO; + goto out_set; + } + *reqset = set; + return rc; +out_set: + lov_fini_setattr_set(set); + return rc; +} + +#define LOV_U64_MAX ((__u64)~0ULL) +#define LOV_SUM_MAX(tot, add) \ + do { \ + if ((tot) + (add) < (tot)) \ + (tot) = LOV_U64_MAX; \ + else \ + (tot) += (add); \ + } while (0) + +int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, + int success) +{ + if (success) { + __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov, + LOV_MAGIC, 0); + if (osfs->os_files != LOV_U64_MAX) + lov_do_div64(osfs->os_files, expected_stripes); + if (osfs->os_ffree != LOV_U64_MAX) + lov_do_div64(osfs->os_ffree, expected_stripes); + + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(*osfs)); + obd->obd_osfs_age = cfs_time_current_64(); + spin_unlock(&obd->obd_osfs_lock); + return 0; + } + + return -EIO; +} + +int lov_fini_statfs_set(struct lov_request_set *set) +{ + int rc = 0; + + if (set == NULL) + return 0; + + if (atomic_read(&set->set_completes)) { + rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs, + atomic_read(&set->set_success)); + } + lov_put_reqset(set); + return rc; +} + +void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, + int success) +{ + int shift = 0, quit = 0; + __u64 tmp; + + if (success == 0) { + memcpy(osfs, lov_sfs, sizeof(*lov_sfs)); + } else { + if (osfs->os_bsize != lov_sfs->os_bsize) { + /* assume all block sizes are always powers of 2 */ + /* get the bits difference */ + tmp = osfs->os_bsize | lov_sfs->os_bsize; + for (shift = 0; shift <= 64; ++shift) { + if (tmp & 1) { + if (quit) + break; + else + quit = 1; + shift = 0; + } + tmp >>= 1; + } + } + + if (osfs->os_bsize < lov_sfs->os_bsize) { + osfs->os_bsize = lov_sfs->os_bsize; + + osfs->os_bfree >>= shift; + osfs->os_bavail >>= shift; + osfs->os_blocks >>= shift; + } else if (shift != 0) { + lov_sfs->os_bfree >>= shift; + lov_sfs->os_bavail >>= shift; + lov_sfs->os_blocks >>= shift; + } + osfs->os_bfree += lov_sfs->os_bfree; + osfs->os_bavail += lov_sfs->os_bavail; + osfs->os_blocks += lov_sfs->os_blocks; + /* XXX not sure about this one - depends on policy. + * - could be minimum if we always stripe on all OBDs + * (but that would be wrong for any other policy, + * if one of the OBDs has no more objects left) + * - could be sum if we stripe whole objects + * - could be average, just to give a nice number + * + * To give a "reasonable" (if not wholly accurate) + * number, we divide the total number of free objects + * by expected stripe count (watch out for overflow). + */ + LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files); + LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree); + } +} + +/* The callback for osc_statfs_async that finalizes a request info when a + * response is received. */ +static int cb_statfs_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + struct lov_request_set *set; + struct obd_statfs *osfs, *lov_sfs; + struct lov_obd *lov; + struct lov_tgt_desc *tgt; + struct obd_device *lovobd, *tgtobd; + int success; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + set = lovreq->rq_rqset; + lovobd = set->set_obd; + lov = &lovobd->u.lov; + osfs = set->set_oi->oi_osfs; + lov_sfs = oinfo->oi_osfs; + success = atomic_read(&set->set_success); + /* XXX: the same is done in lov_update_common_set, however + lovset->set_exp is not initialized. */ + lov_update_set(set, lovreq, rc); + if (rc) + goto out; + + obd_getref(lovobd); + tgt = lov->lov_tgts[lovreq->rq_idx]; + if (!tgt || !tgt->ltd_active) + goto out_update; + + tgtobd = class_exp2obd(tgt->ltd_exp); + spin_lock(&tgtobd->obd_osfs_lock); + memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs)); + if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0) + tgtobd->obd_osfs_age = cfs_time_current_64(); + spin_unlock(&tgtobd->obd_osfs_lock); + +out_update: + lov_update_statfs(osfs, lov_sfs, success); + obd_putref(lovobd); + +out: + if (set->set_oi->oi_flags & OBD_STATFS_PTLRPCD && + lov_set_finished(set, 0)) { + lov_statfs_interpret(NULL, set, set->set_count != + atomic_read(&set->set_success)); + } + + return 0; +} + +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &obd->u.lov; + int rc = 0, i; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + return -ENOMEM; + lov_init_set(set); + + set->set_obd = obd; + set->set_oi = oinfo; + + /* We only get block data from the OBD */ + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + struct lov_request *req; + + if (lov->lov_tgts[i] == NULL || + (!lov_check_and_wait_active(lov, i) && + (oinfo->oi_flags & OBD_STATFS_NODELAY))) { + CDEBUG(D_HA, "lov idx %d inactive\n", i); + continue; + } + + /* skip targets that have been explicitly disabled by the + * administrator */ + if (!lov->lov_tgts[i]->ltd_exp) { + CDEBUG(D_HA, "lov idx %d administratively disabled\n", i); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto out_set; + } + + OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs)); + if (req->rq_oi.oi_osfs == NULL) { + OBD_FREE(req, sizeof(*req)); + rc = -ENOMEM; + goto out_set; + } + + req->rq_idx = i; + req->rq_oi.oi_cb_up = cb_statfs_update; + req->rq_oi.oi_flags = oinfo->oi_flags; + + lov_set_add_req(req, set); + } + if (!set->set_count) { + rc = -EIO; + goto out_set; + } + *reqset = set; + return rc; +out_set: + lov_fini_statfs_set(set); + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_dev.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_dev.c new file mode 100644 index 000000000..42336f13a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_dev.c @@ -0,0 +1,209 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device and cl_device_type for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub transfer operations. + * + */ + +static void lovsub_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct lovsub_req *lsr; + + lsr = cl2lovsub_req(slice); + OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem); +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for lovsub + * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx + * field, which is filled there. + */ +static void lovsub_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, u64 flags) +{ + struct lovsub_object *subobj; + + subobj = cl2lovsub(obj); + /* + * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it + * unconditionally. It never changes anyway. + */ + attr->cra_oa->o_stripe_idx = subobj->lso_index; +} + +static const struct cl_req_operations lovsub_req_ops = { + .cro_attr_set = lovsub_req_attr_set, + .cro_completion = lovsub_req_completion +}; + +/***************************************************************************** + * + * Lov-sub device and device type functions. + * + */ + +static int lovsub_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device_type *ldt; + int rc; + + next->ld_site = d->ld_site; + ldt = next->ld_type; + LASSERT(ldt != NULL); + rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL); + if (rc) { + next->ld_site = NULL; + return rc; + } + + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + lsd->acid_next = lu2cl_dev(next); + return rc; +} + +static struct lu_device *lovsub_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct lu_device *next; + struct lovsub_device *lsd; + + lsd = lu2lovsub_dev(d); + next = cl2lu_dev(lsd->acid_next); + lsd->acid_super = NULL; + lsd->acid_next = NULL; + return next; +} + +static struct lu_device *lovsub_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device *next = cl2lu_dev(lsd->acid_next); + + if (atomic_read(&d->ld_ref) && d->ld_site) { + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL); + lu_site_print(env, d->ld_site, &msgdata, lu_cdebug_printer); + } + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(lsd); + return next; +} + +static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct lovsub_req *lsr; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(lsr, lovsub_req_kmem, GFP_NOFS); + if (lsr != NULL) { + cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +static const struct lu_device_operations lovsub_lu_ops = { + .ldo_object_alloc = lovsub_object_alloc, + .ldo_process_config = NULL, + .ldo_recovery_complete = NULL +}; + +static const struct cl_device_operations lovsub_cl_ops = { + .cdo_req_init = lovsub_req_init +}; + +static struct lu_device *lovsub_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lovsub_device *lsd; + + OBD_ALLOC_PTR(lsd); + if (lsd != NULL) { + int result; + + result = cl_device_init(&lsd->acid_cl, t); + if (result == 0) { + d = lovsub2lu_dev(lsd); + d->ld_ops = &lovsub_lu_ops; + lsd->acid_cl.cd_ops = &lovsub_cl_ops; + } else + d = ERR_PTR(result); + } else + d = ERR_PTR(-ENOMEM); + return d; +} + +static const struct lu_device_type_operations lovsub_device_type_ops = { + .ldto_device_alloc = lovsub_device_alloc, + .ldto_device_free = lovsub_device_free, + + .ldto_device_init = lovsub_device_init, + .ldto_device_fini = lovsub_device_fini +}; + +#define LUSTRE_LOVSUB_NAME "lovsub" + +struct lu_device_type lovsub_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOVSUB_NAME, + .ldt_ops = &lovsub_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + + +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_io.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_io.c new file mode 100644 index 000000000..783ec687a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_io.c @@ -0,0 +1,55 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub io operations. + * + */ + +/* All trivial */ + +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_lock.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_lock.c new file mode 100644 index 000000000..62b696d25 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_lock.c @@ -0,0 +1,466 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub lock operations. + * + */ + +static void lovsub_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lovsub_lock *lsl; + + lsl = cl2lovsub_lock(slice); + LASSERT(list_empty(&lsl->lss_parents)); + OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem); +} + +static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov) +{ + struct cl_lock *parent; + + parent = lov->lls_cl.cls_lock; + cl_lock_get(parent); + lu_ref_add(&parent->cll_reference, "lovsub-parent", current); + cl_lock_mutex_get(env, parent); +} + +static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov) +{ + struct cl_lock *parent; + + parent = lov->lls_cl.cls_lock; + cl_lock_mutex_put(env, lov->lls_cl.cls_lock); + lu_ref_del(&parent->cll_reference, "lovsub-parent", current); + cl_lock_put(env, parent); +} + +/** + * Implements cl_lock_operations::clo_state() method for lovsub layer, which + * method is called whenever sub-lock state changes. Propagates state change + * to the top-locks. + */ +static void lovsub_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct lovsub_lock *sub = cl2lovsub_lock(slice); + struct lov_lock_link *scan; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + struct lov_lock *lov = scan->lll_super; + struct cl_lock *parent = lov->lls_cl.cls_lock; + + if (sub->lss_active != parent) { + lovsub_parent_lock(env, lov); + cl_lock_signal(env, parent); + lovsub_parent_unlock(env, lov); + } + } +} + +/** + * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by + * asking parent lock. + */ +static unsigned long lovsub_lock_weigh(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lovsub_lock *lock = cl2lovsub_lock(slice); + struct lov_lock *lov; + unsigned long dumbbell; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + + if (!list_empty(&lock->lss_parents)) { + /* + * It is not clear whether all parents have to be asked and + * their estimations summed, or it is enough to ask one. For + * the current usages, one is always enough. + */ + lov = container_of(lock->lss_parents.next, + struct lov_lock_link, lll_list)->lll_super; + + lovsub_parent_lock(env, lov); + dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock); + lovsub_parent_unlock(env, lov); + } else + dumbbell = 0; + + return dumbbell; +} + +/** + * Maps start/end offsets within a stripe, to offsets within a file. + */ +static void lovsub_lock_descr_map(const struct cl_lock_descr *in, + struct lov_object *lov, + int stripe, struct cl_lock_descr *out) +{ + pgoff_t size; /* stripe size in pages */ + pgoff_t skip; /* how many pages in every stripe are occupied by + * "other" stripes */ + pgoff_t start; + pgoff_t end; + + start = in->cld_start; + end = in->cld_end; + + if (lov->lo_lsm->lsm_stripe_count > 1) { + size = cl_index(lov2cl(lov), lov->lo_lsm->lsm_stripe_size); + skip = (lov->lo_lsm->lsm_stripe_count - 1) * size; + + /* XXX overflow check here? */ + start += start/size * skip + stripe * size; + + if (end != CL_PAGE_EOF) { + end += end/size * skip + stripe * size; + /* + * And check for overflow... + */ + if (end < in->cld_end) + end = CL_PAGE_EOF; + } + } + out->cld_start = start; + out->cld_end = end; +} + +/** + * Adjusts parent lock extent when a sub-lock is attached to a parent. This is + * called in two ways: + * + * - as part of receive call-back, when server returns granted extent to + * the client, and + * + * - when top-lock finds existing sub-lock in the cache. + * + * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ + * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ. + */ +int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov, + struct lovsub_lock *sublock, + const struct cl_lock_descr *d, int idx) +{ + struct cl_lock *parent; + struct lovsub_object *subobj; + struct cl_lock_descr *pd; + struct cl_lock_descr *parent_descr; + int result; + + parent = lov->lls_cl.cls_lock; + parent_descr = &parent->cll_descr; + LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode)); + + subobj = cl2lovsub(sublock->lss_cl.cls_obj); + pd = &lov_env_info(env)->lti_ldescr; + + pd->cld_obj = parent_descr->cld_obj; + pd->cld_mode = parent_descr->cld_mode; + pd->cld_gid = parent_descr->cld_gid; + lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd); + lov->lls_sub[idx].sub_got = *d; + /* + * Notify top-lock about modification, if lock description changes + * materially. + */ + if (!cl_lock_ext_match(parent_descr, pd)) + result = cl_lock_modify(env, parent, pd); + else + result = 0; + return result; +} + +static int lovsub_lock_modify(const struct lu_env *env, + const struct cl_lock_slice *s, + const struct cl_lock_descr *d) +{ + struct lovsub_lock *lock = cl2lovsub_lock(s); + struct lov_lock_link *scan; + struct lov_lock *lov; + int result = 0; + + LASSERT(cl_lock_mode_match(d->cld_mode, + s->cls_lock->cll_descr.cld_mode)); + list_for_each_entry(scan, &lock->lss_parents, lll_list) { + int rc; + + lov = scan->lll_super; + lovsub_parent_lock(env, lov); + rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx); + lovsub_parent_unlock(env, lov); + result = result ?: rc; + } + return result; +} + +static int lovsub_lock_closure(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_lock_closure *closure) +{ + struct lovsub_lock *sub; + struct cl_lock *parent; + struct lov_lock_link *scan; + int result; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + + sub = cl2lovsub_lock(slice); + result = 0; + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + parent = scan->lll_super->lls_cl.cls_lock; + result = cl_lock_closure_build(env, parent, closure); + if (result != 0) + break; + } + return result; +} + +/** + * A helper function for lovsub_lock_delete() that deals with a given parent + * top-lock. + */ +static int lovsub_lock_delete_one(const struct lu_env *env, + struct cl_lock *child, struct lov_lock *lov) +{ + struct cl_lock *parent; + int result; + + parent = lov->lls_cl.cls_lock; + if (parent->cll_error) + return 0; + + result = 0; + switch (parent->cll_state) { + case CLS_ENQUEUED: + /* See LU-1355 for the case that a glimpse lock is + * interrupted by signal */ + LASSERT(parent->cll_flags & CLF_CANCELLED); + break; + case CLS_QUEUING: + case CLS_FREEING: + cl_lock_signal(env, parent); + break; + case CLS_INTRANSIT: + /* + * Here lies a problem: a sub-lock is canceled while top-lock + * is being unlocked. Top-lock cannot be moved into CLS_NEW + * state, because unlocking has to succeed eventually by + * placing lock into CLS_CACHED (or failing it), see + * cl_unuse_try(). Nor can top-lock be left in CLS_CACHED + * state, because lov maintains an invariant that all + * sub-locks exist in CLS_CACHED (this allows cached top-lock + * to be reused immediately). Nor can we wait for top-lock + * state to change, because this can be synchronous to the + * current thread. + * + * We know for sure that lov_lock_unuse() will be called at + * least one more time to finish un-using, so leave a mark on + * the top-lock, that will be seen by the next call to + * lov_lock_unuse(). + */ + if (cl_lock_is_intransit(parent)) + lov->lls_cancel_race = 1; + break; + case CLS_CACHED: + /* + * if a sub-lock is canceled move its top-lock into CLS_NEW + * state to preserve an invariant that a top-lock in + * CLS_CACHED is immediately ready for re-use (i.e., has all + * sub-locks), and so that next attempt to re-use the top-lock + * enqueues missing sub-lock. + */ + cl_lock_state_set(env, parent, CLS_NEW); + /* fall through */ + case CLS_NEW: + /* + * if last sub-lock is canceled, destroy the top-lock (which + * is now `empty') proactively. + */ + if (lov->lls_nr_filled == 0) { + /* ... but unfortunately, this cannot be done easily, + * as cancellation of a top-lock might acquire mutices + * of its other sub-locks, violating lock ordering, + * see cl_lock_{cancel,delete}() preconditions. + * + * To work around this, the mutex of this sub-lock is + * released, top-lock is destroyed, and sub-lock mutex + * acquired again. The list of parents has to be + * re-scanned from the beginning after this. + * + * Only do this if no mutices other than on @child and + * @parent are held by the current thread. + * + * TODO: The lock modal here is too complex, because + * the lock may be canceled and deleted by voluntarily: + * cl_lock_request + * -> osc_lock_enqueue_wait + * -> osc_lock_cancel_wait + * -> cl_lock_delete + * -> lovsub_lock_delete + * -> cl_lock_cancel/delete + * -> ... + * + * The better choice is to spawn a kernel thread for + * this purpose. -jay + */ + if (cl_lock_nr_mutexed(env) == 2) { + cl_lock_mutex_put(env, child); + cl_lock_cancel(env, parent); + cl_lock_delete(env, parent); + result = 1; + } + } + break; + case CLS_HELD: + CL_LOCK_DEBUG(D_ERROR, env, parent, "Delete CLS_HELD lock\n"); + default: + CERROR("Impossible state: %d\n", parent->cll_state); + LBUG(); + break; + } + + return result; +} + +/** + * An implementation of cl_lock_operations::clo_delete() method. This is + * invoked in "bottom-to-top" delete, when lock destruction starts from the + * sub-lock (e.g, as a result of ldlm lock LRU policy). + */ +static void lovsub_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct cl_lock *child = slice->cls_lock; + struct lovsub_lock *sub = cl2lovsub_lock(slice); + int restart; + + LASSERT(cl_lock_is_mutexed(child)); + + /* + * Destruction of a sub-lock might take multiple iterations, because + * when the last sub-lock of a given top-lock is deleted, top-lock is + * canceled proactively, and this requires to release sub-lock + * mutex. Once sub-lock mutex has been released, list of its parents + * has to be re-scanned from the beginning. + */ + do { + struct lov_lock *lov; + struct lov_lock_link *scan; + struct lov_lock_link *temp; + struct lov_lock_sub *subdata; + + restart = 0; + list_for_each_entry_safe(scan, temp, + &sub->lss_parents, lll_list) { + lov = scan->lll_super; + subdata = &lov->lls_sub[scan->lll_idx]; + lovsub_parent_lock(env, lov); + subdata->sub_got = subdata->sub_descr; + lov_lock_unlink(env, scan, sub); + restart = lovsub_lock_delete_one(env, child, lov); + lovsub_parent_unlock(env, lov); + + if (restart) { + cl_lock_mutex_get(env, child); + break; + } + } + } while (restart); +} + +static int lovsub_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct lovsub_lock *sub = cl2lovsub_lock(slice); + struct lov_lock *lov; + struct lov_lock_link *scan; + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + lov = scan->lll_super; + (*p)(env, cookie, "[%d %p ", scan->lll_idx, lov); + if (lov != NULL) + cl_lock_descr_print(env, cookie, p, + &lov->lls_cl.cls_lock->cll_descr); + (*p)(env, cookie, "] "); + } + return 0; +} + +static const struct cl_lock_operations lovsub_lock_ops = { + .clo_fini = lovsub_lock_fini, + .clo_state = lovsub_lock_state, + .clo_delete = lovsub_lock_delete, + .clo_modify = lovsub_lock_modify, + .clo_closure = lovsub_lock_closure, + .clo_weigh = lovsub_lock_weigh, + .clo_print = lovsub_lock_print +}; + +int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lovsub_lock *lsk; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, GFP_NOFS); + if (lsk != NULL) { + INIT_LIST_HEAD(&lsk->lss_parents); + cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_object.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_object.c new file mode 100644 index 000000000..57e3629fc --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_object.c @@ -0,0 +1,164 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub object operations. + * + */ + +int lovsub_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev); + struct lu_object *below; + struct lu_device *under; + + int result; + + under = &dev->acid_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below != NULL) { + lu_object_add(obj, below); + cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page)); + result = 0; + } else + result = -ENOMEM; + return result; + +} + +static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + struct lov_object *lov = los->lso_super; + + /* We can't assume lov was assigned here, because of the shadow + * object handling in lu_object_find. + */ + if (lov) { + LASSERT(lov->lo_type == LLT_RAID0); + LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los); + spin_lock(&lov->u.raid0.lo_sub_lock); + lov->u.raid0.lo_sub[los->lso_index] = NULL; + spin_unlock(&lov->u.raid0.lo_sub_lock); + } + + lu_object_fini(obj); + lu_object_header_fini(&los->lso_header.coh_lu); + OBD_SLAB_FREE_PTR(los, lovsub_object_kmem); +} + +static int lovsub_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + + return (*p)(env, cookie, "[%d]", los->lso_index); +} + +static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lov_object *lov = cl2lovsub(obj)->lso_super; + + lov_r0(lov)->lo_attr_valid = 0; + return 0; +} + +static int lovsub_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lovsub_object *los = cl2lovsub(obj); + + return cl_object_glimpse(env, &los->lso_super->lo_cl, lvb); +} + + + +static const struct cl_object_operations lovsub_ops = { + .coo_page_init = lovsub_page_init, + .coo_lock_init = lovsub_lock_init, + .coo_attr_set = lovsub_attr_set, + .coo_glimpse = lovsub_object_glimpse +}; + +static const struct lu_object_operations lovsub_lu_obj_ops = { + .loo_object_init = lovsub_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = lovsub_object_free, + .loo_object_print = lovsub_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct lovsub_object *los; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS); + if (los != NULL) { + struct cl_object_header *hdr; + + obj = lovsub2lu(los); + hdr = &los->lso_header; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + los->lso_cl.co_ops = &lovsub_ops; + obj->lo_ops = &lovsub_lu_obj_ops; + } else + obj = NULL; + return obj; +} + +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lovsub_page.c b/kernel/drivers/staging/lustre/lustre/lov/lovsub_page.c new file mode 100644 index 000000000..3f00ce967 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lovsub_page.c @@ -0,0 +1,71 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub page operations. + * + */ + +static void lovsub_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ +} + +static const struct cl_page_operations lovsub_page_ops = { + .cpo_fini = lovsub_page_fini +}; + +int lovsub_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *unused) +{ + struct lovsub_page *lsb = cl_object_page_slice(obj, page); + + cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops); + return 0; +} + +/** @} lov */ diff --git a/kernel/drivers/staging/lustre/lustre/lov/lproc_lov.c b/kernel/drivers/staging/lustre/lustre/lov/lproc_lov.c new file mode 100644 index 000000000..174cbf5c1 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/lov/lproc_lov.c @@ -0,0 +1,311 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include "../include/lprocfs_status.h" +#include "../include/obd_class.h" +#include +#include "lov_internal.h" + +static int lov_stripesize_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + seq_printf(m, "%llu\n", desc->ld_default_stripe_size); + return 0; +} + +static ssize_t lov_stripesize_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct lov_desc *desc; + __u64 val; + int rc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + rc = lprocfs_write_u64_helper(buffer, count, &val); + if (rc) + return rc; + + lov_fix_desc_stripe_size(&val); + desc->ld_default_stripe_size = val; + return count; +} +LPROC_SEQ_FOPS(lov_stripesize); + +static int lov_stripeoffset_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + seq_printf(m, "%llu\n", desc->ld_default_stripe_offset); + return 0; +} + +static ssize_t lov_stripeoffset_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct lov_desc *desc; + __u64 val; + int rc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + rc = lprocfs_write_u64_helper(buffer, count, &val); + if (rc) + return rc; + + desc->ld_default_stripe_offset = val; + return count; +} +LPROC_SEQ_FOPS(lov_stripeoffset); + +static int lov_stripetype_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + seq_printf(m, "%u\n", desc->ld_pattern); + return 0; +} + +static ssize_t lov_stripetype_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct lov_desc *desc; + int val, rc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + lov_fix_desc_pattern(&val); + desc->ld_pattern = val; + return count; +} +LPROC_SEQ_FOPS(lov_stripetype); + +static int lov_stripecount_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + seq_printf(m, "%d\n", (__s16)(desc->ld_default_stripe_count + 1) - 1); + return 0; +} + +static ssize_t lov_stripecount_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct lov_desc *desc; + int val, rc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + lov_fix_desc_stripe_count(&val); + desc->ld_default_stripe_count = val; + return count; +} +LPROC_SEQ_FOPS(lov_stripecount); + +static int lov_numobd_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + seq_printf(m, "%u\n", desc->ld_tgt_count); + return 0; +} +LPROC_SEQ_FOPS_RO(lov_numobd); + +static int lov_activeobd_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + seq_printf(m, "%u\n", desc->ld_active_tgt_count); + return 0; +} +LPROC_SEQ_FOPS_RO(lov_activeobd); + +static int lov_desc_uuid_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lov_obd *lov; + + LASSERT(dev != NULL); + lov = &dev->u.lov; + seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid); + return 0; +} +LPROC_SEQ_FOPS_RO(lov_desc_uuid); + +static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lov_obd *lov = &dev->u.lov; + + while (*pos < lov->desc.ld_tgt_count) { + if (lov->lov_tgts[*pos]) + return lov->lov_tgts[*pos]; + ++*pos; + } + return NULL; +} + +static void lov_tgt_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lov_obd *lov = &dev->u.lov; + + while (++*pos < lov->desc.ld_tgt_count) { + if (lov->lov_tgts[*pos]) + return lov->lov_tgts[*pos]; + } + return NULL; +} + +static int lov_tgt_seq_show(struct seq_file *p, void *v) +{ + struct lov_tgt_desc *tgt = v; + + seq_printf(p, "%d: %s %sACTIVE\n", + tgt->ltd_index, obd_uuid2str(&tgt->ltd_uuid), + tgt->ltd_active ? "" : "IN"); + return 0; +} + +static const struct seq_operations lov_tgt_sops = { + .start = lov_tgt_seq_start, + .stop = lov_tgt_seq_stop, + .next = lov_tgt_seq_next, + .show = lov_tgt_seq_show, +}; + +static int lov_target_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lov_tgt_sops); + if (rc) + return rc; + + seq = file->private_data; + seq->private = PDE_DATA(inode); + return 0; +} + +LPROC_SEQ_FOPS_RO_TYPE(lov, uuid); +LPROC_SEQ_FOPS_RO_TYPE(lov, filestotal); +LPROC_SEQ_FOPS_RO_TYPE(lov, filesfree); +LPROC_SEQ_FOPS_RO_TYPE(lov, blksize); +LPROC_SEQ_FOPS_RO_TYPE(lov, kbytestotal); +LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesfree); +LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesavail); + +static struct lprocfs_vars lprocfs_lov_obd_vars[] = { + { "uuid", &lov_uuid_fops, NULL, 0 }, + { "stripesize", &lov_stripesize_fops, NULL }, + { "stripeoffset", &lov_stripeoffset_fops, NULL }, + { "stripecount", &lov_stripecount_fops, NULL }, + { "stripetype", &lov_stripetype_fops, NULL }, + { "numobd", &lov_numobd_fops, NULL, 0 }, + { "activeobd", &lov_activeobd_fops, NULL, 0 }, + { "filestotal", &lov_filestotal_fops, NULL, 0 }, + { "filesfree", &lov_filesfree_fops, NULL, 0 }, + /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/ + { "blocksize", &lov_blksize_fops, NULL, 0 }, + { "kbytestotal", &lov_kbytestotal_fops, NULL, 0 }, + { "kbytesfree", &lov_kbytesfree_fops, NULL, 0 }, + { "kbytesavail", &lov_kbytesavail_fops, NULL, 0 }, + { "desc_uuid", &lov_desc_uuid_fops, NULL, 0 }, + { NULL } +}; + +LPROC_SEQ_FOPS_RO_TYPE(lov, numrefs); + +static struct lprocfs_vars lprocfs_lov_module_vars[] = { + { "num_refs", &lov_numrefs_fops, NULL, 0 }, + { NULL } +}; + +void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_lov_module_vars; + lvars->obd_vars = lprocfs_lov_obd_vars; +} + +const struct file_operations lov_proc_target_fops = { + .owner = THIS_MODULE, + .open = lov_target_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; diff --git a/kernel/drivers/staging/lustre/lustre/mdc/Makefile b/kernel/drivers/staging/lustre/lustre/mdc/Makefile new file mode 100644 index 000000000..2516551a6 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mdc/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LUSTRE_FS) += mdc.o +mdc-y := mdc_request.o mdc_reint.o mdc_lib.o mdc_locks.o +mdc-$(CONFIG_PROC_FS) += lproc_mdc.o diff --git a/kernel/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/kernel/drivers/staging/lustre/lustre/mdc/lproc_mdc.c new file mode 100644 index 000000000..acfe08e45 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mdc/lproc_mdc.c @@ -0,0 +1,220 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include "../include/obd_class.h" +#include "../include/lprocfs_status.h" +#include "mdc_internal.h" + +static int mdc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + + client_obd_list_lock(&cli->cl_loi_list_lock); + seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return 0; +} + +static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file, + const char __user *buffer, + size_t count, + loff_t *off) +{ + struct obd_device *dev = + ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &dev->u.cli; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 1 || val > MDC_MAX_RIF_MAX) + return -ERANGE; + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_max_rpcs_in_flight = val; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return count; +} +LPROC_SEQ_FOPS(mdc_max_rpcs_in_flight); + +static int mdc_kuc_open(struct inode *inode, struct file *file) +{ + return single_open(file, NULL, PDE_DATA(inode)); +} + +/* temporary for testing */ +static ssize_t mdc_kuc_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = + ((struct seq_file *)file->private_data)->private; + struct kuc_hdr *lh; + struct hsm_action_list *hal; + struct hsm_action_item *hai; + int len; + int fd, rc; + + rc = lprocfs_write_helper(buffer, count, &fd); + if (rc) + return rc; + + if (fd < 0) + return -ERANGE; + CWARN("message to fd %d\n", fd); + + len = sizeof(*lh) + sizeof(*hal) + MTI_NAME_MAXLEN + + /* for mockup below */ 2 * cfs_size_round(sizeof(*hai)); + + OBD_ALLOC(lh, len); + if (!lh) + return -ENOMEM; + + lh->kuc_magic = KUC_MAGIC; + lh->kuc_transport = KUC_TRANSPORT_HSM; + lh->kuc_msgtype = HMT_ACTION_LIST; + lh->kuc_msglen = len; + + hal = (struct hsm_action_list *)(lh + 1); + hal->hal_version = HAL_VERSION; + hal->hal_archive_id = 1; + hal->hal_flags = 0; + obd_uuid2fsname(hal->hal_fsname, obd->obd_name, MTI_NAME_MAXLEN); + + /* mock up an action list */ + hal->hal_count = 2; + hai = hai_zero(hal); + hai->hai_action = HSMA_ARCHIVE; + hai->hai_fid.f_oid = 5; + hai->hai_len = sizeof(*hai); + hai = hai_next(hai); + hai->hai_action = HSMA_RESTORE; + hai->hai_fid.f_oid = 10; + hai->hai_len = sizeof(*hai); + + /* This works for either broadcast or unicast to a single fd */ + if (fd == 0) { + rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh); + } else { + struct file *fp = fget(fd); + + rc = libcfs_kkuc_msg_put(fp, lh); + fput(fp); + } + OBD_FREE(lh, len); + if (rc < 0) + return rc; + return count; +} + +struct file_operations mdc_kuc_fops = { + .open = mdc_kuc_open, + .write = mdc_kuc_write, + .release = single_release, +}; + +LPROC_SEQ_FOPS_WR_ONLY(mdc, ping); + +LPROC_SEQ_FOPS_RO_TYPE(mdc, uuid); +LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags); +LPROC_SEQ_FOPS_RO_TYPE(mdc, blksize); +LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytestotal); +LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesfree); +LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesavail); +LPROC_SEQ_FOPS_RO_TYPE(mdc, filestotal); +LPROC_SEQ_FOPS_RO_TYPE(mdc, filesfree); +LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid); +LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid); +LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts); +LPROC_SEQ_FOPS_RO_TYPE(mdc, state); + +static int mdc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v) +{ + return lprocfs_obd_rd_max_pages_per_rpc(m, m->private); +} +LPROC_SEQ_FOPS_RO(mdc_obd_max_pages_per_rpc); + +LPROC_SEQ_FOPS_RW_TYPE(mdc, import); +LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov); + +static struct lprocfs_vars lprocfs_mdc_obd_vars[] = { + { "uuid", &mdc_uuid_fops, NULL, 0 }, + { "ping", &mdc_ping_fops, NULL, 0222 }, + { "connect_flags", &mdc_connect_flags_fops, NULL, 0 }, + { "blocksize", &mdc_blksize_fops, NULL, 0 }, + { "kbytestotal", &mdc_kbytestotal_fops, NULL, 0 }, + { "kbytesfree", &mdc_kbytesfree_fops, NULL, 0 }, + { "kbytesavail", &mdc_kbytesavail_fops, NULL, 0 }, + { "filestotal", &mdc_filestotal_fops, NULL, 0 }, + { "filesfree", &mdc_filesfree_fops, NULL, 0 }, + /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/ + { "mds_server_uuid", &mdc_server_uuid_fops, NULL, 0 }, + { "mds_conn_uuid", &mdc_conn_uuid_fops, NULL, 0 }, + /* + * FIXME: below proc entry is provided, but not in used, instead + * sbi->sb_md_brw_size is used, the per obd variable should be used + * when CMD is enabled, and dir pages are managed in MDC layer. + * Remember to enable proc write function. + */ + { "max_pages_per_rpc", &mdc_obd_max_pages_per_rpc_fops, NULL, 0 }, + { "max_rpcs_in_flight", &mdc_max_rpcs_in_flight_fops, NULL, 0 }, + { "timeouts", &mdc_timeouts_fops, NULL, 0 }, + { "import", &mdc_import_fops, NULL, 0 }, + { "state", &mdc_state_fops, NULL, 0 }, + { "hsm_nl", &mdc_kuc_fops, NULL, 0200 }, + { "pinger_recov", &mdc_pinger_recov_fops, NULL, 0 }, + { NULL } +}; + +LPROC_SEQ_FOPS_RO_TYPE(mdc, numrefs); + +static struct lprocfs_vars lprocfs_mdc_module_vars[] = { + { "num_refs", &mdc_numrefs_fops, NULL, 0 }, + { NULL } +}; + +void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_mdc_module_vars; + lvars->obd_vars = lprocfs_mdc_obd_vars; +} diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/kernel/drivers/staging/lustre/lustre/mdc/mdc_internal.h new file mode 100644 index 000000000..81780c943 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_internal.h @@ -0,0 +1,181 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _MDC_INTERNAL_H +#define _MDC_INTERNAL_H + +#include "../include/lustre_mdc.h" +#include "../include/lustre_mds.h" + +#if defined CONFIG_PROC_FS +void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + +void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid, + struct obd_capa *oc, __u64 valid, int ea_size, + __u32 suppgid, int flags); +void mdc_pack_capa(struct ptlrpc_request *req, + const struct req_msg_field *field, struct obd_capa *oc); +int mdc_pack_req(struct ptlrpc_request *req, int version, int opc); +void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid, + const struct lu_fid *cfid, int flags); +void mdc_swap_layouts_pack(struct ptlrpc_request *req, + struct md_op_data *op_data); +void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, __u32 size, + const struct lu_fid *fid, struct obd_capa *oc); +void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags, + struct md_op_data *data, int ea_size); +void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len); +void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const void *data, int datalen, __u32 mode, __u32 uid, + __u32 gid, cfs_cap_t capability, __u64 rdev); +void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + __u32 mode, __u64 rdev, __u64 flags, const void *data, + int datalen); +void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +void mdc_getxattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen); +void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +int mdc_enter_request(struct client_obd *cli); +void mdc_exit_request(struct client_obd *cli); + +/* mdc/mdc_locks.c */ +int mdc_set_lock_data(struct obd_export *exp, + __u64 *lockh, void *data, __u64 *bits); + +int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid); + +int mdc_find_cbdata(struct obd_export *exp, const struct lu_fid *fid, + ldlm_iterator_t it, void *data); + +int mdc_intent_lock(struct obd_export *exp, + struct md_op_data *, + void *lmm, int lmmsize, + struct lookup_intent *, int, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); +int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, struct md_op_data *op_data, + struct lustre_handle *lockh, void *lmm, int lmmsize, + struct ptlrpc_request **req, __u64 extra_lock_flags); + +int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, + struct list_head *cancels, ldlm_mode_t mode, + __u64 bits); +/* mdc/mdc_request.c */ +int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data); + +int mdc_open(struct obd_export *exp, u64 ino, int type, int flags, + struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh, + struct ptlrpc_request **); + +struct obd_client_handle; + +int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req, + struct obd_export *dt_exp, struct obd_export *lmv_exp, + struct lustre_md *md); + +int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md); + +int mdc_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it); + +int mdc_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och); +void mdc_commit_open(struct ptlrpc_request *req); +void mdc_replay_open(struct ptlrpc_request *req); + +int mdc_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, __u32 gid, + cfs_cap_t capability, __u64 rdev, + struct ptlrpc_request **request); +int mdc_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request); +int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen, + struct ptlrpc_request **request); +int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len, + struct ptlrpc_request **request, struct md_open_data **mod); +int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request); +int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + ldlm_cancel_flags_t flags, void *opaque); + +static inline void mdc_set_capa_size(struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa *oc) +{ + if (oc == NULL) + req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0); + else + /* it is already calculated as sizeof struct obd_capa */ + ; +} + +int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits); + +int mdc_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo); + +ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + struct lustre_handle *lockh); + +static inline int mdc_prep_elc_req(struct obd_export *exp, + struct ptlrpc_request *req, int opc, + struct list_head *cancels, int count) +{ + return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels, + count); +} + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c new file mode 100644 index 000000000..d3234cb1e --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_lib.c @@ -0,0 +1,593 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC +#include "../include/lustre_net.h" +#include "../include/lustre/lustre_idl.h" +#include "mdc_internal.h" + + +static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid) +{ + LASSERT(b != NULL); + + b->suppgid = suppgid; + b->uid = from_kuid(&init_user_ns, current_uid()); + b->gid = from_kgid(&init_user_ns, current_gid()); + b->fsuid = from_kuid(&init_user_ns, current_fsuid()); + b->fsgid = from_kgid(&init_user_ns, current_fsgid()); + b->capability = cfs_curproc_cap_pack(); +} + +void mdc_pack_capa(struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa *oc) +{ + struct req_capsule *pill = &req->rq_pill; + struct lustre_capa *c; + + if (oc == NULL) { + LASSERT(req_capsule_get_size(pill, field, RCL_CLIENT) == 0); + return; + } + + c = req_capsule_client_get(pill, field); + LASSERT(c != NULL); + capa_cpy(c, oc); + DEBUG_CAPA(D_SEC, c, "pack"); +} + +void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid, + const struct lu_fid *cfid, int flags) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + if (pfid) { + b->fid1 = *pfid; + b->valid = OBD_MD_FLID; + } + if (cfid) + b->fid2 = *cfid; + b->flags = flags; +} + +void mdc_swap_layouts_pack(struct ptlrpc_request *req, + struct md_op_data *op_data) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + __mdc_pack_body(b, op_data->op_suppgids[0]); + b->fid1 = op_data->op_fid1; + b->fid2 = op_data->op_fid2; + b->valid |= OBD_MD_FLID; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2); +} + +void mdc_pack_body(struct ptlrpc_request *req, + const struct lu_fid *fid, struct obd_capa *oc, + __u64 valid, int ea_size, __u32 suppgid, int flags) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + LASSERT(b != NULL); + b->valid = valid; + b->eadatasize = ea_size; + b->flags = flags; + __mdc_pack_body(b, suppgid); + if (fid) { + b->fid1 = *fid; + b->valid |= OBD_MD_FLID; + mdc_pack_capa(req, &RMF_CAPA1, oc); + } +} + +void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, + __u32 size, const struct lu_fid *fid, struct obd_capa *oc) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + b->fid1 = *fid; + b->valid |= OBD_MD_FLID; + b->size = pgoff; /* !! */ + b->nlink = size; /* !! */ + __mdc_pack_body(b, -1); + b->mode = LUDA_FID | LUDA_TYPE; + + mdc_pack_capa(req, &RMF_CAPA1, oc); +} + +/* packing of MDS records */ +void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const void *data, int datalen, __u32 mode, + __u32 uid, __u32 gid, cfs_cap_t cap_effective, __u64 rdev) +{ + struct mdt_rec_create *rec; + char *tmp; + __u64 flags; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + + rec->cr_opcode = REINT_CREATE; + rec->cr_fsuid = uid; + rec->cr_fsgid = gid; + rec->cr_cap = cap_effective; + rec->cr_fid1 = op_data->op_fid1; + rec->cr_fid2 = op_data->op_fid2; + rec->cr_mode = mode; + rec->cr_rdev = rdev; + rec->cr_time = op_data->op_mod_time; + rec->cr_suppgid1 = op_data->op_suppgids[0]; + rec->cr_suppgid2 = op_data->op_suppgids[1]; + flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS; + if (op_data->op_bias & MDS_CREATE_VOLATILE) + flags |= MDS_OPEN_VOLATILE; + set_mrc_cr_flags(rec, flags); + rec->cr_bias = op_data->op_bias; + rec->cr_umask = current_umask(); + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LOGL0(op_data->op_name, op_data->op_namelen, tmp); + + if (data) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, data, datalen); + } +} + +static __u64 mds_pack_open_flags(__u64 flags, __u32 mode) +{ + __u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE | + MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS | + MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK | + MDS_OPEN_BY_FID | MDS_OPEN_LEASE | + MDS_OPEN_RELEASE)); + if (flags & O_CREAT) + cr_flags |= MDS_OPEN_CREAT; + if (flags & O_EXCL) + cr_flags |= MDS_OPEN_EXCL; + if (flags & O_TRUNC) + cr_flags |= MDS_OPEN_TRUNC; + if (flags & O_APPEND) + cr_flags |= MDS_OPEN_APPEND; + if (flags & O_SYNC) + cr_flags |= MDS_OPEN_SYNC; + if (flags & O_DIRECTORY) + cr_flags |= MDS_OPEN_DIRECTORY; + if (flags & __FMODE_EXEC) + cr_flags |= MDS_FMODE_EXEC; + if (cl_is_lov_delay_create(flags)) + cr_flags |= MDS_OPEN_DELAY_CREATE; + + if (flags & O_NONBLOCK) + cr_flags |= MDS_OPEN_NORESTORE; + + return cr_flags; +} + +/* packing of MDS records */ +void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + __u32 mode, __u64 rdev, __u64 flags, const void *lmm, + int lmmlen) +{ + struct mdt_rec_create *rec; + char *tmp; + __u64 cr_flags; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + /* XXX do something about time, uid, gid */ + rec->cr_opcode = REINT_OPEN; + rec->cr_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->cr_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->cr_cap = cfs_curproc_cap_pack(); + rec->cr_fid1 = op_data->op_fid1; + rec->cr_fid2 = op_data->op_fid2; + + rec->cr_mode = mode; + cr_flags = mds_pack_open_flags(flags, mode); + rec->cr_rdev = rdev; + rec->cr_time = op_data->op_mod_time; + rec->cr_suppgid1 = op_data->op_suppgids[0]; + rec->cr_suppgid2 = op_data->op_suppgids[1]; + rec->cr_bias = op_data->op_bias; + rec->cr_umask = current_umask(); + rec->cr_old_handle = op_data->op_handle; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + /* the next buffer is child capa, which is used for replay, + * will be packed from the data in reply message. */ + + if (op_data->op_name) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LOGL0(op_data->op_name, op_data->op_namelen, tmp); + if (op_data->op_bias & MDS_CREATE_VOLATILE) + cr_flags |= MDS_OPEN_VOLATILE; + } + + if (lmm) { + cr_flags |= MDS_OPEN_HAS_EA; + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, lmm, lmmlen); + } + set_mrc_cr_flags(rec, cr_flags); +} + +static inline __u64 attr_pack(unsigned int ia_valid) +{ + __u64 sa_valid = 0; + + if (ia_valid & ATTR_MODE) + sa_valid |= MDS_ATTR_MODE; + if (ia_valid & ATTR_UID) + sa_valid |= MDS_ATTR_UID; + if (ia_valid & ATTR_GID) + sa_valid |= MDS_ATTR_GID; + if (ia_valid & ATTR_SIZE) + sa_valid |= MDS_ATTR_SIZE; + if (ia_valid & ATTR_ATIME) + sa_valid |= MDS_ATTR_ATIME; + if (ia_valid & ATTR_MTIME) + sa_valid |= MDS_ATTR_MTIME; + if (ia_valid & ATTR_CTIME) + sa_valid |= MDS_ATTR_CTIME; + if (ia_valid & ATTR_ATIME_SET) + sa_valid |= MDS_ATTR_ATIME_SET; + if (ia_valid & ATTR_MTIME_SET) + sa_valid |= MDS_ATTR_MTIME_SET; + if (ia_valid & ATTR_FORCE) + sa_valid |= MDS_ATTR_FORCE; + if (ia_valid & ATTR_ATTR_FLAG) + sa_valid |= MDS_ATTR_ATTR_FLAG; + if (ia_valid & ATTR_KILL_SUID) + sa_valid |= MDS_ATTR_KILL_SUID; + if (ia_valid & ATTR_KILL_SGID) + sa_valid |= MDS_ATTR_KILL_SGID; + if (ia_valid & ATTR_CTIME_SET) + sa_valid |= MDS_ATTR_CTIME_SET; + if (ia_valid & ATTR_FROM_OPEN) + sa_valid |= MDS_ATTR_FROM_OPEN; + if (ia_valid & ATTR_BLOCKS) + sa_valid |= MDS_ATTR_BLOCKS; + if (ia_valid & MDS_OPEN_OWNEROVERRIDE) + /* NFSD hack (see bug 5781) */ + sa_valid |= MDS_OPEN_OWNEROVERRIDE; + return sa_valid; +} + +static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec, + struct md_op_data *op_data) +{ + rec->sa_opcode = REINT_SETATTR; + rec->sa_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->sa_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->sa_cap = cfs_curproc_cap_pack(); + rec->sa_suppgid = -1; + + rec->sa_fid = op_data->op_fid1; + rec->sa_valid = attr_pack(op_data->op_attr.ia_valid); + rec->sa_mode = op_data->op_attr.ia_mode; + rec->sa_uid = from_kuid(&init_user_ns, op_data->op_attr.ia_uid); + rec->sa_gid = from_kgid(&init_user_ns, op_data->op_attr.ia_gid); + rec->sa_size = op_data->op_attr.ia_size; + rec->sa_blocks = op_data->op_attr_blocks; + rec->sa_atime = LTIME_S(op_data->op_attr.ia_atime); + rec->sa_mtime = LTIME_S(op_data->op_attr.ia_mtime); + rec->sa_ctime = LTIME_S(op_data->op_attr.ia_ctime); + rec->sa_attr_flags = + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags; + if ((op_data->op_attr.ia_valid & ATTR_GID) && + in_group_p(op_data->op_attr.ia_gid)) + rec->sa_suppgid = + from_kgid(&init_user_ns, op_data->op_attr.ia_gid); + else + rec->sa_suppgid = op_data->op_suppgids[0]; + + rec->sa_bias = op_data->op_bias; +} + +static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch, + struct md_op_data *op_data) +{ + memcpy(&epoch->handle, &op_data->op_handle, sizeof(epoch->handle)); + epoch->ioepoch = op_data->op_ioepoch; + epoch->flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS; +} + +void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len) +{ + struct mdt_rec_setattr *rec; + struct mdt_ioepoch *epoch; + struct lov_user_md *lum = NULL; + + CLASSERT(sizeof(struct mdt_rec_reint) == + sizeof(struct mdt_rec_setattr)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + mdc_setattr_pack_rec(rec, op_data); + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + + if (op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) { + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + mdc_ioepoch_pack(epoch, op_data); + } + + if (ealen == 0) + return; + + lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + if (ea == NULL) { /* Remove LOV EA */ + lum->lmm_magic = LOV_USER_MAGIC_V1; + lum->lmm_stripe_size = 0; + lum->lmm_stripe_count = 0; + lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1); + } else { + memcpy(lum, ea, ealen); + } + + if (ea2len == 0) + return; + + memcpy(req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES), ea2, + ea2len); +} + +void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data) +{ + struct mdt_rec_unlink *rec; + char *tmp; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + LASSERT(rec != NULL); + + rec->ul_opcode = op_data->op_cli_flags & CLI_RM_ENTRY ? + REINT_RMENTRY : REINT_UNLINK; + rec->ul_fsuid = op_data->op_fsuid; + rec->ul_fsgid = op_data->op_fsgid; + rec->ul_cap = op_data->op_cap; + rec->ul_mode = op_data->op_mode; + rec->ul_suppgid1 = op_data->op_suppgids[0]; + rec->ul_suppgid2 = -1; + rec->ul_fid1 = op_data->op_fid1; + rec->ul_fid2 = op_data->op_fid2; + rec->ul_time = op_data->op_mod_time; + rec->ul_bias = op_data->op_bias; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LASSERT(tmp != NULL); + LOGL0(op_data->op_name, op_data->op_namelen, tmp); +} + +void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data) +{ + struct mdt_rec_link *rec; + char *tmp; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + LASSERT(rec != NULL); + + rec->lk_opcode = REINT_LINK; + rec->lk_fsuid = op_data->op_fsuid; /* current->fsuid; */ + rec->lk_fsgid = op_data->op_fsgid; /* current->fsgid; */ + rec->lk_cap = op_data->op_cap; /* current->cap_effective; */ + rec->lk_suppgid1 = op_data->op_suppgids[0]; + rec->lk_suppgid2 = op_data->op_suppgids[1]; + rec->lk_fid1 = op_data->op_fid1; + rec->lk_fid2 = op_data->op_fid2; + rec->lk_time = op_data->op_mod_time; + rec->lk_bias = op_data->op_bias; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LOGL0(op_data->op_name, op_data->op_namelen, tmp); +} + +void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen) +{ + struct mdt_rec_rename *rec; + char *tmp; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + /* XXX do something about time, uid, gid */ + rec->rn_opcode = REINT_RENAME; + rec->rn_fsuid = op_data->op_fsuid; + rec->rn_fsgid = op_data->op_fsgid; + rec->rn_cap = op_data->op_cap; + rec->rn_suppgid1 = op_data->op_suppgids[0]; + rec->rn_suppgid2 = op_data->op_suppgids[1]; + rec->rn_fid1 = op_data->op_fid1; + rec->rn_fid2 = op_data->op_fid2; + rec->rn_time = op_data->op_mod_time; + rec->rn_mode = op_data->op_mode; + rec->rn_bias = op_data->op_bias; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LOGL0(old, oldlen, tmp); + + if (new) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SYMTGT); + LOGL0(new, newlen, tmp); + } +} + +void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags, + struct md_op_data *op_data, int ea_size) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + b->valid = valid; + if (op_data->op_bias & MDS_CHECK_SPLIT) + b->valid |= OBD_MD_FLCKSPLIT; + if (op_data->op_bias & MDS_CROSS_REF) + b->valid |= OBD_MD_FLCROSSREF; + b->eadatasize = ea_size; + b->flags = flags; + __mdc_pack_body(b, op_data->op_suppgids[0]); + + b->fid1 = op_data->op_fid1; + b->fid2 = op_data->op_fid2; + b->valid |= OBD_MD_FLID; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + + if (op_data->op_name) { + char *tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + + LOGL0(op_data->op_name, op_data->op_namelen, tmp); + + } +} + +static void mdc_hsm_release_pack(struct ptlrpc_request *req, + struct md_op_data *op_data) +{ + if (op_data->op_bias & MDS_HSM_RELEASE) { + struct close_data *data; + struct ldlm_lock *lock; + + data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA); + LASSERT(data != NULL); + + lock = ldlm_handle2lock(&op_data->op_lease_handle); + if (lock != NULL) { + data->cd_handle = lock->l_remote_handle; + ldlm_lock_put(lock); + } + ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL); + + data->cd_data_version = op_data->op_data_version; + data->cd_fid = op_data->op_fid2; + } +} + +void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data) +{ + struct mdt_ioepoch *epoch; + struct mdt_rec_setattr *rec; + + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + mdc_setattr_pack_rec(rec, op_data); + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + mdc_ioepoch_pack(epoch, op_data); + mdc_hsm_release_pack(req, op_data); +} + +static int mdc_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw) +{ + int rc; + + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = list_empty(&mcw->mcw_entry); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +}; + +/* We record requests in flight in cli->cl_r_in_flight here. + * There is only one write rpc possible in mdc anyway. If this to change + * in the future - the code may need to be revisited. */ +int mdc_enter_request(struct client_obd *cli) +{ + int rc = 0; + struct mdc_cache_waiter mcw; + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { + list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters); + init_waitqueue_head(&mcw.mcw_waitq); + client_obd_list_unlock(&cli->cl_loi_list_lock); + rc = l_wait_event(mcw.mcw_waitq, mdc_req_avail(cli, &mcw), + &lwi); + if (rc) { + client_obd_list_lock(&cli->cl_loi_list_lock); + if (list_empty(&mcw.mcw_entry)) + cli->cl_r_in_flight--; + list_del_init(&mcw.mcw_entry); + client_obd_list_unlock(&cli->cl_loi_list_lock); + } + } else { + cli->cl_r_in_flight++; + client_obd_list_unlock(&cli->cl_loi_list_lock); + } + return rc; +} + +void mdc_exit_request(struct client_obd *cli) +{ + struct list_head *l, *tmp; + struct mdc_cache_waiter *mcw; + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_r_in_flight--; + list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { + if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { + /* No free request slots anymore */ + break; + } + + mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry); + list_del_init(&mcw->mcw_entry); + cli->cl_r_in_flight++; + wake_up(&mcw->mcw_waitq); + } + /* Empty waiting list? Decrease reqs in-flight number */ + + client_obd_list_unlock(&cli->cl_loi_list_lock); +} diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/kernel/drivers/staging/lustre/lustre/mdc/mdc_locks.c new file mode 100644 index 000000000..d1c224ecd --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_locks.c @@ -0,0 +1,1313 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC + +# include + +#include "../include/lustre_intent.h" +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_fid.h" /* fid_res_name_eq() */ +#include "../include/lustre_mdc.h" +#include "../include/lustre_net.h" +#include "../include/lustre_req_layout.h" +#include "mdc_internal.h" + +struct mdc_getattr_args { + struct obd_export *ga_exp; + struct md_enqueue_info *ga_minfo; + struct ldlm_enqueue_info *ga_einfo; +}; + +int it_disposition(struct lookup_intent *it, int flag) +{ + return it->d.lustre.it_disposition & flag; +} +EXPORT_SYMBOL(it_disposition); + +void it_set_disposition(struct lookup_intent *it, int flag) +{ + it->d.lustre.it_disposition |= flag; +} +EXPORT_SYMBOL(it_set_disposition); + +void it_clear_disposition(struct lookup_intent *it, int flag) +{ + it->d.lustre.it_disposition &= ~flag; +} +EXPORT_SYMBOL(it_clear_disposition); + +int it_open_error(int phase, struct lookup_intent *it) +{ + if (it_disposition(it, DISP_OPEN_LEASE)) { + if (phase >= DISP_OPEN_LEASE) + return it->d.lustre.it_status; + else + return 0; + } + if (it_disposition(it, DISP_OPEN_OPEN)) { + if (phase >= DISP_OPEN_OPEN) + return it->d.lustre.it_status; + else + return 0; + } + + if (it_disposition(it, DISP_OPEN_CREATE)) { + if (phase >= DISP_OPEN_CREATE) + return it->d.lustre.it_status; + else + return 0; + } + + if (it_disposition(it, DISP_LOOKUP_EXECD)) { + if (phase >= DISP_LOOKUP_EXECD) + return it->d.lustre.it_status; + else + return 0; + } + + if (it_disposition(it, DISP_IT_EXECD)) { + if (phase >= DISP_IT_EXECD) + return it->d.lustre.it_status; + else + return 0; + } + CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition, + it->d.lustre.it_status); + LBUG(); + return 0; +} +EXPORT_SYMBOL(it_open_error); + +/* this must be called on a lockh that is known to have a referenced lock */ +int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data, + __u64 *bits) +{ + struct ldlm_lock *lock; + struct inode *new_inode = data; + + if (bits) + *bits = 0; + + if (!*lockh) + return 0; + + lock = ldlm_handle2lock((struct lustre_handle *)lockh); + + LASSERT(lock != NULL); + lock_res_and_lock(lock); + if (lock->l_resource->lr_lvb_inode && + lock->l_resource->lr_lvb_inode != data) { + struct inode *old_inode = lock->l_resource->lr_lvb_inode; + + LASSERTF(old_inode->i_state & I_FREEING, + "Found existing inode %p/%lu/%u state %lu in lock: setting data to %p/%lu/%u\n", + old_inode, old_inode->i_ino, old_inode->i_generation, + old_inode->i_state, new_inode, new_inode->i_ino, + new_inode->i_generation); + } + lock->l_resource->lr_lvb_inode = new_inode; + if (bits) + *bits = lock->l_policy_data.l_inodebits.bits; + + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + + return 0; +} + +ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + struct lustre_handle *lockh) +{ + struct ldlm_res_id res_id; + ldlm_mode_t rc; + + fid_build_reg_res_name(fid, &res_id); + /* LU-4405: Clear bits not supported by server */ + policy->l_inodebits.bits &= exp_connect_ibits(exp); + rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags, + &res_id, type, policy, mode, lockh, 0); + return rc; +} + +int mdc_cancel_unused(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque) +{ + struct ldlm_res_id res_id; + struct obd_device *obd = class_exp2obd(exp); + int rc; + + fid_build_reg_res_name(fid, &res_id); + rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id, + policy, mode, flags, opaque); + return rc; +} + +int mdc_null_inode(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct ldlm_res_id res_id; + struct ldlm_resource *res; + struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace; + + LASSERTF(ns != NULL, "no namespace passed\n"); + + fid_build_reg_res_name(fid, &res_id); + + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if (res == NULL) + return 0; + + lock_res(res); + res->lr_lvb_inode = NULL; + unlock_res(res); + + ldlm_resource_putref(res); + return 0; +} + +/* find any ldlm lock of the inode in mdc + * return 0 not find + * 1 find one + * < 0 error */ +int mdc_find_cbdata(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_iterator_t it, void *data) +{ + struct ldlm_res_id res_id; + int rc = 0; + + fid_build_reg_res_name((struct lu_fid *)fid, &res_id); + rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id, + it, data); + if (rc == LDLM_ITER_STOP) + return 1; + else if (rc == LDLM_ITER_CONTINUE) + return 0; + return rc; +} + +static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc) +{ + /* Don't hold error requests for replay. */ + if (req->rq_replay) { + spin_lock(&req->rq_lock); + req->rq_replay = 0; + spin_unlock(&req->rq_lock); + } + if (rc && req->rq_transno != 0) { + DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc); + LBUG(); + } +} + +/* Save a large LOV EA into the request buffer so that it is available + * for replay. We don't do this in the initial request because the + * original request doesn't need this buffer (at most it sends just the + * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty + * buffer and may also be difficult to allocate and save a very large + * request buffer for each open. (bug 5707) + * + * OOM here may cause recovery failure if lmm is needed (only for the + * original open if the MDS crashed just when this client also OOM'd) + * but this is incredibly unlikely, and questionable whether the client + * could do MDS recovery under OOM anyways... */ +static void mdc_realloc_openmsg(struct ptlrpc_request *req, + struct mdt_body *body) +{ + int rc; + + /* FIXME: remove this explicit offset. */ + rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4, + body->eadatasize); + if (rc) { + CERROR("Can't enlarge segment %d size to %d\n", + DLM_INTENT_REC_OFF + 4, body->eadatasize); + body->valid &= ~OBD_MD_FLEASIZE; + body->eadatasize = 0; + } +} + +static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data, + void *lmm, int lmmsize, + void *cb_data) +{ + struct ptlrpc_request *req; + struct obd_device *obddev = class_exp2obd(exp); + struct ldlm_intent *lit; + LIST_HEAD(cancels); + int count = 0; + int mode; + int rc; + + it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG; + + /* XXX: openlock is not cancelled for cross-refs. */ + /* If inode is known, cancel conflicting OPEN locks. */ + if (fid_is_sane(&op_data->op_fid2)) { + if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */ + if (it->it_flags & FMODE_WRITE) + mode = LCK_EX; + else + mode = LCK_PR; + } else { + if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC)) + mode = LCK_CW; + else if (it->it_flags & __FMODE_EXEC) + mode = LCK_PR; + else + mode = LCK_CR; + } + count = mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, mode, + MDS_INODELOCK_OPEN); + } + + /* If CREATE, cancel parent's UPDATE lock. */ + if (it->it_op & IT_CREAT) + mode = LCK_EX; + else + mode = LCK_CR; + count += mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, mode, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_OPEN); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + return ERR_PTR(-ENOMEM); + } + + /* parent capability */ + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + /* child capability, reserve the size according to parent capa, it will + * be filled after we get the reply */ + mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa1); + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + max(lmmsize, obddev->u.cli.cl_default_mds_easize)); + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); + if (rc < 0) { + ptlrpc_request_free(req); + return ERR_PTR(rc); + } + + spin_lock(&req->rq_lock); + req->rq_replay = req->rq_import->imp_replayable; + spin_unlock(&req->rq_lock); + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm, + lmmsize); + + /* for remote client, fetch remote perm for current user */ + if (client_is_remote(exp)) + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + sizeof(struct mdt_remote_perm)); + ptlrpc_request_set_replen(req); + return req; +} + +static struct ptlrpc_request * +mdc_intent_getxattr_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data) +{ + struct ptlrpc_request *req; + struct ldlm_intent *lit; + int rc, count = 0, maxdata; + LIST_HEAD(cancels); + + + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_GETXATTR); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return ERR_PTR(rc); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = IT_GETXATTR; + + maxdata = class_exp2cliimp(exp)->imp_connect_data.ocd_max_easize; + + /* pack the intended request */ + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + op_data->op_valid, maxdata, -1, 0); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, + RCL_SERVER, maxdata); + + req_capsule_set_size(&req->rq_pill, &RMF_EAVALS, + RCL_SERVER, maxdata); + + req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS, + RCL_SERVER, maxdata); + + ptlrpc_request_set_replen(req); + + return req; +} + +static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data) +{ + struct ptlrpc_request *req; + struct obd_device *obddev = class_exp2obd(exp); + struct ldlm_intent *lit; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_UNLINK); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + return ERR_PTR(rc); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_unlink_pack(req, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obddev->u.cli.cl_default_mds_easize); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + obddev->u.cli.cl_default_mds_cookiesize); + ptlrpc_request_set_replen(req); + return req; +} + +static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data) +{ + struct ptlrpc_request *req; + struct obd_device *obddev = class_exp2obd(exp); + u64 valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | + OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA | + OBD_MD_FLMDSCAPA | OBD_MD_MEA | + (client_is_remote(exp) ? + OBD_MD_FLRMTPERM : OBD_MD_FLACL); + struct ldlm_intent *lit; + int rc; + int easize; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_GETATTR); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + return ERR_PTR(rc); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + if (obddev->u.cli.cl_default_mds_easize > 0) + easize = obddev->u.cli.cl_default_mds_easize; + else + easize = obddev->u.cli.cl_max_mds_easize; + + /* pack the intended request */ + mdc_getattr_pack(req, valid, it->it_flags, op_data, easize); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize); + if (client_is_remote(exp)) + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + sizeof(struct mdt_remote_perm)); + ptlrpc_request_set_replen(req); + return req; +} + +static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *unused) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct ldlm_intent *lit; + struct layout_intent *layout; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_LAYOUT); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0); + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + return ERR_PTR(rc); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the layout intent request */ + layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT); + /* LAYOUT_INTENT_ACCESS is generic, specific operation will be + * set for replication */ + layout->li_opc = LAYOUT_INTENT_ACCESS; + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + ptlrpc_request_set_replen(req); + return req; +} + +static struct ptlrpc_request * +mdc_enqueue_pack(struct obd_export *exp, int lvb_len) +{ + struct ptlrpc_request *req; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + return ERR_PTR(rc); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); + ptlrpc_request_set_replen(req); + return req; +} + +static int mdc_finish_enqueue(struct obd_export *exp, + struct ptlrpc_request *req, + struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, + struct lustre_handle *lockh, + int rc) +{ + struct req_capsule *pill = &req->rq_pill; + struct ldlm_request *lockreq; + struct ldlm_reply *lockrep; + struct lustre_intent_data *intent = &it->d.lustre; + struct ldlm_lock *lock; + void *lvb_data = NULL; + int lvb_len = 0; + + LASSERT(rc >= 0); + /* Similarly, if we're going to replay this request, we don't want to + * actually get a lock, just perform the intent. */ + if (req->rq_transno || req->rq_replay) { + lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ); + lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY); + } + + if (rc == ELDLM_LOCK_ABORTED) { + einfo->ei_mode = 0; + memset(lockh, 0, sizeof(*lockh)); + rc = 0; + } else { /* rc = 0 */ + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + + /* If the server gave us back a different lock mode, we should + * fix up our variables. */ + if (lock->l_req_mode != einfo->ei_mode) { + ldlm_lock_addref(lockh, lock->l_req_mode); + ldlm_lock_decref(lockh, einfo->ei_mode); + einfo->ei_mode = lock->l_req_mode; + } + LDLM_LOCK_PUT(lock); + } + + lockrep = req_capsule_server_get(pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */ + + intent->it_disposition = (int)lockrep->lock_policy_res1; + intent->it_status = (int)lockrep->lock_policy_res2; + intent->it_lock_mode = einfo->ei_mode; + intent->it_lock_handle = lockh->cookie; + intent->it_data = req; + + /* Technically speaking rq_transno must already be zero if + * it_status is in error, so the check is a bit redundant */ + if ((!req->rq_transno || intent->it_status < 0) && req->rq_replay) + mdc_clear_replay_flag(req, intent->it_status); + + /* If we're doing an IT_OPEN which did not result in an actual + * successful open, then we need to remove the bit which saves + * this request for unconditional replay. + * + * It's important that we do this first! Otherwise we might exit the + * function without doing so, and try to replay a failed create + * (bug 3440) */ + if (it->it_op & IT_OPEN && req->rq_replay && + (!it_disposition(it, DISP_OPEN_OPEN) || intent->it_status != 0)) + mdc_clear_replay_flag(req, intent->it_status); + + DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d", + it->it_op, intent->it_disposition, intent->it_status); + + /* We know what to expect, so we do any byte flipping required here */ + if (it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR)) { + struct mdt_body *body; + + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (body == NULL) { + CERROR("Can't swab mdt_body\n"); + return -EPROTO; + } + + if (it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) { + /* + * If this is a successful OPEN request, we need to set + * replay handler and data early, so that if replay + * happens immediately after swabbing below, new reply + * is swabbed by that handler correctly. + */ + mdc_set_open_replay_data(NULL, NULL, it); + } + + if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) { + void *eadata; + + mdc_update_max_ea_from_body(exp, body); + + /* + * The eadata is opaque; just check that it is there. + * Eventually, obd_unpackmd() will check the contents. + */ + eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + body->eadatasize); + if (eadata == NULL) + return -EPROTO; + + /* save lvb data and length in case this is for layout + * lock */ + lvb_data = eadata; + lvb_len = body->eadatasize; + + /* + * We save the reply LOV EA in case we have to replay a + * create for recovery. If we didn't allocate a large + * enough request buffer above we need to reallocate it + * here to hold the actual LOV EA. + * + * To not save LOV EA if request is not going to replay + * (for example error one). + */ + if ((it->it_op & IT_OPEN) && req->rq_replay) { + void *lmm; + + if (req_capsule_get_size(pill, &RMF_EADATA, + RCL_CLIENT) < + body->eadatasize) + mdc_realloc_openmsg(req, body); + else + req_capsule_shrink(pill, &RMF_EADATA, + body->eadatasize, + RCL_CLIENT); + + req_capsule_set_size(pill, &RMF_EADATA, + RCL_CLIENT, + body->eadatasize); + + lmm = req_capsule_client_get(pill, &RMF_EADATA); + if (lmm) + memcpy(lmm, eadata, body->eadatasize); + } + } + + if (body->valid & OBD_MD_FLRMTPERM) { + struct mdt_remote_perm *perm; + + LASSERT(client_is_remote(exp)); + perm = req_capsule_server_swab_get(pill, &RMF_ACL, + lustre_swab_mdt_remote_perm); + if (perm == NULL) + return -EPROTO; + } + if (body->valid & OBD_MD_FLMDSCAPA) { + struct lustre_capa *capa, *p; + + capa = req_capsule_server_get(pill, &RMF_CAPA1); + if (capa == NULL) + return -EPROTO; + + if (it->it_op & IT_OPEN) { + /* client fid capa will be checked in replay */ + p = req_capsule_client_get(pill, &RMF_CAPA2); + LASSERT(p); + *p = *capa; + } + } + if (body->valid & OBD_MD_FLOSSCAPA) { + struct lustre_capa *capa; + + capa = req_capsule_server_get(pill, &RMF_CAPA2); + if (capa == NULL) + return -EPROTO; + } + } else if (it->it_op & IT_LAYOUT) { + /* maybe the lock was granted right away and layout + * is packed into RMF_DLM_LVB of req */ + lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER); + if (lvb_len > 0) { + lvb_data = req_capsule_server_sized_get(pill, + &RMF_DLM_LVB, lvb_len); + if (lvb_data == NULL) + return -EPROTO; + } + } + + /* fill in stripe data for layout lock */ + lock = ldlm_handle2lock(lockh); + if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL) { + void *lmm; + + LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d\n", + ldlm_it2str(it->it_op), lvb_len); + + OBD_ALLOC_LARGE(lmm, lvb_len); + if (lmm == NULL) { + LDLM_LOCK_PUT(lock); + return -ENOMEM; + } + memcpy(lmm, lvb_data, lvb_len); + + /* install lvb_data */ + lock_res_and_lock(lock); + if (lock->l_lvb_data == NULL) { + lock->l_lvb_type = LVB_T_LAYOUT; + lock->l_lvb_data = lmm; + lock->l_lvb_len = lvb_len; + lmm = NULL; + } + unlock_res_and_lock(lock); + if (lmm != NULL) + OBD_FREE_LARGE(lmm, lvb_len); + } + if (lock != NULL) + LDLM_LOCK_PUT(lock); + + return rc; +} + +/* We always reserve enough space in the reply packet for a stripe MD, because + * we don't know in advance the file type. */ +int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, struct md_op_data *op_data, + struct lustre_handle *lockh, void *lmm, int lmmsize, + struct ptlrpc_request **reqp, u64 extra_lock_flags) +{ + static const ldlm_policy_data_t lookup_policy = { + .l_inodebits = { MDS_INODELOCK_LOOKUP } + }; + static const ldlm_policy_data_t update_policy = { + .l_inodebits = { MDS_INODELOCK_UPDATE } + }; + static const ldlm_policy_data_t layout_policy = { + .l_inodebits = { MDS_INODELOCK_LAYOUT } + }; + static const ldlm_policy_data_t getxattr_policy = { + .l_inodebits = { MDS_INODELOCK_XATTR } + }; + ldlm_policy_data_t const *policy = &lookup_policy; + struct obd_device *obddev = class_exp2obd(exp); + struct ptlrpc_request *req; + u64 flags, saved_flags = extra_lock_flags; + struct ldlm_res_id res_id; + int generation, resends = 0; + struct ldlm_reply *lockrep; + enum lvb_type lvb_type = LVB_T_NONE; + int rc; + + LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n", + einfo->ei_type); + + fid_build_reg_res_name(&op_data->op_fid1, &res_id); + + if (it) { + saved_flags |= LDLM_FL_HAS_INTENT; + if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR)) + policy = &update_policy; + else if (it->it_op & IT_LAYOUT) + policy = &layout_policy; + else if (it->it_op & (IT_GETXATTR | IT_SETXATTR)) + policy = &getxattr_policy; + } + + LASSERT(reqp == NULL); + + generation = obddev->u.cli.cl_import->imp_generation; +resend: + flags = saved_flags; + if (!it) { + /* The only way right now is FLOCK, in this case we hide flock + policy as lmm, but lmmsize is 0 */ + LASSERT(lmm && lmmsize == 0); + LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n", + einfo->ei_type); + policy = (ldlm_policy_data_t *)lmm; + res_id.name[3] = LDLM_FLOCK; + req = NULL; + } else if (it->it_op & IT_OPEN) { + req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize, + einfo->ei_cbdata); + policy = &update_policy; + einfo->ei_cbdata = NULL; + lmm = NULL; + } else if (it->it_op & IT_UNLINK) { + req = mdc_intent_unlink_pack(exp, it, op_data); + } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { + req = mdc_intent_getattr_pack(exp, it, op_data); + } else if (it->it_op & IT_READDIR) { + req = mdc_enqueue_pack(exp, 0); + } else if (it->it_op & IT_LAYOUT) { + if (!imp_connect_lvb_type(class_exp2cliimp(exp))) + return -EOPNOTSUPP; + req = mdc_intent_layout_pack(exp, it, op_data); + lvb_type = LVB_T_LAYOUT; + } else if (it->it_op & IT_GETXATTR) { + req = mdc_intent_getxattr_pack(exp, it, op_data); + } else { + LBUG(); + return -EINVAL; + } + + if (IS_ERR(req)) + return PTR_ERR(req); + + if (req != NULL && it && it->it_op & IT_CREAT) + /* ask ptlrpc not to resend on EINPROGRESS since we have our own + * retry logic */ + req->rq_no_retry_einprogress = 1; + + if (resends) { + req->rq_generation_set = 1; + req->rq_import_generation = generation; + req->rq_sent = get_seconds() + resends; + } + + /* It is important to obtain rpc_lock first (if applicable), so that + * threads that are serialised with rpc_lock are not polluting our + * rpcs in flight counter. We do not do flock request limiting, though*/ + if (it) { + mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it); + rc = mdc_enter_request(&obddev->u.cli); + if (rc != 0) { + mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); + mdc_clear_replay_flag(req, 0); + ptlrpc_req_finished(req); + return rc; + } + } + + rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL, + 0, lvb_type, lockh, 0); + if (!it) { + /* For flock requests we immediately return without further + delay and let caller deal with the rest, since rest of + this function metadata processing makes no sense for flock + requests anyway. But in case of problem during comms with + Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we + can not rely on caller and this mainly for F_UNLCKs + (explicits or automatically generated by Kernel to clean + current FLocks upon exit) that can't be trashed */ + if ((rc == -EINTR) || (rc == -ETIMEDOUT)) + goto resend; + return rc; + } + + mdc_exit_request(&obddev->u.cli); + mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); + + if (rc < 0) { + CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR, + "%s: ldlm_cli_enqueue failed: rc = %d\n", + obddev->obd_name, rc); + + mdc_clear_replay_flag(req, rc); + ptlrpc_req_finished(req); + return rc; + } + + lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); + + lockrep->lock_policy_res2 = + ptlrpc_status_ntoh(lockrep->lock_policy_res2); + + /* Retry the create infinitely when we get -EINPROGRESS from + * server. This is required by the new quota design. */ + if (it && it->it_op & IT_CREAT && + (int)lockrep->lock_policy_res2 == -EINPROGRESS) { + mdc_clear_replay_flag(req, rc); + ptlrpc_req_finished(req); + resends++; + + CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n", + obddev->obd_name, resends, it->it_op, + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2)); + + if (generation == obddev->u.cli.cl_import->imp_generation) { + goto resend; + } else { + CDEBUG(D_HA, "resend cross eviction\n"); + return -EIO; + } + } + + rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); + if (rc < 0) { + if (lustre_handle_is_used(lockh)) { + ldlm_lock_decref(lockh, einfo->ei_mode); + memset(lockh, 0, sizeof(*lockh)); + } + ptlrpc_req_finished(req); + + it->d.lustre.it_lock_handle = 0; + it->d.lustre.it_lock_mode = 0; + it->d.lustre.it_data = NULL; + } + + return rc; +} + +static int mdc_finish_intent_lock(struct obd_export *exp, + struct ptlrpc_request *request, + struct md_op_data *op_data, + struct lookup_intent *it, + struct lustre_handle *lockh) +{ + struct lustre_handle old_lock; + struct mdt_body *mdt_body; + struct ldlm_lock *lock; + int rc; + + LASSERT(request != NULL); + LASSERT(request != LP_POISON); + LASSERT(request->rq_repmsg != LP_POISON); + + if (!it_disposition(it, DISP_IT_EXECD)) { + /* The server failed before it even started executing the + * intent, i.e. because it couldn't unpack the request. */ + LASSERT(it->d.lustre.it_status != 0); + return it->d.lustre.it_status; + } + rc = it_open_error(DISP_IT_EXECD, it); + if (rc) + return rc; + + mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + LASSERT(mdt_body != NULL); /* mdc_enqueue checked */ + + /* If we were revalidating a fid/name pair, mark the intent in + * case we fail and get called again from lookup */ + if (fid_is_sane(&op_data->op_fid2) && + it->it_create_mode & M_CHECK_STALE && + it->it_op != IT_GETATTR) { + + /* Also: did we find the same inode? */ + /* sever can return one of two fids: + * op_fid2 - new allocated fid - if file is created. + * op_fid3 - existent fid - if file only open. + * op_fid3 is saved in lmv_intent_open */ + if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) && + (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) { + CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID + "\n", PFID(&op_data->op_fid2), + PFID(&op_data->op_fid2), PFID(&mdt_body->fid1)); + return -ESTALE; + } + } + + rc = it_open_error(DISP_LOOKUP_EXECD, it); + if (rc) + return rc; + + /* keep requests around for the multiple phases of the call + * this shows the DISP_XX must guarantee we make it into the call + */ + if (!it_disposition(it, DISP_ENQ_CREATE_REF) && + it_disposition(it, DISP_OPEN_CREATE) && + !it_open_error(DISP_OPEN_CREATE, it)) { + it_set_disposition(it, DISP_ENQ_CREATE_REF); + ptlrpc_request_addref(request); /* balanced in ll_create_node */ + } + if (!it_disposition(it, DISP_ENQ_OPEN_REF) && + it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) { + it_set_disposition(it, DISP_ENQ_OPEN_REF); + ptlrpc_request_addref(request); /* balanced in ll_file_open */ + /* BUG 11546 - eviction in the middle of open rpc processing */ + OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout); + } + + if (it->it_op & IT_CREAT) { + /* XXX this belongs in ll_create_it */ + } else if (it->it_op == IT_OPEN) { + LASSERT(!it_disposition(it, DISP_OPEN_CREATE)); + } else { + LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP | IT_LAYOUT)); + } + + /* If we already have a matching lock, then cancel the new + * one. We have to set the data here instead of in + * mdc_enqueue, because we need to use the child's inode as + * the l_ast_data to match, and that's not available until + * intent_finish has performed the iget().) */ + lock = ldlm_handle2lock(lockh); + if (lock) { + ldlm_policy_data_t policy = lock->l_policy_data; + + LDLM_DEBUG(lock, "matching against this"); + + LASSERTF(fid_res_name_eq(&mdt_body->fid1, + &lock->l_resource->lr_name), + "Lock res_id: "DLDLMRES", fid: "DFID"\n", + PLDLMRES(lock->l_resource), PFID(&mdt_body->fid1)); + LDLM_LOCK_PUT(lock); + + memcpy(&old_lock, lockh, sizeof(*lockh)); + if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL, + LDLM_IBITS, &policy, LCK_NL, + &old_lock, 0)) { + ldlm_lock_decref_and_cancel(lockh, + it->d.lustre.it_lock_mode); + memcpy(lockh, &old_lock, sizeof(old_lock)); + it->d.lustre.it_lock_handle = lockh->cookie; + } + } + CDEBUG(D_DENTRY, + "D_IT dentry %.*s intent: %s status %d disp %x rc %d\n", + op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op), + it->d.lustre.it_status, it->d.lustre.it_disposition, rc); + return rc; +} + +int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + /* We could just return 1 immediately, but since we should only + * be called in revalidate_it if we already have a lock, let's + * verify that. */ + struct ldlm_res_id res_id; + struct lustre_handle lockh; + ldlm_policy_data_t policy; + ldlm_mode_t mode; + + if (it->d.lustre.it_lock_handle) { + lockh.cookie = it->d.lustre.it_lock_handle; + mode = ldlm_revalidate_lock_handle(&lockh, bits); + } else { + fid_build_reg_res_name(fid, &res_id); + switch (it->it_op) { + case IT_GETATTR: + /* File attributes are held under multiple bits: + * nlink is under lookup lock, size and times are + * under UPDATE lock and recently we've also got + * a separate permissions lock for owner/group/acl that + * were protected by lookup lock before. + * Getattr must provide all of that information, + * so we need to ensure we have all of those locks. + * Unfortunately, if the bits are split across multiple + * locks, there's no easy way to match all of them here, + * so an extra RPC would be performed to fetch all + * of those bits at once for now. */ + /* For new MDTs(> 2.4), UPDATE|PERM should be enough, + * but for old MDTs (< 2.4), permission is covered + * by LOOKUP lock, so it needs to match all bits here.*/ + policy.l_inodebits.bits = MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LOOKUP | + MDS_INODELOCK_PERM; + break; + case IT_LAYOUT: + policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT; + break; + default: + policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP; + break; + } + + mode = mdc_lock_match(exp, LDLM_FL_BLOCK_GRANTED, fid, + LDLM_IBITS, &policy, + LCK_CR | LCK_CW | LCK_PR | LCK_PW, + &lockh); + } + + if (mode) { + it->d.lustre.it_lock_handle = lockh.cookie; + it->d.lustre.it_lock_mode = mode; + } else { + it->d.lustre.it_lock_handle = 0; + it->d.lustre.it_lock_mode = 0; + } + + return !!mode; +} + +/* + * This long block is all about fixing up the lock and request state + * so that it is correct as of the moment _before_ the operation was + * applied; that way, the VFS will think that everything is normal and + * call Lustre's regular VFS methods. + * + * If we're performing a creation, that means that unless the creation + * failed with EEXIST, we should fake up a negative dentry. + * + * For everything else, we want to lookup to succeed. + * + * One additional note: if CREATE or OPEN succeeded, we add an extra + * reference to the request because we need to keep it around until + * ll_create/ll_open gets called. + * + * The server will return to us, in it_disposition, an indication of + * exactly what d.lustre.it_status refers to. + * + * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call, + * otherwise if DISP_OPEN_CREATE is set, then it status is the + * creation failure mode. In either case, one of DISP_LOOKUP_NEG or + * DISP_LOOKUP_POS will be set, indicating whether the child lookup + * was successful. + * + * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the + * child lookup. + */ +int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int lookup_flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct ldlm_enqueue_info einfo = { + .ei_type = LDLM_IBITS, + .ei_mode = it_to_lock_mode(it), + .ei_cb_bl = cb_blocking, + .ei_cb_cp = ldlm_completion_ast, + }; + struct lustre_handle lockh; + int rc = 0; + + LASSERT(it); + + CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID + ", intent: %s flags %#Lo\n", op_data->op_namelen, + op_data->op_name, PFID(&op_data->op_fid2), + PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), + it->it_flags); + + lockh.cookie = 0; + if (fid_is_sane(&op_data->op_fid2) && + (it->it_op & (IT_LOOKUP | IT_GETATTR))) { + /* We could just return 1 immediately, but since we should only + * be called in revalidate_it if we already have a lock, let's + * verify that. */ + it->d.lustre.it_lock_handle = 0; + rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL); + /* Only return failure if it was not GETATTR by cfid + (from inode_revalidate) */ + if (rc || op_data->op_namelen != 0) + return rc; + } + + /* For case if upper layer did not alloc fid, do it now. */ + if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) { + rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data); + if (rc < 0) { + CERROR("Can't alloc new fid, rc %d\n", rc); + return rc; + } + } + rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh, lmm, lmmsize, NULL, + extra_lock_flags); + if (rc < 0) + return rc; + + *reqp = it->d.lustre.it_data; + rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh); + return rc; +} + +static int mdc_intent_getattr_async_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) +{ + struct mdc_getattr_args *ga = args; + struct obd_export *exp = ga->ga_exp; + struct md_enqueue_info *minfo = ga->ga_minfo; + struct ldlm_enqueue_info *einfo = ga->ga_einfo; + struct lookup_intent *it; + struct lustre_handle *lockh; + struct obd_device *obddev; + struct ldlm_reply *lockrep; + __u64 flags = LDLM_FL_HAS_INTENT; + + it = &minfo->mi_it; + lockh = &minfo->mi_lockh; + + obddev = class_exp2obd(exp); + + mdc_exit_request(&obddev->u.cli); + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE)) + rc = -ETIMEDOUT; + + rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode, + &flags, NULL, 0, lockh, rc); + if (rc < 0) { + CERROR("ldlm_cli_enqueue_fini: %d\n", rc); + mdc_clear_replay_flag(req, rc); + goto out; + } + + lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); + + lockrep->lock_policy_res2 = + ptlrpc_status_ntoh(lockrep->lock_policy_res2); + + rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); + if (rc) + goto out; + + rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh); + +out: + OBD_FREE_PTR(einfo); + minfo->mi_cb(req, minfo, rc); + return 0; +} + +int mdc_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo) +{ + struct md_op_data *op_data = &minfo->mi_data; + struct lookup_intent *it = &minfo->mi_it; + struct ptlrpc_request *req; + struct mdc_getattr_args *ga; + struct obd_device *obddev = class_exp2obd(exp); + struct ldlm_res_id res_id; + /*XXX: Both MDS_INODELOCK_LOOKUP and MDS_INODELOCK_UPDATE are needed + * for statahead currently. Consider CMD in future, such two bits + * maybe managed by different MDS, should be adjusted then. */ + ldlm_policy_data_t policy = { + .l_inodebits = { MDS_INODELOCK_LOOKUP | + MDS_INODELOCK_UPDATE } + }; + int rc = 0; + __u64 flags = LDLM_FL_HAS_INTENT; + + CDEBUG(D_DLMTRACE, + "name: %.*s in inode "DFID", intent: %s flags %#Lo\n", + op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), + ldlm_it2str(it->it_op), it->it_flags); + + fid_build_reg_res_name(&op_data->op_fid1, &res_id); + req = mdc_intent_getattr_pack(exp, it, op_data); + if (IS_ERR(req)) + return PTR_ERR(req); + + rc = mdc_enter_request(&obddev->u.cli); + if (rc != 0) { + ptlrpc_req_finished(req); + return rc; + } + + rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL, + 0, LVB_T_NONE, &minfo->mi_lockh, 1); + if (rc < 0) { + mdc_exit_request(&obddev->u.cli); + ptlrpc_req_finished(req); + return rc; + } + + CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args)); + ga = ptlrpc_req_async_args(req); + ga->ga_exp = exp; + ga->ga_minfo = minfo; + ga->ga_einfo = einfo; + + req->rq_interpret_reply = mdc_intent_getattr_async_interpret; + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + + return 0; +} diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/kernel/drivers/staging/lustre/lustre/mdc/mdc_reint.c new file mode 100644 index 000000000..5e9c6296c --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_reint.c @@ -0,0 +1,483 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC + +# include +# include + +#include "../include/obd_class.h" +#include "mdc_internal.h" +#include "../include/lustre_fid.h" + +/* mdc_setattr does its own semaphore handling */ +static int mdc_reint(struct ptlrpc_request *request, + struct mdc_rpc_lock *rpc_lock, + int level) +{ + int rc; + + request->rq_send_state = level; + + mdc_get_rpc_lock(rpc_lock, NULL); + rc = ptlrpc_queue_wait(request); + mdc_put_rpc_lock(rpc_lock, NULL); + if (rc) + CDEBUG(D_INFO, "error in handling %d\n", rc); + else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) + rc = -EPROTO; + + return rc; +} + +/* Find and cancel locally locks matched by inode @bits & @mode in the resource + * found by @fid. Found locks are added into @cancel list. Returns the amount of + * locks added to @cancels list. */ +int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, + struct list_head *cancels, ldlm_mode_t mode, + __u64 bits) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + ldlm_policy_data_t policy = {}; + struct ldlm_res_id res_id; + struct ldlm_resource *res; + int count; + + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + return 0; + + fid_build_reg_res_name(fid, &res_id); + res = ldlm_resource_get(exp->exp_obd->obd_namespace, + NULL, &res_id, 0, 0); + if (res == NULL) + return 0; + LDLM_RESOURCE_ADDREF(res); + /* Initialize ibits lock policy. */ + policy.l_inodebits.bits = bits; + count = ldlm_cancel_resource_local(res, cancels, &policy, + mode, 0, 0, NULL); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + return count; +} + +int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len, + struct ptlrpc_request **request, struct md_open_data **mod) +{ + LIST_HEAD(cancels); + struct ptlrpc_request *req; + struct mdc_rpc_lock *rpc_lock; + struct obd_device *obd = exp->exp_obd; + int count = 0, rc; + __u64 bits; + + LASSERT(op_data != NULL); + + bits = MDS_INODELOCK_UPDATE; + if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) + bits |= MDS_INODELOCK_LOOKUP; + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1)) && + !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, bits); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_SETATTR); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + return -ENOMEM; + } + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + if ((op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) == 0) + req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT, + 0); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT, + ea2len); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + rpc_lock = obd->u.cli.cl_rpc_lock; + + if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime "CFS_TIME_T + ", ctime "CFS_TIME_T"\n", + LTIME_S(op_data->op_attr.ia_mtime), + LTIME_S(op_data->op_attr.ia_ctime)); + mdc_setattr_pack(req, op_data, ea, ealen, ea2, ea2len); + + ptlrpc_request_set_replen(req); + if (mod && (op_data->op_flags & MF_EPOCH_OPEN) && + req->rq_import->imp_replayable) { + LASSERT(*mod == NULL); + + *mod = obd_mod_alloc(); + if (*mod == NULL) { + DEBUG_REQ(D_ERROR, req, "Can't allocate md_open_data"); + } else { + req->rq_replay = 1; + req->rq_cb_data = *mod; + (*mod)->mod_open_req = req; + req->rq_commit_cb = mdc_commit_open; + (*mod)->mod_is_create = true; + /** + * Take an extra reference on \var mod, it protects \var + * mod from being freed on eviction (commit callback is + * called despite rq_replay flag). + * Will be put on mdc_done_writing(). + */ + obd_mod_get(*mod); + } + } + + rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL); + + /* Save the obtained info in the original RPC for the replay case. */ + if (rc == 0 && (op_data->op_flags & MF_EPOCH_OPEN)) { + struct mdt_ioepoch *epoch; + struct mdt_body *body; + + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(epoch != NULL); + LASSERT(body != NULL); + epoch->handle = body->handle; + epoch->ioepoch = body->ioepoch; + req->rq_replay_cb = mdc_replay_open; + /** bug 3633, open may be committed and estale answer is not error */ + } else if (rc == -ESTALE && (op_data->op_flags & MF_SOM_CHANGE)) { + rc = 0; + } else if (rc == -ERESTARTSYS) { + rc = 0; + } + *request = req; + if (rc && req->rq_commit_cb) { + /* Put an extra reference on \var mod on error case. */ + if (mod != NULL && *mod != NULL) + obd_mod_put(*mod); + req->rq_commit_cb(req); + } + return rc; +} + +int mdc_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, __u32 gid, + cfs_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int level, rc; + int count, resends = 0; + struct obd_import *import = exp->exp_obd->u.cli.cl_import; + int generation = import->imp_generation; + LIST_HEAD(cancels); + + /* For case if upper layer did not alloc fid, do it now. */ + if (!fid_is_sane(&op_data->op_fid2)) { + /* + * mdc_fid_alloc() may return errno 1 in case of switch to new + * sequence, handle this. + */ + rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data); + if (rc < 0) { + CERROR("Can't alloc new fid, rc %d\n", rc); + return rc; + } + } + +rebuild: + count = 0; + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_CREATE_RMT_ACL); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + return -ENOMEM; + } + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + data && datalen ? datalen : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + /* + * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with + * tgt, for symlinks or lov MD data. + */ + mdc_create_pack(req, op_data, data, datalen, mode, uid, + gid, cap_effective, rdev); + + ptlrpc_request_set_replen(req); + + /* ask ptlrpc not to resend on EINPROGRESS since we have our own retry + * logic here */ + req->rq_no_retry_einprogress = 1; + + if (resends) { + req->rq_generation_set = 1; + req->rq_import_generation = generation; + req->rq_sent = get_seconds() + resends; + } + level = LUSTRE_IMP_FULL; + resend: + rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level); + + /* Resend if we were told to. */ + if (rc == -ERESTARTSYS) { + level = LUSTRE_IMP_RECOVER; + goto resend; + } else if (rc == -EINPROGRESS) { + /* Retry create infinitely until succeed or get other + * error code. */ + ptlrpc_req_finished(req); + resends++; + + CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n", + exp->exp_obd->obd_name, resends, + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2)); + + if (generation == import->imp_generation) { + goto rebuild; + } else { + CDEBUG(D_HA, "resend cross eviction\n"); + return -EIO; + } + } else if (rc == 0) { + struct mdt_body *body; + struct lustre_capa *capa; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body); + if (body->valid & OBD_MD_FLMDSCAPA) { + capa = req_capsule_server_get(&req->rq_pill, + &RMF_CAPA1); + if (capa == NULL) + rc = -EPROTO; + } + } + + *request = req; + return rc; +} + +int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req = *request; + int count = 0, rc; + + LASSERT(req == NULL); + + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1)) && + !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + (fid_is_sane(&op_data->op_fid3)) && + !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + count += mdc_resource_get_unused(exp, &op_data->op_fid3, + &cancels, LCK_EX, + MDS_INODELOCK_FULL); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_UNLINK); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + return -ENOMEM; + } + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_unlink_pack(req, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER, + obd->u.cli.cl_default_mds_cookiesize); + ptlrpc_request_set_replen(req); + + *request = req; + + rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL); + if (rc == -ERESTARTSYS) + rc = 0; + return rc; +} + +int mdc_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req; + int count = 0, rc; + + if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && + (fid_is_sane(&op_data->op_fid2))) + count = mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count += mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + return -ENOMEM; + } + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_link_pack(req, op_data); + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL); + *request = req; + if (rc == -ERESTARTSYS) + rc = 0; + + return rc; +} + +int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen, + struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req; + int count = 0, rc; + + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && + (fid_is_sane(&op_data->op_fid2))) + count += mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + (fid_is_sane(&op_data->op_fid3))) + count += mdc_resource_get_unused(exp, &op_data->op_fid3, + &cancels, LCK_EX, + MDS_INODELOCK_LOOKUP); + if ((op_data->op_flags & MF_MDC_CANCEL_FID4) && + (fid_is_sane(&op_data->op_fid4))) + count += mdc_resource_get_unused(exp, &op_data->op_fid4, + &cancels, LCK_EX, + MDS_INODELOCK_FULL); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_RENAME); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + return -ENOMEM; + } + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + if (exp_connect_cancelset(exp) && req) + ldlm_cli_cancel_list(&cancels, count, req, 0); + + mdc_rename_pack(req, op_data, old, oldlen, new, newlen); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER, + obd->u.cli.cl_default_mds_cookiesize); + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL); + *request = req; + if (rc == -ERESTARTSYS) + rc = 0; + + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/mdc/mdc_request.c b/kernel/drivers/staging/lustre/lustre/mdc/mdc_request.c new file mode 100644 index 000000000..f8ef5fe5e --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -0,0 +1,2731 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC + +# include +# include +# include +# include +# include + +#include "../include/lustre_acl.h" +#include "../include/obd_class.h" +#include "../include/lustre_fid.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre_param.h" +#include "../include/lustre_log.h" + +#include "mdc_internal.h" + +#define REQUEST_MINOR 244 + +struct mdc_renew_capa_args { + struct obd_capa *ra_oc; + renew_capa_cb_t ra_cb; +}; + +static int mdc_cleanup(struct obd_device *obd); + +static int mdc_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req, + const struct req_msg_field *field, struct obd_capa **oc) +{ + struct lustre_capa *capa; + struct obd_capa *c; + + /* swabbed already in mdc_enqueue */ + capa = req_capsule_server_get(&req->rq_pill, field); + if (capa == NULL) + return -EPROTO; + + c = alloc_capa(CAPA_SITE_CLIENT); + if (IS_ERR(c)) { + CDEBUG(D_INFO, "alloc capa failed!\n"); + return PTR_ERR(c); + } else { + c->c_capa = *capa; + *oc = c; + return 0; + } +} + +static inline int mdc_queue_wait(struct ptlrpc_request *req) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + int rc; + + /* mdc_enter_request() ensures that this client has no more + * than cl_max_rpcs_in_flight RPCs simultaneously inf light + * against an MDT. */ + rc = mdc_enter_request(cli); + if (rc != 0) + return rc; + + rc = ptlrpc_queue_wait(req); + mdc_exit_request(cli); + + return rc; +} + +/* Helper that implements most of mdc_getstatus and signal_completed_replay. */ +/* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */ +static int send_getstatus(struct obd_import *imp, struct lu_fid *rootfid, + struct obd_capa **pc, int level, int msg_flags) +{ + struct ptlrpc_request *req; + struct mdt_body *body; + int rc; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_GETSTATUS, + LUSTRE_MDS_VERSION, MDS_GETSTATUS); + if (req == NULL) + return -ENOMEM; + + mdc_pack_body(req, NULL, NULL, 0, 0, -1, 0); + lustre_msg_add_flags(req->rq_reqmsg, msg_flags); + req->rq_send_state = level; + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) { + rc = -EPROTO; + goto out; + } + + if (body->valid & OBD_MD_FLMDSCAPA) { + rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, pc); + if (rc) + goto out; + } + + *rootfid = body->fid1; + CDEBUG(D_NET, + "root fid="DFID", last_committed=%llu\n", + PFID(rootfid), + lustre_msg_get_last_committed(req->rq_repmsg)); +out: + ptlrpc_req_finished(req); + return rc; +} + +/* This should be mdc_get_info("rootfid") */ +static int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid, + struct obd_capa **pc) +{ + return send_getstatus(class_exp2cliimp(exp), rootfid, pc, + LUSTRE_IMP_FULL, 0); +} + +/* + * This function now is known to always saying that it will receive 4 buffers + * from server. Even for cases when acl_size and md_size is zero, RPC header + * will contain 4 fields and RPC itself will contain zero size fields. This is + * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed + * and thus zero, it shrinks it, making zero size. The same story about + * md_size. And this is course of problem when client waits for smaller number + * of fields. This issue will be fixed later when client gets aware of RPC + * layouts. --umka + */ +static int mdc_getattr_common(struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct req_capsule *pill = &req->rq_pill; + struct mdt_body *body; + void *eadata; + int rc; + + /* Request message already built. */ + rc = ptlrpc_queue_wait(req); + if (rc != 0) + return rc; + + /* sanity check for the reply */ + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (body == NULL) + return -EPROTO; + + CDEBUG(D_NET, "mode: %o\n", body->mode); + + if (body->eadatasize != 0) { + mdc_update_max_ea_from_body(exp, body); + + eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + body->eadatasize); + if (eadata == NULL) + return -EPROTO; + } + + if (body->valid & OBD_MD_FLRMTPERM) { + struct mdt_remote_perm *perm; + + LASSERT(client_is_remote(exp)); + perm = req_capsule_server_swab_get(pill, &RMF_ACL, + lustre_swab_mdt_remote_perm); + if (perm == NULL) + return -EPROTO; + } + + if (body->valid & OBD_MD_FLMDSCAPA) { + struct lustre_capa *capa; + + capa = req_capsule_server_get(pill, &RMF_CAPA1); + if (capa == NULL) + return -EPROTO; + } + + return 0; +} + +static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + + /* Single MDS without an LMV case */ + if (op_data->op_flags & MF_GET_MDT_IDX) { + op_data->op_mds = 0; + return 0; + } + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + op_data->op_valid, op_data->op_mode, -1, 0); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + op_data->op_mode); + if (op_data->op_valid & OBD_MD_FLRMTPERM) { + LASSERT(client_is_remote(exp)); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + sizeof(struct mdt_remote_perm)); + } + ptlrpc_request_set_replen(req); + + rc = mdc_getattr_common(exp, req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + return rc; +} + +static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_GETATTR_NAME); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + op_data->op_valid, op_data->op_mode, + op_data->op_suppgids[0], 0); + + if (op_data->op_name) { + char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + + LASSERT(strnlen(op_data->op_name, op_data->op_namelen) == + op_data->op_namelen); + memcpy(name, op_data->op_name, op_data->op_namelen); + } + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + op_data->op_mode); + ptlrpc_request_set_replen(req); + + rc = mdc_getattr_common(exp, req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + return rc; +} + +static int mdc_is_subdir(struct obd_export *exp, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + + *request = NULL; + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MDS_IS_SUBDIR, LUSTRE_MDS_VERSION, + MDS_IS_SUBDIR); + if (req == NULL) + return -ENOMEM; + + mdc_is_subdir_pack(req, pfid, cfid, 0); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc && rc != -EREMOTE) + ptlrpc_req_finished(req); + else + *request = req; + return rc; +} + +static int mdc_xattr_common(struct obd_export *exp, + const struct req_format *fmt, + const struct lu_fid *fid, + struct obd_capa *oc, int opcode, u64 valid, + const char *xattr_name, const char *input, + int input_size, int output_size, int flags, + __u32 suppgid, struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int xattr_namelen = 0; + char *tmp; + int rc; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, oc); + if (xattr_name) { + xattr_namelen = strlen(xattr_name) + 1; + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + xattr_namelen); + } + if (input_size) { + LASSERT(input); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + input_size); + } + + /* Flush local XATTR locks to get rid of a possible cancel RPC */ + if (opcode == MDS_REINT && fid_is_sane(fid) && + exp->exp_connect_data.ocd_ibits_known & MDS_INODELOCK_XATTR) { + LIST_HEAD(cancels); + int count; + + /* Without that packing would fail */ + if (input_size == 0) + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, + RCL_CLIENT, 0); + + count = mdc_resource_get_unused(exp, fid, + &cancels, LCK_EX, + MDS_INODELOCK_XATTR); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + } else { + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + } + + if (opcode == MDS_REINT) { + struct mdt_rec_setxattr *rec; + + CLASSERT(sizeof(struct mdt_rec_setxattr) == + sizeof(struct mdt_rec_reint)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + rec->sx_opcode = REINT_SETXATTR; + rec->sx_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->sx_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->sx_cap = cfs_curproc_cap_pack(); + rec->sx_suppgid1 = suppgid; + rec->sx_suppgid2 = -1; + rec->sx_fid = *fid; + rec->sx_valid = valid | OBD_MD_FLCTIME; + rec->sx_time = get_seconds(); + rec->sx_size = output_size; + rec->sx_flags = flags; + + mdc_pack_capa(req, &RMF_CAPA1, oc); + } else { + mdc_pack_body(req, fid, oc, valid, output_size, suppgid, flags); + } + + if (xattr_name) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + memcpy(tmp, xattr_name, xattr_namelen); + } + if (input_size) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, input, input_size); + } + + if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER)) + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, + RCL_SERVER, output_size); + ptlrpc_request_set_replen(req); + + /* make rpc */ + if (opcode == MDS_REINT) + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + + rc = ptlrpc_queue_wait(req); + + if (opcode == MDS_REINT) + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + return rc; +} + +static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, u64 valid, const char *xattr_name, + const char *input, int input_size, int output_size, + int flags, __u32 suppgid, struct ptlrpc_request **request) +{ + return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR, + fid, oc, MDS_REINT, valid, xattr_name, + input, input_size, output_size, flags, + suppgid, request); +} + +static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, u64 valid, const char *xattr_name, + const char *input, int input_size, int output_size, + int flags, struct ptlrpc_request **request) +{ + return mdc_xattr_common(exp, &RQF_MDS_GETXATTR, + fid, oc, MDS_GETXATTR, valid, xattr_name, + input, input_size, output_size, flags, + -1, request); +} + +#ifdef CONFIG_FS_POSIX_ACL +static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md) +{ + struct req_capsule *pill = &req->rq_pill; + struct mdt_body *body = md->body; + struct posix_acl *acl; + void *buf; + int rc; + + if (!body->aclsize) + return 0; + + buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->aclsize); + + if (!buf) + return -EPROTO; + + acl = posix_acl_from_xattr(&init_user_ns, buf, body->aclsize); + if (acl == NULL) + return 0; + + if (IS_ERR(acl)) { + rc = PTR_ERR(acl); + CERROR("convert xattr to acl: %d\n", rc); + return rc; + } + + rc = posix_acl_valid(acl); + if (rc) { + CERROR("validate acl: %d\n", rc); + posix_acl_release(acl); + return rc; + } + + md->posix_acl = acl; + return 0; +} +#else +#define mdc_unpack_acl(req, md) 0 +#endif + +int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req, + struct obd_export *dt_exp, struct obd_export *md_exp, + struct lustre_md *md) +{ + struct req_capsule *pill = &req->rq_pill; + int rc; + + LASSERT(md); + memset(md, 0, sizeof(*md)); + + md->body = req_capsule_server_get(pill, &RMF_MDT_BODY); + LASSERT(md->body != NULL); + + if (md->body->valid & OBD_MD_FLEASIZE) { + int lmmsize; + struct lov_mds_md *lmm; + + if (!S_ISREG(md->body->mode)) { + CDEBUG(D_INFO, + "OBD_MD_FLEASIZE set, should be a regular file, but is not\n"); + rc = -EPROTO; + goto out; + } + + if (md->body->eadatasize == 0) { + CDEBUG(D_INFO, + "OBD_MD_FLEASIZE set, but eadatasize 0\n"); + rc = -EPROTO; + goto out; + } + lmmsize = md->body->eadatasize; + lmm = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmmsize); + if (!lmm) { + rc = -EPROTO; + goto out; + } + + rc = obd_unpackmd(dt_exp, &md->lsm, lmm, lmmsize); + if (rc < 0) + goto out; + + if (rc < sizeof(*md->lsm)) { + CDEBUG(D_INFO, + "lsm size too small: rc < sizeof (*md->lsm) (%d < %d)\n", + rc, (int)sizeof(*md->lsm)); + rc = -EPROTO; + goto out; + } + + } else if (md->body->valid & OBD_MD_FLDIREA) { + int lmvsize; + struct lov_mds_md *lmv; + + if (!S_ISDIR(md->body->mode)) { + CDEBUG(D_INFO, + "OBD_MD_FLDIREA set, should be a directory, but is not\n"); + rc = -EPROTO; + goto out; + } + + if (md->body->eadatasize == 0) { + CDEBUG(D_INFO, + "OBD_MD_FLDIREA is set, but eadatasize 0\n"); + return -EPROTO; + } + if (md->body->valid & OBD_MD_MEA) { + lmvsize = md->body->eadatasize; + lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + lmvsize); + if (!lmv) { + rc = -EPROTO; + goto out; + } + + rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv, + lmvsize); + if (rc < 0) + goto out; + + if (rc < sizeof(*md->mea)) { + CDEBUG(D_INFO, + "size too small: rc < sizeof(*md->mea) (%d < %d)\n", + rc, (int)sizeof(*md->mea)); + rc = -EPROTO; + goto out; + } + } + } + rc = 0; + + if (md->body->valid & OBD_MD_FLRMTPERM) { + /* remote permission */ + LASSERT(client_is_remote(exp)); + md->remote_perm = req_capsule_server_swab_get(pill, &RMF_ACL, + lustre_swab_mdt_remote_perm); + if (!md->remote_perm) { + rc = -EPROTO; + goto out; + } + } else if (md->body->valid & OBD_MD_FLACL) { + /* for ACL, it's possible that FLACL is set but aclsize is zero. + * only when aclsize != 0 there's an actual segment for ACL + * in reply buffer. + */ + if (md->body->aclsize) { + rc = mdc_unpack_acl(req, md); + if (rc) + goto out; +#ifdef CONFIG_FS_POSIX_ACL + } else { + md->posix_acl = NULL; +#endif + } + } + if (md->body->valid & OBD_MD_FLMDSCAPA) { + struct obd_capa *oc = NULL; + + rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, &oc); + if (rc) + goto out; + md->mds_capa = oc; + } + + if (md->body->valid & OBD_MD_FLOSSCAPA) { + struct obd_capa *oc = NULL; + + rc = mdc_unpack_capa(NULL, req, &RMF_CAPA2, &oc); + if (rc) + goto out; + md->oss_capa = oc; + } + +out: + if (rc) { + if (md->oss_capa) { + capa_put(md->oss_capa); + md->oss_capa = NULL; + } + if (md->mds_capa) { + capa_put(md->mds_capa); + md->mds_capa = NULL; + } +#ifdef CONFIG_FS_POSIX_ACL + posix_acl_release(md->posix_acl); +#endif + if (md->lsm) + obd_free_memmd(dt_exp, &md->lsm); + } + return rc; +} + +int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md) +{ + return 0; +} + +/** + * Handles both OPEN and SETATTR RPCs for OPEN-CLOSE and SETATTR-DONE_WRITING + * RPC chains. + */ +void mdc_replay_open(struct ptlrpc_request *req) +{ + struct md_open_data *mod = req->rq_cb_data; + struct ptlrpc_request *close_req; + struct obd_client_handle *och; + struct lustre_handle old; + struct mdt_body *body; + + if (mod == NULL) { + DEBUG_REQ(D_ERROR, req, + "Can't properly replay without open data."); + return; + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + och = mod->mod_och; + if (och != NULL) { + struct lustre_handle *file_fh; + + LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC); + + file_fh = &och->och_fh; + CDEBUG(D_HA, "updating handle from %#llx to %#llx\n", + file_fh->cookie, body->handle.cookie); + old = *file_fh; + *file_fh = body->handle; + } + close_req = mod->mod_close_req; + if (close_req != NULL) { + __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg); + struct mdt_ioepoch *epoch; + + LASSERT(opc == MDS_CLOSE || opc == MDS_DONE_WRITING); + epoch = req_capsule_client_get(&close_req->rq_pill, + &RMF_MDT_EPOCH); + LASSERT(epoch); + + if (och != NULL) + LASSERT(!memcmp(&old, &epoch->handle, sizeof(old))); + DEBUG_REQ(D_HA, close_req, "updating close body with new fh"); + epoch->handle = body->handle; + } +} + +void mdc_commit_open(struct ptlrpc_request *req) +{ + struct md_open_data *mod = req->rq_cb_data; + + if (mod == NULL) + return; + + /** + * No need to touch md_open_data::mod_och, it holds a reference on + * \var mod and will zero references to each other, \var mod will be + * freed after that when md_open_data::mod_och will put the reference. + */ + + /** + * Do not let open request to disappear as it still may be needed + * for close rpc to happen (it may happen on evict only, otherwise + * ptlrpc_request::rq_replay does not let mdc_commit_open() to be + * called), just mark this rpc as committed to distinguish these 2 + * cases, see mdc_close() for details. The open request reference will + * be put along with freeing \var mod. + */ + ptlrpc_request_addref(req); + spin_lock(&req->rq_lock); + req->rq_committed = 1; + spin_unlock(&req->rq_lock); + req->rq_cb_data = NULL; + obd_mod_put(mod); +} + +int mdc_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it) +{ + struct md_open_data *mod; + struct mdt_rec_create *rec; + struct mdt_body *body; + struct ptlrpc_request *open_req = it->d.lustre.it_data; + struct obd_import *imp = open_req->rq_import; + + if (!open_req->rq_replay) + return 0; + + rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT); + body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY); + LASSERT(rec != NULL); + /* Incoming message in my byte order (it's been swabbed). */ + /* Outgoing messages always in my byte order. */ + LASSERT(body != NULL); + + /* Only if the import is replayable, we set replay_open data */ + if (och && imp->imp_replayable) { + mod = obd_mod_alloc(); + if (mod == NULL) { + DEBUG_REQ(D_ERROR, open_req, + "Can't allocate md_open_data"); + return 0; + } + + /** + * Take a reference on \var mod, to be freed on mdc_close(). + * It protects \var mod from being freed on eviction (commit + * callback is called despite rq_replay flag). + * Another reference for \var och. + */ + obd_mod_get(mod); + obd_mod_get(mod); + + spin_lock(&open_req->rq_lock); + och->och_mod = mod; + mod->mod_och = och; + mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) || + it_disposition(it, DISP_OPEN_STRIPE); + mod->mod_open_req = open_req; + open_req->rq_cb_data = mod; + open_req->rq_commit_cb = mdc_commit_open; + spin_unlock(&open_req->rq_lock); + } + + rec->cr_fid2 = body->fid1; + rec->cr_ioepoch = body->ioepoch; + rec->cr_old_handle.cookie = body->handle.cookie; + open_req->rq_replay_cb = mdc_replay_open; + if (!fid_is_sane(&body->fid1)) { + DEBUG_REQ(D_ERROR, open_req, + "Saving replay request with insane fid"); + LBUG(); + } + + DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data"); + return 0; +} + +static void mdc_free_open(struct md_open_data *mod) +{ + int committed = 0; + + if (mod->mod_is_create == 0 && + imp_connect_disp_stripe(mod->mod_open_req->rq_import)) + committed = 1; + + LASSERT(mod->mod_open_req->rq_replay == 0); + + DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request\n"); + + ptlrpc_request_committed(mod->mod_open_req, committed); + if (mod->mod_close_req) + ptlrpc_request_committed(mod->mod_close_req, committed); +} + +int mdc_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + struct md_open_data *mod = och->och_mod; + + /** + * It is possible to not have \var mod in a case of eviction between + * lookup and ll_file_open(). + **/ + if (mod == NULL) + return 0; + + LASSERT(mod != LP_POISON); + LASSERT(mod->mod_open_req != NULL); + mdc_free_open(mod); + + mod->mod_och = NULL; + och->och_mod = NULL; + obd_mod_put(mod); + + return 0; +} + +/* Prepares the request for the replay by the given reply */ +static void mdc_close_handle_reply(struct ptlrpc_request *req, + struct md_op_data *op_data, int rc) { + struct mdt_body *repbody; + struct mdt_ioepoch *epoch; + + if (req && rc == -EAGAIN) { + repbody = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + + epoch->flags |= MF_SOM_AU; + if (repbody->valid & OBD_MD_FLGETATTRLOCK) + op_data->op_flags |= MF_GETATTR_LOCK; + } +} + +static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, struct ptlrpc_request **request) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct req_format *req_fmt; + int rc; + int saved_rc = 0; + + + req_fmt = &RQF_MDS_CLOSE; + if (op_data->op_bias & MDS_HSM_RELEASE) { + req_fmt = &RQF_MDS_RELEASE_CLOSE; + + /* allocate a FID for volatile file */ + rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data); + if (rc < 0) { + CERROR("%s: "DFID" failed to allocate FID: %d\n", + obd->obd_name, PFID(&op_data->op_fid1), rc); + /* save the errcode and proceed to close */ + saved_rc = rc; + } + } + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a + * portal whose threads are not taking any DLM locks and are therefore + * always progressing */ + req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + /* Ensure that this close's handle is fixed up during replay. */ + if (likely(mod != NULL)) { + LASSERTF(mod->mod_open_req != NULL && + mod->mod_open_req->rq_type != LI_POISON, + "POISONED open %p!\n", mod->mod_open_req); + + mod->mod_close_req = req; + + DEBUG_REQ(D_HA, mod->mod_open_req, "matched open"); + /* We no longer want to preserve this open for replay even + * though the open was committed. b=3632, b=3633 */ + spin_lock(&mod->mod_open_req->rq_lock); + mod->mod_open_req->rq_replay = 0; + spin_unlock(&mod->mod_open_req->rq_lock); + } else { + CDEBUG(D_HA, + "couldn't find open req; expecting close error\n"); + } + + mdc_close_pack(req, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER, + obd->u.cli.cl_default_mds_cookiesize); + + ptlrpc_request_set_replen(req); + + mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL); + + if (req->rq_repmsg == NULL) { + CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req, + req->rq_status); + if (rc == 0) + rc = req->rq_status ?: -EIO; + } else if (rc == 0 || rc == -EAGAIN) { + struct mdt_body *body; + + rc = lustre_msg_get_status(req->rq_repmsg); + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + DEBUG_REQ(D_ERROR, req, + "type == PTL_RPC_MSG_ERR, err = %d", rc); + if (rc > 0) + rc = -rc; + } + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + rc = -EPROTO; + } else if (rc == -ESTALE) { + /** + * it can be allowed error after 3633 if open was committed and + * server failed before close was sent. Let's check if mod + * exists and return no error in that case + */ + if (mod) { + DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc); + LASSERT(mod->mod_open_req != NULL); + if (mod->mod_open_req->rq_committed) + rc = 0; + } + } + + if (mod) { + if (rc != 0) + mod->mod_close_req = NULL; + /* Since now, mod is accessed through open_req only, + * thus close req does not keep a reference on mod anymore. */ + obd_mod_put(mod); + } + *request = req; + mdc_close_handle_reply(req, op_data, rc); + return rc < 0 ? rc : saved_rc; +} + +static int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_DONE_WRITING); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_DONE_WRITING); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + if (mod != NULL) { + LASSERTF(mod->mod_open_req != NULL && + mod->mod_open_req->rq_type != LI_POISON, + "POISONED setattr %p!\n", mod->mod_open_req); + + mod->mod_close_req = req; + DEBUG_REQ(D_HA, mod->mod_open_req, "matched setattr"); + /* We no longer want to preserve this setattr for replay even + * though the open was committed. b=3632, b=3633 */ + spin_lock(&mod->mod_open_req->rq_lock); + mod->mod_open_req->rq_replay = 0; + spin_unlock(&mod->mod_open_req->rq_lock); + } + + mdc_close_pack(req, op_data); + ptlrpc_request_set_replen(req); + + mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL); + + if (rc == -ESTALE) { + /** + * it can be allowed error after 3633 if open or setattr were + * committed and server failed before close was sent. + * Let's check if mod exists and return no error in that case + */ + if (mod) { + LASSERT(mod->mod_open_req != NULL); + if (mod->mod_open_req->rq_committed) + rc = 0; + } + } + + if (mod) { + if (rc != 0) + mod->mod_close_req = NULL; + LASSERT(mod->mod_open_req != NULL); + mdc_free_open(mod); + + /* Since now, mod is accessed through setattr req only, + * thus DW req does not keep a reference on mod anymore. */ + obd_mod_put(mod); + } + + mdc_close_handle_reply(req, op_data, rc); + ptlrpc_req_finished(req); + return rc; +} + + +static int mdc_readpage(struct obd_export *exp, struct md_op_data *op_data, + struct page **pages, struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + int i; + wait_queue_head_t waitq; + int resends = 0; + struct l_wait_info lwi; + int rc; + + *request = NULL; + init_waitqueue_head(&waitq); + +restart_bulk: + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + desc = ptlrpc_prep_bulk_imp(req, op_data->op_npages, 1, BULK_PUT_SINK, + MDS_BULK_PORTAL); + if (desc == NULL) { + ptlrpc_request_free(req); + return -ENOMEM; + } + + /* NB req now owns desc and will free it when it gets freed */ + for (i = 0; i < op_data->op_npages; i++) + ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE); + + mdc_readdir_pack(req, op_data->op_offset, + PAGE_CACHE_SIZE * op_data->op_npages, + &op_data->op_fid1, op_data->op_capa1); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) { + ptlrpc_req_finished(req); + if (rc != -ETIMEDOUT) + return rc; + + resends++; + if (!client_should_resend(resends, &exp->exp_obd->u.cli)) { + CERROR("too many resend retries, returning error\n"); + return -EIO; + } + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), + NULL, NULL, NULL); + l_wait_event(waitq, 0, &lwi); + + goto restart_bulk; + } + + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, + req->rq_bulk->bd_nob_transferred); + if (rc < 0) { + ptlrpc_req_finished(req); + return rc; + } + + if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) { + CERROR("Unexpected # bytes transferred: %d (%ld expected)\n", + req->rq_bulk->bd_nob_transferred, + PAGE_CACHE_SIZE * op_data->op_npages); + ptlrpc_req_finished(req); + return -EPROTO; + } + + *request = req; + return 0; +} + +static int mdc_statfs(const struct lu_env *env, + struct obd_export *exp, struct obd_statfs *osfs, + __u64 max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct obd_statfs *msfs; + struct obd_import *imp = NULL; + int rc; + + /* + * Since the request might also come from lprocfs, so we need + * sync this with client_disconnect_export Bug15684 + */ + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + if (!imp) + return -ENODEV; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS, + LUSTRE_MDS_VERSION, MDS_STATFS); + if (req == NULL) { + rc = -ENOMEM; + goto output; + } + + ptlrpc_request_set_replen(req); + + if (flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stay in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) { + /* check connection error first */ + if (imp->imp_connect_error) + rc = imp->imp_connect_error; + goto out; + } + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) { + rc = -EPROTO; + goto out; + } + + *osfs = *msfs; +out: + ptlrpc_req_finished(req); +output: + class_import_put(imp); + return rc; +} + +static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf) +{ + __u32 keylen, vallen; + void *key; + int rc; + + if (gf->gf_pathlen > PATH_MAX) + return -ENAMETOOLONG; + if (gf->gf_pathlen < 2) + return -EOVERFLOW; + + /* Key is KEY_FID2PATH + getinfo_fid2path description */ + keylen = cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf); + OBD_ALLOC(key, keylen); + if (key == NULL) + return -ENOMEM; + memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH)); + memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf)); + + CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n", + PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno); + + if (!fid_is_sane(&gf->gf_fid)) { + rc = -EINVAL; + goto out; + } + + /* Val is struct getinfo_fid2path result plus path */ + vallen = sizeof(*gf) + gf->gf_pathlen; + + rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf, NULL); + if (rc != 0 && rc != -EREMOTE) + goto out; + + if (vallen <= sizeof(*gf)) { + rc = -EPROTO; + goto out; + } else if (vallen > sizeof(*gf) + gf->gf_pathlen) { + rc = -EOVERFLOW; + goto out; + } + + CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n%s\n", + PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, gf->gf_path); + +out: + OBD_FREE(key, keylen); + return rc; +} + +static int mdc_ioc_hsm_progress(struct obd_export *exp, + struct hsm_progress_kernel *hpk) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct hsm_progress_kernel *req_hpk; + struct ptlrpc_request *req; + int rc; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS, + LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS); + if (req == NULL) { + rc = -ENOMEM; + goto out; + } + + mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0); + + /* Copy hsm_progress struct */ + req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS); + if (req_hpk == NULL) { + rc = -EPROTO; + goto out; + } + + *req_hpk = *hpk; + req_hpk->hpk_errval = lustre_errno_hton(hpk->hpk_errval); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + goto out; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives) +{ + __u32 *archive_mask; + struct ptlrpc_request *req; + int rc; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER, + LUSTRE_MDS_VERSION, + MDS_HSM_CT_REGISTER); + if (req == NULL) { + rc = -ENOMEM; + goto out; + } + + mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0); + + /* Copy hsm_progress struct */ + archive_mask = req_capsule_client_get(&req->rq_pill, + &RMF_MDS_HSM_ARCHIVE); + if (archive_mask == NULL) { + rc = -EPROTO; + goto out; + } + + *archive_mask = archives; + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + goto out; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_current_action(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_current_action *hca = op_data->op_data; + struct hsm_current_action *req_hca; + struct ptlrpc_request *req; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_ACTION); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + if (rc) + goto out; + + req_hca = req_capsule_server_get(&req->rq_pill, + &RMF_MDS_HSM_CURRENT_ACTION); + if (req_hca == NULL) { + rc = -EPROTO; + goto out; + } + + *hca = *req_hca; + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp) +{ + struct ptlrpc_request *req; + int rc; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER, + LUSTRE_MDS_VERSION, + MDS_HSM_CT_UNREGISTER); + if (req == NULL) { + rc = -ENOMEM; + goto out; + } + + mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + goto out; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_state_get(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_user_state *hus = op_data->op_data; + struct hsm_user_state *req_hus; + struct ptlrpc_request *req; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_STATE_GET); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET); + if (rc != 0) { + ptlrpc_request_free(req); + return rc; + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + if (rc) + goto out; + + req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE); + if (req_hus == NULL) { + rc = -EPROTO; + goto out; + } + + *hus = *req_hus; + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_state_set(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_state_set *hss = op_data->op_data; + struct hsm_state_set *req_hss; + struct ptlrpc_request *req; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_STATE_SET); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0); + + /* Copy states */ + req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET); + if (req_hss == NULL) { + rc = -EPROTO; + goto out; + } + *req_hss = *hss; + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + goto out; + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_request(struct obd_export *exp, + struct hsm_user_request *hur) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct ptlrpc_request *req; + struct hsm_request *req_hr; + struct hsm_user_item *req_hui; + char *req_opaque; + int rc; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST); + if (req == NULL) { + rc = -ENOMEM; + goto out; + } + + req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT, + hur->hur_request.hr_itemcount + * sizeof(struct hsm_user_item)); + req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT, + hur->hur_request.hr_data_len); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0); + + /* Copy hsm_request struct */ + req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST); + if (req_hr == NULL) { + rc = -EPROTO; + goto out; + } + *req_hr = hur->hur_request; + + /* Copy hsm_user_item structs */ + req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM); + if (req_hui == NULL) { + rc = -EPROTO; + goto out; + } + memcpy(req_hui, hur->hur_user_item, + hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item)); + + /* Copy opaque field */ + req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA); + if (req_opaque == NULL) { + rc = -EPROTO; + goto out; + } + memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + goto out; + +out: + ptlrpc_req_finished(req); + return rc; +} + +static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags) +{ + struct kuc_hdr *lh = (struct kuc_hdr *)buf; + + LASSERT(len <= KUC_CHANGELOG_MSG_MAXSIZE); + + lh->kuc_magic = KUC_MAGIC; + lh->kuc_transport = KUC_TRANSPORT_CHANGELOG; + lh->kuc_flags = flags; + lh->kuc_msgtype = CL_RECORD; + lh->kuc_msglen = len; + return lh; +} + +#define D_CHANGELOG 0 + +struct changelog_show { + __u64 cs_startrec; + __u32 cs_flags; + struct file *cs_fp; + char *cs_buf; + struct obd_device *cs_obd; +}; + +static int changelog_kkuc_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *hdr, void *data) +{ + struct changelog_show *cs = data; + struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr; + struct kuc_hdr *lh; + int len, rc; + + if (rec->cr_hdr.lrh_type != CHANGELOG_REC) { + rc = -EINVAL; + CERROR("%s: not a changelog rec %x/%d: rc = %d\n", + cs->cs_obd->obd_name, rec->cr_hdr.lrh_type, + rec->cr.cr_type, rc); + return rc; + } + + if (rec->cr.cr_index < cs->cs_startrec) { + /* Skip entries earlier than what we are interested in */ + CDEBUG(D_CHANGELOG, "rec=%llu start=%llu\n", + rec->cr.cr_index, cs->cs_startrec); + return 0; + } + + CDEBUG(D_CHANGELOG, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID + " %.*s\n", rec->cr.cr_index, rec->cr.cr_type, + changelog_type2str(rec->cr.cr_type), rec->cr.cr_time, + rec->cr.cr_flags & CLF_FLAGMASK, + PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid), + rec->cr.cr_namelen, changelog_rec_name(&rec->cr)); + + len = sizeof(*lh) + changelog_rec_size(&rec->cr) + rec->cr.cr_namelen; + + /* Set up the message */ + lh = changelog_kuc_hdr(cs->cs_buf, len, cs->cs_flags); + memcpy(lh + 1, &rec->cr, len - sizeof(*lh)); + + rc = libcfs_kkuc_msg_put(cs->cs_fp, lh); + CDEBUG(D_CHANGELOG, "kucmsg fp %p len %d rc %d\n", cs->cs_fp, len, rc); + + return rc; +} + +static int mdc_changelog_send_thread(void *csdata) +{ + struct changelog_show *cs = csdata; + struct llog_ctxt *ctxt = NULL; + struct llog_handle *llh = NULL; + struct kuc_hdr *kuch; + int rc; + + CDEBUG(D_CHANGELOG, "changelog to fp=%p start %llu\n", + cs->cs_fp, cs->cs_startrec); + + OBD_ALLOC(cs->cs_buf, KUC_CHANGELOG_MSG_MAXSIZE); + if (cs->cs_buf == NULL) { + rc = -ENOMEM; + goto out; + } + + /* Set up the remote catalog handle */ + ctxt = llog_get_context(cs->cs_obd, LLOG_CHANGELOG_REPL_CTXT); + if (ctxt == NULL) { + rc = -ENOENT; + goto out; + } + rc = llog_open(NULL, ctxt, &llh, NULL, CHANGELOG_CATALOG, + LLOG_OPEN_EXISTS); + if (rc) { + CERROR("%s: fail to open changelog catalog: rc = %d\n", + cs->cs_obd->obd_name, rc); + goto out; + } + rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT, NULL); + if (rc) { + CERROR("llog_init_handle failed %d\n", rc); + goto out; + } + + rc = llog_cat_process(NULL, llh, changelog_kkuc_cb, cs, 0, 0); + + /* Send EOF no matter what our result */ + kuch = changelog_kuc_hdr(cs->cs_buf, sizeof(*kuch), cs->cs_flags); + if (kuch) { + kuch->kuc_msgtype = CL_EOF; + libcfs_kkuc_msg_put(cs->cs_fp, kuch); + } + +out: + fput(cs->cs_fp); + if (llh) + llog_cat_close(NULL, llh); + if (ctxt) + llog_ctxt_put(ctxt); + if (cs->cs_buf) + OBD_FREE(cs->cs_buf, KUC_CHANGELOG_MSG_MAXSIZE); + OBD_FREE_PTR(cs); + return rc; +} + +static int mdc_ioc_changelog_send(struct obd_device *obd, + struct ioc_changelog *icc) +{ + struct changelog_show *cs; + int rc; + + /* Freed in mdc_changelog_send_thread */ + OBD_ALLOC_PTR(cs); + if (!cs) + return -ENOMEM; + + cs->cs_obd = obd; + cs->cs_startrec = icc->icc_recno; + /* matching fput in mdc_changelog_send_thread */ + cs->cs_fp = fget(icc->icc_id); + cs->cs_flags = icc->icc_flags; + + /* + * New thread because we should return to user app before + * writing into our pipe + */ + rc = PTR_ERR(kthread_run(mdc_changelog_send_thread, cs, + "mdc_clg_send_thread")); + if (!IS_ERR_VALUE(rc)) { + CDEBUG(D_CHANGELOG, "start changelog thread\n"); + return 0; + } + + CERROR("Failed to start changelog thread: %d\n", rc); + OBD_FREE_PTR(cs); + return rc; +} + +static int mdc_ioc_hsm_ct_start(struct obd_export *exp, + struct lustre_kernelcomm *lk); + +static int mdc_quotacheck(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct ptlrpc_request *req; + struct obd_quotactl *body; + int rc; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MDS_QUOTACHECK, LUSTRE_MDS_VERSION, + MDS_QUOTACHECK); + if (req == NULL) + return -ENOMEM; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *body = *oqctl; + + ptlrpc_request_set_replen(req); + + /* the next poll will find -ENODATA, that means quotacheck is + * going on */ + cli->cl_qchk_stat = -ENODATA; + rc = ptlrpc_queue_wait(req); + if (rc) + cli->cl_qchk_stat = rc; + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_quota_poll_check(struct obd_export *exp, + struct if_quotacheck *qchk) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + int rc; + + qchk->obd_uuid = cli->cl_target_uuid; + memcpy(qchk->obd_type, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)); + + rc = cli->cl_qchk_stat; + /* the client is not the previous one */ + if (rc == CL_NOT_QUOTACHECKED) + rc = -EINTR; + return rc; +} + +static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct ptlrpc_request *req; + struct obd_quotactl *oqc; + int rc; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION, + MDS_QUOTACTL); + if (req == NULL) + return -ENOMEM; + + oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *oqc = *oqctl; + + ptlrpc_request_set_replen(req); + ptlrpc_at_set_req_timeout(req); + req->rq_no_resend = 1; + + rc = ptlrpc_queue_wait(req); + if (rc) + CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); + + if (req->rq_repmsg) { + oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + if (oqc) { + *oqctl = *oqc; + } else if (!rc) { + CERROR("Can't unpack obd_quotactl\n"); + rc = -EPROTO; + } + } else if (!rc) { + CERROR("Can't unpack obd_quotactl\n"); + rc = -EPROTO; + } + ptlrpc_req_finished(req); + + return rc; +} + +static int mdc_ioc_swap_layouts(struct obd_export *exp, + struct md_op_data *op_data) +{ + LIST_HEAD(cancels); + struct ptlrpc_request *req; + int rc, count; + struct mdc_swap_layouts *msl, *payload; + + msl = op_data->op_data; + + /* When the MDT will get the MDS_SWAP_LAYOUTS RPC the + * first thing it will do is to cancel the 2 layout + * locks hold by this client. + * So the client must cancel its layout locks on the 2 fids + * with the request RPC to avoid extra RPC round trips + */ + count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, + LCK_CR, MDS_INODELOCK_LAYOUT); + count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels, + LCK_CR, MDS_INODELOCK_LAYOUT); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_SWAP_LAYOUTS); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + return -ENOMEM; + } + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2); + + rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_swap_layouts_pack(req, op_data); + + payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS); + LASSERT(payload); + + *payload = *msl; + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + struct obd_import *imp = obd->u.cli.cl_import; + int rc; + + if (!try_module_get(THIS_MODULE)) { + CERROR("Can't get module. Is it alive?"); + return -EINVAL; + } + switch (cmd) { + case OBD_IOC_CHANGELOG_SEND: + rc = mdc_ioc_changelog_send(obd, karg); + goto out; + case OBD_IOC_CHANGELOG_CLEAR: { + struct ioc_changelog *icc = karg; + struct changelog_setinfo cs = { + .cs_recno = icc->icc_recno, + .cs_id = icc->icc_id + }; + + rc = obd_set_info_async(NULL, exp, strlen(KEY_CHANGELOG_CLEAR), + KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, + NULL); + goto out; + } + case OBD_IOC_FID2PATH: + rc = mdc_ioc_fid2path(exp, karg); + goto out; + case LL_IOC_HSM_CT_START: + rc = mdc_ioc_hsm_ct_start(exp, karg); + /* ignore if it was already registered on this MDS. */ + if (rc == -EEXIST) + rc = 0; + goto out; + case LL_IOC_HSM_PROGRESS: + rc = mdc_ioc_hsm_progress(exp, karg); + goto out; + case LL_IOC_HSM_STATE_GET: + rc = mdc_ioc_hsm_state_get(exp, karg); + goto out; + case LL_IOC_HSM_STATE_SET: + rc = mdc_ioc_hsm_state_set(exp, karg); + goto out; + case LL_IOC_HSM_ACTION: + rc = mdc_ioc_hsm_current_action(exp, karg); + goto out; + case LL_IOC_HSM_REQUEST: + rc = mdc_ioc_hsm_request(exp, karg); + goto out; + case OBD_IOC_CLIENT_RECOVER: + rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0); + if (rc < 0) + goto out; + rc = 0; + goto out; + case IOC_OSC_SET_ACTIVE: + rc = ptlrpc_set_import_active(imp, data->ioc_offset); + goto out; + case OBD_IOC_POLL_QUOTACHECK: + rc = mdc_quota_poll_check(exp, (struct if_quotacheck *)karg); + goto out; + case OBD_IOC_PING_TARGET: + rc = ptlrpc_obd_ping(obd); + goto out; + /* + * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by + * LMV instead of MDC. But when the cluster is upgraded from 1.8, + * there'd be no LMV layer thus we might be called here. Eventually + * this code should be removed. + * bz20731, LU-592. + */ + case IOC_OBD_STATFS: { + struct obd_statfs stat_buf = {0}; + + if (*((__u32 *) data->ioc_inlbuf2) != 0) { + rc = -ENODEV; + goto out; + } + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd), + min_t(size_t, data->ioc_plen2, + sizeof(struct obd_uuid)))) { + rc = -EFAULT; + goto out; + } + + rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + 0); + if (rc != 0) + goto out; + + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min_t(size_t, data->ioc_plen1, + sizeof(stat_buf)))) { + rc = -EFAULT; + goto out; + } + + rc = 0; + goto out; + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct obd_quotactl *oqctl; + + OBD_ALLOC_PTR(oqctl); + if (oqctl == NULL) { + rc = -ENOMEM; + goto out; + } + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_MDTIDX; + qctl->obd_uuid = obd->u.cli.cl_target_uuid; + } + + OBD_FREE_PTR(oqctl); + goto out; + } + case LL_IOC_GET_CONNECT_FLAGS: + if (copy_to_user(uarg, exp_connect_flags_ptr(exp), + sizeof(*exp_connect_flags_ptr(exp)))) { + rc = -EFAULT; + goto out; + } + + rc = 0; + goto out; + case LL_IOC_LOV_SWAP_LAYOUTS: + rc = mdc_ioc_swap_layouts(exp, karg); + goto out; + default: + CERROR("unrecognised ioctl: cmd = %#x\n", cmd); + rc = -ENOTTY; + goto out; + } +out: + module_put(THIS_MODULE); + + return rc; +} + +static int mdc_get_info_rpc(struct obd_export *exp, + u32 keylen, void *key, + int vallen, void *val) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct ptlrpc_request *req; + char *tmp; + int rc = -EINVAL; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO); + if (req == NULL) + return -ENOMEM; + + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN, + RCL_CLIENT, sizeof(__u32)); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN); + memcpy(tmp, &vallen, sizeof(__u32)); + + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL, + RCL_SERVER, vallen); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + /* -EREMOTE means the get_info result is partial, and it needs to + * continue on another MDT, see fid2path part in lmv_iocontrol */ + if (rc == 0 || rc == -EREMOTE) { + tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL); + memcpy(val, tmp, vallen); + if (ptlrpc_rep_need_swab(req)) { + if (KEY_IS(KEY_FID2PATH)) + lustre_swab_fid2path(val); + } + } + ptlrpc_req_finished(req); + + return rc; +} + +static void lustre_swab_hai(struct hsm_action_item *h) +{ + __swab32s(&h->hai_len); + __swab32s(&h->hai_action); + lustre_swab_lu_fid(&h->hai_fid); + lustre_swab_lu_fid(&h->hai_dfid); + __swab64s(&h->hai_cookie); + __swab64s(&h->hai_extent.offset); + __swab64s(&h->hai_extent.length); + __swab64s(&h->hai_gid); +} + +static void lustre_swab_hal(struct hsm_action_list *h) +{ + struct hsm_action_item *hai; + int i; + + __swab32s(&h->hal_version); + __swab32s(&h->hal_count); + __swab32s(&h->hal_archive_id); + __swab64s(&h->hal_flags); + hai = hai_zero(h); + for (i = 0; i < h->hal_count; i++, hai = hai_next(hai)) + lustre_swab_hai(hai); +} + +static void lustre_swab_kuch(struct kuc_hdr *l) +{ + __swab16s(&l->kuc_magic); + /* __u8 l->kuc_transport */ + __swab16s(&l->kuc_msgtype); + __swab16s(&l->kuc_msglen); +} + +static int mdc_ioc_hsm_ct_start(struct obd_export *exp, + struct lustre_kernelcomm *lk) +{ + struct obd_import *imp = class_exp2cliimp(exp); + __u32 archive = lk->lk_data; + int rc = 0; + + if (lk->lk_group != KUC_GRP_HSM) { + CERROR("Bad copytool group %d\n", lk->lk_group); + return -EINVAL; + } + + CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd, + lk->lk_uid, lk->lk_group, lk->lk_flags); + + if (lk->lk_flags & LK_FLG_STOP) { + /* Unregister with the coordinator */ + rc = mdc_ioc_hsm_ct_unregister(imp); + } else { + rc = mdc_ioc_hsm_ct_register(imp, archive); + } + + return rc; +} + +/** + * Send a message to any listening copytools + * @param val KUC message (kuc_hdr + hsm_action_list) + * @param len total length of message + */ +static int mdc_hsm_copytool_send(int len, void *val) +{ + struct kuc_hdr *lh = (struct kuc_hdr *)val; + struct hsm_action_list *hal = (struct hsm_action_list *)(lh + 1); + int rc; + + if (len < sizeof(*lh) + sizeof(*hal)) { + CERROR("Short HSM message %d < %d\n", len, + (int) (sizeof(*lh) + sizeof(*hal))); + return -EPROTO; + } + if (lh->kuc_magic == __swab16(KUC_MAGIC)) { + lustre_swab_kuch(lh); + lustre_swab_hal(hal); + } else if (lh->kuc_magic != KUC_MAGIC) { + CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC); + return -EPROTO; + } + + CDEBUG(D_HSM, + "Received message mg=%x t=%d m=%d l=%d actions=%d on %s\n", + lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype, + lh->kuc_msglen, hal->hal_count, hal->hal_fsname); + + /* Broadcast to HSM listeners */ + rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh); + + return rc; +} + +/** + * callback function passed to kuc for re-registering each HSM copytool + * running on MDC, after MDT shutdown/recovery. + * @param data archive id served by the copytool + * @param cb_arg callback argument (obd_import) + */ +static int mdc_hsm_ct_reregister(__u32 data, void *cb_arg) +{ + struct obd_import *imp = (struct obd_import *)cb_arg; + __u32 archive = data; + int rc; + + CDEBUG(D_HA, "recover copytool registration to MDT (archive=%#x)\n", + archive); + rc = mdc_ioc_hsm_ct_register(imp, archive); + + /* ignore error if the copytool is already registered */ + return ((rc != 0) && (rc != -EEXIST)) ? rc : 0; +} + +/** + * Re-establish all kuc contexts with MDT + * after MDT shutdown/recovery. + */ +static int mdc_kuc_reregister(struct obd_import *imp) +{ + /* re-register HSM agents */ + return libcfs_kkuc_group_foreach(KUC_GRP_HSM, mdc_hsm_ct_reregister, + (void *)imp); +} + +static int mdc_set_info_async(const struct lu_env *env, + struct obd_export *exp, + u32 keylen, void *key, + u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct obd_import *imp = class_exp2cliimp(exp); + int rc; + + if (KEY_IS(KEY_READ_ONLY)) { + if (vallen != sizeof(int)) + return -EINVAL; + + spin_lock(&imp->imp_lock); + if (*((int *)val)) { + imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY; + imp->imp_connect_data.ocd_connect_flags |= + OBD_CONNECT_RDONLY; + } else { + imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY; + imp->imp_connect_data.ocd_connect_flags &= + ~OBD_CONNECT_RDONLY; + } + spin_unlock(&imp->imp_lock); + + rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, + keylen, key, vallen, val, set); + return rc; + } + if (KEY_IS(KEY_SPTLRPC_CONF)) { + sptlrpc_conf_client_adapt(exp->exp_obd); + return 0; + } + if (KEY_IS(KEY_FLUSH_CTX)) { + sptlrpc_import_flush_my_ctx(imp); + return 0; + } + if (KEY_IS(KEY_CHANGELOG_CLEAR)) { + rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, + keylen, key, vallen, val, set); + return rc; + } + if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) { + rc = mdc_hsm_copytool_send(vallen, val); + return rc; + } + + CERROR("Unknown key %s\n", (char *)key); + return -EINVAL; +} + +static int mdc_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + int rc = -EINVAL; + + if (KEY_IS(KEY_MAX_EASIZE)) { + int mdsize, *max_easize; + + if (*vallen != sizeof(int)) + return -EINVAL; + mdsize = *(int *)val; + if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize) + exp->exp_obd->u.cli.cl_max_mds_easize = mdsize; + max_easize = val; + *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize; + return 0; + } else if (KEY_IS(KEY_DEFAULT_EASIZE)) { + int *default_easize; + + if (*vallen != sizeof(int)) + return -EINVAL; + default_easize = val; + *default_easize = exp->exp_obd->u.cli.cl_default_mds_easize; + return 0; + } else if (KEY_IS(KEY_MAX_COOKIESIZE)) { + int mdsize, *max_cookiesize; + + if (*vallen != sizeof(int)) + return -EINVAL; + mdsize = *(int *)val; + if (mdsize > exp->exp_obd->u.cli.cl_max_mds_cookiesize) + exp->exp_obd->u.cli.cl_max_mds_cookiesize = mdsize; + max_cookiesize = val; + *max_cookiesize = exp->exp_obd->u.cli.cl_max_mds_cookiesize; + return 0; + } else if (KEY_IS(KEY_DEFAULT_COOKIESIZE)) { + int *default_cookiesize; + + if (*vallen != sizeof(int)) + return -EINVAL; + default_cookiesize = val; + *default_cookiesize = + exp->exp_obd->u.cli.cl_default_mds_cookiesize; + return 0; + } else if (KEY_IS(KEY_CONN_DATA)) { + struct obd_import *imp = class_exp2cliimp(exp); + struct obd_connect_data *data = val; + + if (*vallen != sizeof(*data)) + return -EINVAL; + + *data = imp->imp_connect_data; + return 0; + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = 1; + return 0; + } + + rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val); + + return rc; +} + +static int mdc_sync(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, oc); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_pack_body(req, fid, oc, 0, 0, -1, 0); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + return rc; +} + +static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, + enum obd_import_event event) +{ + int rc = 0; + + LASSERT(imp->imp_obd == obd); + + switch (event) { + case IMP_EVENT_DISCON: { +#if 0 + /* XXX Pass event up to OBDs stack. used only for FLD now */ + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DISCON, NULL); +#endif + break; + } + case IMP_EVENT_INACTIVE: { + struct client_obd *cli = &obd->u.cli; + /* + * Flush current sequence to make client obtain new one + * from server in case of disconnect/reconnect. + */ + if (cli->cl_seq != NULL) + seq_client_flush(cli->cl_seq); + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL); + break; + } + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + + break; + } + case IMP_EVENT_ACTIVE: + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); + /* redo the kuc registration after reconnecting */ + if (rc == 0) + rc = mdc_kuc_reregister(imp); + break; + case IMP_EVENT_OCD: + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL); + break; + case IMP_EVENT_DEACTIVATE: + case IMP_EVENT_ACTIVATE: + break; + default: + CERROR("Unknown import event %x\n", event); + LBUG(); + } + return rc; +} + +int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct lu_client_seq *seq = cli->cl_seq; + + return seq_client_alloc_fid(NULL, seq, fid); +} + +static struct obd_uuid *mdc_get_uuid(struct obd_export *exp) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + + return &cli->cl_target_uuid; +} + +/** + * Determine whether the lock can be canceled before replaying it during + * recovery, non zero value will be return if the lock can be canceled, + * or zero returned for not + */ +static int mdc_cancel_for_recovery(struct ldlm_lock *lock) +{ + if (lock->l_resource->lr_type != LDLM_IBITS) + return 0; + + /* FIXME: if we ever get into a situation where there are too many + * opened files with open locks on a single node, then we really + * should replay these open locks to reget it */ + if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN) + return 0; + + return 1; +} + +static int mdc_resource_inode_free(struct ldlm_resource *res) +{ + if (res->lr_lvb_inode) + res->lr_lvb_inode = NULL; + + return 0; +} + +static struct ldlm_valblock_ops inode_lvbo = { + .lvbo_free = mdc_resource_inode_free, +}; + +static int mdc_llog_init(struct obd_device *obd) +{ + struct obd_llog_group *olg = &obd->obd_olg; + struct llog_ctxt *ctxt; + int rc; + + rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, obd, + &llog_client_ops); + if (rc) + return rc; + + ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT); + llog_initiator_connect(ctxt); + llog_ctxt_put(ctxt); + + return 0; +} + +static void mdc_llog_finish(struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); +} + +static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg) +{ + struct client_obd *cli = &obd->u.cli; + struct lprocfs_static_vars lvars = { NULL }; + int rc; + + OBD_ALLOC(cli->cl_rpc_lock, sizeof(*cli->cl_rpc_lock)); + if (!cli->cl_rpc_lock) + return -ENOMEM; + mdc_init_rpc_lock(cli->cl_rpc_lock); + + ptlrpcd_addref(); + + OBD_ALLOC(cli->cl_close_lock, sizeof(*cli->cl_close_lock)); + if (!cli->cl_close_lock) { + rc = -ENOMEM; + goto err_rpc_lock; + } + mdc_init_rpc_lock(cli->cl_close_lock); + + rc = client_obd_setup(obd, cfg); + if (rc) + goto err_close_lock; + lprocfs_mdc_init_vars(&lvars); + lprocfs_obd_setup(obd, lvars.obd_vars); + sptlrpc_lprocfs_cliobd_attach(obd); + ptlrpc_lprocfs_register_obd(obd); + + ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery); + + obd->obd_namespace->ns_lvbo = &inode_lvbo; + + rc = mdc_llog_init(obd); + if (rc) { + mdc_cleanup(obd); + CERROR("failed to setup llogging subsystems\n"); + } + + return rc; + +err_close_lock: + OBD_FREE(cli->cl_close_lock, sizeof(*cli->cl_close_lock)); +err_rpc_lock: + OBD_FREE(cli->cl_rpc_lock, sizeof(*cli->cl_rpc_lock)); + ptlrpcd_decref(); + return rc; +} + +/* Initialize the default and maximum LOV EA and cookie sizes. This allows + * us to make MDS RPCs with large enough reply buffers to hold a default + * sized EA and cookie without having to calculate this (via a call into the + * LOV + OSCs) each time we make an RPC. The maximum size is also tracked + * but not used to avoid wastefully vmalloc()'ing large reply buffers when + * a large number of stripes is possible. If a larger reply buffer is + * required it will be reallocated in the ptlrpc layer due to overflow. + */ +static int mdc_init_ea_size(struct obd_export *exp, int easize, + int def_easize, int cookiesize, int def_cookiesize) +{ + struct obd_device *obd = exp->exp_obd; + struct client_obd *cli = &obd->u.cli; + + if (cli->cl_max_mds_easize < easize) + cli->cl_max_mds_easize = easize; + + if (cli->cl_default_mds_easize < def_easize) + cli->cl_default_mds_easize = def_easize; + + if (cli->cl_max_mds_cookiesize < cookiesize) + cli->cl_max_mds_cookiesize = cookiesize; + + if (cli->cl_default_mds_cookiesize < def_cookiesize) + cli->cl_default_mds_cookiesize = def_cookiesize; + + return 0; +} + +static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + switch (stage) { + case OBD_CLEANUP_EARLY: + break; + case OBD_CLEANUP_EXPORTS: + /* Failsafe, ok if racy */ + if (obd->obd_type->typ_refcnt <= 1) + libcfs_kkuc_group_rem(0, KUC_GRP_HSM); + + obd_cleanup_client_import(obd); + ptlrpc_lprocfs_unregister_obd(obd); + lprocfs_obd_cleanup(obd); + + mdc_llog_finish(obd); + break; + } + return 0; +} + +static int mdc_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + + OBD_FREE(cli->cl_rpc_lock, sizeof(*cli->cl_rpc_lock)); + OBD_FREE(cli->cl_close_lock, sizeof(*cli->cl_close_lock)); + + ptlrpcd_decref(); + + return client_obd_cleanup(obd); +} + +static int mdc_process_config(struct obd_device *obd, u32 len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct lprocfs_static_vars lvars = { NULL }; + int rc = 0; + + lprocfs_mdc_init_vars(&lvars); + switch (lcfg->lcfg_command) { + default: + rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars, + lcfg, obd); + if (rc > 0) + rc = 0; + break; + } + return rc; +} + + +/* get remote permission for current user on fid */ +static int mdc_get_remote_perm(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, __u32 suppgid, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + + LASSERT(client_is_remote(exp)); + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR); + if (req == NULL) + return -ENOMEM; + + mdc_set_capa_size(req, &RMF_CAPA1, oc); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + mdc_pack_body(req, fid, oc, OBD_MD_FLRMTPERM, 0, suppgid, 0); + + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + sizeof(struct mdt_remote_perm)); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + return rc; +} + +static int mdc_interpret_renew_capa(const struct lu_env *env, + struct ptlrpc_request *req, void *args, + int status) +{ + struct mdc_renew_capa_args *ra = args; + struct mdt_body *body = NULL; + struct lustre_capa *capa; + + if (status) { + capa = ERR_PTR(status); + goto out; + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) { + capa = ERR_PTR(-EFAULT); + goto out; + } + + if ((body->valid & OBD_MD_FLOSSCAPA) == 0) { + capa = ERR_PTR(-ENOENT); + goto out; + } + + capa = req_capsule_server_get(&req->rq_pill, &RMF_CAPA2); + if (!capa) { + capa = ERR_PTR(-EFAULT); + goto out; + } +out: + ra->ra_cb(ra->ra_oc, capa); + return 0; +} + +static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc, + renew_capa_cb_t cb) +{ + struct ptlrpc_request *req; + struct mdc_renew_capa_args *ra; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_GETATTR, + LUSTRE_MDS_VERSION, MDS_GETATTR); + if (req == NULL) + return -ENOMEM; + + /* NB, OBD_MD_FLOSSCAPA is set here, but it doesn't necessarily mean the + * capa to renew is oss capa. + */ + mdc_pack_body(req, &oc->c_capa.lc_fid, oc, OBD_MD_FLOSSCAPA, 0, -1, 0); + ptlrpc_request_set_replen(req); + + CLASSERT(sizeof(*ra) <= sizeof(req->rq_async_args)); + ra = ptlrpc_req_async_args(req); + ra->ra_oc = oc; + ra->ra_cb = cb; + req->rq_interpret_reply = mdc_interpret_renew_capa; + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + return 0; +} + +static struct obd_ops mdc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mdc_setup, + .o_precleanup = mdc_precleanup, + .o_cleanup = mdc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_disconnect = client_disconnect_export, + .o_iocontrol = mdc_iocontrol, + .o_set_info_async = mdc_set_info_async, + .o_statfs = mdc_statfs, + .o_fid_init = client_fid_init, + .o_fid_fini = client_fid_fini, + .o_fid_alloc = mdc_fid_alloc, + .o_import_event = mdc_import_event, + .o_get_info = mdc_get_info, + .o_process_config = mdc_process_config, + .o_get_uuid = mdc_get_uuid, + .o_quotactl = mdc_quotactl, + .o_quotacheck = mdc_quotacheck +}; + +static struct md_ops mdc_md_ops = { + .m_getstatus = mdc_getstatus, + .m_null_inode = mdc_null_inode, + .m_find_cbdata = mdc_find_cbdata, + .m_close = mdc_close, + .m_create = mdc_create, + .m_done_writing = mdc_done_writing, + .m_enqueue = mdc_enqueue, + .m_getattr = mdc_getattr, + .m_getattr_name = mdc_getattr_name, + .m_intent_lock = mdc_intent_lock, + .m_link = mdc_link, + .m_is_subdir = mdc_is_subdir, + .m_rename = mdc_rename, + .m_setattr = mdc_setattr, + .m_setxattr = mdc_setxattr, + .m_getxattr = mdc_getxattr, + .m_sync = mdc_sync, + .m_readpage = mdc_readpage, + .m_unlink = mdc_unlink, + .m_cancel_unused = mdc_cancel_unused, + .m_init_ea_size = mdc_init_ea_size, + .m_set_lock_data = mdc_set_lock_data, + .m_lock_match = mdc_lock_match, + .m_get_lustre_md = mdc_get_lustre_md, + .m_free_lustre_md = mdc_free_lustre_md, + .m_set_open_replay_data = mdc_set_open_replay_data, + .m_clear_open_replay_data = mdc_clear_open_replay_data, + .m_renew_capa = mdc_renew_capa, + .m_unpack_capa = mdc_unpack_capa, + .m_get_remote_perm = mdc_get_remote_perm, + .m_intent_getattr_async = mdc_intent_getattr_async, + .m_revalidate_lock = mdc_revalidate_lock +}; + +static int __init mdc_init(void) +{ + struct lprocfs_static_vars lvars = { NULL }; + + lprocfs_mdc_init_vars(&lvars); + + return class_register_type(&mdc_obd_ops, &mdc_md_ops, lvars.module_vars, + LUSTRE_MDC_NAME, NULL); +} + +static void /*__exit*/ mdc_exit(void) +{ + class_unregister_type(LUSTRE_MDC_NAME); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Metadata Client"); +MODULE_LICENSE("GPL"); + +module_init(mdc_init); +module_exit(mdc_exit); diff --git a/kernel/drivers/staging/lustre/lustre/mgc/Makefile b/kernel/drivers/staging/lustre/lustre/mgc/Makefile new file mode 100644 index 000000000..cc6e9f51a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mgc/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LUSTRE_FS) += mgc.o +mgc-y := mgc_request.o +mgc-$(CONFIG_PROC_FS) += lproc_mgc.o diff --git a/kernel/drivers/staging/lustre/lustre/mgc/lproc_mgc.c b/kernel/drivers/staging/lustre/lustre/mgc/lproc_mgc.c new file mode 100644 index 000000000..c4ea38e5f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mgc/lproc_mgc.c @@ -0,0 +1,80 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include "../include/obd_class.h" +#include "../include/lprocfs_status.h" +#include "mgc_internal.h" + +LPROC_SEQ_FOPS_RO_TYPE(mgc, uuid); +LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags); +LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid); +LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid); +LPROC_SEQ_FOPS_RO_TYPE(mgc, import); +LPROC_SEQ_FOPS_RO_TYPE(mgc, state); + +LPROC_SEQ_FOPS_WR_ONLY(mgc, ping); + +static int mgc_ir_state_seq_show(struct seq_file *m, void *v) +{ + return lprocfs_mgc_rd_ir_state(m, m->private); +} +LPROC_SEQ_FOPS_RO(mgc_ir_state); + +static struct lprocfs_vars lprocfs_mgc_obd_vars[] = { + { "uuid", &mgc_uuid_fops, NULL, 0 }, + { "ping", &mgc_ping_fops, NULL, 0222 }, + { "connect_flags", &mgc_connect_flags_fops, NULL, 0 }, + { "mgs_server_uuid", &mgc_server_uuid_fops, NULL, 0 }, + { "mgs_conn_uuid", &mgc_conn_uuid_fops, NULL, 0 }, + { "import", &mgc_import_fops, NULL, 0 }, + { "state", &mgc_state_fops, NULL, 0 }, + { "ir_state", &mgc_ir_state_fops, NULL, 0 }, + { NULL } +}; + +LPROC_SEQ_FOPS_RO_TYPE(mgc, numrefs); +static struct lprocfs_vars lprocfs_mgc_module_vars[] = { + { "num_refs", &mgc_numrefs_fops, NULL, 0 }, + { NULL } +}; + +void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_mgc_module_vars; + lvars->obd_vars = lprocfs_mgc_obd_vars; +} diff --git a/kernel/drivers/staging/lustre/lustre/mgc/mgc_internal.h b/kernel/drivers/staging/lustre/lustre/mgc/mgc_internal.h new file mode 100644 index 000000000..a6f8b3ced --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mgc/mgc_internal.h @@ -0,0 +1,73 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _MGC_INTERNAL_H +#define _MGC_INTERNAL_H + +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/lustre/lustre_idl.h" +#include "../include/lustre_lib.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_log.h" +#include "../include/lustre_export.h" + +#if defined (CONFIG_PROC_FS) +void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars); +int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data); +#else +static inline void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +static inline int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data) +{ + return 0; +} +#endif /* CONFIG_PROC_FS */ + +int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld); + +static inline int cld_is_sptlrpc(struct config_llog_data *cld) +{ + return cld->cld_type == CONFIG_T_SPTLRPC; +} + +static inline int cld_is_recover(struct config_llog_data *cld) +{ + return cld->cld_type == CONFIG_T_RECOVER; +} + +#endif /* _MGC_INTERNAL_H */ diff --git a/kernel/drivers/staging/lustre/lustre/mgc/mgc_request.c b/kernel/drivers/staging/lustre/lustre/mgc/mgc_request.c new file mode 100644 index 000000000..7947aec5c --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/mgc/mgc_request.c @@ -0,0 +1,1762 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/mgc/mgc_request.c + * + * Author: Nathan Rutman + */ + +#define DEBUG_SUBSYSTEM S_MGC +#define D_MGC D_CONFIG /*|D_WARNING*/ + +#include +#include "../include/obd_class.h" +#include "../include/lustre_dlm.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre_log.h" +#include "../include/lustre_disk.h" + +#include "mgc_internal.h" + +static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id, + int type) +{ + __u64 resname = 0; + + if (len > sizeof(resname)) { + CERROR("name too long: %s\n", name); + return -EINVAL; + } + if (len <= 0) { + CERROR("missing name: %s\n", name); + return -EINVAL; + } + memcpy(&resname, name, len); + + /* Always use the same endianness for the resid */ + memset(res_id, 0, sizeof(*res_id)); + res_id->name[0] = cpu_to_le64(resname); + /* XXX: unfortunately, sptlprc and config llog share one lock */ + switch (type) { + case CONFIG_T_CONFIG: + case CONFIG_T_SPTLRPC: + resname = 0; + break; + case CONFIG_T_RECOVER: + case CONFIG_T_PARAMS: + resname = type; + break; + default: + LBUG(); + } + res_id->name[1] = cpu_to_le64(resname); + CDEBUG(D_MGC, "log %s to resid %#llx/%#llx (%.8s)\n", name, + res_id->name[0], res_id->name[1], (char *)&res_id->name[0]); + return 0; +} + +int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type) +{ + /* fsname is at most 8 chars long, maybe contain "-". + * e.g. "lustre", "SUN-000" */ + return mgc_name2resid(fsname, strlen(fsname), res_id, type); +} +EXPORT_SYMBOL(mgc_fsname2resid); + +static int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type) +{ + char *name_end; + int len; + + /* logname consists of "fsname-nodetype". + * e.g. "lustre-MDT0001", "SUN-000-client" + * there is an exception: llog "params" */ + name_end = strrchr(logname, '-'); + if (!name_end) + len = strlen(logname); + else + len = name_end - logname; + return mgc_name2resid(logname, len, res_id, type); +} + +/********************** config llog list **********************/ +static LIST_HEAD(config_llog_list); +static DEFINE_SPINLOCK(config_list_lock); + +/* Take a reference to a config log */ +static int config_log_get(struct config_llog_data *cld) +{ + atomic_inc(&cld->cld_refcount); + CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + atomic_read(&cld->cld_refcount)); + return 0; +} + +/* Drop a reference to a config log. When no longer referenced, + we can free the config log data */ +static void config_log_put(struct config_llog_data *cld) +{ + CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + atomic_read(&cld->cld_refcount)); + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* spinlock to make sure no item with 0 refcount in the list */ + if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) { + list_del(&cld->cld_list_chain); + spin_unlock(&config_list_lock); + + CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname); + + if (cld->cld_recover) + config_log_put(cld->cld_recover); + if (cld->cld_sptlrpc) + config_log_put(cld->cld_sptlrpc); + if (cld->cld_params) + config_log_put(cld->cld_params); + if (cld_is_sptlrpc(cld)) + sptlrpc_conf_log_stop(cld->cld_logname); + + class_export_put(cld->cld_mgcexp); + OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1); + } +} + +/* Find a config log by name */ +static +struct config_llog_data *config_log_find(char *logname, + struct config_llog_instance *cfg) +{ + struct config_llog_data *cld; + struct config_llog_data *found = NULL; + void *instance; + + LASSERT(logname != NULL); + + instance = cfg ? cfg->cfg_instance : NULL; + spin_lock(&config_list_lock); + list_for_each_entry(cld, &config_llog_list, cld_list_chain) { + /* check if instance equals */ + if (instance != cld->cld_cfg.cfg_instance) + continue; + + /* instance may be NULL, should check name */ + if (strcmp(logname, cld->cld_logname) == 0) { + found = cld; + break; + } + } + if (found) { + atomic_inc(&found->cld_refcount); + LASSERT(found->cld_stopping == 0 || cld_is_sptlrpc(found) == 0); + } + spin_unlock(&config_list_lock); + return found; +} + +static +struct config_llog_data *do_config_log_add(struct obd_device *obd, + char *logname, + int type, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_data *cld; + int rc; + + CDEBUG(D_MGC, "do adding config log %s:%p\n", logname, + cfg ? cfg->cfg_instance : NULL); + + OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1); + if (!cld) + return ERR_PTR(-ENOMEM); + + strcpy(cld->cld_logname, logname); + if (cfg) + cld->cld_cfg = *cfg; + else + cld->cld_cfg.cfg_callback = class_config_llog_handler; + mutex_init(&cld->cld_lock); + cld->cld_cfg.cfg_last_idx = 0; + cld->cld_cfg.cfg_flags = 0; + cld->cld_cfg.cfg_sb = sb; + cld->cld_type = type; + atomic_set(&cld->cld_refcount, 1); + + /* Keep the mgc around until we are done */ + cld->cld_mgcexp = class_export_get(obd->obd_self_export); + + if (cld_is_sptlrpc(cld)) { + sptlrpc_conf_log_start(logname); + cld->cld_cfg.cfg_obdname = obd->obd_name; + } + + rc = mgc_logname2resid(logname, &cld->cld_resid, type); + + spin_lock(&config_list_lock); + list_add(&cld->cld_list_chain, &config_llog_list); + spin_unlock(&config_list_lock); + + if (rc) { + config_log_put(cld); + return ERR_PTR(rc); + } + + if (cld_is_sptlrpc(cld)) { + rc = mgc_process_log(obd, cld); + if (rc && rc != -ENOENT) + CERROR("failed processing sptlrpc log: %d\n", rc); + } + + return cld; +} + +static struct config_llog_data *config_recover_log_add(struct obd_device *obd, + char *fsname, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_instance lcfg = *cfg; + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_data *cld; + char logname[32]; + + if (IS_OST(lsi)) + return NULL; + + /* for osp-on-ost, see lustre_start_osp() */ + if (IS_MDT(lsi) && lcfg.cfg_instance) + return NULL; + + /* we have to use different llog for clients and mdts for cmd + * where only clients are notified if one of cmd server restarts */ + LASSERT(strlen(fsname) < sizeof(logname) / 2); + strcpy(logname, fsname); + if (IS_SERVER(lsi)) { /* mdt */ + LASSERT(lcfg.cfg_instance == NULL); + lcfg.cfg_instance = sb; + strcat(logname, "-mdtir"); + } else { + LASSERT(lcfg.cfg_instance != NULL); + strcat(logname, "-cliir"); + } + + cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb); + return cld; +} + +static struct config_llog_data *config_params_log_add(struct obd_device *obd, + struct config_llog_instance *cfg, struct super_block *sb) +{ + struct config_llog_instance lcfg = *cfg; + struct config_llog_data *cld; + + lcfg.cfg_instance = sb; + + cld = do_config_log_add(obd, PARAMS_FILENAME, CONFIG_T_PARAMS, + &lcfg, sb); + + return cld; +} + +/** Add this log to the list of active logs watched by an MGC. + * Active means we're watching for updates. + * We have one active log per "mount" - client instance or servername. + * Each instance may be at a different point in the log. + */ +static int config_log_add(struct obd_device *obd, char *logname, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_data *cld; + struct config_llog_data *sptlrpc_cld; + struct config_llog_data *params_cld; + char seclogname[32]; + char *ptr; + int rc; + + CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance); + + /* + * for each regular log, the depended sptlrpc log name is + * -sptlrpc. multiple regular logs may share one sptlrpc log. + */ + ptr = strrchr(logname, '-'); + if (ptr == NULL || ptr - logname > 8) { + CERROR("logname %s is too long\n", logname); + return -EINVAL; + } + + memcpy(seclogname, logname, ptr - logname); + strcpy(seclogname + (ptr - logname), "-sptlrpc"); + + sptlrpc_cld = config_log_find(seclogname, NULL); + if (sptlrpc_cld == NULL) { + sptlrpc_cld = do_config_log_add(obd, seclogname, + CONFIG_T_SPTLRPC, NULL, NULL); + if (IS_ERR(sptlrpc_cld)) { + CERROR("can't create sptlrpc log: %s\n", seclogname); + rc = PTR_ERR(sptlrpc_cld); + goto out_err; + } + } + params_cld = config_params_log_add(obd, cfg, sb); + if (IS_ERR(params_cld)) { + rc = PTR_ERR(params_cld); + CERROR("%s: can't create params log: rc = %d\n", + obd->obd_name, rc); + goto out_err1; + } + + cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb); + if (IS_ERR(cld)) { + CERROR("can't create log: %s\n", logname); + rc = PTR_ERR(cld); + goto out_err2; + } + + cld->cld_sptlrpc = sptlrpc_cld; + cld->cld_params = params_cld; + + LASSERT(lsi->lsi_lmd); + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) { + struct config_llog_data *recover_cld; + *strrchr(seclogname, '-') = 0; + recover_cld = config_recover_log_add(obd, seclogname, cfg, sb); + if (IS_ERR(recover_cld)) { + rc = PTR_ERR(recover_cld); + goto out_err3; + } + cld->cld_recover = recover_cld; + } + + return 0; + +out_err3: + config_log_put(cld); + +out_err2: + config_log_put(params_cld); + +out_err1: + config_log_put(sptlrpc_cld); + +out_err: + return rc; +} + +DEFINE_MUTEX(llog_process_lock); + +/** Stop watching for updates on this log. + */ +static int config_log_end(char *logname, struct config_llog_instance *cfg) +{ + struct config_llog_data *cld; + struct config_llog_data *cld_sptlrpc = NULL; + struct config_llog_data *cld_params = NULL; + struct config_llog_data *cld_recover = NULL; + int rc = 0; + + cld = config_log_find(logname, cfg); + if (cld == NULL) + return -ENOENT; + + mutex_lock(&cld->cld_lock); + /* + * if cld_stopping is set, it means we didn't start the log thus + * not owning the start ref. this can happen after previous umount: + * the cld still hanging there waiting for lock cancel, and we + * remount again but failed in the middle and call log_end without + * calling start_log. + */ + if (unlikely(cld->cld_stopping)) { + mutex_unlock(&cld->cld_lock); + /* drop the ref from the find */ + config_log_put(cld); + return rc; + } + + cld->cld_stopping = 1; + + cld_recover = cld->cld_recover; + cld->cld_recover = NULL; + mutex_unlock(&cld->cld_lock); + + if (cld_recover) { + mutex_lock(&cld_recover->cld_lock); + cld_recover->cld_stopping = 1; + mutex_unlock(&cld_recover->cld_lock); + config_log_put(cld_recover); + } + + spin_lock(&config_list_lock); + cld_sptlrpc = cld->cld_sptlrpc; + cld->cld_sptlrpc = NULL; + cld_params = cld->cld_params; + cld->cld_params = NULL; + spin_unlock(&config_list_lock); + + if (cld_sptlrpc) + config_log_put(cld_sptlrpc); + + if (cld_params) { + mutex_lock(&cld_params->cld_lock); + cld_params->cld_stopping = 1; + mutex_unlock(&cld_params->cld_lock); + config_log_put(cld_params); + } + + /* drop the ref from the find */ + config_log_put(cld); + /* drop the start ref */ + config_log_put(cld); + + CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client", + rc); + return rc; +} + +#if defined (CONFIG_PROC_FS) +int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp; + struct obd_connect_data *ocd; + struct config_llog_data *cld; + + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + ocd = &imp->imp_connect_data; + + seq_printf(m, "imperative_recovery: %s\n", + OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED"); + seq_printf(m, "client_state:\n"); + + spin_lock(&config_list_lock); + list_for_each_entry(cld, &config_llog_list, cld_list_chain) { + if (cld->cld_recover == NULL) + continue; + seq_printf(m, " - { client: %s, nidtbl_version: %u }\n", + cld->cld_logname, + cld->cld_recover->cld_cfg.cfg_last_idx); + } + spin_unlock(&config_list_lock); + + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +#endif + +/* reenqueue any lost locks */ +#define RQ_RUNNING 0x1 +#define RQ_NOW 0x2 +#define RQ_LATER 0x4 +#define RQ_STOP 0x8 +#define RQ_PRECLEANUP 0x10 +static int rq_state; +static wait_queue_head_t rq_waitq; +static DECLARE_COMPLETION(rq_exit); +static DECLARE_COMPLETION(rq_start); + +static void do_requeue(struct config_llog_data *cld) +{ + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* Do not run mgc_process_log on a disconnected export or an + export which is being disconnected. Take the client + semaphore to make the check non-racy. */ + down_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem); + if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) { + CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname); + mgc_process_log(cld->cld_mgcexp->exp_obd, cld); + } else { + CDEBUG(D_MGC, "disconnecting, won't update log %s\n", + cld->cld_logname); + } + up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem); +} + +/* this timeout represents how many seconds MGC should wait before + * requeue config and recover lock to the MGS. We need to randomize this + * in order to not flood the MGS. + */ +#define MGC_TIMEOUT_MIN_SECONDS 5 +#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */ + +static int mgc_requeue_thread(void *data) +{ + bool first = true; + + CDEBUG(D_MGC, "Starting requeue thread\n"); + + /* Keep trying failed locks periodically */ + spin_lock(&config_list_lock); + rq_state |= RQ_RUNNING; + while (1) { + struct l_wait_info lwi; + struct config_llog_data *cld, *cld_prev; + int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC; + int stopped = !!(rq_state & RQ_STOP); + int to; + + /* Any new or requeued lostlocks will change the state */ + rq_state &= ~(RQ_NOW | RQ_LATER); + spin_unlock(&config_list_lock); + + if (first) { + first = false; + complete(&rq_start); + } + + /* Always wait a few seconds to allow the server who + caused the lock revocation to finish its setup, plus some + random so everyone doesn't try to reconnect at once. */ + to = MGC_TIMEOUT_MIN_SECONDS * HZ; + to += rand * HZ / 100; /* rand is centi-seconds */ + lwi = LWI_TIMEOUT(to, NULL, NULL); + l_wait_event(rq_waitq, rq_state & (RQ_STOP | RQ_PRECLEANUP), + &lwi); + + /* + * iterate & processing through the list. for each cld, process + * its depending sptlrpc cld firstly (if any) and then itself. + * + * it's guaranteed any item in the list must have + * reference > 0; and if cld_lostlock is set, at + * least one reference is taken by the previous enqueue. + */ + cld_prev = NULL; + + spin_lock(&config_list_lock); + rq_state &= ~RQ_PRECLEANUP; + list_for_each_entry(cld, &config_llog_list, + cld_list_chain) { + if (!cld->cld_lostlock) + continue; + + spin_unlock(&config_list_lock); + + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* Whether we enqueued again or not in mgc_process_log, + * we're done with the ref from the old enqueue */ + if (cld_prev) + config_log_put(cld_prev); + cld_prev = cld; + + cld->cld_lostlock = 0; + if (likely(!stopped)) + do_requeue(cld); + + spin_lock(&config_list_lock); + } + spin_unlock(&config_list_lock); + if (cld_prev) + config_log_put(cld_prev); + + /* break after scanning the list so that we can drop + * refcount to losing lock clds */ + if (unlikely(stopped)) { + spin_lock(&config_list_lock); + break; + } + + /* Wait a bit to see if anyone else needs a requeue */ + lwi = (struct l_wait_info) { 0 }; + l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP), + &lwi); + spin_lock(&config_list_lock); + } + /* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */ + rq_state &= ~RQ_RUNNING; + spin_unlock(&config_list_lock); + + complete(&rq_exit); + + CDEBUG(D_MGC, "Ending requeue thread\n"); + return 0; +} + +/* Add a cld to the list to requeue. Start the requeue thread if needed. + We are responsible for dropping the config log reference from here on out. */ +static void mgc_requeue_add(struct config_llog_data *cld) +{ + CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n", + cld->cld_logname, atomic_read(&cld->cld_refcount), + cld->cld_stopping, rq_state); + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + mutex_lock(&cld->cld_lock); + if (cld->cld_stopping || cld->cld_lostlock) { + mutex_unlock(&cld->cld_lock); + return; + } + /* this refcount will be released in mgc_requeue_thread. */ + config_log_get(cld); + cld->cld_lostlock = 1; + mutex_unlock(&cld->cld_lock); + + /* Hold lock for rq_state */ + spin_lock(&config_list_lock); + if (rq_state & RQ_STOP) { + spin_unlock(&config_list_lock); + cld->cld_lostlock = 0; + config_log_put(cld); + } else { + rq_state |= RQ_NOW; + spin_unlock(&config_list_lock); + wake_up(&rq_waitq); + } +} + +static int mgc_llog_init(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + int rc; + + /* setup only remote ctxt, the local disk context is switched per each + * filesystem during mgc_fs_setup() */ + rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_REPL_CTXT, obd, + &llog_client_ops); + if (rc) + return rc; + + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + LASSERT(ctxt); + + llog_initiator_connect(ctxt); + llog_ctxt_put(ctxt); + + return 0; +} + +static int mgc_llog_fini(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + if (ctxt) + llog_cleanup(env, ctxt); + + return 0; +} + +static atomic_t mgc_count = ATOMIC_INIT(0); +static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + int rc = 0; + int temp; + + switch (stage) { + case OBD_CLEANUP_EARLY: + break; + case OBD_CLEANUP_EXPORTS: + if (atomic_dec_and_test(&mgc_count)) { + LASSERT(rq_state & RQ_RUNNING); + /* stop requeue thread */ + temp = RQ_STOP; + } else { + /* wakeup requeue thread to clean our cld */ + temp = RQ_NOW | RQ_PRECLEANUP; + } + spin_lock(&config_list_lock); + rq_state |= temp; + spin_unlock(&config_list_lock); + wake_up(&rq_waitq); + if (temp & RQ_STOP) + wait_for_completion(&rq_exit); + obd_cleanup_client_import(obd); + rc = mgc_llog_fini(NULL, obd); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + break; + } + return rc; +} + +static int mgc_cleanup(struct obd_device *obd) +{ + /* COMPAT_146 - old config logs may have added profiles we don't + know about */ + if (obd->obd_type->typ_refcnt <= 1) + /* Only for the last mgc */ + class_del_profiles(); + + lprocfs_obd_cleanup(obd); + ptlrpcd_decref(); + + return client_obd_cleanup(obd); +} + +static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars; + int rc; + + ptlrpcd_addref(); + + rc = client_obd_setup(obd, lcfg); + if (rc) + goto err_decref; + + rc = mgc_llog_init(NULL, obd); + if (rc) { + CERROR("failed to setup llogging subsystems\n"); + goto err_cleanup; + } + + lprocfs_mgc_init_vars(&lvars); + lprocfs_obd_setup(obd, lvars.obd_vars); + sptlrpc_lprocfs_cliobd_attach(obd); + + if (atomic_inc_return(&mgc_count) == 1) { + rq_state = 0; + init_waitqueue_head(&rq_waitq); + + /* start requeue thread */ + rc = PTR_ERR(kthread_run(mgc_requeue_thread, NULL, + "ll_cfg_requeue")); + if (IS_ERR_VALUE(rc)) { + CERROR("%s: Cannot start requeue thread (%d),no more log updates!\n", + obd->obd_name, rc); + goto err_cleanup; + } + /* rc is the task_struct pointer of mgc_requeue_thread. */ + rc = 0; + wait_for_completion(&rq_start); + } + + return rc; + +err_cleanup: + client_obd_cleanup(obd); +err_decref: + ptlrpcd_decref(); + return rc; +} + +/* based on ll_mdc_blocking_ast */ +static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct lustre_handle lockh; + struct config_llog_data *cld = (struct config_llog_data *)data; + int rc = 0; + + switch (flag) { + case LDLM_CB_BLOCKING: + /* mgs wants the lock, give it up... */ + LDLM_DEBUG(lock, "MGC blocking CB"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + break; + case LDLM_CB_CANCELING: + /* We've given up the lock, prepare ourselves to update. */ + LDLM_DEBUG(lock, "MGC cancel CB"); + + CDEBUG(D_MGC, "Lock res "DLDLMRES" (%.8s)\n", + PLDLMRES(lock->l_resource), + (char *)&lock->l_resource->lr_name.name[0]); + + if (!cld) { + CDEBUG(D_INFO, "missing data, won't requeue\n"); + break; + } + + /* held at mgc_process_log(). */ + LASSERT(atomic_read(&cld->cld_refcount) > 0); + /* Are we done with this log? */ + if (cld->cld_stopping) { + CDEBUG(D_MGC, "log %s: stopping, won't requeue\n", + cld->cld_logname); + config_log_put(cld); + break; + } + /* Make sure not to re-enqueue when the mgc is stopping + (we get called from client_disconnect_export) */ + if (!lock->l_conn_export || + !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) { + CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n", + cld->cld_logname); + config_log_put(cld); + break; + } + + /* Re-enqueue now */ + mgc_requeue_add(cld); + config_log_put(cld); + break; + default: + LBUG(); + } + + return rc; +} + +/* Not sure where this should go... */ +/* This is the timeout value for MGS_CONNECT request plus a ping interval, such + * that we can have a chance to try the secondary MGS if any. */ +#define MGC_ENQUEUE_LIMIT (INITIAL_CONNECT_TIMEOUT + (AT_OFF ? 0 : at_min) \ + + PING_INTERVAL) +#define MGC_TARGET_REG_LIMIT 10 +#define MGC_SEND_PARAM_LIMIT 10 + +/* Send parameter to MGS*/ +static int mgc_set_mgs_param(struct obd_export *exp, + struct mgs_send_param *msp) +{ + struct ptlrpc_request *req; + struct mgs_send_param *req_msp, *rep_msp; + int rc; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION, + MGS_SET_INFO); + if (!req) + return -ENOMEM; + + req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM); + if (!req_msp) { + ptlrpc_req_finished(req); + return -ENOMEM; + } + + memcpy(req_msp, msp, sizeof(*req_msp)); + ptlrpc_request_set_replen(req); + + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = MGC_SEND_PARAM_LIMIT; + rc = ptlrpc_queue_wait(req); + if (!rc) { + rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM); + memcpy(msp, rep_msp, sizeof(*rep_msp)); + } + + ptlrpc_req_finished(req); + + return rc; +} + +/* Take a config lock so we can get cancel notifications */ +static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + __u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb, + void *data, __u32 lvb_len, void *lvb_swabber, + struct lustre_handle *lockh) +{ + struct config_llog_data *cld = (struct config_llog_data *)data; + struct ldlm_enqueue_info einfo = { + .ei_type = type, + .ei_mode = mode, + .ei_cb_bl = mgc_blocking_ast, + .ei_cb_cp = ldlm_completion_ast, + }; + struct ptlrpc_request *req; + int short_limit = cld_is_sptlrpc(cld); + int rc; + + CDEBUG(D_MGC, "Enqueue for %s (res %#llx)\n", cld->cld_logname, + cld->cld_resid.name[0]); + + /* We need a callback for every lockholder, so don't try to + ldlm_lock_match (see rev 1.1.2.11.2.47) */ + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION, + LDLM_ENQUEUE); + if (req == NULL) + return -ENOMEM; + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0); + ptlrpc_request_set_replen(req); + + /* check if this is server or client */ + if (cld->cld_cfg.cfg_sb) { + struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb); + if (lsi && IS_SERVER(lsi)) + short_limit = 1; + } + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT; + rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags, + NULL, 0, LVB_T_NONE, lockh, 0); + /* A failed enqueue should still call the mgc_blocking_ast, + where it will be requeued if needed ("grant failed"). */ + ptlrpc_req_finished(req); + return rc; +} + +static void mgc_notify_active(struct obd_device *unused) +{ + /* wakeup mgc_requeue_thread to requeue mgc lock */ + spin_lock(&config_list_lock); + rq_state |= RQ_NOW; + spin_unlock(&config_list_lock); + wake_up(&rq_waitq); + + /* TODO: Help the MGS rebuild nidtbl. -jay */ +} + +/* Send target_reg message to MGS */ +static int mgc_target_register(struct obd_export *exp, + struct mgs_target_info *mti) +{ + struct ptlrpc_request *req; + struct mgs_target_info *req_mti, *rep_mti; + int rc; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION, + MGS_TARGET_REG); + if (req == NULL) + return -ENOMEM; + + req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO); + if (!req_mti) { + ptlrpc_req_finished(req); + return -ENOMEM; + } + + memcpy(req_mti, mti, sizeof(*req_mti)); + ptlrpc_request_set_replen(req); + CDEBUG(D_MGC, "register %s\n", mti->mti_svname); + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = MGC_TARGET_REG_LIMIT; + + rc = ptlrpc_queue_wait(req); + if (!rc) { + rep_mti = req_capsule_server_get(&req->rq_pill, + &RMF_MGS_TARGET_INFO); + memcpy(mti, rep_mti, sizeof(*rep_mti)); + CDEBUG(D_MGC, "register %s got index = %d\n", + mti->mti_svname, mti->mti_stripe_index); + } + ptlrpc_req_finished(req); + + return rc; +} + +static int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, u32 vallen, + void *val, struct ptlrpc_request_set *set) +{ + int rc = -EINVAL; + + /* Turn off initial_recov after we try all backup servers once */ + if (KEY_IS(KEY_INIT_RECOV_BACKUP)) { + struct obd_import *imp = class_exp2cliimp(exp); + int value; + if (vallen != sizeof(int)) + return -EINVAL; + value = *(int *)val; + CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n", + imp->imp_obd->obd_name, value, + imp->imp_deactive, imp->imp_invalid, + imp->imp_replayable, imp->imp_obd->obd_replayable, + ptlrpc_import_state_name(imp->imp_state)); + /* Resurrect if we previously died */ + if ((imp->imp_state != LUSTRE_IMP_FULL && + imp->imp_state != LUSTRE_IMP_NEW) || value > 1) + ptlrpc_reconnect_import(imp); + return 0; + } + if (KEY_IS(KEY_SET_INFO)) { + struct mgs_send_param *msp; + + msp = (struct mgs_send_param *)val; + rc = mgc_set_mgs_param(exp, msp); + return rc; + } + if (KEY_IS(KEY_MGSSEC)) { + struct client_obd *cli = &exp->exp_obd->u.cli; + struct sptlrpc_flavor flvr; + + /* + * empty string means using current flavor, if which haven't + * been set yet, set it as null. + * + * if flavor has been set previously, check the asking flavor + * must match the existing one. + */ + if (vallen == 0) { + if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID) + return 0; + val = "null"; + vallen = 4; + } + + rc = sptlrpc_parse_flavor(val, &flvr); + if (rc) { + CERROR("invalid sptlrpc flavor %s to MGS\n", + (char *) val); + return rc; + } + + /* + * caller already hold a mutex + */ + if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) { + cli->cl_flvr_mgc = flvr; + } else if (memcmp(&cli->cl_flvr_mgc, &flvr, + sizeof(flvr)) != 0) { + char str[20]; + + sptlrpc_flavor2name(&cli->cl_flvr_mgc, + str, sizeof(str)); + LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but currently %s is in use\n", + (char *) val, str); + rc = -EPERM; + } + return rc; + } + + return rc; +} + +static int mgc_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *unused) +{ + int rc = -EINVAL; + + if (KEY_IS(KEY_CONN_DATA)) { + struct obd_import *imp = class_exp2cliimp(exp); + struct obd_connect_data *data = val; + + if (*vallen == sizeof(*data)) { + *data = imp->imp_connect_data; + rc = 0; + } + } + + return rc; +} + +static int mgc_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + LASSERT(imp->imp_obd == obd); + CDEBUG(D_MGC, "import event %#x\n", event); + + switch (event) { + case IMP_EVENT_DISCON: + /* MGC imports should not wait for recovery */ + if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) + ptlrpc_pinger_ir_down(); + break; + case IMP_EVENT_INACTIVE: + break; + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + break; + } + case IMP_EVENT_ACTIVE: + CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name); + /* Clearing obd_no_recov allows us to continue pinging */ + obd->obd_no_recov = 0; + mgc_notify_active(obd); + if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) + ptlrpc_pinger_ir_up(); + break; + case IMP_EVENT_OCD: + break; + case IMP_EVENT_DEACTIVATE: + case IMP_EVENT_ACTIVATE: + break; + default: + CERROR("Unknown import event %#x\n", event); + LBUG(); + } + return 0; +} + +enum { + CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_CACHE_SHIFT), + CONFIG_READ_NRPAGES = 4 +}; + +static int mgc_apply_recover_logs(struct obd_device *mgc, + struct config_llog_data *cld, + __u64 max_version, + void *data, int datalen, bool mne_swab) +{ + struct config_llog_instance *cfg = &cld->cld_cfg; + struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb); + struct mgs_nidtbl_entry *entry; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + u64 prev_version = 0; + char *inst; + char *buf; + int bufsz; + int pos; + int rc = 0; + int off = 0; + + LASSERT(cfg->cfg_instance != NULL); + LASSERT(cfg->cfg_sb == cfg->cfg_instance); + + OBD_ALLOC(inst, PAGE_CACHE_SIZE); + if (inst == NULL) + return -ENOMEM; + + if (!IS_SERVER(lsi)) { + pos = snprintf(inst, PAGE_CACHE_SIZE, "%p", cfg->cfg_instance); + if (pos >= PAGE_CACHE_SIZE) { + OBD_FREE(inst, PAGE_CACHE_SIZE); + return -E2BIG; + } + } else { + LASSERT(IS_MDT(lsi)); + rc = server_name2svname(lsi->lsi_svname, inst, NULL, + PAGE_CACHE_SIZE); + if (rc) { + OBD_FREE(inst, PAGE_CACHE_SIZE); + return -EINVAL; + } + pos = strlen(inst); + } + + ++pos; + buf = inst + pos; + bufsz = PAGE_CACHE_SIZE - pos; + + while (datalen > 0) { + int entry_len = sizeof(*entry); + int is_ost; + struct obd_device *obd; + char *obdname; + char *cname; + char *params; + char *uuid; + + rc = -EINVAL; + if (datalen < sizeof(*entry)) + break; + + entry = (typeof(entry))(data + off); + + /* sanity check */ + if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */ + break; + if (entry->mne_nid_count == 0) /* at least one nid entry */ + break; + if (entry->mne_nid_size != sizeof(lnet_nid_t)) + break; + + entry_len += entry->mne_nid_count * entry->mne_nid_size; + if (datalen < entry_len) /* must have entry_len at least */ + break; + + /* Keep this swab for normal mixed endian handling. LU-1644 */ + if (mne_swab) + lustre_swab_mgs_nidtbl_entry(entry); + if (entry->mne_length > PAGE_CACHE_SIZE) { + CERROR("MNE too large (%u)\n", entry->mne_length); + break; + } + + if (entry->mne_length < entry_len) + break; + + off += entry->mne_length; + datalen -= entry->mne_length; + if (datalen < 0) + break; + + if (entry->mne_version > max_version) { + CERROR("entry index(%lld) is over max_index(%lld)\n", + entry->mne_version, max_version); + break; + } + + if (prev_version >= entry->mne_version) { + CERROR("index unsorted, prev %lld, now %lld\n", + prev_version, entry->mne_version); + break; + } + prev_version = entry->mne_version; + + /* + * Write a string with format "nid::instance" to + * lustre//--/import. + */ + + is_ost = entry->mne_type == LDD_F_SV_TYPE_OST; + memset(buf, 0, bufsz); + obdname = buf; + pos = 0; + + /* lustre-OST0001-osc- */ + strcpy(obdname, cld->cld_logname); + cname = strrchr(obdname, '-'); + if (cname == NULL) { + CERROR("mgc %s: invalid logname %s\n", + mgc->obd_name, obdname); + break; + } + + pos = cname - obdname; + obdname[pos] = 0; + pos += sprintf(obdname + pos, "-%s%04x", + is_ost ? "OST" : "MDT", entry->mne_index); + + cname = is_ost ? "osc" : "mdc", + pos += sprintf(obdname + pos, "-%s-%s", cname, inst); + lustre_cfg_bufs_reset(&bufs, obdname); + + /* find the obd by obdname */ + obd = class_name2obd(obdname); + if (obd == NULL) { + CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n", + mgc->obd_name, obdname); + rc = 0; + /* this is a safe race, when the ost is starting up...*/ + continue; + } + + /* osc.import = "connection=::" */ + ++pos; + params = buf + pos; + pos += sprintf(params, "%s.import=%s", cname, "connection="); + uuid = buf + pos; + + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import == NULL) { + /* client does not connect to the OST yet */ + up_read(&obd->u.cli.cl_sem); + rc = 0; + continue; + } + + /* TODO: iterate all nids to find one */ + /* find uuid by nid */ + rc = client_import_find_conn(obd->u.cli.cl_import, + entry->u.nids[0], + (struct obd_uuid *)uuid); + up_read(&obd->u.cli.cl_sem); + if (rc < 0) { + CERROR("mgc: cannot find uuid by nid %s\n", + libcfs_nid2str(entry->u.nids[0])); + break; + } + + CDEBUG(D_INFO, "Find uuid %s by nid %s\n", + uuid, libcfs_nid2str(entry->u.nids[0])); + + pos += strlen(uuid); + pos += sprintf(buf + pos, "::%u", entry->mne_instance); + LASSERT(pos < bufsz); + + lustre_cfg_bufs_set_string(&bufs, 1, params); + + rc = -ENOMEM; + lcfg = lustre_cfg_new(LCFG_PARAM, &bufs); + if (lcfg == NULL) { + CERROR("mgc: cannot allocate memory\n"); + break; + } + + CDEBUG(D_INFO, "ir apply logs %lld/%lld for %s -> %s\n", + prev_version, max_version, obdname, params); + + rc = class_process_config(lcfg); + lustre_cfg_free(lcfg); + if (rc) + CDEBUG(D_INFO, "process config for %s error %d\n", + obdname, rc); + + /* continue, even one with error */ + } + + OBD_FREE(inst, PAGE_CACHE_SIZE); + return rc; +} + +/** + * This function is called if this client was notified for target restarting + * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs. + */ +static int mgc_process_recover_log(struct obd_device *obd, + struct config_llog_data *cld) +{ + struct ptlrpc_request *req = NULL; + struct config_llog_instance *cfg = &cld->cld_cfg; + struct mgs_config_body *body; + struct mgs_config_res *res; + struct ptlrpc_bulk_desc *desc; + struct page **pages; + int nrpages; + bool eof = true; + bool mne_swab = false; + int i; + int ealen; + int rc; + + /* allocate buffer for bulk transfer. + * if this is the first time for this mgs to read logs, + * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs + * once; otherwise, it only reads increment of logs, this should be + * small and CONFIG_READ_NRPAGES will be used. + */ + nrpages = CONFIG_READ_NRPAGES; + if (cfg->cfg_last_idx == 0) /* the first time */ + nrpages = CONFIG_READ_NRPAGES_INIT; + + OBD_ALLOC(pages, sizeof(*pages) * nrpages); + if (pages == NULL) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < nrpages; i++) { + pages[i] = alloc_page(GFP_IOFS); + if (pages[i] == NULL) { + rc = -ENOMEM; + goto out; + } + } + +again: + LASSERT(cld_is_recover(cld)); + LASSERT(mutex_is_locked(&cld->cld_lock)); + req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp), + &RQF_MGS_CONFIG_READ); + if (req == NULL) { + rc = -ENOMEM; + goto out; + } + + rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ); + if (rc) + goto out; + + /* pack request */ + body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY); + LASSERT(body != NULL); + LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname)); + if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name)) + >= sizeof(body->mcb_name)) { + rc = -E2BIG; + goto out; + } + body->mcb_offset = cfg->cfg_last_idx + 1; + body->mcb_type = cld->cld_type; + body->mcb_bits = PAGE_CACHE_SHIFT; + body->mcb_units = nrpages; + + /* allocate bulk transfer descriptor */ + desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, BULK_PUT_SINK, + MGS_BULK_PORTAL); + if (desc == NULL) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < nrpages; i++) + ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES); + if (res->mcr_size < res->mcr_offset) { + rc = -EINVAL; + goto out; + } + + /* always update the index even though it might have errors with + * handling the recover logs */ + cfg->cfg_last_idx = res->mcr_offset; + eof = res->mcr_offset == res->mcr_size; + + CDEBUG(D_INFO, "Latest version %lld, more %d.\n", + res->mcr_offset, eof == false); + + ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0); + if (ealen < 0) { + rc = ealen; + goto out; + } + + if (ealen > nrpages << PAGE_CACHE_SHIFT) { + rc = -EINVAL; + goto out; + } + + if (ealen == 0) { /* no logs transferred */ + if (!eof) + rc = -EINVAL; + goto out; + } + + mne_swab = !!ptlrpc_rep_need_swab(req); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) + /* This import flag means the server did an extra swab of IR MNE + * records (fixed in LU-1252), reverse it here if needed. LU-1644 */ + if (unlikely(req->rq_import->imp_need_mne_swab)) + mne_swab = !mne_swab; +#else +#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" +#endif + + for (i = 0; i < nrpages && ealen > 0; i++) { + int rc2; + void *ptr; + + ptr = kmap(pages[i]); + rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr, + min_t(int, ealen, PAGE_CACHE_SIZE), + mne_swab); + kunmap(pages[i]); + if (rc2 < 0) { + CWARN("Process recover log %s error %d\n", + cld->cld_logname, rc2); + break; + } + + ealen -= PAGE_CACHE_SIZE; + } + +out: + if (req) + ptlrpc_req_finished(req); + + if (rc == 0 && !eof) + goto again; + + if (pages) { + for (i = 0; i < nrpages; i++) { + if (pages[i] == NULL) + break; + __free_page(pages[i]); + } + OBD_FREE(pages, sizeof(*pages) * nrpages); + } + return rc; +} + +/* local_only means it cannot get remote llogs */ +static int mgc_process_cfg_log(struct obd_device *mgc, + struct config_llog_data *cld, int local_only) +{ + struct llog_ctxt *ctxt; + struct lustre_sb_info *lsi = NULL; + int rc = 0; + bool sptlrpc_started = false; + struct lu_env *env; + + LASSERT(cld); + LASSERT(mutex_is_locked(&cld->cld_lock)); + + /* + * local copy of sptlrpc log is controlled elsewhere, don't try to + * read it up here. + */ + if (cld_is_sptlrpc(cld) && local_only) + return 0; + + if (cld->cld_cfg.cfg_sb) + lsi = s2lsi(cld->cld_cfg.cfg_sb); + + OBD_ALLOC_PTR(env); + if (env == NULL) + return -ENOMEM; + + rc = lu_env_init(env, LCT_MG_THREAD); + if (rc) + goto out_free; + + ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT); + LASSERT(ctxt); + + if (local_only) /* no local log at client side */ { + rc = -EIO; + goto out_pop; + } + + if (cld_is_sptlrpc(cld)) { + sptlrpc_conf_log_update_begin(cld->cld_logname); + sptlrpc_started = true; + } + + /* logname and instance info should be the same, so use our + * copy of the instance for the update. The cfg_last_idx will + * be updated here. */ + rc = class_config_parse_llog(env, ctxt, cld->cld_logname, + &cld->cld_cfg); + +out_pop: + __llog_ctxt_put(env, ctxt); + + /* + * update settings on existing OBDs. doing it inside + * of llog_process_lock so no device is attaching/detaching + * in parallel. + * the logname must be -sptlrpc + */ + if (sptlrpc_started) { + LASSERT(cld_is_sptlrpc(cld)); + sptlrpc_conf_log_update_end(cld->cld_logname); + class_notify_sptlrpc_conf(cld->cld_logname, + strlen(cld->cld_logname) - + strlen("-sptlrpc")); + } + + lu_env_fini(env); +out_free: + OBD_FREE_PTR(env); + return rc; +} + +/** Get a config log from the MGS and process it. + * This func is called for both clients and servers. + * Copy the log locally before parsing it if appropriate (non-MGS server) + */ +int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld) +{ + struct lustre_handle lockh = { 0 }; + __u64 flags = LDLM_FL_NO_LRU; + int rc = 0, rcl; + + LASSERT(cld); + + /* I don't want multiple processes running process_log at once -- + sounds like badness. It actually might be fine, as long as + we're not trying to update from the same log + simultaneously (in which case we should use a per-log sem.) */ + mutex_lock(&cld->cld_lock); + if (cld->cld_stopping) { + mutex_unlock(&cld->cld_lock); + return 0; + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20); + + CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname, + cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1); + + /* Get the cfg lock on the llog */ + rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL, + LCK_CR, &flags, NULL, NULL, NULL, + cld, 0, NULL, &lockh); + if (rcl == 0) { + /* Get the cld, it will be released in mgc_blocking_ast. */ + config_log_get(cld); + rc = ldlm_lock_set_data(&lockh, (void *)cld); + LASSERT(rc == 0); + } else { + CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl); + + /* mark cld_lostlock so that it will requeue + * after MGC becomes available. */ + cld->cld_lostlock = 1; + /* Get extra reference, it will be put in requeue thread */ + config_log_get(cld); + } + + + if (cld_is_recover(cld)) { + rc = 0; /* this is not a fatal error for recover log */ + if (rcl == 0) + rc = mgc_process_recover_log(mgc, cld); + } else { + rc = mgc_process_cfg_log(mgc, cld, rcl != 0); + } + + CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n", + mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc); + + mutex_unlock(&cld->cld_lock); + + /* Now drop the lock so MGS can revoke it */ + if (!rcl) + ldlm_lock_decref(&lockh, LCK_CR); + + return rc; +} + + +/** Called from lustre_process_log. + * LCFG_LOG_START gets the config log from the MGS, processes it to start + * any services, and adds it to the list logs to watch (follow). + */ +static int mgc_process_config(struct obd_device *obd, u32 len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct config_llog_instance *cfg = NULL; + char *logname; + int rc = 0; + + switch (lcfg->lcfg_command) { + case LCFG_LOV_ADD_OBD: { + /* Overloading this cfg command: register a new target */ + struct mgs_target_info *mti; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) != + sizeof(struct mgs_target_info)) { + rc = -EINVAL; + goto out; + } + + mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1); + CDEBUG(D_MGC, "add_target %s %#x\n", + mti->mti_svname, mti->mti_flags); + rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti); + break; + } + case LCFG_LOV_DEL_OBD: + /* Unregister has no meaning at the moment. */ + CERROR("lov_del_obd unimplemented\n"); + rc = -ENOSYS; + break; + case LCFG_SPTLRPC_CONF: { + rc = sptlrpc_process_config(lcfg); + break; + } + case LCFG_LOG_START: { + struct config_llog_data *cld; + struct super_block *sb; + + logname = lustre_cfg_string(lcfg, 1); + cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2); + sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3); + + CDEBUG(D_MGC, "parse_log %s from %d\n", logname, + cfg->cfg_last_idx); + + /* We're only called through here on the initial mount */ + rc = config_log_add(obd, logname, cfg, sb); + if (rc) + break; + cld = config_log_find(logname, cfg); + if (cld == NULL) { + rc = -ENOENT; + break; + } + + /* COMPAT_146 */ + /* FIXME only set this for old logs! Right now this forces + us to always skip the "inside markers" check */ + cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146; + + rc = mgc_process_log(obd, cld); + if (rc == 0 && cld->cld_recover != NULL) { + if (OCD_HAS_FLAG(&obd->u.cli.cl_import-> + imp_connect_data, IMP_RECOV)) { + rc = mgc_process_log(obd, cld->cld_recover); + } else { + struct config_llog_data *cir = cld->cld_recover; + cld->cld_recover = NULL; + config_log_put(cir); + } + if (rc) + CERROR("Cannot process recover llog %d\n", rc); + } + + if (rc == 0 && cld->cld_params != NULL) { + rc = mgc_process_log(obd, cld->cld_params); + if (rc == -ENOENT) { + CDEBUG(D_MGC, + "There is no params config file yet\n"); + rc = 0; + } + /* params log is optional */ + if (rc) + CERROR( + "%s: can't process params llog: rc = %d\n", + obd->obd_name, rc); + } + config_log_put(cld); + + break; + } + case LCFG_LOG_END: { + logname = lustre_cfg_string(lcfg, 1); + + if (lcfg->lcfg_bufcount >= 2) + cfg = (struct config_llog_instance *)lustre_cfg_buf( + lcfg, 2); + rc = config_log_end(logname, cfg); + break; + } + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + rc = -EINVAL; + goto out; + + } + } +out: + return rc; +} + +struct obd_ops mgc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mgc_setup, + .o_precleanup = mgc_precleanup, + .o_cleanup = mgc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_disconnect = client_disconnect_export, + /* .o_enqueue = mgc_enqueue, */ + /* .o_iocontrol = mgc_iocontrol, */ + .o_set_info_async = mgc_set_info_async, + .o_get_info = mgc_get_info, + .o_import_event = mgc_import_event, + .o_process_config = mgc_process_config, +}; + +static int __init mgc_init(void) +{ + return class_register_type(&mgc_obd_ops, NULL, NULL, + LUSTRE_MGC_NAME, NULL); +} + +static void /*__exit*/ mgc_exit(void) +{ + class_unregister_type(LUSTRE_MGC_NAME); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Management Client"); +MODULE_LICENSE("GPL"); + +module_init(mgc_init); +module_exit(mgc_exit); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/Makefile b/kernel/drivers/staging/lustre/lustre/obdclass/Makefile new file mode 100644 index 000000000..e89468179 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_LUSTRE_FS) += obdclass.o + +obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \ + llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \ + genops.o uuid.o lprocfs_status.o \ + lustre_handles.o lustre_peer.o \ + statfs_pack.o obdo.o obd_config.o obd_mount.o \ + lu_object.o dt_object.o capa.o cl_object.o \ + cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o + +obdclass-$(CONFIG_PROC_FS) += lprocfs_counters.o diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/acl.c b/kernel/drivers/staging/lustre/lustre/obdclass/acl.c new file mode 100644 index 000000000..9a69f6b35 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/acl.c @@ -0,0 +1,548 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/acl.c + * + * Lustre Access Control List. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include "../include/lu_object.h" +#include "../include/lustre_acl.h" +#include "../include/lustre_eacl.h" +#include "../include/obd_support.h" + +#ifdef CONFIG_FS_POSIX_ACL + +#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION + +enum { + ES_UNK = 0, /* unknown stat */ + ES_UNC = 1, /* ACL entry is not changed */ + ES_MOD = 2, /* ACL entry is modified */ + ES_ADD = 3, /* ACL entry is added */ + ES_DEL = 4 /* ACL entry is deleted */ +}; + +static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d, + ext_acl_xattr_entry *s) +{ + d->e_tag = le16_to_cpu(s->e_tag); + d->e_perm = le16_to_cpu(s->e_perm); + d->e_id = le32_to_cpu(s->e_id); + d->e_stat = le32_to_cpu(s->e_stat); +} + +static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d, + ext_acl_xattr_entry *s) +{ + d->e_tag = cpu_to_le16(s->e_tag); + d->e_perm = cpu_to_le16(s->e_perm); + d->e_id = cpu_to_le32(s->e_id); + d->e_stat = cpu_to_le32(s->e_stat); +} + +static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d, + posix_acl_xattr_entry *s) +{ + d->e_tag = le16_to_cpu(s->e_tag); + d->e_perm = le16_to_cpu(s->e_perm); + d->e_id = le32_to_cpu(s->e_id); +} + +static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d, + posix_acl_xattr_entry *s) +{ + d->e_tag = cpu_to_le16(s->e_tag); + d->e_perm = cpu_to_le16(s->e_perm); + d->e_id = cpu_to_le32(s->e_id); +} + + +/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */ +static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header, + int old_count, int new_count) +{ + int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr); + int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr); + posix_acl_xattr_header *new; + + if (unlikely(old_count <= new_count)) + return old_size; + + OBD_ALLOC(new, new_size); + if (unlikely(new == NULL)) + return -ENOMEM; + + memcpy(new, *header, new_size); + OBD_FREE(*header, old_size); + *header = new; + return new_size; +} + +/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */ +static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header, + int old_count) +{ + int ext_count = le32_to_cpu((*header)->a_count); + int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr); + int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr); + ext_acl_xattr_header *new; + + if (unlikely(old_count <= ext_count)) + return 0; + + OBD_ALLOC(new, ext_size); + if (unlikely(new == NULL)) + return -ENOMEM; + + memcpy(new, *header, ext_size); + OBD_FREE(*header, old_size); + *header = new; + return 0; +} + +/* + * Generate new extended ACL based on the posix ACL. + */ +ext_acl_xattr_header * +lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size) +{ + int count, i, esize; + ext_acl_xattr_header *new; + + if (unlikely(size < 0)) + return ERR_PTR(-EINVAL); + else if (!size) + count = 0; + else + count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr); + OBD_ALLOC(new, esize); + if (unlikely(new == NULL)) + return ERR_PTR(-ENOMEM); + + new->a_count = cpu_to_le32(count); + for (i = 0; i < count; i++) { + new->a_entries[i].e_tag = header->a_entries[i].e_tag; + new->a_entries[i].e_perm = header->a_entries[i].e_perm; + new->a_entries[i].e_id = header->a_entries[i].e_id; + new->a_entries[i].e_stat = cpu_to_le32(ES_UNK); + } + + return new; +} +EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext); + +/* + * Filter out the "nobody" entries in the posix ACL. + */ +int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, size_t size, + posix_acl_xattr_header **out) +{ + int count, i, j, rc = 0; + __u32 id; + posix_acl_xattr_header *new; + + if (!size) + return 0; + if (size < sizeof(*new)) + return -EINVAL; + + OBD_ALLOC(new, size); + if (unlikely(new == NULL)) + return -ENOMEM; + + new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION); + count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + for (i = 0, j = 0; i < count; i++) { + id = le32_to_cpu(header->a_entries[i].e_id); + switch (le16_to_cpu(header->a_entries[i].e_tag)) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + if (id != ACL_UNDEFINED_ID) { + rc = -EIO; + goto _out; + } + + memcpy(&new->a_entries[j++], &header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + break; + case ACL_USER: + if (id != NOBODY_UID) + memcpy(&new->a_entries[j++], + &header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + break; + case ACL_GROUP: + if (id != NOBODY_GID) + memcpy(&new->a_entries[j++], + &header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + break; + default: + rc = -EIO; + goto _out; + } + } + + /* free unused space. */ + rc = lustre_posix_acl_xattr_reduce_space(&new, count, j); + if (rc >= 0) { + size = rc; + *out = new; + rc = 0; + } + +_out: + if (rc) { + OBD_FREE(new, size); + size = rc; + } + return size; +} +EXPORT_SYMBOL(lustre_posix_acl_xattr_filter); + +/* + * Release the posix ACL space. + */ +void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size) +{ + OBD_FREE(header, size); +} +EXPORT_SYMBOL(lustre_posix_acl_xattr_free); + +/* + * Release the extended ACL space. + */ +void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header) +{ + OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \ + ext_acl_xattr)); +} +EXPORT_SYMBOL(lustre_ext_acl_xattr_free); + +static ext_acl_xattr_entry * +lustre_ext_acl_xattr_search(ext_acl_xattr_header *header, + posix_acl_xattr_entry *entry, int *pos) +{ + int once, start, end, i, j, count = le32_to_cpu(header->a_count); + + once = 0; + start = *pos; + end = count; + +again: + for (i = start; i < end; i++) { + if (header->a_entries[i].e_tag == entry->e_tag && + header->a_entries[i].e_id == entry->e_id) { + j = i; + if (++i >= count) + i = 0; + *pos = i; + return &header->a_entries[j]; + } + } + + if (!once) { + once = 1; + start = 0; + end = *pos; + goto again; + } + + return NULL; +} + +/* + * Merge the posix ACL and the extended ACL into new posix ACL. + */ +int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header, + posix_acl_xattr_header **out) +{ + int posix_count, posix_size, i, j; + int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0; + posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID}; + posix_acl_xattr_header *new; + ext_acl_xattr_entry *ee, ae; + + lustre_posix_acl_cpu_to_le(&pe, &pe); + ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos); + if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) { + /* there are only base ACL entries at most. */ + posix_count = 3; + posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr); + OBD_ALLOC(new, posix_size); + if (unlikely(new == NULL)) + return -ENOMEM; + + new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION); + for (i = 0, j = 0; i < ext_count; i++) { + lustre_ext_acl_le_to_cpu(&ae, + &ext_header->a_entries[i]); + switch (ae.e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_OTHER: + if (ae.e_id != ACL_UNDEFINED_ID) { + rc = -EIO; + goto _out; + } + + if (ae.e_stat != ES_DEL) { + new->a_entries[j].e_tag = + ext_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + ext_header->a_entries[i].e_perm; + new->a_entries[j++].e_id = + ext_header->a_entries[i].e_id; + } + break; + case ACL_MASK: + case ACL_USER: + case ACL_GROUP: + if (ae.e_stat == ES_DEL) + break; + default: + rc = -EIO; + goto _out; + } + } + } else { + /* maybe there are valid ACL_USER or ACL_GROUP entries in the + * original server-side ACL, they are regarded as ES_UNC stat.*/ + int ori_posix_count; + + if (unlikely(size < 0)) + return -EINVAL; + else if (!size) + ori_posix_count = 0; + else + ori_posix_count = + CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + posix_count = ori_posix_count + ext_count; + posix_size = + CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr); + OBD_ALLOC(new, posix_size); + if (unlikely(new == NULL)) + return -ENOMEM; + + new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION); + /* 1. process the unchanged ACL entries + * in the original server-side ACL. */ + pos = 0; + for (i = 0, j = 0; i < ori_posix_count; i++) { + ee = lustre_ext_acl_xattr_search(ext_header, + &posix_header->a_entries[i], &pos); + if (ee == NULL) + memcpy(&new->a_entries[j++], + &posix_header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + } + + /* 2. process the non-deleted entries + * from client-side extended ACL. */ + for (i = 0; i < ext_count; i++) { + if (le16_to_cpu(ext_header->a_entries[i].e_stat) != + ES_DEL) { + new->a_entries[j].e_tag = + ext_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + ext_header->a_entries[i].e_perm; + new->a_entries[j++].e_id = + ext_header->a_entries[i].e_id; + } + } + } + + /* free unused space. */ + rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j); + if (rc >= 0) { + posix_size = rc; + *out = new; + rc = 0; + } + +_out: + if (rc) { + OBD_FREE(new, posix_size); + posix_size = rc; + } + return posix_size; +} +EXPORT_SYMBOL(lustre_acl_xattr_merge2posix); + +/* + * Merge the posix ACL and the extended ACL into new extended ACL. + */ +ext_acl_xattr_header * +lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header) +{ + int ori_ext_count, posix_count, ext_count, ext_size; + int i, j, pos = 0, rc = 0; + posix_acl_xattr_entry pae; + ext_acl_xattr_header *new; + ext_acl_xattr_entry *ee, eae; + + if (unlikely(size < 0)) + return ERR_PTR(-EINVAL); + else if (!size) + posix_count = 0; + else + posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + ori_ext_count = le32_to_cpu(ext_header->a_count); + ext_count = posix_count + ori_ext_count; + ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr); + + OBD_ALLOC(new, ext_size); + if (unlikely(new == NULL)) + return ERR_PTR(-ENOMEM); + + for (i = 0, j = 0; i < posix_count; i++) { + lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]); + switch (pae.e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + if (pae.e_id != ACL_UNDEFINED_ID) { + rc = -EIO; + goto out; + } + case ACL_USER: + /* ignore "nobody" entry. */ + if (pae.e_id == NOBODY_UID) + break; + + new->a_entries[j].e_tag = + posix_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + posix_header->a_entries[i].e_perm; + new->a_entries[j].e_id = + posix_header->a_entries[i].e_id; + ee = lustre_ext_acl_xattr_search(ext_header, + &posix_header->a_entries[i], &pos); + if (ee) { + if (posix_header->a_entries[i].e_perm != + ee->e_perm) + /* entry modified. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_MOD); + else + /* entry unchanged. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_UNC); + } else { + /* new entry. */ + new->a_entries[j++].e_stat = + cpu_to_le32(ES_ADD); + } + break; + case ACL_GROUP: + /* ignore "nobody" entry. */ + if (pae.e_id == NOBODY_GID) + break; + new->a_entries[j].e_tag = + posix_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + posix_header->a_entries[i].e_perm; + new->a_entries[j].e_id = + posix_header->a_entries[i].e_id; + ee = lustre_ext_acl_xattr_search(ext_header, + &posix_header->a_entries[i], &pos); + if (ee) { + if (posix_header->a_entries[i].e_perm != + ee->e_perm) + /* entry modified. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_MOD); + else + /* entry unchanged. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_UNC); + } else { + /* new entry. */ + new->a_entries[j++].e_stat = + cpu_to_le32(ES_ADD); + } + break; + default: + rc = -EIO; + goto out; + } + } + + /* process deleted entries. */ + for (i = 0; i < ori_ext_count; i++) { + lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]); + if (eae.e_stat == ES_UNK) { + /* ignore "nobody" entry. */ + if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) || + (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID)) + continue; + + new->a_entries[j].e_tag = + ext_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + ext_header->a_entries[i].e_perm; + new->a_entries[j].e_id = ext_header->a_entries[i].e_id; + new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL); + } + } + + new->a_count = cpu_to_le32(j); + /* free unused space. */ + rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count); + +out: + if (rc) { + OBD_FREE(new, ext_size); + new = ERR_PTR(rc); + } + return new; +} +EXPORT_SYMBOL(lustre_acl_xattr_merge2ext); + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/capa.c b/kernel/drivers/staging/lustre/lustre/obdclass/capa.c new file mode 100644 index 000000000..d206b1046 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/capa.c @@ -0,0 +1,421 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/capa.c + * + * Lustre Capability Hash Management + * + * Author: Lai Siyao + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include +#include +#include + +#include "../include/obd_class.h" +#include "../include/lustre_debug.h" +#include "../include/lustre/lustre_idl.h" + +#include +#include "../include/lustre_capa.h" + +#define NR_CAPAHASH 32 +#define CAPA_HASH_SIZE 3000 /* for MDS & OSS */ + +struct kmem_cache *capa_cachep = NULL; + +/* lock for capa hash/capa_list/fo_capa_keys */ +DEFINE_SPINLOCK(capa_lock); + +struct list_head capa_list[CAPA_SITE_MAX]; + +static struct capa_hmac_alg capa_hmac_algs[] = { + DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20), +}; +/* capa count */ +int capa_count[CAPA_SITE_MAX] = { 0, }; + +EXPORT_SYMBOL(capa_cachep); +EXPORT_SYMBOL(capa_list); +EXPORT_SYMBOL(capa_lock); +EXPORT_SYMBOL(capa_count); + +static inline +unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm) +{ + return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize; +} + +struct hlist_head *init_capa_hash(void) +{ + struct hlist_head *hash; + int nr_hash, i; + + OBD_ALLOC(hash, PAGE_CACHE_SIZE); + if (!hash) + return NULL; + + nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head); + LASSERT(nr_hash > NR_CAPAHASH); + + for (i = 0; i < NR_CAPAHASH; i++) + INIT_HLIST_HEAD(hash + i); + return hash; +} +EXPORT_SYMBOL(init_capa_hash); + +static inline int capa_on_server(struct obd_capa *ocapa) +{ + return ocapa->c_site == CAPA_SITE_SERVER; +} + +static inline void capa_delete(struct obd_capa *ocapa) +{ + LASSERT(capa_on_server(ocapa)); + hlist_del_init(&ocapa->u.tgt.c_hash); + list_del_init(&ocapa->c_list); + capa_count[ocapa->c_site]--; + /* release the ref when alloc */ + capa_put(ocapa); +} + +void cleanup_capa_hash(struct hlist_head *hash) +{ + int i; + struct hlist_node *next; + struct obd_capa *oc; + + spin_lock(&capa_lock); + for (i = 0; i < NR_CAPAHASH; i++) { + hlist_for_each_entry_safe(oc, next, hash + i, + u.tgt.c_hash) + capa_delete(oc); + } + spin_unlock(&capa_lock); + + OBD_FREE(hash, PAGE_CACHE_SIZE); +} +EXPORT_SYMBOL(cleanup_capa_hash); + +static inline int capa_hashfn(struct lu_fid *fid) +{ + return (fid_oid(fid) ^ fid_ver(fid)) * + (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH; +} + +/* capa renewal time check is earlier than that on client, which is to prevent + * client renew right after obtaining it. */ +static inline int capa_is_to_expire(struct obd_capa *oc) +{ + return time_before(cfs_time_sub(oc->c_expiry, + cfs_time_seconds(oc->c_capa.lc_timeout)*2/3), + cfs_time_current()); +} + +static struct obd_capa *find_capa(struct lustre_capa *capa, + struct hlist_head *head, int alive) +{ + struct obd_capa *ocapa; + int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa); + + hlist_for_each_entry(ocapa, head, u.tgt.c_hash) { + if (memcmp(&ocapa->c_capa, capa, len)) + continue; + /* don't return one that will expire soon in this case */ + if (alive && capa_is_to_expire(ocapa)) + continue; + + LASSERT(capa_on_server(ocapa)); + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found"); + return ocapa; + } + + return NULL; +} + +#define LRU_CAPA_DELETE_COUNT 12 +static inline void capa_delete_lru(struct list_head *head) +{ + struct obd_capa *ocapa; + struct list_head *node = head->next; + int count = 0; + + /* free LRU_CAPA_DELETE_COUNT unused capa from head */ + while (count++ < LRU_CAPA_DELETE_COUNT) { + ocapa = list_entry(node, struct obd_capa, c_list); + node = node->next; + if (atomic_read(&ocapa->c_refc)) + continue; + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru"); + capa_delete(ocapa); + } +} + +/* add or update */ +struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa) +{ + struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid); + struct obd_capa *ocapa, *old = NULL; + struct list_head *list = &capa_list[CAPA_SITE_SERVER]; + + ocapa = alloc_capa(CAPA_SITE_SERVER); + if (IS_ERR(ocapa)) + return NULL; + + spin_lock(&capa_lock); + old = find_capa(capa, head, 0); + if (!old) { + ocapa->c_capa = *capa; + set_capa_expiry(ocapa); + hlist_add_head(&ocapa->u.tgt.c_hash, head); + list_add_tail(&ocapa->c_list, list); + capa_get(ocapa); + capa_count[CAPA_SITE_SERVER]++; + if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE) + capa_delete_lru(list); + spin_unlock(&capa_lock); + return ocapa; + } + capa_get(old); + spin_unlock(&capa_lock); + capa_put(ocapa); + return old; +} +EXPORT_SYMBOL(capa_add); + +struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa, + int alive) +{ + struct obd_capa *ocapa; + + spin_lock(&capa_lock); + ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive); + if (ocapa) { + list_move_tail(&ocapa->c_list, + &capa_list[CAPA_SITE_SERVER]); + capa_get(ocapa); + } + spin_unlock(&capa_lock); + + return ocapa; +} +EXPORT_SYMBOL(capa_lookup); + +static inline int ll_crypto_hmac(struct crypto_hash *tfm, + u8 *key, unsigned int *keylen, + struct scatterlist *sg, + unsigned int size, u8 *result) +{ + struct hash_desc desc; + int rv; + desc.tfm = tfm; + desc.flags = 0; + rv = crypto_hash_setkey(desc.tfm, key, *keylen); + if (rv) { + CERROR("failed to hash setkey: %d\n", rv); + return rv; + } + return crypto_hash_digest(&desc, sg, size, result); +} + +int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key) +{ + struct crypto_hash *tfm; + struct capa_hmac_alg *alg; + int keylen; + struct scatterlist sl; + + if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) { + CERROR("unknown capability hmac algorithm!\n"); + return -EFAULT; + } + + alg = &capa_hmac_algs[capa_alg(capa)]; + + tfm = crypto_alloc_hash(alg->ha_name, 0, 0); + if (IS_ERR(tfm)) { + CERROR("crypto_alloc_tfm failed, check whether your kernel has crypto support!\n"); + return PTR_ERR(tfm); + } + keylen = alg->ha_keylen; + + sg_init_table(&sl, 1); + sg_set_page(&sl, virt_to_page(capa), + offsetof(struct lustre_capa, lc_hmac), + (unsigned long)(capa) % PAGE_CACHE_SIZE); + + ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac); + crypto_free_hash(tfm); + + return 0; +} +EXPORT_SYMBOL(capa_hmac); + +int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen) +{ + struct crypto_blkcipher *tfm; + struct scatterlist sd; + struct scatterlist ss; + struct blkcipher_desc desc; + unsigned int min; + int rc; + char alg[CRYPTO_MAX_ALG_NAME+1] = "aes"; + + /* passing "aes" in a variable instead of a constant string keeps gcc + * 4.3.2 happy */ + tfm = crypto_alloc_blkcipher(alg, 0, 0); + if (IS_ERR(tfm)) { + CERROR("failed to load transform for aes\n"); + return PTR_ERR(tfm); + } + + min = ll_crypto_tfm_alg_min_keysize(tfm); + if (keylen < min) { + CERROR("keylen at least %d bits for aes\n", min * 8); + rc = -EINVAL; + goto out; + } + + rc = crypto_blkcipher_setkey(tfm, key, min); + if (rc) { + CERROR("failed to setting key for aes\n"); + goto out; + } + + sg_init_table(&sd, 1); + sg_set_page(&sd, virt_to_page(d), 16, + (unsigned long)(d) % PAGE_CACHE_SIZE); + + sg_init_table(&ss, 1); + sg_set_page(&ss, virt_to_page(s), 16, + (unsigned long)(s) % PAGE_CACHE_SIZE); + desc.tfm = tfm; + desc.info = NULL; + desc.flags = 0; + rc = crypto_blkcipher_encrypt(&desc, &sd, &ss, 16); + if (rc) { + CERROR("failed to encrypt for aes\n"); + goto out; + } + +out: + crypto_free_blkcipher(tfm); + return rc; +} +EXPORT_SYMBOL(capa_encrypt_id); + +int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen) +{ + struct crypto_blkcipher *tfm; + struct scatterlist sd; + struct scatterlist ss; + struct blkcipher_desc desc; + unsigned int min; + int rc; + char alg[CRYPTO_MAX_ALG_NAME+1] = "aes"; + + /* passing "aes" in a variable instead of a constant string keeps gcc + * 4.3.2 happy */ + tfm = crypto_alloc_blkcipher(alg, 0, 0); + if (IS_ERR(tfm)) { + CERROR("failed to load transform for aes\n"); + return PTR_ERR(tfm); + } + + min = ll_crypto_tfm_alg_min_keysize(tfm); + if (keylen < min) { + CERROR("keylen at least %d bits for aes\n", min * 8); + rc = -EINVAL; + goto out; + } + + rc = crypto_blkcipher_setkey(tfm, key, min); + if (rc) { + CERROR("failed to setting key for aes\n"); + goto out; + } + + sg_init_table(&sd, 1); + sg_set_page(&sd, virt_to_page(d), 16, + (unsigned long)(d) % PAGE_CACHE_SIZE); + + sg_init_table(&ss, 1); + sg_set_page(&ss, virt_to_page(s), 16, + (unsigned long)(s) % PAGE_CACHE_SIZE); + + desc.tfm = tfm; + desc.info = NULL; + desc.flags = 0; + rc = crypto_blkcipher_decrypt(&desc, &sd, &ss, 16); + if (rc) { + CERROR("failed to decrypt for aes\n"); + goto out; + } + +out: + crypto_free_blkcipher(tfm); + return rc; +} +EXPORT_SYMBOL(capa_decrypt_id); + +void capa_cpy(void *capa, struct obd_capa *ocapa) +{ + spin_lock(&ocapa->c_lock); + *(struct lustre_capa *)capa = ocapa->c_capa; + spin_unlock(&ocapa->c_lock); +} +EXPORT_SYMBOL(capa_cpy); + +void _debug_capa(struct lustre_capa *c, + struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + libcfs_debug_vmsg2(msgdata, fmt, args, + " capability@%p fid " DFID " opc %#llx uid %llu gid %llu flags %u alg %d keyid %u timeout %u expiry %u\n", + c, PFID(capa_fid(c)), capa_opc(c), + capa_uid(c), capa_gid(c), capa_flags(c), + capa_alg(c), capa_keyid(c), capa_timeout(c), + capa_expiry(c)); + va_end(args); +} +EXPORT_SYMBOL(_debug_capa); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/kernel/drivers/staging/lustre/lustre/obdclass/cl_internal.h new file mode 100644 index 000000000..7eb0ad7b3 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_internal.h @@ -0,0 +1,121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal cl interfaces. + * + * Author: Nikita Danilov + */ +#ifndef _CL_INTERNAL_H +#define _CL_INTERNAL_H + +#define CLT_PVEC_SIZE (14) + +/** + * Possible levels of the nesting. Currently this is 2: there are "top" + * entities (files, extent locks), and "sub" entities (stripes and stripe + * locks). This is used only for debugging counters right now. + */ +enum clt_nesting_level { + CNL_TOP, + CNL_SUB, + CNL_NR +}; + +/** + * Counters used to check correctness of cl_lock interface usage. + */ +struct cl_thread_counters { + /** + * Number of outstanding calls to cl_lock_mutex_get() made by the + * current thread. For debugging. + */ + int ctc_nr_locks_locked; + /** List of locked locks. */ + struct lu_ref ctc_locks_locked; + /** Number of outstanding holds on locks. */ + int ctc_nr_held; + /** Number of outstanding uses on locks. */ + int ctc_nr_used; + /** Number of held extent locks. */ + int ctc_nr_locks_acquired; +}; + +/** + * Thread local state internal for generic cl-code. + */ +struct cl_thread_info { + /* + * Common fields. + */ + struct cl_io clt_io; + struct cl_2queue clt_queue; + + /* + * Fields used by cl_lock.c + */ + struct cl_lock_descr clt_descr; + struct cl_page_list clt_list; + /** + * Counters for every level of lock nesting. + */ + struct cl_thread_counters clt_counters[CNL_NR]; + /** @} debugging */ + + /* + * Fields used by cl_page.c + */ + struct cl_page *clt_pvec[CLT_PVEC_SIZE]; + + /* + * Fields used by cl_io.c + */ + /** + * Pointer to the topmost ongoing IO in this thread. + */ + struct cl_io *clt_current_io; + /** + * Used for submitting a sync io. + */ + struct cl_sync_io clt_anchor; + /** + * Fields used by cl_lock_discard_pages(). + */ + pgoff_t clt_next_index; + pgoff_t clt_fn_index; /* first non-overlapped index */ +}; + +struct cl_thread_info *cl_env_info(const struct lu_env *env); + +#endif /* _CL_INTERNAL_H */ diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_io.c b/kernel/drivers/staging/lustre/lustre/obdclass/cl_io.c new file mode 100644 index 000000000..3141b6043 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_io.c @@ -0,0 +1,1669 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client IO. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lustre_fid.h" +#include +#include "../include/cl_object.h" +#include "cl_internal.h" + +/***************************************************************************** + * + * cl_io interface. + * + */ + +#define cl_io_for_each(slice, io) \ + list_for_each_entry((slice), &io->ci_layers, cis_linkage) +#define cl_io_for_each_reverse(slice, io) \ + list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage) + +static inline int cl_io_type_is_valid(enum cl_io_type type) +{ + return CIT_READ <= type && type < CIT_OP_NR; +} + +static inline int cl_io_is_loopable(const struct cl_io *io) +{ + return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC; +} + +/** + * Returns true iff there is an IO ongoing in the given environment. + */ +int cl_io_is_going(const struct lu_env *env) +{ + return cl_env_info(env)->clt_current_io != NULL; +} +EXPORT_SYMBOL(cl_io_is_going); + +/** + * cl_io invariant that holds at all times when exported cl_io_*() functions + * are entered and left. + */ +static int cl_io_invariant(const struct cl_io *io) +{ + struct cl_io *up; + + up = io->ci_parent; + return + /* + * io can own pages only when it is ongoing. Sub-io might + * still be in CIS_LOCKED state when top-io is in + * CIS_IO_GOING. + */ + ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING || + (io->ci_state == CIS_LOCKED && up != NULL)); +} + +/** + * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top. + */ +void cl_io_fini(const struct lu_env *env, struct cl_io *io) +{ + struct cl_io_slice *slice; + struct cl_thread_info *info; + + LINVRNT(cl_io_type_is_valid(io->ci_type)); + LINVRNT(cl_io_invariant(io)); + + while (!list_empty(&io->ci_layers)) { + slice = container_of(io->ci_layers.prev, struct cl_io_slice, + cis_linkage); + list_del_init(&slice->cis_linkage); + if (slice->cis_iop->op[io->ci_type].cio_fini != NULL) + slice->cis_iop->op[io->ci_type].cio_fini(env, slice); + /* + * Invalidate slice to catch use after free. This assumes that + * slices are allocated within session and can be touched + * after ->cio_fini() returns. + */ + slice->cis_io = NULL; + } + io->ci_state = CIS_FINI; + info = cl_env_info(env); + if (info->clt_current_io == io) + info->clt_current_io = NULL; + + /* sanity check for layout change */ + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + break; + case CIT_FAULT: + case CIT_FSYNC: + LASSERT(!io->ci_need_restart); + break; + case CIT_SETATTR: + case CIT_MISC: + /* Check ignore layout change conf */ + LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout, + !io->ci_need_restart)); + break; + default: + LBUG(); + } +} +EXPORT_SYMBOL(cl_io_fini); + +static int cl_io_init0(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_object *scan; + int result; + + LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI); + LINVRNT(cl_io_type_is_valid(iot)); + LINVRNT(cl_io_invariant(io)); + + io->ci_type = iot; + INIT_LIST_HEAD(&io->ci_lockset.cls_todo); + INIT_LIST_HEAD(&io->ci_lockset.cls_curr); + INIT_LIST_HEAD(&io->ci_lockset.cls_done); + INIT_LIST_HEAD(&io->ci_layers); + + result = 0; + cl_object_for_each(scan, obj) { + if (scan->co_ops->coo_io_init != NULL) { + result = scan->co_ops->coo_io_init(env, scan, io); + if (result != 0) + break; + } + } + if (result == 0) + io->ci_state = CIS_INIT; + return result; +} + +/** + * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * \pre obj != cl_object_top(obj) + */ +int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_thread_info *info = cl_env_info(env); + + LASSERT(obj != cl_object_top(obj)); + if (info->clt_current_io == NULL) + info->clt_current_io = io; + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_sub_init); + +/** + * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter + * what the latter returned. + * + * \pre obj == cl_object_top(obj) + * \pre cl_io_type_is_valid(iot) + * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot + */ +int cl_io_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_thread_info *info = cl_env_info(env); + + LASSERT(obj == cl_object_top(obj)); + LASSERT(info->clt_current_io == NULL); + + info->clt_current_io = io; + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_init); + +/** + * Initialize read or write io. + * + * \pre iot == CIT_READ || iot == CIT_WRITE + */ +int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count) +{ + LINVRNT(iot == CIT_READ || iot == CIT_WRITE); + LINVRNT(io->ci_obj != NULL); + + LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, + "io range: %u [%llu, %llu) %u %u\n", + iot, (__u64)pos, (__u64)pos + count, + io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); + io->u.ci_rw.crw_pos = pos; + io->u.ci_rw.crw_count = count; + return cl_io_init(env, io, iot, io->ci_obj); +} +EXPORT_SYMBOL(cl_io_rw_init); + +static inline const struct lu_fid * +cl_lock_descr_fid(const struct cl_lock_descr *descr) +{ + return lu_object_fid(&descr->cld_obj->co_lu); +} + +static int cl_lock_descr_sort(const struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?: + __diff_normalize(d0->cld_start, d1->cld_start); +} + +static int cl_lock_descr_cmp(const struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + int ret; + + ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)); + if (ret) + return ret; + if (d0->cld_end < d1->cld_start) + return -1; + if (d0->cld_start > d0->cld_end) + return 1; + return 0; +} + +static void cl_lock_descr_merge(struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + d0->cld_start = min(d0->cld_start, d1->cld_start); + d0->cld_end = max(d0->cld_end, d1->cld_end); + + if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE) + d0->cld_mode = CLM_WRITE; + + if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP) + d0->cld_mode = CLM_GROUP; +} + +/* + * Sort locks in lexicographical order of their (fid, start-offset) pairs. + */ +static void cl_io_locks_sort(struct cl_io *io) +{ + int done = 0; + + /* hidden treasure: bubble sort for now. */ + do { + struct cl_io_lock_link *curr; + struct cl_io_lock_link *prev; + struct cl_io_lock_link *temp; + + done = 1; + prev = NULL; + + list_for_each_entry_safe(curr, temp, + &io->ci_lockset.cls_todo, + cill_linkage) { + if (prev != NULL) { + switch (cl_lock_descr_sort(&prev->cill_descr, + &curr->cill_descr)) { + case 0: + /* + * IMPOSSIBLE: Identical locks are + * already removed at + * this point. + */ + default: + LBUG(); + case +1: + list_move_tail(&curr->cill_linkage, + &prev->cill_linkage); + done = 0; + continue; /* don't change prev: it's + * still "previous" */ + case -1: /* already in order */ + break; + } + } + prev = curr; + } + } while (!done); +} + +/** + * Check whether \a queue contains locks matching \a need. + * + * \retval +ve there is a matching lock in the \a queue + * \retval 0 there are no matching locks in the \a queue + */ +int cl_queue_match(const struct list_head *queue, + const struct cl_lock_descr *need) +{ + struct cl_io_lock_link *scan; + + list_for_each_entry(scan, queue, cill_linkage) { + if (cl_lock_descr_match(&scan->cill_descr, need)) + return +1; + } + return 0; +} +EXPORT_SYMBOL(cl_queue_match); + +static int cl_queue_merge(const struct list_head *queue, + const struct cl_lock_descr *need) +{ + struct cl_io_lock_link *scan; + + list_for_each_entry(scan, queue, cill_linkage) { + if (cl_lock_descr_cmp(&scan->cill_descr, need)) + continue; + cl_lock_descr_merge(&scan->cill_descr, need); + CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", + scan->cill_descr.cld_mode, scan->cill_descr.cld_start, + scan->cill_descr.cld_end); + return +1; + } + return 0; + +} + +static int cl_lockset_match(const struct cl_lockset *set, + const struct cl_lock_descr *need) +{ + return cl_queue_match(&set->cls_curr, need) || + cl_queue_match(&set->cls_done, need); +} + +static int cl_lockset_merge(const struct cl_lockset *set, + const struct cl_lock_descr *need) +{ + return cl_queue_merge(&set->cls_todo, need) || + cl_lockset_match(set, need); +} + +static int cl_lockset_lock_one(const struct lu_env *env, + struct cl_io *io, struct cl_lockset *set, + struct cl_io_lock_link *link) +{ + struct cl_lock *lock; + int result; + + lock = cl_lock_request(env, io, &link->cill_descr, "io", io); + + if (!IS_ERR(lock)) { + link->cill_lock = lock; + list_move(&link->cill_linkage, &set->cls_curr); + if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) { + result = cl_wait(env, lock); + if (result == 0) + list_move(&link->cill_linkage, + &set->cls_done); + } else + result = 0; + } else + result = PTR_ERR(lock); + return result; +} + +static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + struct cl_lock *lock = link->cill_lock; + + list_del_init(&link->cill_linkage); + if (lock != NULL) { + cl_lock_release(env, lock, "io", io); + link->cill_lock = NULL; + } + if (link->cill_fini != NULL) + link->cill_fini(env, link); +} + +static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io, + struct cl_lockset *set) +{ + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + struct cl_lock *lock; + int result; + + result = 0; + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { + if (!cl_lockset_match(set, &link->cill_descr)) { + /* XXX some locking to guarantee that locks aren't + * expanded in between. */ + result = cl_lockset_lock_one(env, io, set, link); + if (result != 0) + break; + } else + cl_lock_link_fini(env, io, link); + } + if (result == 0) { + list_for_each_entry_safe(link, temp, + &set->cls_curr, cill_linkage) { + lock = link->cill_lock; + result = cl_wait(env, lock); + if (result == 0) + list_move(&link->cill_linkage, + &set->cls_done); + else + break; + } + } + return result; +} + +/** + * Takes locks necessary for the current iteration of io. + * + * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required + * by layers for the current iteration. Then sort locks (to avoid dead-locks), + * and acquire them. + */ +int cl_io_lock(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IT_STARTED); + LINVRNT(cl_io_invariant(io)); + + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_lock == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan); + if (result != 0) + break; + } + if (result == 0) { + cl_io_locks_sort(io); + result = cl_lockset_lock(env, io, &io->ci_lockset); + } + if (result != 0) + cl_io_unlock(env, io); + else + io->ci_state = CIS_LOCKED; + return result; +} +EXPORT_SYMBOL(cl_io_lock); + +/** + * Release locks takes by io. + */ +void cl_io_unlock(const struct lu_env *env, struct cl_io *io) +{ + struct cl_lockset *set; + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + const struct cl_io_slice *scan; + + LASSERT(cl_io_is_loopable(io)); + LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + set = &io->ci_lockset; + + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) + cl_lock_link_fini(env, io, link); + + list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage) + cl_lock_link_fini(env, io, link); + + list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) { + cl_unuse(env, link->cill_lock); + cl_lock_link_fini(env, io, link); + } + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL) + scan->cis_iop->op[io->ci_type].cio_unlock(env, scan); + } + io->ci_state = CIS_UNLOCKED; + LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired); +} +EXPORT_SYMBOL(cl_io_unlock); + +/** + * Prepares next iteration of io. + * + * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give + * layers a chance to modify io parameters, e.g., so that lov can restrict io + * to a single stripe. + */ +int cl_io_iter_init(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED); + LINVRNT(cl_io_invariant(io)); + + result = 0; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_iter_init(env, + scan); + if (result != 0) + break; + } + if (result == 0) + io->ci_state = CIS_IT_STARTED; + return result; +} +EXPORT_SYMBOL(cl_io_iter_init); + +/** + * Finalizes io iteration. + * + * Calls cl_io_operations::cio_iter_fini() bottom-to-top. + */ +void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL) + scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan); + } + io->ci_state = CIS_IT_ENDED; +} +EXPORT_SYMBOL(cl_io_iter_fini); + +/** + * Records that read or write io progressed \a nob bytes forward. + */ +void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob) +{ + const struct cl_io_slice *scan; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + nob == 0); + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(cl_io_invariant(io)); + + io->u.ci_rw.crw_pos += nob; + io->u.ci_rw.crw_count -= nob; + + /* layers have to be notified. */ + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_advance != NULL) + scan->cis_iop->op[io->ci_type].cio_advance(env, scan, + nob); + } +} +EXPORT_SYMBOL(cl_io_rw_advance); + +/** + * Adds a lock to a lockset. + */ +int cl_io_lock_add(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + int result; + + if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr)) + result = +1; + else { + list_add(&link->cill_linkage, &io->ci_lockset.cls_todo); + result = 0; + } + return result; +} +EXPORT_SYMBOL(cl_io_lock_add); + +static void cl_free_io_lock_link(const struct lu_env *env, + struct cl_io_lock_link *link) +{ + OBD_FREE_PTR(link); +} + +/** + * Allocates new lock link, and uses it to add a lock to a lockset. + */ +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr) +{ + struct cl_io_lock_link *link; + int result; + + OBD_ALLOC_PTR(link); + if (link != NULL) { + link->cill_descr = *descr; + link->cill_fini = cl_free_io_lock_link; + result = cl_io_lock_add(env, io, link); + if (result) /* lock match */ + link->cill_fini(env, link); + } else + result = -ENOMEM; + + return result; +} +EXPORT_SYMBOL(cl_io_lock_alloc_add); + +/** + * Starts io by calling cl_io_operations::cio_start() top-to-bottom. + */ +int cl_io_start(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + + io->ci_state = CIS_IO_GOING; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_start == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_start(env, scan); + if (result != 0) + break; + } + if (result >= 0) + result = 0; + return result; +} +EXPORT_SYMBOL(cl_io_start); + +/** + * Wait until current io iteration is finished by calling + * cl_io_operations::cio_end() bottom-to-top. + */ +void cl_io_end(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IO_GOING); + LINVRNT(cl_io_invariant(io)); + + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_end != NULL) + scan->cis_iop->op[io->ci_type].cio_end(env, scan); + /* TODO: error handling. */ + } + io->ci_state = CIS_IO_FINISHED; +} +EXPORT_SYMBOL(cl_io_end); + +static const struct cl_page_slice * +cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page) +{ + const struct cl_page_slice *slice; + + slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type); + LINVRNT(slice != NULL); + return slice; +} + +/** + * True iff \a page is within \a io range. + */ +static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io) +{ + int result = 1; + loff_t start; + loff_t end; + pgoff_t idx; + + idx = page->cp_index; + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + /* + * check that [start, end) and [pos, pos + count) extents + * overlap. + */ + if (!cl_io_is_append(io)) { + const struct cl_io_rw_common *crw = &(io->u.ci_rw); + start = cl_offset(page->cp_obj, idx); + end = cl_offset(page->cp_obj, idx + 1); + result = crw->crw_pos < end && + start < crw->crw_pos + crw->crw_count; + } + break; + case CIT_FAULT: + result = io->u.ci_fault.ft_index == idx; + break; + default: + LBUG(); + } + return result; +} + +/** + * Called by read io, when page has to be read from the server. + * + * \see cl_io_operations::cio_read_page() + */ +int cl_io_read_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + const struct cl_io_slice *scan; + struct cl_2queue *queue; + int result = 0; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT); + LINVRNT(cl_page_is_owned(page, io)); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_page_in_io(page, io)); + LINVRNT(cl_io_invariant(io)); + + queue = &io->ci_queue; + + cl_2queue_init(queue); + /* + * ->cio_read_page() methods called in the loop below are supposed to + * never block waiting for network (the only subtle point is the + * creation of new pages for read-ahead that might result in cache + * shrinking, but currently only clean pages are shrunk and this + * requires no network io). + * + * Should this ever starts blocking, retry loop would be needed for + * "parallel io" (see CLO_REPEAT loops in cl_lock.c). + */ + cl_io_for_each(scan, io) { + if (scan->cis_iop->cio_read_page != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + LINVRNT(slice != NULL); + result = scan->cis_iop->cio_read_page(env, scan, slice); + if (result != 0) + break; + } + } + if (result == 0) + result = cl_io_submit_rw(env, io, CRT_READ, queue); + /* + * Unlock unsent pages in case of error. + */ + cl_page_list_disown(env, io, &queue->c2_qin); + cl_2queue_fini(env, queue); + return result; +} +EXPORT_SYMBOL(cl_io_read_page); + +/** + * Called by write io to prepare page to receive data from user buffer. + * + * \see cl_io_operations::cio_prepare_write() + */ +int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_WRITE); + LINVRNT(cl_page_is_owned(page, io)); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + LASSERT(cl_page_in_io(page, io)); + + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->cio_prepare_write != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + result = scan->cis_iop->cio_prepare_write(env, scan, + slice, + from, to); + if (result != 0) + break; + } + } + return result; +} +EXPORT_SYMBOL(cl_io_prepare_write); + +/** + * Called by write io after user data were copied into a page. + * + * \see cl_io_operations::cio_commit_write() + */ +int cl_io_commit_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_WRITE); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + /* + * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov) + * already called cl_page_cache_add(), moving page into CPS_CACHED + * state. Better (and more general) way of dealing with such situation + * is needed. + */ + LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL); + LASSERT(cl_page_in_io(page, io)); + + cl_io_for_each(scan, io) { + if (scan->cis_iop->cio_commit_write != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + result = scan->cis_iop->cio_commit_write(env, scan, + slice, + from, to); + if (result != 0) + break; + } + } + LINVRNT(result <= 0); + return result; +} +EXPORT_SYMBOL(cl_io_commit_write); + +/** + * Submits a list of pages for immediate io. + * + * After the function gets returned, The submitted pages are moved to + * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need + * to be submitted, and the pages are errant to submit. + * + * \returns 0 if at least one page was submitted, error code otherwise. + * \see cl_io_operations::cio_submit() + */ +int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, + enum cl_req_type crt, struct cl_2queue *queue) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op)); + + cl_io_for_each(scan, io) { + if (scan->cis_iop->req_op[crt].cio_submit == NULL) + continue; + result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt, + queue); + if (result != 0) + break; + } + /* + * If ->cio_submit() failed, no pages were sent. + */ + LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages))); + return result; +} +EXPORT_SYMBOL(cl_io_submit_rw); + +/** + * Submit a sync_io and wait for the IO to be finished, or error happens. + * If \a timeout is zero, it means to wait for the IO unconditionally. + */ +int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout) +{ + struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor; + struct cl_page *pg; + int rc; + + cl_page_list_for_each(pg, &queue->c2_qin) { + LASSERT(pg->cp_sync_io == NULL); + pg->cp_sync_io = anchor; + } + + cl_sync_io_init(anchor, queue->c2_qin.pl_nr); + rc = cl_io_submit_rw(env, io, iot, queue); + if (rc == 0) { + /* + * If some pages weren't sent for any reason (e.g., + * read found up-to-date pages in the cache, or write found + * clean pages), count them as completed to avoid infinite + * wait. + */ + cl_page_list_for_each(pg, &queue->c2_qin) { + pg->cp_sync_io = NULL; + cl_sync_io_note(anchor, +1); + } + + /* wait for the IO to be finished. */ + rc = cl_sync_io_wait(env, io, &queue->c2_qout, + anchor, timeout); + } else { + LASSERT(list_empty(&queue->c2_qout.pl_pages)); + cl_page_list_for_each(pg, &queue->c2_qin) + pg->cp_sync_io = NULL; + } + return rc; +} +EXPORT_SYMBOL(cl_io_submit_sync); + +/** + * Cancel an IO which has been submitted by cl_io_submit_rw. + */ +int cl_io_cancel(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue) +{ + struct cl_page *page; + int result = 0; + + CERROR("Canceling ongoing page transmission\n"); + cl_page_list_for_each(page, queue) { + int rc; + + LINVRNT(cl_page_in_io(page, io)); + rc = cl_page_cancel(env, page); + result = result ?: rc; + } + return result; +} +EXPORT_SYMBOL(cl_io_cancel); + +/** + * Main io loop. + * + * Pumps io through iterations calling + * + * - cl_io_iter_init() + * + * - cl_io_lock() + * + * - cl_io_start() + * + * - cl_io_end() + * + * - cl_io_unlock() + * + * - cl_io_iter_fini() + * + * repeatedly until there is no more io to do. + */ +int cl_io_loop(const struct lu_env *env, struct cl_io *io) +{ + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + + do { + size_t nob; + + io->ci_continue = 0; + result = cl_io_iter_init(env, io); + if (result == 0) { + nob = io->ci_nob; + result = cl_io_lock(env, io); + if (result == 0) { + /* + * Notify layers that locks has been taken, + * and do actual i/o. + * + * - llite: kms, short read; + * - llite: generic_file_read(); + */ + result = cl_io_start(env, io); + /* + * Send any remaining pending + * io, etc. + * + * - llite: ll_rw_stats_tally. + */ + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_rw_advance(env, io, io->ci_nob - nob); + } + } + cl_io_iter_fini(env, io); + } while (result == 0 && io->ci_continue); + if (result == 0) + result = io->ci_result; + return result < 0 ? result : 0; +} +EXPORT_SYMBOL(cl_io_loop); + +/** + * Adds io slice to the cl_io. + * + * This is called by cl_object_operations::coo_io_init() methods to add a + * per-layer state to the io. New state is added at the end of + * cl_io::ci_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add() + */ +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, + const struct cl_io_operations *ops) +{ + struct list_head *linkage = &slice->cis_linkage; + + LASSERT((linkage->prev == NULL && linkage->next == NULL) || + list_empty(linkage)); + + list_add_tail(linkage, &io->ci_layers); + slice->cis_io = io; + slice->cis_obj = obj; + slice->cis_iop = ops; +} +EXPORT_SYMBOL(cl_io_slice_add); + + +/** + * Initializes page list. + */ +void cl_page_list_init(struct cl_page_list *plist) +{ + plist->pl_nr = 0; + INIT_LIST_HEAD(&plist->pl_pages); + plist->pl_owner = current; +} +EXPORT_SYMBOL(cl_page_list_init); + +/** + * Adds a page to a page list. + */ +void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page) +{ + /* it would be better to check that page is owned by "current" io, but + * it is not passed here. */ + LASSERT(page->cp_owner != NULL); + LINVRNT(plist->pl_owner == current); + + lockdep_off(); + mutex_lock(&page->cp_mutex); + lockdep_on(); + LASSERT(list_empty(&page->cp_batch)); + list_add_tail(&page->cp_batch, &plist->pl_pages); + ++plist->pl_nr; + lu_ref_add_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); + cl_page_get(page); +} +EXPORT_SYMBOL(cl_page_list_add); + +/** + * Removes a page from a page list. + */ +void cl_page_list_del(const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page) +{ + LASSERT(plist->pl_nr > 0); + LINVRNT(plist->pl_owner == current); + + list_del_init(&page->cp_batch); + lockdep_off(); + mutex_unlock(&page->cp_mutex); + lockdep_on(); + --plist->pl_nr; + lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); + cl_page_put(env, page); +} +EXPORT_SYMBOL(cl_page_list_del); + +/** + * Moves a page from one page list to another. + */ +void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page) +{ + LASSERT(src->pl_nr > 0); + LINVRNT(dst->pl_owner == current); + LINVRNT(src->pl_owner == current); + + list_move_tail(&page->cp_batch, &dst->pl_pages); + --src->pl_nr; + ++dst->pl_nr; + lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue", + src, dst); +} +EXPORT_SYMBOL(cl_page_list_move); + +/** + * splice the cl_page_list, just as list head does + */ +void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head) +{ + struct cl_page *page; + struct cl_page *tmp; + + LINVRNT(list->pl_owner == current); + LINVRNT(head->pl_owner == current); + + cl_page_list_for_each_safe(page, tmp, list) + cl_page_list_move(head, list, page); +} +EXPORT_SYMBOL(cl_page_list_splice); + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); + +/** + * Disowns pages in a queue. + */ +void cl_page_list_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == current); + + cl_page_list_for_each_safe(page, temp, plist) { + LASSERT(plist->pl_nr > 0); + + list_del_init(&page->cp_batch); + lockdep_off(); + mutex_unlock(&page->cp_mutex); + lockdep_on(); + --plist->pl_nr; + /* + * cl_page_disown0 rather than usual cl_page_disown() is used, + * because pages are possibly in CPS_FREEING state already due + * to the call to cl_page_list_discard(). + */ + /* + * XXX cl_page_disown0() will fail if page is not locked. + */ + cl_page_disown0(env, io, page); + lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", + plist); + cl_page_put(env, page); + } +} +EXPORT_SYMBOL(cl_page_list_disown); + +/** + * Releases pages from queue. + */ +void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == current); + + cl_page_list_for_each_safe(page, temp, plist) + cl_page_list_del(env, plist, page); + LASSERT(plist->pl_nr == 0); +} +EXPORT_SYMBOL(cl_page_list_fini); + +/** + * Owns all pages in a queue. + */ +int cl_page_list_own(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + pgoff_t index = 0; + int result; + + LINVRNT(plist->pl_owner == current); + + result = 0; + cl_page_list_for_each_safe(page, temp, plist) { + LASSERT(index <= page->cp_index); + index = page->cp_index; + if (cl_page_own(env, io, page) == 0) + result = result ?: page->cp_error; + else + cl_page_list_del(env, plist, page); + } + return result; +} +EXPORT_SYMBOL(cl_page_list_own); + +/** + * Assumes all pages in a queue. + */ +void cl_page_list_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == current); + + cl_page_list_for_each(page, plist) + cl_page_assume(env, io, page); +} +EXPORT_SYMBOL(cl_page_list_assume); + +/** + * Discards all pages in a queue. + */ +void cl_page_list_discard(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == current); + cl_page_list_for_each(page, plist) + cl_page_discard(env, io, page); +} +EXPORT_SYMBOL(cl_page_list_discard); + +/** + * Unmaps all pages in a queue from user virtual memory. + */ +int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + int result; + + LINVRNT(plist->pl_owner == current); + result = 0; + cl_page_list_for_each(page, plist) { + result = cl_page_unmap(env, io, page); + if (result != 0) + break; + } + return result; +} +EXPORT_SYMBOL(cl_page_list_unmap); + +/** + * Initialize dual page queue. + */ +void cl_2queue_init(struct cl_2queue *queue) +{ + cl_page_list_init(&queue->c2_qin); + cl_page_list_init(&queue->c2_qout); +} +EXPORT_SYMBOL(cl_2queue_init); + +/** + * Add a page to the incoming page list of 2-queue. + */ +void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page) +{ + cl_page_list_add(&queue->c2_qin, page); +} +EXPORT_SYMBOL(cl_2queue_add); + +/** + * Disown pages in both lists of a 2-queue. + */ +void cl_2queue_disown(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + cl_page_list_disown(env, io, &queue->c2_qin); + cl_page_list_disown(env, io, &queue->c2_qout); +} +EXPORT_SYMBOL(cl_2queue_disown); + +/** + * Discard (truncate) pages in both lists of a 2-queue. + */ +void cl_2queue_discard(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + cl_page_list_discard(env, io, &queue->c2_qin); + cl_page_list_discard(env, io, &queue->c2_qout); +} +EXPORT_SYMBOL(cl_2queue_discard); + +/** + * Assume to own the pages in cl_2queue + */ +void cl_2queue_assume(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + cl_page_list_assume(env, io, &queue->c2_qin); + cl_page_list_assume(env, io, &queue->c2_qout); +} +EXPORT_SYMBOL(cl_2queue_assume); + +/** + * Finalize both page lists of a 2-queue. + */ +void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue) +{ + cl_page_list_fini(env, &queue->c2_qout); + cl_page_list_fini(env, &queue->c2_qin); +} +EXPORT_SYMBOL(cl_2queue_fini); + +/** + * Initialize a 2-queue to contain \a page in its incoming page list. + */ +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page) +{ + cl_2queue_init(queue); + cl_2queue_add(queue, page); +} +EXPORT_SYMBOL(cl_2queue_init_page); + +/** + * Returns top-level io. + * + * \see cl_object_top(), cl_page_top(). + */ +struct cl_io *cl_io_top(struct cl_io *io) +{ + while (io->ci_parent != NULL) + io = io->ci_parent; + return io; +} +EXPORT_SYMBOL(cl_io_top); + +/** + * Prints human readable representation of \a io to the \a f. + */ +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io) +{ +} + +/** + * Adds request slice to the compound request. + * + * This is called by cl_device_operations::cdo_req_init() methods to add a + * per-layer state to the request. New state is added at the end of + * cl_req::crq_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice, + struct cl_device *dev, + const struct cl_req_operations *ops) +{ + list_add_tail(&slice->crs_linkage, &req->crq_layers); + slice->crs_dev = dev; + slice->crs_ops = ops; + slice->crs_req = req; +} +EXPORT_SYMBOL(cl_req_slice_add); + +static void cl_req_free(const struct lu_env *env, struct cl_req *req) +{ + unsigned i; + + LASSERT(list_empty(&req->crq_pages)); + LASSERT(req->crq_nrpages == 0); + LINVRNT(list_empty(&req->crq_layers)); + LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL)); + + if (req->crq_o != NULL) { + for (i = 0; i < req->crq_nrobjs; ++i) { + struct cl_object *obj = req->crq_o[i].ro_obj; + if (obj != NULL) { + lu_object_ref_del_at(&obj->co_lu, + &req->crq_o[i].ro_obj_ref, + "cl_req", req); + cl_object_put(env, obj); + } + } + OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof(req->crq_o[0])); + } + OBD_FREE_PTR(req); +} + +static int cl_req_init(const struct lu_env *env, struct cl_req *req, + struct cl_page *page) +{ + struct cl_device *dev; + struct cl_page_slice *slice; + int result; + + result = 0; + page = cl_page_top(page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev); + if (dev->cd_ops->cdo_req_init != NULL) { + result = dev->cd_ops->cdo_req_init(env, + dev, req); + if (result != 0) + break; + } + } + page = page->cp_child; + } while (page != NULL && result == 0); + return result; +} + +/** + * Invokes per-request transfer completion call-backs + * (cl_req_operations::cro_completion()) bottom-to-top. + */ +void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc) +{ + struct cl_req_slice *slice; + + /* + * for the lack of list_for_each_entry_reverse_safe()... + */ + while (!list_empty(&req->crq_layers)) { + slice = list_entry(req->crq_layers.prev, + struct cl_req_slice, crs_linkage); + list_del_init(&slice->crs_linkage); + if (slice->crs_ops->cro_completion != NULL) + slice->crs_ops->cro_completion(env, slice, rc); + } + cl_req_free(env, req); +} +EXPORT_SYMBOL(cl_req_completion); + +/** + * Allocates new transfer request. + */ +struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page, + enum cl_req_type crt, int nr_objects) +{ + struct cl_req *req; + + LINVRNT(nr_objects > 0); + + OBD_ALLOC_PTR(req); + if (req != NULL) { + int result; + + req->crq_type = crt; + INIT_LIST_HEAD(&req->crq_pages); + INIT_LIST_HEAD(&req->crq_layers); + + OBD_ALLOC(req->crq_o, nr_objects * sizeof(req->crq_o[0])); + if (req->crq_o != NULL) { + req->crq_nrobjs = nr_objects; + result = cl_req_init(env, req, page); + } else + result = -ENOMEM; + if (result != 0) { + cl_req_completion(env, req, result); + req = ERR_PTR(result); + } + } else + req = ERR_PTR(-ENOMEM); + return req; +} +EXPORT_SYMBOL(cl_req_alloc); + +/** + * Adds a page to a request. + */ +void cl_req_page_add(const struct lu_env *env, + struct cl_req *req, struct cl_page *page) +{ + struct cl_object *obj; + struct cl_req_obj *rqo; + int i; + + page = cl_page_top(page); + + LASSERT(list_empty(&page->cp_flight)); + LASSERT(page->cp_req == NULL); + + CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n", + req, req->crq_type, req->crq_nrpages); + + list_add_tail(&page->cp_flight, &req->crq_pages); + ++req->crq_nrpages; + page->cp_req = req; + obj = cl_object_top(page->cp_obj); + for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) { + if (rqo->ro_obj == NULL) { + rqo->ro_obj = obj; + cl_object_get(obj); + lu_object_ref_add_at(&obj->co_lu, &rqo->ro_obj_ref, + "cl_req", req); + break; + } + } + LASSERT(i < req->crq_nrobjs); +} +EXPORT_SYMBOL(cl_req_page_add); + +/** + * Removes a page from a request. + */ +void cl_req_page_done(const struct lu_env *env, struct cl_page *page) +{ + struct cl_req *req = page->cp_req; + + page = cl_page_top(page); + + LASSERT(!list_empty(&page->cp_flight)); + LASSERT(req->crq_nrpages > 0); + + list_del_init(&page->cp_flight); + --req->crq_nrpages; + page->cp_req = NULL; +} +EXPORT_SYMBOL(cl_req_page_done); + +/** + * Notifies layers that request is about to depart by calling + * cl_req_operations::cro_prep() top-to-bottom. + */ +int cl_req_prep(const struct lu_env *env, struct cl_req *req) +{ + int i; + int result; + const struct cl_req_slice *slice; + + /* + * Check that the caller of cl_req_alloc() didn't lie about the number + * of objects. + */ + for (i = 0; i < req->crq_nrobjs; ++i) + LASSERT(req->crq_o[i].ro_obj != NULL); + + result = 0; + list_for_each_entry(slice, &req->crq_layers, crs_linkage) { + if (slice->crs_ops->cro_prep != NULL) { + result = slice->crs_ops->cro_prep(env, slice); + if (result != 0) + break; + } + } + return result; +} +EXPORT_SYMBOL(cl_req_prep); + +/** + * Fills in attributes that are passed to server together with transfer. Only + * attributes from \a flags may be touched. This can be called multiple times + * for the same request. + */ +void cl_req_attr_set(const struct lu_env *env, struct cl_req *req, + struct cl_req_attr *attr, u64 flags) +{ + const struct cl_req_slice *slice; + struct cl_page *page; + int i; + + LASSERT(!list_empty(&req->crq_pages)); + + /* Take any page to use as a model. */ + page = list_entry(req->crq_pages.next, struct cl_page, cp_flight); + + for (i = 0; i < req->crq_nrobjs; ++i) { + list_for_each_entry(slice, &req->crq_layers, crs_linkage) { + const struct cl_page_slice *scan; + const struct cl_object *obj; + + scan = cl_page_at(page, + slice->crs_dev->cd_lu_dev.ld_type); + LASSERT(scan != NULL); + obj = scan->cpl_obj; + if (slice->crs_ops->cro_attr_set != NULL) + slice->crs_ops->cro_attr_set(env, slice, obj, + attr + i, flags); + } + } +} +EXPORT_SYMBOL(cl_req_attr_set); + +/* XXX complete(), init_completion(), and wait_for_completion(), until they are + * implemented in libcfs. */ +# include + +/** + * Initialize synchronous io wait anchor, for transfer of \a nrpages pages. + */ +void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages) +{ + init_waitqueue_head(&anchor->csi_waitq); + atomic_set(&anchor->csi_sync_nr, nrpages); + atomic_set(&anchor->csi_barrier, nrpages > 0); + anchor->csi_sync_rc = 0; +} +EXPORT_SYMBOL(cl_sync_io_init); + +/** + * Wait until all transfer completes. Transfer completion routine has to call + * cl_sync_io_note() for every page. + */ +int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_sync_io *anchor, + long timeout) +{ + struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), + NULL, NULL, NULL); + int rc; + + LASSERT(timeout >= 0); + + rc = l_wait_event(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + &lwi); + if (rc < 0) { + CERROR("SYNC IO failed with error: %d, try to cancel %d remaining pages\n", + rc, atomic_read(&anchor->csi_sync_nr)); + + (void)cl_io_cancel(env, io, queue); + + lwi = (struct l_wait_info) { 0 }; + (void)l_wait_event(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + &lwi); + } else { + rc = anchor->csi_sync_rc; + } + LASSERT(atomic_read(&anchor->csi_sync_nr) == 0); + cl_page_list_assume(env, io, queue); + + /* wait until cl_sync_io_note() has done wakeup */ + while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) { + cpu_relax(); + } + + POISON(anchor, 0x5a, sizeof(*anchor)); + return rc; +} +EXPORT_SYMBOL(cl_sync_io_wait); + +/** + * Indicate that transfer of a single page completed. + */ +void cl_sync_io_note(struct cl_sync_io *anchor, int ioret) +{ + if (anchor->csi_sync_rc == 0 && ioret < 0) + anchor->csi_sync_rc = ioret; + /* + * Synchronous IO done without releasing page lock (e.g., as a part of + * ->{prepare,commit}_write(). Completion is used to signal the end of + * IO. + */ + LASSERT(atomic_read(&anchor->csi_sync_nr) > 0); + if (atomic_dec_and_test(&anchor->csi_sync_nr)) { + wake_up_all(&anchor->csi_waitq); + /* it's safe to nuke or reuse anchor now */ + atomic_set(&anchor->csi_barrier, 0); + } +} +EXPORT_SYMBOL(cl_sync_io_note); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/kernel/drivers/staging/lustre/lustre/obdclass/cl_lock.c new file mode 100644 index 000000000..b081167f9 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_lock.c @@ -0,0 +1,2239 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Extent Lock. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lustre_fid.h" +#include +#include "../include/cl_object.h" +#include "cl_internal.h" + +/** Lock class of cl_lock::cll_guard */ +static struct lock_class_key cl_lock_guard_class; +static struct kmem_cache *cl_lock_kmem; + +static struct lu_kmem_descr cl_lock_caches[] = { + { + .ckd_cache = &cl_lock_kmem, + .ckd_name = "cl_lock_kmem", + .ckd_size = sizeof (struct cl_lock) + }, + { + .ckd_cache = NULL + } +}; + +#define CS_LOCK_INC(o, item) +#define CS_LOCK_DEC(o, item) +#define CS_LOCKSTATE_INC(o, state) +#define CS_LOCKSTATE_DEC(o, state) + +/** + * Basic lock invariant that is maintained at all times. Caller either has a + * reference to \a lock, or somehow assures that \a lock cannot be freed. + * + * \see cl_lock_invariant() + */ +static int cl_lock_invariant_trusted(const struct lu_env *env, + const struct cl_lock *lock) +{ + return ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) && + atomic_read(&lock->cll_ref) >= lock->cll_holds && + lock->cll_holds >= lock->cll_users && + lock->cll_holds >= 0 && + lock->cll_users >= 0 && + lock->cll_depth >= 0; +} + +/** + * Stronger lock invariant, checking that caller has a reference on a lock. + * + * \see cl_lock_invariant_trusted() + */ +static int cl_lock_invariant(const struct lu_env *env, + const struct cl_lock *lock) +{ + int result; + + result = atomic_read(&lock->cll_ref) > 0 && + cl_lock_invariant_trusted(env, lock); + if (!result && env != NULL) + CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken"); + return result; +} + +/** + * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock. + */ +static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock) +{ + return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting; +} + +/** + * Returns a set of counters for this lock, depending on a lock nesting. + */ +static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env, + const struct cl_lock *lock) +{ + struct cl_thread_info *info; + enum clt_nesting_level nesting; + + info = cl_env_info(env); + nesting = cl_lock_nesting(lock); + LASSERT(nesting < ARRAY_SIZE(info->clt_counters)); + return &info->clt_counters[nesting]; +} + +static void cl_lock_trace0(int level, const struct lu_env *env, + const char *prefix, const struct cl_lock *lock, + const char *func, const int line) +{ + struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj); + CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)(%p/%d/%d) at %s():%d\n", + prefix, lock, atomic_read(&lock->cll_ref), + lock->cll_guarder, lock->cll_depth, + lock->cll_state, lock->cll_error, lock->cll_holds, + lock->cll_users, lock->cll_flags, + env, h->coh_nesting, cl_lock_nr_mutexed(env), + func, line); +} +#define cl_lock_trace(level, env, prefix, lock) \ + cl_lock_trace0(level, env, prefix, lock, __func__, __LINE__) + +#define RETIP ((unsigned long)__builtin_return_address(0)) + +#ifdef CONFIG_LOCKDEP +static struct lock_class_key cl_lock_key; + +static void cl_lock_lockdep_init(struct cl_lock *lock) +{ + lockdep_set_class_and_name(lock, &cl_lock_key, "EXT"); +} + +static void cl_lock_lockdep_acquire(const struct lu_env *env, + struct cl_lock *lock, __u32 enqflags) +{ + cl_lock_counters(env, lock)->ctc_nr_locks_acquired++; + lock_map_acquire(&lock->dep_map); +} + +static void cl_lock_lockdep_release(const struct lu_env *env, + struct cl_lock *lock) +{ + cl_lock_counters(env, lock)->ctc_nr_locks_acquired--; + lock_release(&lock->dep_map, 0, RETIP); +} + +#else /* !CONFIG_LOCKDEP */ + +static void cl_lock_lockdep_init(struct cl_lock *lock) +{} +static void cl_lock_lockdep_acquire(const struct lu_env *env, + struct cl_lock *lock, __u32 enqflags) +{} +static void cl_lock_lockdep_release(const struct lu_env *env, + struct cl_lock *lock) +{} + +#endif /* !CONFIG_LOCKDEP */ + +/** + * Adds lock slice to the compound lock. + * + * This is called by cl_object_operations::coo_lock_init() methods to add a + * per-layer state to the lock. New state is added at the end of + * cl_lock::cll_layers list, that is, it is at the bottom of the stack. + * + * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops) +{ + slice->cls_lock = lock; + list_add_tail(&slice->cls_linkage, &lock->cll_layers); + slice->cls_obj = obj; + slice->cls_ops = ops; +} +EXPORT_SYMBOL(cl_lock_slice_add); + +/** + * Returns true iff a lock with the mode \a has provides at least the same + * guarantees as a lock with the mode \a need. + */ +int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need) +{ + LINVRNT(need == CLM_READ || need == CLM_WRITE || + need == CLM_PHANTOM || need == CLM_GROUP); + LINVRNT(has == CLM_READ || has == CLM_WRITE || + has == CLM_PHANTOM || has == CLM_GROUP); + CLASSERT(CLM_PHANTOM < CLM_READ); + CLASSERT(CLM_READ < CLM_WRITE); + CLASSERT(CLM_WRITE < CLM_GROUP); + + if (has != CLM_GROUP) + return need <= has; + else + return need == has; +} +EXPORT_SYMBOL(cl_lock_mode_match); + +/** + * Returns true iff extent portions of lock descriptions match. + */ +int cl_lock_ext_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need) +{ + return + has->cld_start <= need->cld_start && + has->cld_end >= need->cld_end && + cl_lock_mode_match(has->cld_mode, need->cld_mode) && + (has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid); +} +EXPORT_SYMBOL(cl_lock_ext_match); + +/** + * Returns true iff a lock with the description \a has provides at least the + * same guarantees as a lock with the description \a need. + */ +int cl_lock_descr_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need) +{ + return + cl_object_same(has->cld_obj, need->cld_obj) && + cl_lock_ext_match(has, need); +} +EXPORT_SYMBOL(cl_lock_descr_match); + +static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object *obj = lock->cll_descr.cld_obj; + + LINVRNT(!cl_lock_is_mutexed(lock)); + + cl_lock_trace(D_DLMTRACE, env, "free lock", lock); + might_sleep(); + while (!list_empty(&lock->cll_layers)) { + struct cl_lock_slice *slice; + + slice = list_entry(lock->cll_layers.next, + struct cl_lock_slice, cls_linkage); + list_del_init(lock->cll_layers.next); + slice->cls_ops->clo_fini(env, slice); + } + CS_LOCK_DEC(obj, total); + CS_LOCKSTATE_DEC(obj, lock->cll_state); + lu_object_ref_del_at(&obj->co_lu, &lock->cll_obj_ref, "cl_lock", lock); + cl_object_put(env, obj); + lu_ref_fini(&lock->cll_reference); + lu_ref_fini(&lock->cll_holders); + mutex_destroy(&lock->cll_guard); + OBD_SLAB_FREE_PTR(lock, cl_lock_kmem); +} + +/** + * Releases a reference on a lock. + * + * When last reference is released, lock is returned to the cache, unless it + * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed + * immediately. + * + * \see cl_object_put(), cl_page_put() + */ +void cl_lock_put(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object *obj; + + LINVRNT(cl_lock_invariant(env, lock)); + obj = lock->cll_descr.cld_obj; + LINVRNT(obj != NULL); + + CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + + if (atomic_dec_and_test(&lock->cll_ref)) { + if (lock->cll_state == CLS_FREEING) { + LASSERT(list_empty(&lock->cll_linkage)); + cl_lock_free(env, lock); + } + CS_LOCK_DEC(obj, busy); + } +} +EXPORT_SYMBOL(cl_lock_put); + +/** + * Acquires an additional reference to a lock. + * + * This can be called only by caller already possessing a reference to \a + * lock. + * + * \see cl_object_get(), cl_page_get() + */ +void cl_lock_get(struct cl_lock *lock) +{ + LINVRNT(cl_lock_invariant(NULL, lock)); + CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + atomic_inc(&lock->cll_ref); +} +EXPORT_SYMBOL(cl_lock_get); + +/** + * Acquires a reference to a lock. + * + * This is much like cl_lock_get(), except that this function can be used to + * acquire initial reference to the cached lock. Caller has to deal with all + * possible races. Use with care! + * + * \see cl_page_get_trust() + */ +void cl_lock_get_trust(struct cl_lock *lock) +{ + CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + if (atomic_inc_return(&lock->cll_ref) == 1) + CS_LOCK_INC(lock->cll_descr.cld_obj, busy); +} +EXPORT_SYMBOL(cl_lock_get_trust); + +/** + * Helper function destroying the lock that wasn't completely initialized. + * + * Other threads can acquire references to the top-lock through its + * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately. + */ +static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock) +{ + cl_lock_mutex_get(env, lock); + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); +} + +static struct cl_lock *cl_lock_alloc(const struct lu_env *env, + struct cl_object *obj, + const struct cl_io *io, + const struct cl_lock_descr *descr) +{ + struct cl_lock *lock; + struct lu_object_header *head; + + OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, GFP_NOFS); + if (lock != NULL) { + atomic_set(&lock->cll_ref, 1); + lock->cll_descr = *descr; + lock->cll_state = CLS_NEW; + cl_object_get(obj); + lu_object_ref_add_at(&obj->co_lu, &lock->cll_obj_ref, "cl_lock", + lock); + INIT_LIST_HEAD(&lock->cll_layers); + INIT_LIST_HEAD(&lock->cll_linkage); + INIT_LIST_HEAD(&lock->cll_inclosure); + lu_ref_init(&lock->cll_reference); + lu_ref_init(&lock->cll_holders); + mutex_init(&lock->cll_guard); + lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class); + init_waitqueue_head(&lock->cll_wq); + head = obj->co_lu.lo_header; + CS_LOCKSTATE_INC(obj, CLS_NEW); + CS_LOCK_INC(obj, total); + CS_LOCK_INC(obj, create); + cl_lock_lockdep_init(lock); + list_for_each_entry(obj, &head->loh_layers, + co_lu.lo_linkage) { + int err; + + err = obj->co_ops->coo_lock_init(env, obj, lock, io); + if (err != 0) { + cl_lock_finish(env, lock); + lock = ERR_PTR(err); + break; + } + } + } else + lock = ERR_PTR(-ENOMEM); + return lock; +} + +/** + * Transfer the lock into INTRANSIT state and return the original state. + * + * \pre state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED + * \post state: CLS_INTRANSIT + * \see CLS_INTRANSIT + */ +enum cl_lock_state cl_lock_intransit(const struct lu_env *env, + struct cl_lock *lock) +{ + enum cl_lock_state state = lock->cll_state; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(state != CLS_INTRANSIT); + LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED, + "Malformed lock state %d.\n", state); + + cl_lock_state_set(env, lock, CLS_INTRANSIT); + lock->cll_intransit_owner = current; + cl_lock_hold_add(env, lock, "intransit", current); + return state; +} +EXPORT_SYMBOL(cl_lock_intransit); + +/** + * Exit the intransit state and restore the lock state to the original state + */ +void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(lock->cll_state == CLS_INTRANSIT); + LASSERT(state != CLS_INTRANSIT); + LASSERT(lock->cll_intransit_owner == current); + + lock->cll_intransit_owner = NULL; + cl_lock_state_set(env, lock, state); + cl_lock_unhold(env, lock, "intransit", current); +} +EXPORT_SYMBOL(cl_lock_extransit); + +/** + * Checking whether the lock is intransit state + */ +int cl_lock_is_intransit(struct cl_lock *lock) +{ + LASSERT(cl_lock_is_mutexed(lock)); + return lock->cll_state == CLS_INTRANSIT && + lock->cll_intransit_owner != current; +} +EXPORT_SYMBOL(cl_lock_is_intransit); +/** + * Returns true iff lock is "suitable" for given io. E.g., locks acquired by + * truncate and O_APPEND cannot be reused for read/non-append-write, as they + * cover multiple stripes and can trigger cascading timeouts. + */ +static int cl_lock_fits_into(const struct lu_env *env, + const struct cl_lock *lock, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_invariant_trusted(env, lock)); + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_fits_into != NULL && + !slice->cls_ops->clo_fits_into(env, slice, need, io)) + return 0; + } + return 1; +} + +static struct cl_lock *cl_lock_lookup(const struct lu_env *env, + struct cl_object *obj, + const struct cl_io *io, + const struct cl_lock_descr *need) +{ + struct cl_lock *lock; + struct cl_object_header *head; + + head = cl_object_header(obj); + assert_spin_locked(&head->coh_lock_guard); + CS_LOCK_INC(obj, lookup); + list_for_each_entry(lock, &head->coh_locks, cll_linkage) { + int matched; + + matched = cl_lock_ext_match(&lock->cll_descr, need) && + lock->cll_state < CLS_FREEING && + lock->cll_error == 0 && + !(lock->cll_flags & CLF_CANCELLED) && + cl_lock_fits_into(env, lock, need, io); + CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n", + PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need), + matched); + if (matched) { + cl_lock_get_trust(lock); + CS_LOCK_INC(obj, hit); + return lock; + } + } + return NULL; +} + +/** + * Returns a lock matching description \a need. + * + * This is the main entry point into the cl_lock caching interface. First, a + * cache (implemented as a per-object linked list) is consulted. If lock is + * found there, it is returned immediately. Otherwise new lock is allocated + * and returned. In any case, additional reference to lock is acquired. + * + * \see cl_object_find(), cl_page_find() + */ +static struct cl_lock *cl_lock_find(const struct lu_env *env, + const struct cl_io *io, + const struct cl_lock_descr *need) +{ + struct cl_object_header *head; + struct cl_object *obj; + struct cl_lock *lock; + + obj = need->cld_obj; + head = cl_object_header(obj); + + spin_lock(&head->coh_lock_guard); + lock = cl_lock_lookup(env, obj, io, need); + spin_unlock(&head->coh_lock_guard); + + if (lock == NULL) { + lock = cl_lock_alloc(env, obj, io, need); + if (!IS_ERR(lock)) { + struct cl_lock *ghost; + + spin_lock(&head->coh_lock_guard); + ghost = cl_lock_lookup(env, obj, io, need); + if (ghost == NULL) { + cl_lock_get_trust(lock); + list_add_tail(&lock->cll_linkage, + &head->coh_locks); + spin_unlock(&head->coh_lock_guard); + CS_LOCK_INC(obj, busy); + } else { + spin_unlock(&head->coh_lock_guard); + /* + * Other threads can acquire references to the + * top-lock through its sub-locks. Hence, it + * cannot be cl_lock_free()-ed immediately. + */ + cl_lock_finish(env, lock); + lock = ghost; + } + } + } + return lock; +} + +/** + * Returns existing lock matching given description. This is similar to + * cl_lock_find() except that no new lock is created, and returned lock is + * guaranteed to be in enum cl_lock_state::CLS_HELD state. + */ +struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_object_header *head; + struct cl_object *obj; + struct cl_lock *lock; + + obj = need->cld_obj; + head = cl_object_header(obj); + + do { + spin_lock(&head->coh_lock_guard); + lock = cl_lock_lookup(env, obj, io, need); + spin_unlock(&head->coh_lock_guard); + if (lock == NULL) + return NULL; + + cl_lock_mutex_get(env, lock); + if (lock->cll_state == CLS_INTRANSIT) + /* Don't care return value. */ + cl_lock_state_wait(env, lock); + if (lock->cll_state == CLS_FREEING) { + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + lock = NULL; + } + } while (lock == NULL); + + cl_lock_hold_add(env, lock, scope, source); + cl_lock_user_add(env, lock); + if (lock->cll_state == CLS_CACHED) + cl_use_try(env, lock, 1); + if (lock->cll_state == CLS_HELD) { + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_acquire(env, lock, 0); + cl_lock_put(env, lock); + } else { + cl_unuse_try(env, lock); + cl_lock_unhold(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + lock = NULL; + } + + return lock; +} +EXPORT_SYMBOL(cl_lock_peek); + +/** + * Returns a slice within a lock, corresponding to the given layer in the + * device stack. + * + * \see cl_page_at() + */ +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype) +{ + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_invariant_trusted(NULL, lock)); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype) + return slice; + } + return NULL; +} +EXPORT_SYMBOL(cl_lock_at); + +static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_counters *counters; + + counters = cl_lock_counters(env, lock); + lock->cll_depth++; + counters->ctc_nr_locks_locked++; + lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock); + cl_lock_trace(D_TRACE, env, "got mutex", lock); +} + +/** + * Locks cl_lock object. + * + * This is used to manipulate cl_lock fields, and to serialize state + * transitions in the lock state machine. + * + * \post cl_lock_is_mutexed(lock) + * + * \see cl_lock_mutex_put() + */ +void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_invariant(env, lock)); + + if (lock->cll_guarder == current) { + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(lock->cll_depth > 0); + } else { + struct cl_object_header *hdr; + struct cl_thread_info *info; + int i; + + LINVRNT(lock->cll_guarder != current); + hdr = cl_object_header(lock->cll_descr.cld_obj); + /* + * Check that mutices are taken in the bottom-to-top order. + */ + info = cl_env_info(env); + for (i = 0; i < hdr->coh_nesting; ++i) + LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0); + mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting); + lock->cll_guarder = current; + LINVRNT(lock->cll_depth == 0); + } + cl_lock_mutex_tail(env, lock); +} +EXPORT_SYMBOL(cl_lock_mutex_get); + +/** + * Try-locks cl_lock object. + * + * \retval 0 \a lock was successfully locked + * + * \retval -EBUSY \a lock cannot be locked right now + * + * \post ergo(result == 0, cl_lock_is_mutexed(lock)) + * + * \see cl_lock_mutex_get() + */ +int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + + LINVRNT(cl_lock_invariant_trusted(env, lock)); + + result = 0; + if (lock->cll_guarder == current) { + LINVRNT(lock->cll_depth > 0); + cl_lock_mutex_tail(env, lock); + } else if (mutex_trylock(&lock->cll_guard)) { + LINVRNT(lock->cll_depth == 0); + lock->cll_guarder = current; + cl_lock_mutex_tail(env, lock); + } else + result = -EBUSY; + return result; +} +EXPORT_SYMBOL(cl_lock_mutex_try); + +/** + {* Unlocks cl_lock object. + * + * \pre cl_lock_is_mutexed(lock) + * + * \see cl_lock_mutex_get() + */ +void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_counters *counters; + + LINVRNT(cl_lock_invariant(env, lock)); + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(lock->cll_guarder == current); + LINVRNT(lock->cll_depth > 0); + + counters = cl_lock_counters(env, lock); + LINVRNT(counters->ctc_nr_locks_locked > 0); + + cl_lock_trace(D_TRACE, env, "put mutex", lock); + lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock); + counters->ctc_nr_locks_locked--; + if (--lock->cll_depth == 0) { + lock->cll_guarder = NULL; + mutex_unlock(&lock->cll_guard); + } +} +EXPORT_SYMBOL(cl_lock_mutex_put); + +/** + * Returns true iff lock's mutex is owned by the current thread. + */ +int cl_lock_is_mutexed(struct cl_lock *lock) +{ + return lock->cll_guarder == current; +} +EXPORT_SYMBOL(cl_lock_is_mutexed); + +/** + * Returns number of cl_lock mutices held by the current thread (environment). + */ +int cl_lock_nr_mutexed(const struct lu_env *env) +{ + struct cl_thread_info *info; + int i; + int locked; + + /* + * NOTE: if summation across all nesting levels (currently 2) proves + * too expensive, a summary counter can be added to + * struct cl_thread_info. + */ + info = cl_env_info(env); + for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i) + locked += info->clt_counters[i].ctc_nr_locks_locked; + return locked; +} +EXPORT_SYMBOL(cl_lock_nr_mutexed); + +static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + if (!(lock->cll_flags & CLF_CANCELLED)) { + const struct cl_lock_slice *slice; + + lock->cll_flags |= CLF_CANCELLED; + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_cancel != NULL) + slice->cls_ops->clo_cancel(env, slice); + } + } +} + +static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object_header *head; + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + if (lock->cll_state < CLS_FREEING) { + bool in_cache; + + LASSERT(lock->cll_state != CLS_INTRANSIT); + cl_lock_state_set(env, lock, CLS_FREEING); + + head = cl_object_header(lock->cll_descr.cld_obj); + + spin_lock(&head->coh_lock_guard); + in_cache = !list_empty(&lock->cll_linkage); + if (in_cache) + list_del_init(&lock->cll_linkage); + spin_unlock(&head->coh_lock_guard); + + if (in_cache) /* coh_locks cache holds a refcount. */ + cl_lock_put(env, lock); + + /* + * From now on, no new references to this lock can be acquired + * by cl_lock_lookup(). + */ + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_delete != NULL) + slice->cls_ops->clo_delete(env, slice); + } + /* + * From now on, no new references to this lock can be acquired + * by layer-specific means (like a pointer from struct + * ldlm_lock in osc, or a pointer from top-lock to sub-lock in + * lov). + * + * Lock will be finally freed in cl_lock_put() when last of + * existing references goes away. + */ + } +} + +/** + * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a + * top-lock (nesting == 0) accounts for this modification in the per-thread + * debugging counters. Sub-lock holds can be released by a thread different + * from one that acquired it. + */ +static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock, + int delta) +{ + struct cl_thread_counters *counters; + enum clt_nesting_level nesting; + + lock->cll_holds += delta; + nesting = cl_lock_nesting(lock); + if (nesting == CNL_TOP) { + counters = &cl_env_info(env)->clt_counters[CNL_TOP]; + counters->ctc_nr_held += delta; + LASSERT(counters->ctc_nr_held >= 0); + } +} + +/** + * Mod(ifie)s cl_lock::cll_users counter for a given lock. See + * cl_lock_hold_mod() for the explanation of the debugging code. + */ +static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock, + int delta) +{ + struct cl_thread_counters *counters; + enum clt_nesting_level nesting; + + lock->cll_users += delta; + nesting = cl_lock_nesting(lock); + if (nesting == CNL_TOP) { + counters = &cl_env_info(env)->clt_counters[CNL_TOP]; + counters->ctc_nr_used += delta; + LASSERT(counters->ctc_nr_used >= 0); + } +} + +void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_holds > 0); + + cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock); + lu_ref_del(&lock->cll_holders, scope, source); + cl_lock_hold_mod(env, lock, -1); + if (lock->cll_holds == 0) { + CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock); + if (lock->cll_descr.cld_mode == CLM_PHANTOM || + lock->cll_descr.cld_mode == CLM_GROUP || + lock->cll_state != CLS_CACHED) + /* + * If lock is still phantom or grouplock when user is + * done with it---destroy the lock. + */ + lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED; + if (lock->cll_flags & CLF_CANCELPEND) { + lock->cll_flags &= ~CLF_CANCELPEND; + cl_lock_cancel0(env, lock); + } + if (lock->cll_flags & CLF_DOOMED) { + /* no longer doomed: it's dead... Jim. */ + lock->cll_flags &= ~CLF_DOOMED; + cl_lock_delete0(env, lock); + } + } +} +EXPORT_SYMBOL(cl_lock_hold_release); + +/** + * Waits until lock state is changed. + * + * This function is called with cl_lock mutex locked, atomically releases + * mutex and goes to sleep, waiting for a lock state change (signaled by + * cl_lock_signal()), and re-acquires the mutex before return. + * + * This function is used to wait until lock state machine makes some progress + * and to emulate synchronous operations on top of asynchronous lock + * interface. + * + * \retval -EINTR wait was interrupted + * + * \retval 0 wait wasn't interrupted + * + * \pre cl_lock_is_mutexed(lock) + * + * \see cl_lock_signal() + */ +int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock) +{ + wait_queue_t waiter; + sigset_t blocked; + int result; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_depth == 1); + LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */ + + cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock); + result = lock->cll_error; + if (result == 0) { + /* To avoid being interrupted by the 'non-fatal' signals + * (SIGCHLD, for instance), we'd block them temporarily. + * LU-305 */ + blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS); + + init_waitqueue_entry(&waiter, current); + add_wait_queue(&lock->cll_wq, &waiter); + set_current_state(TASK_INTERRUPTIBLE); + cl_lock_mutex_put(env, lock); + + LASSERT(cl_lock_nr_mutexed(env) == 0); + + /* Returning ERESTARTSYS instead of EINTR so syscalls + * can be restarted if signals are pending here */ + result = -ERESTARTSYS; + if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) { + schedule(); + if (!cfs_signal_pending()) + result = 0; + } + + cl_lock_mutex_get(env, lock); + set_current_state(TASK_RUNNING); + remove_wait_queue(&lock->cll_wq, &waiter); + + /* Restore old blocked signals */ + cfs_restore_sigs(blocked); + } + return result; +} +EXPORT_SYMBOL(cl_lock_state_wait); + +static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) + if (slice->cls_ops->clo_state != NULL) + slice->cls_ops->clo_state(env, slice, state); + wake_up_all(&lock->cll_wq); +} + +/** + * Notifies waiters that lock state changed. + * + * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all + * layers about state change by calling cl_lock_operations::clo_state() + * top-to-bottom. + */ +void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock) +{ + cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock); + cl_lock_state_signal(env, lock, lock->cll_state); +} +EXPORT_SYMBOL(cl_lock_signal); + +/** + * Changes lock state. + * + * This function is invoked to notify layers that lock state changed, possible + * as a result of an asynchronous event such as call-back reception. + * + * \post lock->cll_state == state + * + * \see cl_lock_operations::clo_state() + */ +void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + LASSERT(lock->cll_state <= state || + (lock->cll_state == CLS_CACHED && + (state == CLS_HELD || /* lock found in cache */ + state == CLS_NEW || /* sub-lock canceled */ + state == CLS_INTRANSIT)) || + /* lock is in transit state */ + lock->cll_state == CLS_INTRANSIT); + + if (lock->cll_state != state) { + CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state); + CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state); + + cl_lock_state_signal(env, lock, state); + lock->cll_state = state; + } +} +EXPORT_SYMBOL(cl_lock_state_set); + +static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + int result; + + do { + result = 0; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state == CLS_INTRANSIT); + + result = -ENOSYS; + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_unuse != NULL) { + result = slice->cls_ops->clo_unuse(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + } while (result == CLO_REPEAT); + + return result; +} + +/** + * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling + * cl_lock_operations::clo_use() top-to-bottom to notify layers. + * @atomic = 1, it must unuse the lock to recovery the lock to keep the + * use process atomic + */ +int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic) +{ + const struct cl_lock_slice *slice; + int result; + enum cl_lock_state state; + + cl_lock_trace(D_DLMTRACE, env, "use lock", lock); + + LASSERT(lock->cll_state == CLS_CACHED); + if (lock->cll_error) + return lock->cll_error; + + result = -ENOSYS; + state = cl_lock_intransit(env, lock); + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_use != NULL) { + result = slice->cls_ops->clo_use(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + + LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n", + lock->cll_state); + + if (result == 0) { + state = CLS_HELD; + } else { + if (result == -ESTALE) { + /* + * ESTALE means sublock being cancelled + * at this time, and set lock state to + * be NEW here and ask the caller to repeat. + */ + state = CLS_NEW; + result = CLO_REPEAT; + } + + /* @atomic means back-off-on-failure. */ + if (atomic) { + int rc; + rc = cl_unuse_try_internal(env, lock); + /* Vet the results. */ + if (rc < 0 && result > 0) + result = rc; + } + + } + cl_lock_extransit(env, lock, state); + return result; +} +EXPORT_SYMBOL(cl_use_try); + +/** + * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers + * top-to-bottom. + */ +static int cl_enqueue_kick(const struct lu_env *env, + struct cl_lock *lock, + struct cl_io *io, __u32 flags) +{ + int result; + const struct cl_lock_slice *slice; + + result = -ENOSYS; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_enqueue != NULL) { + result = slice->cls_ops->clo_enqueue(env, + slice, io, flags); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + return result; +} + +/** + * Tries to enqueue a lock. + * + * This function is called repeatedly by cl_enqueue() until either lock is + * enqueued, or error occurs. This function does not block waiting for + * networking communication to complete. + * + * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + * + * \see cl_enqueue() cl_lock_operations::clo_enqueue() + * \see cl_lock_state::CLS_ENQUEUED + */ +int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags) +{ + int result; + + cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock); + do { + LINVRNT(cl_lock_is_mutexed(lock)); + + result = lock->cll_error; + if (result != 0) + break; + + switch (lock->cll_state) { + case CLS_NEW: + cl_lock_state_set(env, lock, CLS_QUEUING); + /* fall-through */ + case CLS_QUEUING: + /* kick layers. */ + result = cl_enqueue_kick(env, lock, io, flags); + /* For AGL case, the cl_lock::cll_state may + * become CLS_HELD already. */ + if (result == 0 && lock->cll_state == CLS_QUEUING) + cl_lock_state_set(env, lock, CLS_ENQUEUED); + break; + case CLS_INTRANSIT: + LASSERT(cl_lock_is_intransit(lock)); + result = CLO_WAIT; + break; + case CLS_CACHED: + /* yank lock from the cache. */ + result = cl_use_try(env, lock, 0); + break; + case CLS_ENQUEUED: + case CLS_HELD: + result = 0; + break; + default: + case CLS_FREEING: + /* + * impossible, only held locks with increased + * ->cll_holds can be enqueued, and they cannot be + * freed. + */ + LBUG(); + } + } while (result == CLO_REPEAT); + return result; +} +EXPORT_SYMBOL(cl_enqueue_try); + +/** + * Cancel the conflicting lock found during previous enqueue. + * + * \retval 0 conflicting lock has been canceled. + * \retval -ve error code. + */ +int cl_lock_enqueue_wait(const struct lu_env *env, + struct cl_lock *lock, + int keep_mutex) +{ + struct cl_lock *conflict; + int rc = 0; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(lock->cll_state == CLS_QUEUING); + LASSERT(lock->cll_conflict != NULL); + + conflict = lock->cll_conflict; + lock->cll_conflict = NULL; + + cl_lock_mutex_put(env, lock); + LASSERT(cl_lock_nr_mutexed(env) == 0); + + cl_lock_mutex_get(env, conflict); + cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict); + cl_lock_cancel(env, conflict); + cl_lock_delete(env, conflict); + + while (conflict->cll_state != CLS_FREEING) { + rc = cl_lock_state_wait(env, conflict); + if (rc != 0) + break; + } + cl_lock_mutex_put(env, conflict); + lu_ref_del(&conflict->cll_reference, "cancel-wait", lock); + cl_lock_put(env, conflict); + + if (keep_mutex) + cl_lock_mutex_get(env, lock); + + LASSERT(rc <= 0); + return rc; +} +EXPORT_SYMBOL(cl_lock_enqueue_wait); + +static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 enqflags) +{ + int result; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_holds > 0); + + cl_lock_user_add(env, lock); + do { + result = cl_enqueue_try(env, lock, io, enqflags); + if (result == CLO_WAIT) { + if (lock->cll_conflict != NULL) + result = cl_lock_enqueue_wait(env, lock, 1); + else + result = cl_lock_state_wait(env, lock); + if (result == 0) + continue; + } + break; + } while (1); + if (result != 0) + cl_unuse_try(env, lock); + LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL), + lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD)); + return result; +} + +/** + * Enqueues a lock. + * + * \pre current thread or io owns a hold on lock. + * + * \post ergo(result == 0, lock->users increased) + * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + */ +int cl_enqueue(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 enqflags) +{ + int result; + + cl_lock_lockdep_acquire(env, lock, enqflags); + cl_lock_mutex_get(env, lock); + result = cl_enqueue_locked(env, lock, io, enqflags); + cl_lock_mutex_put(env, lock); + if (result != 0) + cl_lock_lockdep_release(env, lock); + LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD)); + return result; +} +EXPORT_SYMBOL(cl_enqueue); + +/** + * Tries to unlock a lock. + * + * This function is called to release underlying resource: + * 1. for top lock, the resource is sublocks it held; + * 2. for sublock, the resource is the reference to dlmlock. + * + * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT. + * + * \see cl_unuse() cl_lock_operations::clo_unuse() + * \see cl_lock_state::CLS_CACHED + */ +int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + enum cl_lock_state state = CLS_NEW; + + cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock); + + if (lock->cll_users > 1) { + cl_lock_user_del(env, lock); + return 0; + } + + /* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold + * underlying resources. */ + if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) { + cl_lock_user_del(env, lock); + return 0; + } + + /* + * New lock users (->cll_users) are not protecting unlocking + * from proceeding. From this point, lock eventually reaches + * CLS_CACHED, is reinitialized to CLS_NEW or fails into + * CLS_FREEING. + */ + state = cl_lock_intransit(env, lock); + + result = cl_unuse_try_internal(env, lock); + LASSERT(lock->cll_state == CLS_INTRANSIT); + LASSERT(result != CLO_WAIT); + cl_lock_user_del(env, lock); + if (result == 0 || result == -ESTALE) { + /* + * Return lock back to the cache. This is the only + * place where lock is moved into CLS_CACHED state. + * + * If one of ->clo_unuse() methods returned -ESTALE, lock + * cannot be placed into cache and has to be + * re-initialized. This happens e.g., when a sub-lock was + * canceled while unlocking was in progress. + */ + if (state == CLS_HELD && result == 0) + state = CLS_CACHED; + else + state = CLS_NEW; + cl_lock_extransit(env, lock, state); + + /* + * Hide -ESTALE error. + * If the lock is a glimpse lock, and it has multiple + * stripes. Assuming that one of its sublock returned -ENAVAIL, + * and other sublocks are matched write locks. In this case, + * we can't set this lock to error because otherwise some of + * its sublocks may not be canceled. This causes some dirty + * pages won't be written to OSTs. -jay + */ + result = 0; + } else { + CERROR("result = %d, this is unlikely!\n", result); + state = CLS_NEW; + cl_lock_extransit(env, lock, state); + } + return result ?: lock->cll_error; +} +EXPORT_SYMBOL(cl_unuse_try); + +static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + + result = cl_unuse_try(env, lock); + if (result) + CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result); +} + +/** + * Unlocks a lock. + */ +void cl_unuse(const struct lu_env *env, struct cl_lock *lock) +{ + cl_lock_mutex_get(env, lock); + cl_unuse_locked(env, lock); + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_release(env, lock); +} +EXPORT_SYMBOL(cl_unuse); + +/** + * Tries to wait for a lock. + * + * This function is called repeatedly by cl_wait() until either lock is + * granted, or error occurs. This function does not block waiting for network + * communication to complete. + * + * \see cl_wait() cl_lock_operations::clo_wait() + * \see cl_lock_state::CLS_HELD + */ +int cl_wait_try(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + int result; + + cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock); + do { + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERTF(lock->cll_state == CLS_QUEUING || + lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD || + lock->cll_state == CLS_INTRANSIT, + "lock state: %d\n", lock->cll_state); + LASSERT(lock->cll_users > 0); + LASSERT(lock->cll_holds > 0); + + result = lock->cll_error; + if (result != 0) + break; + + if (cl_lock_is_intransit(lock)) { + result = CLO_WAIT; + break; + } + + if (lock->cll_state == CLS_HELD) + /* nothing to do */ + break; + + result = -ENOSYS; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_wait != NULL) { + result = slice->cls_ops->clo_wait(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + if (result == 0) { + LASSERT(lock->cll_state != CLS_INTRANSIT); + cl_lock_state_set(env, lock, CLS_HELD); + } + } while (result == CLO_REPEAT); + return result; +} +EXPORT_SYMBOL(cl_wait_try); + +/** + * Waits until enqueued lock is granted. + * + * \pre current thread or io owns a hold on the lock + * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + * + * \post ergo(result == 0, lock->cll_state == CLS_HELD) + */ +int cl_wait(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + + cl_lock_mutex_get(env, lock); + + LINVRNT(cl_lock_invariant(env, lock)); + LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD, + "Wrong state %d \n", lock->cll_state); + LASSERT(lock->cll_holds > 0); + + do { + result = cl_wait_try(env, lock); + if (result == CLO_WAIT) { + result = cl_lock_state_wait(env, lock); + if (result == 0) + continue; + } + break; + } while (1); + if (result < 0) { + cl_unuse_try(env, lock); + cl_lock_lockdep_release(env, lock); + } + cl_lock_trace(D_DLMTRACE, env, "wait lock", lock); + cl_lock_mutex_put(env, lock); + LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD)); + return result; +} +EXPORT_SYMBOL(cl_wait); + +/** + * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock + * value. + */ +unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + unsigned long pound; + unsigned long ounce; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + pound = 0; + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_weigh != NULL) { + ounce = slice->cls_ops->clo_weigh(env, slice); + pound += ounce; + if (pound < ounce) /* over-weight^Wflow */ + pound = ~0UL; + } + } + return pound; +} +EXPORT_SYMBOL(cl_lock_weigh); + +/** + * Notifies layers that lock description changed. + * + * The server can grant client a lock different from one that was requested + * (e.g., larger in extent). This method is called when actually granted lock + * description becomes known to let layers to accommodate for changed lock + * description. + * + * \see cl_lock_operations::clo_modify() + */ +int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock, + const struct cl_lock_descr *desc) +{ + const struct cl_lock_slice *slice; + struct cl_object *obj = lock->cll_descr.cld_obj; + struct cl_object_header *hdr = cl_object_header(obj); + int result; + + cl_lock_trace(D_DLMTRACE, env, "modify lock", lock); + /* don't allow object to change */ + LASSERT(obj == desc->cld_obj); + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_modify != NULL) { + result = slice->cls_ops->clo_modify(env, slice, desc); + if (result != 0) + return result; + } + } + CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n", + PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu))); + /* + * Just replace description in place. Nothing more is needed for + * now. If locks were indexed according to their extent and/or mode, + * that index would have to be updated here. + */ + spin_lock(&hdr->coh_lock_guard); + lock->cll_descr = *desc; + spin_unlock(&hdr->coh_lock_guard); + return 0; +} +EXPORT_SYMBOL(cl_lock_modify); + +/** + * Initializes lock closure with a given origin. + * + * \see cl_lock_closure + */ +void cl_lock_closure_init(const struct lu_env *env, + struct cl_lock_closure *closure, + struct cl_lock *origin, int wait) +{ + LINVRNT(cl_lock_is_mutexed(origin)); + LINVRNT(cl_lock_invariant(env, origin)); + + INIT_LIST_HEAD(&closure->clc_list); + closure->clc_origin = origin; + closure->clc_wait = wait; + closure->clc_nr = 0; +} +EXPORT_SYMBOL(cl_lock_closure_init); + +/** + * Builds a closure of \a lock. + * + * Building of a closure consists of adding initial lock (\a lock) into it, + * and calling cl_lock_operations::clo_closure() methods of \a lock. These + * methods might call cl_lock_closure_build() recursively again, adding more + * locks to the closure, etc. + * + * \see cl_lock_closure + */ +int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure) +{ + const struct cl_lock_slice *slice; + int result; + + LINVRNT(cl_lock_is_mutexed(closure->clc_origin)); + LINVRNT(cl_lock_invariant(env, closure->clc_origin)); + + result = cl_lock_enclosure(env, lock, closure); + if (result == 0) { + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_closure != NULL) { + result = slice->cls_ops->clo_closure(env, slice, + closure); + if (result != 0) + break; + } + } + } + if (result != 0) + cl_lock_disclosure(env, closure); + return result; +} +EXPORT_SYMBOL(cl_lock_closure_build); + +/** + * Adds new lock to a closure. + * + * Try-locks \a lock and if succeeded, adds it to the closure (never more than + * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting + * until next try-lock is likely to succeed. + */ +int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure) +{ + int result = 0; + + cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock); + if (!cl_lock_mutex_try(env, lock)) { + /* + * If lock->cll_inclosure is not empty, lock is already in + * this closure. + */ + if (list_empty(&lock->cll_inclosure)) { + cl_lock_get_trust(lock); + lu_ref_add(&lock->cll_reference, "closure", closure); + list_add(&lock->cll_inclosure, &closure->clc_list); + closure->clc_nr++; + } else + cl_lock_mutex_put(env, lock); + result = 0; + } else { + cl_lock_disclosure(env, closure); + if (closure->clc_wait) { + cl_lock_get_trust(lock); + lu_ref_add(&lock->cll_reference, "closure-w", closure); + cl_lock_mutex_put(env, closure->clc_origin); + + LASSERT(cl_lock_nr_mutexed(env) == 0); + cl_lock_mutex_get(env, lock); + cl_lock_mutex_put(env, lock); + + cl_lock_mutex_get(env, closure->clc_origin); + lu_ref_del(&lock->cll_reference, "closure-w", closure); + cl_lock_put(env, lock); + } + result = CLO_REPEAT; + } + return result; +} +EXPORT_SYMBOL(cl_lock_enclosure); + +/** Releases mutices of enclosed locks. */ +void cl_lock_disclosure(const struct lu_env *env, + struct cl_lock_closure *closure) +{ + struct cl_lock *scan; + struct cl_lock *temp; + + cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin); + list_for_each_entry_safe(scan, temp, &closure->clc_list, + cll_inclosure){ + list_del_init(&scan->cll_inclosure); + cl_lock_mutex_put(env, scan); + lu_ref_del(&scan->cll_reference, "closure", closure); + cl_lock_put(env, scan); + closure->clc_nr--; + } + LASSERT(closure->clc_nr == 0); +} +EXPORT_SYMBOL(cl_lock_disclosure); + +/** Finalizes a closure. */ +void cl_lock_closure_fini(struct cl_lock_closure *closure) +{ + LASSERT(closure->clc_nr == 0); + LASSERT(list_empty(&closure->clc_list)); +} +EXPORT_SYMBOL(cl_lock_closure_fini); + +/** + * Destroys this lock. Notifies layers (bottom-to-top) that lock is being + * destroyed, then destroy the lock. If there are holds on the lock, postpone + * destruction until all holds are released. This is called when a decision is + * made to destroy the lock in the future. E.g., when a blocking AST is + * received on it, or fatal communication error happens. + * + * Caller must have a reference on this lock to prevent a situation, when + * deleted lock lingers in memory for indefinite time, because nobody calls + * cl_lock_put() to finish it. + * + * \pre atomic_read(&lock->cll_ref) > 0 + * \pre ergo(cl_lock_nesting(lock) == CNL_TOP, + * cl_lock_nr_mutexed(env) == 1) + * [i.e., if a top-lock is deleted, mutices of no other locks can be + * held, as deletion of sub-locks might require releasing a top-lock + * mutex] + * + * \see cl_lock_operations::clo_delete() + * \see cl_lock::cll_holds + */ +void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP, + cl_lock_nr_mutexed(env) == 1)); + + cl_lock_trace(D_DLMTRACE, env, "delete lock", lock); + if (lock->cll_holds == 0) + cl_lock_delete0(env, lock); + else + lock->cll_flags |= CLF_DOOMED; +} +EXPORT_SYMBOL(cl_lock_delete); + +/** + * Mark lock as irrecoverably failed, and mark it for destruction. This + * happens when, e.g., server fails to grant a lock to us, or networking + * time-out happens. + * + * \pre atomic_read(&lock->cll_ref) > 0 + * + * \see clo_lock_delete() + * \see cl_lock::cll_holds + */ +void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + if (lock->cll_error == 0 && error != 0) { + cl_lock_trace(D_DLMTRACE, env, "set lock error", lock); + lock->cll_error = error; + cl_lock_signal(env, lock); + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + } +} +EXPORT_SYMBOL(cl_lock_error); + +/** + * Cancels this lock. Notifies layers + * (bottom-to-top) that lock is being cancelled, then destroy the lock. If + * there are holds on the lock, postpone cancellation until + * all holds are released. + * + * Cancellation notification is delivered to layers at most once. + * + * \see cl_lock_operations::clo_cancel() + * \see cl_lock::cll_holds + */ +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock); + if (lock->cll_holds == 0) + cl_lock_cancel0(env, lock); + else + lock->cll_flags |= CLF_CANCELPEND; +} +EXPORT_SYMBOL(cl_lock_cancel); + +/** + * Finds an existing lock covering given index and optionally different from a + * given \a except lock. + */ +struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env, + struct cl_object *obj, pgoff_t index, + struct cl_lock *except, + int pending, int canceld) +{ + struct cl_object_header *head; + struct cl_lock *scan; + struct cl_lock *lock; + struct cl_lock_descr *need; + + head = cl_object_header(obj); + need = &cl_env_info(env)->clt_descr; + lock = NULL; + + need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but + * not PHANTOM */ + need->cld_start = need->cld_end = index; + need->cld_enq_flags = 0; + + spin_lock(&head->coh_lock_guard); + /* It is fine to match any group lock since there could be only one + * with a uniq gid and it conflicts with all other lock modes too */ + list_for_each_entry(scan, &head->coh_locks, cll_linkage) { + if (scan != except && + (scan->cll_descr.cld_mode == CLM_GROUP || + cl_lock_ext_match(&scan->cll_descr, need)) && + scan->cll_state >= CLS_HELD && + scan->cll_state < CLS_FREEING && + /* + * This check is racy as the lock can be canceled right + * after it is done, but this is fine, because page exists + * already. + */ + (canceld || !(scan->cll_flags & CLF_CANCELLED)) && + (pending || !(scan->cll_flags & CLF_CANCELPEND))) { + /* Don't increase cs_hit here since this + * is just a helper function. */ + cl_lock_get_trust(scan); + lock = scan; + break; + } + } + spin_unlock(&head->coh_lock_guard); + return lock; +} +EXPORT_SYMBOL(cl_lock_at_pgoff); + +/** + * Calculate the page offset at the layer of @lock. + * At the time of this writing, @page is top page and @lock is sub lock. + */ +static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock) +{ + struct lu_device_type *dtype; + const struct cl_page_slice *slice; + + dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type; + slice = cl_page_at(page, dtype); + LASSERT(slice != NULL); + return slice->cpl_page->cp_index; +} + +/** + * Check if page @page is covered by an extra lock or discard it. + */ +static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_lock *lock = cbdata; + pgoff_t index = pgoff_at_lock(page, lock); + + if (index >= info->clt_fn_index) { + struct cl_lock *tmp; + + /* refresh non-overlapped index */ + tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index, + lock, 1, 0); + if (tmp != NULL) { + /* Cache the first-non-overlapped index so as to skip + * all pages within [index, clt_fn_index). This + * is safe because if tmp lock is canceled, it will + * discard these pages. */ + info->clt_fn_index = tmp->cll_descr.cld_end + 1; + if (tmp->cll_descr.cld_end == CL_PAGE_EOF) + info->clt_fn_index = CL_PAGE_EOF; + cl_lock_put(env, tmp); + } else if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + } + + info->clt_next_index = index + 1; + return CLP_GANG_OKAY; +} + +static int discard_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_lock *lock = cbdata; + + LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE); + KLASSERT(ergo(page->cp_type == CPT_CACHEABLE, + !PageWriteback(cl_page_vmpage(env, page)))); + KLASSERT(ergo(page->cp_type == CPT_CACHEABLE, + !PageDirty(cl_page_vmpage(env, page)))); + + info->clt_next_index = pgoff_at_lock(page, lock) + 1; + if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + + return CLP_GANG_OKAY; +} + +/** + * Discard pages protected by the given lock. This function traverses radix + * tree to find all covering pages and discard them. If a page is being covered + * by other locks, it should remain in cache. + * + * If error happens on any step, the process continues anyway (the reasoning + * behind this being that lock cancellation cannot be delayed indefinitely). + */ +int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_io *io = &info->clt_io; + struct cl_lock_descr *descr = &lock->cll_descr; + cl_page_gang_cb_t cb; + int res; + int result; + + LINVRNT(cl_lock_invariant(env, lock)); + + io->ci_obj = cl_object_top(descr->cld_obj); + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + goto out; + + cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb; + info->clt_fn_index = info->clt_next_index = descr->cld_start; + do { + res = cl_page_gang_lookup(env, descr->cld_obj, io, + info->clt_next_index, descr->cld_end, + cb, (void *)lock); + if (info->clt_next_index > descr->cld_end) + break; + + if (res == CLP_GANG_RESCHED) + cond_resched(); + } while (res != CLP_GANG_OKAY); +out: + cl_io_fini(env, io); + return result; +} +EXPORT_SYMBOL(cl_lock_discard_pages); + +/** + * Eliminate all locks for a given object. + * + * Caller has to guarantee that no lock is in active use. + * + * \param cancel when this is set, cl_locks_prune() cancels locks before + * destroying. + */ +void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel) +{ + struct cl_object_header *head; + struct cl_lock *lock; + + head = cl_object_header(obj); + /* + * If locks are destroyed without cancellation, all pages must be + * already destroyed (as otherwise they will be left unprotected). + */ + LASSERT(ergo(!cancel, + head->coh_tree.rnode == NULL && head->coh_pages == 0)); + + spin_lock(&head->coh_lock_guard); + while (!list_empty(&head->coh_locks)) { + lock = container_of(head->coh_locks.next, + struct cl_lock, cll_linkage); + cl_lock_get_trust(lock); + spin_unlock(&head->coh_lock_guard); + lu_ref_add(&lock->cll_reference, "prune", current); + +again: + cl_lock_mutex_get(env, lock); + if (lock->cll_state < CLS_FREEING) { + LASSERT(lock->cll_users <= 1); + if (unlikely(lock->cll_users == 1)) { + struct l_wait_info lwi = { 0 }; + + cl_lock_mutex_put(env, lock); + l_wait_event(lock->cll_wq, + lock->cll_users == 0, + &lwi); + goto again; + } + + if (cancel) + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + } + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, "prune", current); + cl_lock_put(env, lock); + spin_lock(&head->coh_lock_guard); + } + spin_unlock(&head->coh_lock_guard); +} +EXPORT_SYMBOL(cl_locks_prune); + +static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env, + const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + + while (1) { + lock = cl_lock_find(env, io, need); + if (IS_ERR(lock)) + break; + cl_lock_mutex_get(env, lock); + if (lock->cll_state < CLS_FREEING && + !(lock->cll_flags & CLF_CANCELLED)) { + cl_lock_hold_mod(env, lock, +1); + lu_ref_add(&lock->cll_holders, scope, source); + lu_ref_add(&lock->cll_reference, scope, source); + break; + } + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + } + return lock; +} + +/** + * Returns a lock matching \a need description with a reference and a hold on + * it. + * + * This is much like cl_lock_find(), except that cl_lock_hold() additionally + * guarantees that lock is not in the CLS_FREEING state on return. + */ +struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + + lock = cl_lock_hold_mutex(env, io, need, scope, source); + if (!IS_ERR(lock)) + cl_lock_mutex_put(env, lock); + return lock; +} +EXPORT_SYMBOL(cl_lock_hold); + +/** + * Main high-level entry point of cl_lock interface that finds existing or + * enqueues new lock matching given description. + */ +struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + int rc; + __u32 enqflags = need->cld_enq_flags; + + do { + lock = cl_lock_hold_mutex(env, io, need, scope, source); + if (IS_ERR(lock)) + break; + + rc = cl_enqueue_locked(env, lock, io, enqflags); + if (rc == 0) { + if (cl_lock_fits_into(env, lock, need, io)) { + if (!(enqflags & CEF_AGL)) { + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_acquire(env, lock, + enqflags); + break; + } + rc = 1; + } + cl_unuse_locked(env, lock); + } + cl_lock_trace(D_DLMTRACE, env, + rc <= 0 ? "enqueue failed" : "agl succeed", lock); + cl_lock_hold_release(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + if (rc > 0) { + LASSERT(enqflags & CEF_AGL); + lock = NULL; + } else if (rc != 0) { + lock = ERR_PTR(rc); + } + } while (rc == 0); + return lock; +} +EXPORT_SYMBOL(cl_lock_request); + +/** + * Adds a hold to a known lock. + */ +void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state != CLS_FREEING); + + cl_lock_hold_mod(env, lock, +1); + cl_lock_get(lock); + lu_ref_add(&lock->cll_holders, scope, source); + lu_ref_add(&lock->cll_reference, scope, source); +} +EXPORT_SYMBOL(cl_lock_hold_add); + +/** + * Releases a hold and a reference on a lock, on which caller acquired a + * mutex. + */ +void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_invariant(env, lock)); + cl_lock_hold_release(env, lock, scope, source); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); +} +EXPORT_SYMBOL(cl_lock_unhold); + +/** + * Releases a hold and a reference on a lock, obtained by cl_lock_hold(). + */ +void cl_lock_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_invariant(env, lock)); + cl_lock_trace(D_DLMTRACE, env, "release lock", lock); + cl_lock_mutex_get(env, lock); + cl_lock_hold_release(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); +} +EXPORT_SYMBOL(cl_lock_release); + +void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + cl_lock_used_mod(env, lock, +1); +} +EXPORT_SYMBOL(cl_lock_user_add); + +void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_users > 0); + + cl_lock_used_mod(env, lock, -1); + if (lock->cll_users == 0) + wake_up_all(&lock->cll_wq); +} +EXPORT_SYMBOL(cl_lock_user_del); + +const char *cl_lock_mode_name(const enum cl_lock_mode mode) +{ + static const char *names[] = { + [CLM_PHANTOM] = "P", + [CLM_READ] = "R", + [CLM_WRITE] = "W", + [CLM_GROUP] = "G" + }; + if (0 <= mode && mode < ARRAY_SIZE(names)) + return names[mode]; + else + return "U"; +} +EXPORT_SYMBOL(cl_lock_mode_name); + +/** + * Prints human readable representation of a lock description. + */ +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr) +{ + const struct lu_fid *fid; + + fid = lu_object_fid(&descr->cld_obj->co_lu); + (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid)); +} +EXPORT_SYMBOL(cl_lock_descr_print); + +/** + * Prints human readable representation of \a lock to the \a f. + */ +void cl_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + (*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ", + lock, atomic_read(&lock->cll_ref), + lock->cll_state, lock->cll_error, lock->cll_holds, + lock->cll_users, lock->cll_flags); + cl_lock_descr_print(env, cookie, printer, &lock->cll_descr); + (*printer)(env, cookie, " {\n"); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + (*printer)(env, cookie, " %s@%p: ", + slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name, + slice); + if (slice->cls_ops->clo_print != NULL) + slice->cls_ops->clo_print(env, cookie, printer, slice); + (*printer)(env, cookie, "\n"); + } + (*printer)(env, cookie, "} lock@%p\n", lock); +} +EXPORT_SYMBOL(cl_lock_print); + +int cl_lock_init(void) +{ + return lu_kmem_init(cl_lock_caches); +} + +void cl_lock_fini(void) +{ + lu_kmem_fini(cl_lock_caches); +} diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_object.c b/kernel/drivers/staging/lustre/lustre/obdclass/cl_object.c new file mode 100644 index 000000000..f13d1fbff --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_object.c @@ -0,0 +1,1139 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Object. + * + * Author: Nikita Danilov + */ + +/* + * Locking. + * + * i_mutex + * PG_locked + * ->coh_page_guard + * ->coh_lock_guard + * ->coh_attr_guard + * ->ls_guard + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../../include/linux/libcfs/libcfs.h" +/* class_put_type() */ +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lustre_fid.h" +#include +#include "../../include/linux/libcfs/libcfs_hash.h" /* for cfs_hash stuff */ +#include "../include/cl_object.h" +#include "cl_internal.h" + +static struct kmem_cache *cl_env_kmem; + +/** Lock class of cl_object_header::coh_page_guard */ +static struct lock_class_key cl_page_guard_class; +/** Lock class of cl_object_header::coh_lock_guard */ +static struct lock_class_key cl_lock_guard_class; +/** Lock class of cl_object_header::coh_attr_guard */ +static struct lock_class_key cl_attr_guard_class; + +extern __u32 lu_context_tags_default; +extern __u32 lu_session_tags_default; +/** + * Initialize cl_object_header. + */ +int cl_object_header_init(struct cl_object_header *h) +{ + int result; + + result = lu_object_header_init(&h->coh_lu); + if (result == 0) { + spin_lock_init(&h->coh_page_guard); + spin_lock_init(&h->coh_lock_guard); + spin_lock_init(&h->coh_attr_guard); + lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class); + lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class); + lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class); + h->coh_pages = 0; + /* XXX hard coded GFP_* mask. */ + INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC); + INIT_LIST_HEAD(&h->coh_locks); + h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8); + } + return result; +} +EXPORT_SYMBOL(cl_object_header_init); + +/** + * Finalize cl_object_header. + */ +void cl_object_header_fini(struct cl_object_header *h) +{ + LASSERT(list_empty(&h->coh_locks)); + lu_object_header_fini(&h->coh_lu); +} +EXPORT_SYMBOL(cl_object_header_fini); + +/** + * Returns a cl_object with a given \a fid. + * + * Returns either cached or newly created object. Additional reference on the + * returned object is acquired. + * + * \see lu_object_find(), cl_page_find(), cl_lock_find() + */ +struct cl_object *cl_object_find(const struct lu_env *env, + struct cl_device *cd, const struct lu_fid *fid, + const struct cl_object_conf *c) +{ + might_sleep(); + return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu)); +} +EXPORT_SYMBOL(cl_object_find); + +/** + * Releases a reference on \a o. + * + * When last reference is released object is returned to the cache, unless + * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header. + * + * \see cl_page_put(), cl_lock_put(). + */ +void cl_object_put(const struct lu_env *env, struct cl_object *o) +{ + lu_object_put(env, &o->co_lu); +} +EXPORT_SYMBOL(cl_object_put); + +/** + * Acquire an additional reference to the object \a o. + * + * This can only be used to acquire _additional_ reference, i.e., caller + * already has to possess at least one reference to \a o before calling this. + * + * \see cl_page_get(), cl_lock_get(). + */ +void cl_object_get(struct cl_object *o) +{ + lu_object_get(&o->co_lu); +} +EXPORT_SYMBOL(cl_object_get); + +/** + * Returns the top-object for a given \a o. + * + * \see cl_page_top(), cl_io_top() + */ +struct cl_object *cl_object_top(struct cl_object *o) +{ + struct cl_object_header *hdr = cl_object_header(o); + struct cl_object *top; + + while (hdr->coh_parent != NULL) + hdr = hdr->coh_parent; + + top = lu2cl(lu_object_top(&hdr->coh_lu)); + CDEBUG(D_TRACE, "%p -> %p\n", o, top); + return top; +} +EXPORT_SYMBOL(cl_object_top); + +/** + * Returns pointer to the lock protecting data-attributes for the given object + * \a o. + * + * Data-attributes are protected by the cl_object_header::coh_attr_guard + * spin-lock in the top-object. + * + * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get(). + */ +static spinlock_t *cl_object_attr_guard(struct cl_object *o) +{ + return &cl_object_header(cl_object_top(o))->coh_attr_guard; +} + +/** + * Locks data-attributes. + * + * Prevents data-attributes from changing, until lock is released by + * cl_object_attr_unlock(). This has to be called before calls to + * cl_object_attr_get(), cl_object_attr_set(). + */ +void cl_object_attr_lock(struct cl_object *o) + __acquires(cl_object_attr_guard(o)) +{ + spin_lock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_lock); + +/** + * Releases data-attributes lock, acquired by cl_object_attr_lock(). + */ +void cl_object_attr_unlock(struct cl_object *o) + __releases(cl_object_attr_guard(o)) +{ + spin_unlock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_unlock); + +/** + * Returns data-attributes of an object \a obj. + * + * Every layer is asked (by calling cl_object_operations::coo_attr_get()) + * top-to-bottom to fill in parts of \a attr that this layer is responsible + * for. + */ +int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lu_object_header *top; + int result; + + assert_spin_locked(cl_object_attr_guard(obj)); + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_get != NULL) { + result = obj->co_ops->coo_attr_get(env, obj, attr); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + return result; +} +EXPORT_SYMBOL(cl_object_attr_get); + +/** + * Updates data-attributes of an object \a obj. + * + * Only attributes, mentioned in a validness bit-mask \a v are + * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom + * to top. + */ +int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned v) +{ + struct lu_object_header *top; + int result; + + assert_spin_locked(cl_object_attr_guard(obj)); + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, + co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_set != NULL) { + result = obj->co_ops->coo_attr_set(env, obj, attr, v); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + return result; +} +EXPORT_SYMBOL(cl_object_attr_set); + +/** + * Notifies layers (bottom-to-top) that glimpse AST was received. + * + * Layers have to fill \a lvb fields with information that will be shipped + * back to glimpse issuer. + * + * \see cl_lock_operations::clo_glimpse() + */ +int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lu_object_header *top; + int result; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, + co_lu.lo_linkage) { + if (obj->co_ops->coo_glimpse != NULL) { + result = obj->co_ops->coo_glimpse(env, obj, lvb); + if (result != 0) + break; + } + } + LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top), + "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu\n", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); + return result; +} +EXPORT_SYMBOL(cl_object_glimpse); + +/** + * Updates a configuration of an object \a obj. + */ +int cl_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lu_object_header *top; + int result; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_conf_set != NULL) { + result = obj->co_ops->coo_conf_set(env, obj, conf); + if (result != 0) + break; + } + } + return result; +} +EXPORT_SYMBOL(cl_conf_set); + +/** + * Helper function removing all object locks, and marking object for + * deletion. All object pages must have been deleted at this point. + * + * This is called by cl_inode_fini() and lov_object_delete() to destroy top- + * and sub- objects respectively. + */ +void cl_object_kill(const struct lu_env *env, struct cl_object *obj) +{ + struct cl_object_header *hdr; + + hdr = cl_object_header(obj); + LASSERT(hdr->coh_tree.rnode == NULL); + LASSERT(hdr->coh_pages == 0); + + set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags); + /* + * Destroy all locks. Object destruction (including cl_inode_fini()) + * cannot cancel the locks, because in the case of a local client, + * where client and server share the same thread running + * prune_icache(), this can dead-lock with ldlm_cancel_handler() + * waiting on __wait_on_freeing_inode(). + */ + cl_locks_prune(env, obj, 0); +} +EXPORT_SYMBOL(cl_object_kill); + +/** + * Prunes caches of pages and locks for this object. + */ +void cl_object_prune(const struct lu_env *env, struct cl_object *obj) +{ + cl_pages_prune(env, obj); + cl_locks_prune(env, obj, 1); +} +EXPORT_SYMBOL(cl_object_prune); + +/** + * Check if the object has locks. + */ +int cl_object_has_locks(struct cl_object *obj) +{ + struct cl_object_header *head = cl_object_header(obj); + int has; + + spin_lock(&head->coh_lock_guard); + has = list_empty(&head->coh_locks); + spin_unlock(&head->coh_lock_guard); + + return (has == 0); +} +EXPORT_SYMBOL(cl_object_has_locks); + +void cache_stats_init(struct cache_stats *cs, const char *name) +{ + int i; + + cs->cs_name = name; + for (i = 0; i < CS_NR; i++) + atomic_set(&cs->cs_stats[i], 0); +} + +int cache_stats_print(const struct cache_stats *cs, struct seq_file *m, int h) +{ + int i; + /* + * lookup hit total cached create + * env: ...... ...... ...... ...... ...... + */ + if (h) { + const char *names[CS_NR] = CS_NAMES; + + seq_printf(m, "%6s", " "); + for (i = 0; i < CS_NR; i++) + seq_printf(m, "%8s", names[i]); + seq_printf(m, "\n"); + } + + seq_printf(m, "%5.5s:", cs->cs_name); + for (i = 0; i < CS_NR; i++) + seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i])); + return 0; +} + +/** + * Initialize client site. + * + * Perform common initialization (lu_site_init()), and initialize statistical + * counters. Also perform global initializations on the first call. + */ +int cl_site_init(struct cl_site *s, struct cl_device *d) +{ + int i; + int result; + + result = lu_site_init(&s->cs_lu, &d->cd_lu_dev); + if (result == 0) { + cache_stats_init(&s->cs_pages, "pages"); + cache_stats_init(&s->cs_locks, "locks"); + for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i) + atomic_set(&s->cs_pages_state[0], 0); + for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i) + atomic_set(&s->cs_locks_state[i], 0); + } + return result; +} +EXPORT_SYMBOL(cl_site_init); + +/** + * Finalize client site. Dual to cl_site_init(). + */ +void cl_site_fini(struct cl_site *s) +{ + lu_site_fini(&s->cs_lu); +} +EXPORT_SYMBOL(cl_site_fini); + +static struct cache_stats cl_env_stats = { + .cs_name = "envs", + .cs_stats = { ATOMIC_INIT(0), } +}; + +/** + * Outputs client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, struct seq_file *m) +{ + int i; + static const char *pstate[] = { + [CPS_CACHED] = "c", + [CPS_OWNED] = "o", + [CPS_PAGEOUT] = "w", + [CPS_PAGEIN] = "r", + [CPS_FREEING] = "f" + }; + static const char *lstate[] = { + [CLS_NEW] = "n", + [CLS_QUEUING] = "q", + [CLS_ENQUEUED] = "e", + [CLS_HELD] = "h", + [CLS_INTRANSIT] = "t", + [CLS_CACHED] = "c", + [CLS_FREEING] = "f" + }; +/* + lookup hit total busy create +pages: ...... ...... ...... ...... ...... [...... ...... ...... ......] +locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......] + env: ...... ...... ...... ...... ...... + */ + lu_site_stats_print(&site->cs_lu, m); + cache_stats_print(&site->cs_pages, m, 1); + seq_printf(m, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i) + seq_printf(m, "%s: %u ", pstate[i], + atomic_read(&site->cs_pages_state[i])); + seq_printf(m, "]\n"); + cache_stats_print(&site->cs_locks, m, 0); + seq_printf(m, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i) + seq_printf(m, "%s: %u ", lstate[i], + atomic_read(&site->cs_locks_state[i])); + seq_printf(m, "]\n"); + cache_stats_print(&cl_env_stats, m, 0); + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(cl_site_stats_print); + +/***************************************************************************** + * + * lu_env handling on client. + * + */ + +/** + * The most efficient way is to store cl_env pointer in task specific + * structures. On Linux, it wont' be easy to use task_struct->journal_info + * because Lustre code may call into other fs which has certain assumptions + * about journal_info. Currently following fields in task_struct are identified + * can be used for this purpose: + * - cl_env: for liblustre. + * - tux_info: only on RedHat kernel. + * - ... + * \note As long as we use task_struct to store cl_env, we assume that once + * called into Lustre, we'll never call into the other part of the kernel + * which will use those fields in task_struct without explicitly exiting + * Lustre. + * + * If there's no space in task_struct is available, hash will be used. + * bz20044, bz22683. + */ + +struct cl_env { + void *ce_magic; + struct lu_env ce_lu; + struct lu_context ce_ses; + + /** + * This allows cl_env to be entered into cl_env_hash which implements + * the current thread -> client environment lookup. + */ + struct hlist_node ce_node; + /** + * Owner for the current cl_env. + * + * If LL_TASK_CL_ENV is defined, this point to the owning current, + * only for debugging purpose ; + * Otherwise hash is used, and this is the key for cfs_hash. + * Now current thread pid is stored. Note using thread pointer would + * lead to unbalanced hash because of its specific allocation locality + * and could be varied for different platforms and OSes, even different + * OS versions. + */ + void *ce_owner; + + /* + * Linkage into global list of all client environments. Used for + * garbage collection. + */ + struct list_head ce_linkage; + /* + * + */ + int ce_ref; + /* + * Debugging field: address of the caller who made original + * allocation. + */ + void *ce_debug; +}; + +#define CL_ENV_INC(counter) +#define CL_ENV_DEC(counter) + +static void cl_env_init0(struct cl_env *cle, void *debug) +{ + LASSERT(cle->ce_ref == 0); + LASSERT(cle->ce_magic == &cl_env_init0); + LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL); + + cle->ce_ref = 1; + cle->ce_debug = debug; + CL_ENV_INC(busy); +} + + +/* + * The implementation of using hash table to connect cl_env and thread + */ + +static struct cfs_hash *cl_env_hash; + +static unsigned cl_env_hops_hash(struct cfs_hash *lh, + const void *key, unsigned mask) +{ +#if BITS_PER_LONG == 64 + return cfs_hash_u64_hash((__u64)key, mask); +#else + return cfs_hash_u32_hash((__u32)key, mask); +#endif +} + +static void *cl_env_hops_obj(struct hlist_node *hn) +{ + struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node); + LASSERT(cle->ce_magic == &cl_env_init0); + return (void *)cle; +} + +static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn) +{ + struct cl_env *cle = cl_env_hops_obj(hn); + + LASSERT(cle->ce_owner != NULL); + return (key == cle->ce_owner); +} + +static void cl_env_hops_noop(struct cfs_hash *hs, struct hlist_node *hn) +{ + struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node); + LASSERT(cle->ce_magic == &cl_env_init0); +} + +static cfs_hash_ops_t cl_env_hops = { + .hs_hash = cl_env_hops_hash, + .hs_key = cl_env_hops_obj, + .hs_keycmp = cl_env_hops_keycmp, + .hs_object = cl_env_hops_obj, + .hs_get = cl_env_hops_noop, + .hs_put_locked = cl_env_hops_noop, +}; + +static inline struct cl_env *cl_env_fetch(void) +{ + struct cl_env *cle; + + cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid); + LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0)); + return cle; +} + +static inline void cl_env_attach(struct cl_env *cle) +{ + if (cle) { + int rc; + + LASSERT(cle->ce_owner == NULL); + cle->ce_owner = (void *) (long) current->pid; + rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner, + &cle->ce_node); + LASSERT(rc == 0); + } +} + +static inline void cl_env_do_detach(struct cl_env *cle) +{ + void *cookie; + + LASSERT(cle->ce_owner == (void *) (long) current->pid); + cookie = cfs_hash_del(cl_env_hash, cle->ce_owner, + &cle->ce_node); + LASSERT(cookie == cle); + cle->ce_owner = NULL; +} + +static int cl_env_store_init(void) { + cl_env_hash = cfs_hash_create("cl_env", + HASH_CL_ENV_BITS, HASH_CL_ENV_BITS, + HASH_CL_ENV_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &cl_env_hops, + CFS_HASH_RW_BKTLOCK); + return cl_env_hash != NULL ? 0 :-ENOMEM; +} + +static void cl_env_store_fini(void) +{ + cfs_hash_putref(cl_env_hash); +} + + +static inline struct cl_env *cl_env_detach(struct cl_env *cle) +{ + if (cle == NULL) + cle = cl_env_fetch(); + + if (cle && cle->ce_owner) + cl_env_do_detach(cle); + + return cle; +} + +static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug) +{ + struct lu_env *env; + struct cl_env *cle; + + OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, GFP_NOFS); + if (cle != NULL) { + int rc; + + INIT_LIST_HEAD(&cle->ce_linkage); + cle->ce_magic = &cl_env_init0; + env = &cle->ce_lu; + rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags); + if (rc == 0) { + rc = lu_context_init(&cle->ce_ses, + LCT_SESSION | ses_tags); + if (rc == 0) { + lu_context_enter(&cle->ce_ses); + env->le_ses = &cle->ce_ses; + cl_env_init0(cle, debug); + } else + lu_env_fini(env); + } + if (rc != 0) { + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); + env = ERR_PTR(rc); + } else { + CL_ENV_INC(create); + CL_ENV_INC(total); + } + } else + env = ERR_PTR(-ENOMEM); + return env; +} + +static void cl_env_fini(struct cl_env *cle) +{ + CL_ENV_DEC(total); + lu_context_fini(&cle->ce_lu.le_ctx); + lu_context_fini(&cle->ce_ses); + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); +} + +static inline struct cl_env *cl_env_container(struct lu_env *env) +{ + return container_of(env, struct cl_env, ce_lu); +} + +struct lu_env *cl_env_peek(int *refcheck) +{ + struct lu_env *env; + struct cl_env *cle; + + CL_ENV_INC(lookup); + + /* check that we don't go far from untrusted pointer */ + CLASSERT(offsetof(struct cl_env, ce_magic) == 0); + + env = NULL; + cle = cl_env_fetch(); + if (cle != NULL) { + CL_ENV_INC(hit); + env = &cle->ce_lu; + *refcheck = ++cle->ce_ref; + } + CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle); + return env; +} +EXPORT_SYMBOL(cl_env_peek); + +/** + * Returns lu_env: if there already is an environment associated with the + * current thread, it is returned, otherwise, new environment is allocated. + * + * \param refcheck pointer to a counter used to detect environment leaks. In + * the usual case cl_env_get() and cl_env_put() are called in the same lexical + * scope and pointer to the same integer is passed as \a refcheck. This is + * used to detect missed cl_env_put(). + * + * \see cl_env_put() + */ +struct lu_env *cl_env_get(int *refcheck) +{ + struct lu_env *env; + + env = cl_env_peek(refcheck); + if (env == NULL) { + env = cl_env_new(lu_context_tags_default, + lu_session_tags_default, + __builtin_return_address(0)); + + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + cl_env_attach(cle); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + } + return env; +} +EXPORT_SYMBOL(cl_env_get); + +/** + * Forces an allocation of a fresh environment with given tags. + * + * \see cl_env_get() + */ +struct lu_env *cl_env_alloc(int *refcheck, __u32 tags) +{ + struct lu_env *env; + + LASSERT(cl_env_peek(refcheck) == NULL); + env = cl_env_new(tags, tags, __builtin_return_address(0)); + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + return env; +} +EXPORT_SYMBOL(cl_env_alloc); + +static void cl_env_exit(struct cl_env *cle) +{ + LASSERT(cle->ce_owner == NULL); + lu_context_exit(&cle->ce_lu.le_ctx); + lu_context_exit(&cle->ce_ses); +} + +/** + * Release an environment. + * + * Decrement \a env reference counter. When counter drops to 0, nothing in + * this thread is using environment and it is returned to the allocation + * cache, or freed straight away, if cache is large enough. + */ +void cl_env_put(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle; + + cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 0); + LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck)); + + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + if (--cle->ce_ref == 0) { + CL_ENV_DEC(busy); + cl_env_detach(cle); + cle->ce_debug = NULL; + cl_env_exit(cle); + cl_env_fini(cle); + } +} +EXPORT_SYMBOL(cl_env_put); + +/** + * Declares a point of re-entrancy. + * + * \see cl_env_reexit() + */ +void *cl_env_reenter(void) +{ + return cl_env_detach(NULL); +} +EXPORT_SYMBOL(cl_env_reenter); + +/** + * Exits re-entrancy. + */ +void cl_env_reexit(void *cookie) +{ + cl_env_detach(NULL); + cl_env_attach(cookie); +} +EXPORT_SYMBOL(cl_env_reexit); + +/** + * Setup user-supplied \a env as a current environment. This is to be used to + * guaranteed that environment exists even when cl_env_get() fails. It is up + * to user to ensure proper concurrency control. + * + * \see cl_env_unplant() + */ +void cl_env_implant(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 0); + + cl_env_attach(cle); + cl_env_get(refcheck); + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); +} +EXPORT_SYMBOL(cl_env_implant); + +/** + * Detach environment installed earlier by cl_env_implant(). + */ +void cl_env_unplant(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 1); + + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + + cl_env_detach(cle); + cl_env_put(env, refcheck); +} +EXPORT_SYMBOL(cl_env_unplant); + +struct lu_env *cl_env_nested_get(struct cl_env_nest *nest) +{ + struct lu_env *env; + + nest->cen_cookie = NULL; + env = cl_env_peek(&nest->cen_refcheck); + if (env != NULL) { + if (!cl_io_is_going(env)) + return env; + else { + cl_env_put(env, &nest->cen_refcheck); + nest->cen_cookie = cl_env_reenter(); + } + } + env = cl_env_get(&nest->cen_refcheck); + if (IS_ERR(env)) { + cl_env_reexit(nest->cen_cookie); + return env; + } + + LASSERT(!cl_io_is_going(env)); + return env; +} +EXPORT_SYMBOL(cl_env_nested_get); + +void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env) +{ + cl_env_put(env, &nest->cen_refcheck); + cl_env_reexit(nest->cen_cookie); +} +EXPORT_SYMBOL(cl_env_nested_put); + +/** + * Converts struct cl_attr to struct ost_lvb. + * + * \see cl_lvb2attr + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr) +{ + lvb->lvb_size = attr->cat_size; + lvb->lvb_mtime = attr->cat_mtime; + lvb->lvb_atime = attr->cat_atime; + lvb->lvb_ctime = attr->cat_ctime; + lvb->lvb_blocks = attr->cat_blocks; +} +EXPORT_SYMBOL(cl_attr2lvb); + +/** + * Converts struct ost_lvb to struct cl_attr. + * + * \see cl_attr2lvb + */ +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb) +{ + attr->cat_size = lvb->lvb_size; + attr->cat_mtime = lvb->lvb_mtime; + attr->cat_atime = lvb->lvb_atime; + attr->cat_ctime = lvb->lvb_ctime; + attr->cat_blocks = lvb->lvb_blocks; +} +EXPORT_SYMBOL(cl_lvb2attr); + +/***************************************************************************** + * + * Temporary prototype thing: mirror obd-devices into cl devices. + * + */ + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next) +{ + const char *typename; + struct lu_device *d; + + LASSERT(ldt != NULL); + + typename = ldt->ldt_name; + d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL); + if (!IS_ERR(d)) { + int rc; + + if (site != NULL) + d->ld_site = site; + rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next); + if (rc == 0) { + lu_device_get(d); + lu_ref_add(&d->ld_reference, + "lu-stack", &lu_site_init); + } else { + ldt->ldt_ops->ldto_device_free(env, d); + CERROR("can't init device '%s', %d\n", typename, rc); + d = ERR_PTR(rc); + } + } else + CERROR("Cannot allocate device: '%s'\n", typename); + return lu2cl_dev(d); +} +EXPORT_SYMBOL(cl_type_setup); + +/** + * Finalize device stack by calling lu_stack_fini(). + */ +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl) +{ + lu_stack_fini(env, cl2lu_dev(cl)); +} +EXPORT_SYMBOL(cl_stack_fini); + +int cl_lock_init(void); +void cl_lock_fini(void); + +int cl_page_init(void); +void cl_page_fini(void); + +static struct lu_context_key cl_key; + +struct cl_thread_info *cl_env_info(const struct lu_env *env) +{ + return lu_context_key_get(&env->le_ctx, &cl_key); +} + +/* defines cl0_key_{init,fini}() */ +LU_KEY_INIT_FINI(cl0, struct cl_thread_info); + +static void *cl_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct cl_thread_info *info; + + info = cl0_key_init(ctx, key); + if (!IS_ERR(info)) { + int i; + + for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) + lu_ref_init(&info->clt_counters[i].ctc_locks_locked); + } + return info; +} + +static void cl_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct cl_thread_info *info; + int i; + + info = data; + for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) + lu_ref_fini(&info->clt_counters[i].ctc_locks_locked); + cl0_key_fini(ctx, key, data); +} + +static void cl_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct cl_thread_info *info = data; + int i; + + for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) { + LASSERT(info->clt_counters[i].ctc_nr_held == 0); + LASSERT(info->clt_counters[i].ctc_nr_used == 0); + LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0); + LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0); + lu_ref_fini(&info->clt_counters[i].ctc_locks_locked); + lu_ref_init(&info->clt_counters[i].ctc_locks_locked); + } +} + +static struct lu_context_key cl_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = cl_key_init, + .lct_fini = cl_key_fini, + .lct_exit = cl_key_exit +}; + +static struct lu_kmem_descr cl_object_caches[] = { + { + .ckd_cache = &cl_env_kmem, + .ckd_name = "cl_env_kmem", + .ckd_size = sizeof (struct cl_env) + }, + { + .ckd_cache = NULL + } +}; + +/** + * Global initialization of cl-data. Create kmem caches, register + * lu_context_key's, etc. + * + * \see cl_global_fini() + */ +int cl_global_init(void) +{ + int result; + + result = cl_env_store_init(); + if (result) + return result; + + result = lu_kmem_init(cl_object_caches); + if (result) + goto out_store; + + LU_CONTEXT_KEY_INIT(&cl_key); + result = lu_context_key_register(&cl_key); + if (result) + goto out_kmem; + + result = cl_lock_init(); + if (result) + goto out_context; + + result = cl_page_init(); + if (result) + goto out_lock; + + return 0; +out_lock: + cl_lock_fini(); +out_context: + lu_context_key_degister(&cl_key); +out_kmem: + lu_kmem_fini(cl_object_caches); +out_store: + cl_env_store_fini(); + return result; +} + +/** + * Finalization of global cl-data. Dual to cl_global_init(). + */ +void cl_global_fini(void) +{ + cl_lock_fini(); + cl_page_fini(); + lu_context_key_degister(&cl_key); + lu_kmem_fini(cl_object_caches); + cl_env_store_fini(); +} diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/cl_page.c b/kernel/drivers/staging/lustre/lustre/obdclass/cl_page.c new file mode 100644 index 000000000..b7dd04808 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/cl_page.c @@ -0,0 +1,1553 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Page. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include + +#include "../include/cl_object.h" +#include "cl_internal.h" + +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg, + int radix); + +# define PASSERT(env, page, expr) \ + do { \ + if (unlikely(!(expr))) { \ + CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ + LASSERT(0); \ + } \ + } while (0) + +# define PINVRNT(env, page, exp) \ + ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) + +/* Disable page statistic by default due to huge performance penalty. */ +#define CS_PAGE_INC(o, item) +#define CS_PAGE_DEC(o, item) +#define CS_PAGESTATE_INC(o, state) +#define CS_PAGESTATE_DEC(o, state) + +/** + * Internal version of cl_page_top, it should be called if the page is + * known to be not freed, says with page referenced, or radix tree lock held, + * or page owned. + */ +static struct cl_page *cl_page_top_trusted(struct cl_page *page) +{ + while (page->cp_parent != NULL) + page = page->cp_parent; + return page; +} + +/** + * Internal version of cl_page_get(). + * + * This function can be used to obtain initial reference to previously + * unreferenced cached object. It can be called only if concurrent page + * reclamation is somehow prevented, e.g., by locking page radix-tree + * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page, + * associated with \a page. + * + * Use with care! Not exported. + */ +static void cl_page_get_trust(struct cl_page *page) +{ + LASSERT(atomic_read(&page->cp_ref) > 0); + atomic_inc(&page->cp_ref); +} + +/** + * Returns a slice within a page, corresponding to the given layer in the + * device stack. + * + * \see cl_lock_at() + */ +static const struct cl_page_slice * +cl_page_at_trusted(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + const struct cl_page_slice *slice; + + page = cl_page_top_trusted((struct cl_page *)page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype) + return slice; + } + page = page->cp_child; + } while (page != NULL); + return NULL; +} + +/** + * Returns a page with given index in the given object, or NULL if no page is + * found. Acquires a reference on \a page. + * + * Locking: called under cl_object_header::coh_page_guard spin-lock. + */ +struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index) +{ + struct cl_page *page; + + assert_spin_locked(&hdr->coh_page_guard); + + page = radix_tree_lookup(&hdr->coh_tree, index); + if (page != NULL) + cl_page_get_trust(page); + return page; +} +EXPORT_SYMBOL(cl_page_lookup); + +/** + * Returns a list of pages by a given [start, end] of \a obj. + * + * \param resched If not NULL, then we give up before hogging CPU for too + * long and set *resched = 1, in that case caller should implement a retry + * logic. + * + * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely + * crucial in the face of [offset, EOF] locks. + * + * Return at least one page in @queue unless there is no covered page. + */ +int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, pgoff_t start, pgoff_t end, + cl_page_gang_cb_t cb, void *cbdata) +{ + struct cl_object_header *hdr; + struct cl_page *page; + struct cl_page **pvec; + const struct cl_page_slice *slice; + const struct lu_device_type *dtype; + pgoff_t idx; + unsigned int nr; + unsigned int i; + unsigned int j; + int res = CLP_GANG_OKAY; + int tree_lock = 1; + + idx = start; + hdr = cl_object_header(obj); + pvec = cl_env_info(env)->clt_pvec; + dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type; + spin_lock(&hdr->coh_page_guard); + while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec, + idx, CLT_PVEC_SIZE)) > 0) { + int end_of_region = 0; + idx = pvec[nr - 1]->cp_index + 1; + for (i = 0, j = 0; i < nr; ++i) { + page = pvec[i]; + pvec[i] = NULL; + + LASSERT(page->cp_type == CPT_CACHEABLE); + if (page->cp_index > end) { + end_of_region = 1; + break; + } + if (page->cp_state == CPS_FREEING) + continue; + + slice = cl_page_at_trusted(page, dtype); + /* + * Pages for lsm-less file has no underneath sub-page + * for osc, in case of ... + */ + PASSERT(env, page, slice != NULL); + + page = slice->cpl_page; + /* + * Can safely call cl_page_get_trust() under + * radix-tree spin-lock. + * + * XXX not true, because @page is from object another + * than @hdr and protected by different tree lock. + */ + cl_page_get_trust(page); + lu_ref_add_atomic(&page->cp_reference, + "gang_lookup", current); + pvec[j++] = page; + } + + /* + * Here a delicate locking dance is performed. Current thread + * holds a reference to a page, but has to own it before it + * can be placed into queue. Owning implies waiting, so + * radix-tree lock is to be released. After a wait one has to + * check that pages weren't truncated (cl_page_own() returns + * error in the latter case). + */ + spin_unlock(&hdr->coh_page_guard); + tree_lock = 0; + + for (i = 0; i < j; ++i) { + page = pvec[i]; + if (res == CLP_GANG_OKAY) + res = (*cb)(env, io, page, cbdata); + lu_ref_del(&page->cp_reference, + "gang_lookup", current); + cl_page_put(env, page); + } + if (nr < CLT_PVEC_SIZE || end_of_region) + break; + + if (res == CLP_GANG_OKAY && need_resched()) + res = CLP_GANG_RESCHED; + if (res != CLP_GANG_OKAY) + break; + + spin_lock(&hdr->coh_page_guard); + tree_lock = 1; + } + if (tree_lock) + spin_unlock(&hdr->coh_page_guard); + return res; +} +EXPORT_SYMBOL(cl_page_gang_lookup); + +static void cl_page_free(const struct lu_env *env, struct cl_page *page) +{ + struct cl_object *obj = page->cp_obj; + int pagesize = cl_object_header(obj)->coh_page_bufsize; + + PASSERT(env, page, list_empty(&page->cp_batch)); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, page->cp_req == NULL); + PASSERT(env, page, page->cp_parent == NULL); + PASSERT(env, page, page->cp_state == CPS_FREEING); + + might_sleep(); + while (!list_empty(&page->cp_layers)) { + struct cl_page_slice *slice; + + slice = list_entry(page->cp_layers.next, + struct cl_page_slice, cpl_linkage); + list_del_init(page->cp_layers.next); + slice->cpl_ops->cpo_fini(env, slice); + } + CS_PAGE_DEC(obj, total); + CS_PAGESTATE_DEC(obj, page->cp_state); + lu_object_ref_del_at(&obj->co_lu, &page->cp_obj_ref, "cl_page", page); + cl_object_put(env, obj); + lu_ref_fini(&page->cp_reference); + OBD_FREE(page, pagesize); +} + +/** + * Helper function updating page state. This is the only place in the code + * where cl_page::cp_state field is mutated. + */ +static inline void cl_page_state_set_trust(struct cl_page *page, + enum cl_page_state state) +{ + /* bypass const. */ + *(enum cl_page_state *)&page->cp_state = state; +} + +static struct cl_page *cl_page_alloc(const struct lu_env *env, + struct cl_object *o, pgoff_t ind, struct page *vmpage, + enum cl_page_type type) +{ + struct cl_page *page; + struct lu_object_header *head; + + OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize, + GFP_NOFS); + if (page != NULL) { + int result = 0; + atomic_set(&page->cp_ref, 1); + if (type == CPT_CACHEABLE) /* for radix tree */ + atomic_inc(&page->cp_ref); + page->cp_obj = o; + cl_object_get(o); + lu_object_ref_add_at(&o->co_lu, &page->cp_obj_ref, "cl_page", + page); + page->cp_index = ind; + cl_page_state_set_trust(page, CPS_CACHED); + page->cp_type = type; + INIT_LIST_HEAD(&page->cp_layers); + INIT_LIST_HEAD(&page->cp_batch); + INIT_LIST_HEAD(&page->cp_flight); + mutex_init(&page->cp_mutex); + lu_ref_init(&page->cp_reference); + head = o->co_lu.lo_header; + list_for_each_entry(o, &head->loh_layers, + co_lu.lo_linkage) { + if (o->co_ops->coo_page_init != NULL) { + result = o->co_ops->coo_page_init(env, o, + page, vmpage); + if (result != 0) { + cl_page_delete0(env, page, 0); + cl_page_free(env, page); + page = ERR_PTR(result); + break; + } + } + } + if (result == 0) { + CS_PAGE_INC(o, total); + CS_PAGE_INC(o, create); + CS_PAGESTATE_DEC(o, CPS_CACHED); + } + } else { + page = ERR_PTR(-ENOMEM); + } + return page; +} + +/** + * Returns a cl_page with index \a idx at the object \a o, and associated with + * the VM page \a vmpage. + * + * This is the main entry point into the cl_page caching interface. First, a + * cache (implemented as a per-object radix tree) is consulted. If page is + * found there, it is returned immediately. Otherwise new page is allocated + * and returned. In any case, additional reference to page is acquired. + * + * \see cl_object_find(), cl_lock_find() + */ +static struct cl_page *cl_page_find0(const struct lu_env *env, + struct cl_object *o, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type, + struct cl_page *parent) +{ + struct cl_page *page = NULL; + struct cl_page *ghost = NULL; + struct cl_object_header *hdr; + int err; + + LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT); + might_sleep(); + + hdr = cl_object_header(o); + CS_PAGE_INC(o, lookup); + + CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n", + idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type); + /* fast path. */ + if (type == CPT_CACHEABLE) { + /* vmpage lock is used to protect the child/parent + * relationship */ + KLASSERT(PageLocked(vmpage)); + /* + * cl_vmpage_page() can be called here without any locks as + * + * - "vmpage" is locked (which prevents ->private from + * concurrent updates), and + * + * - "o" cannot be destroyed while current thread holds a + * reference on it. + */ + page = cl_vmpage_page(vmpage, o); + PINVRNT(env, page, + ergo(page != NULL, + cl_page_vmpage(env, page) == vmpage && + (void *)radix_tree_lookup(&hdr->coh_tree, + idx) == page)); + } + + if (page != NULL) { + CS_PAGE_INC(o, hit); + return page; + } + + /* allocate and initialize cl_page */ + page = cl_page_alloc(env, o, idx, vmpage, type); + if (IS_ERR(page)) + return page; + + if (type == CPT_TRANSIENT) { + if (parent) { + LASSERT(page->cp_parent == NULL); + page->cp_parent = parent; + parent->cp_child = page; + } + return page; + } + + /* + * XXX optimization: use radix_tree_preload() here, and change tree + * gfp mask to GFP_KERNEL in cl_object_header_init(). + */ + spin_lock(&hdr->coh_page_guard); + err = radix_tree_insert(&hdr->coh_tree, idx, page); + if (err != 0) { + ghost = page; + /* + * Noted by Jay: a lock on \a vmpage protects cl_page_find() + * from this race, but + * + * 0. it's better to have cl_page interface "locally + * consistent" so that its correctness can be reasoned + * about without appealing to the (obscure world of) VM + * locking. + * + * 1. handling this race allows ->coh_tree to remain + * consistent even when VM locking is somehow busted, + * which is very useful during diagnosing and debugging. + */ + page = ERR_PTR(err); + CL_PAGE_DEBUG(D_ERROR, env, ghost, + "fail to insert into radix tree: %d\n", err); + } else { + if (parent) { + LASSERT(page->cp_parent == NULL); + page->cp_parent = parent; + parent->cp_child = page; + } + hdr->coh_pages++; + } + spin_unlock(&hdr->coh_page_guard); + + if (unlikely(ghost != NULL)) { + cl_page_delete0(env, ghost, 0); + cl_page_free(env, ghost); + } + return page; +} + +struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type) +{ + return cl_page_find0(env, o, idx, vmpage, type, NULL); +} +EXPORT_SYMBOL(cl_page_find); + + +struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o, + pgoff_t idx, struct page *vmpage, + struct cl_page *parent) +{ + return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent); +} +EXPORT_SYMBOL(cl_page_find_sub); + +static inline int cl_page_invariant(const struct cl_page *pg) +{ + struct cl_object_header *header; + struct cl_page *parent; + struct cl_page *child; + struct cl_io *owner; + + /* + * Page invariant is protected by a VM lock. + */ + LINVRNT(cl_page_is_vmlocked(NULL, pg)); + + header = cl_object_header(pg->cp_obj); + parent = pg->cp_parent; + child = pg->cp_child; + owner = pg->cp_owner; + + return cl_page_in_use(pg) && + ergo(parent != NULL, parent->cp_child == pg) && + ergo(child != NULL, child->cp_parent == pg) && + ergo(child != NULL, pg->cp_obj != child->cp_obj) && + ergo(parent != NULL, pg->cp_obj != parent->cp_obj) && + ergo(owner != NULL && parent != NULL, + parent->cp_owner == pg->cp_owner->ci_parent) && + ergo(owner != NULL && child != NULL, + child->cp_owner->ci_parent == owner) && + /* + * Either page is early in initialization (has neither child + * nor parent yet), or it is in the object radix tree. + */ + ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE, + (void *)radix_tree_lookup(&header->coh_tree, + pg->cp_index) == pg || + (child == NULL && parent == NULL)); +} + +static void cl_page_state_set0(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + enum cl_page_state old; + + /* + * Matrix of allowed state transitions [old][new], for sanity + * checking. + */ + static const int allowed_transitions[CPS_NR][CPS_NR] = { + [CPS_CACHED] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 1, /* io finds existing cached page */ + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 1, /* write-out from the cache */ + [CPS_FREEING] = 1, /* eviction on the memory pressure */ + }, + [CPS_OWNED] = { + [CPS_CACHED] = 1, /* release to the cache */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 1, /* start read immediately */ + [CPS_PAGEOUT] = 1, /* start write immediately */ + [CPS_FREEING] = 1, /* lock invalidation or truncate */ + }, + [CPS_PAGEIN] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_PAGEOUT] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_FREEING] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + } + }; + + old = page->cp_state; + PASSERT(env, page, allowed_transitions[old][state]); + CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state); + for (; page != NULL; page = page->cp_child) { + PASSERT(env, page, page->cp_state == old); + PASSERT(env, page, + equi(state == CPS_OWNED, page->cp_owner != NULL)); + + CS_PAGESTATE_DEC(page->cp_obj, page->cp_state); + CS_PAGESTATE_INC(page->cp_obj, state); + cl_page_state_set_trust(page, state); + } +} + +static void cl_page_state_set(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + cl_page_state_set0(env, page, state); +} + +/** + * Acquires an additional reference to a page. + * + * This can be called only by caller already possessing a reference to \a + * page. + * + * \see cl_object_get(), cl_lock_get(). + */ +void cl_page_get(struct cl_page *page) +{ + cl_page_get_trust(page); +} +EXPORT_SYMBOL(cl_page_get); + +/** + * Releases a reference to a page. + * + * When last reference is released, page is returned to the cache, unless it + * is in cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * \see cl_object_put(), cl_lock_put(). + */ +void cl_page_put(const struct lu_env *env, struct cl_page *page) +{ + PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent); + + CL_PAGE_HEADER(D_TRACE, env, page, "%d\n", + atomic_read(&page->cp_ref)); + + if (atomic_dec_and_test(&page->cp_ref)) { + LASSERT(page->cp_state == CPS_FREEING); + + LASSERT(atomic_read(&page->cp_ref) == 0); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, list_empty(&page->cp_batch)); + /* + * Page is no longer reachable by other threads. Tear + * it down. + */ + cl_page_free(env, page); + } +} +EXPORT_SYMBOL(cl_page_put); + +/** + * Returns a VM page associated with a given cl_page. + */ +struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page) +{ + const struct cl_page_slice *slice; + + /* + * Find uppermost layer with ->cpo_vmpage() method, and return its + * result. + */ + page = cl_page_top(page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_vmpage != NULL) + return slice->cpl_ops->cpo_vmpage(env, slice); + } + page = page->cp_child; + } while (page != NULL); + LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */ +} +EXPORT_SYMBOL(cl_page_vmpage); + +/** + * Returns a cl_page associated with a VM page, and given cl_object. + */ +struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj) +{ + struct cl_page *top; + struct cl_page *page; + + KLASSERT(PageLocked(vmpage)); + + /* + * NOTE: absence of races and liveness of data are guaranteed by page + * lock on a "vmpage". That works because object destruction has + * bottom-to-top pass. + */ + + /* + * This loop assumes that ->private points to the top-most page. This + * can be rectified easily. + */ + top = (struct cl_page *)vmpage->private; + if (top == NULL) + return NULL; + + for (page = top; page != NULL; page = page->cp_child) { + if (cl_object_same(page->cp_obj, obj)) { + cl_page_get_trust(page); + break; + } + } + LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE)); + return page; +} +EXPORT_SYMBOL(cl_vmpage_page); + +/** + * Returns the top-page for a given page. + * + * \see cl_object_top(), cl_io_top() + */ +struct cl_page *cl_page_top(struct cl_page *page) +{ + return cl_page_top_trusted(page); +} +EXPORT_SYMBOL(cl_page_top); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + return cl_page_at_trusted(page, dtype); +} +EXPORT_SYMBOL(cl_page_at); + +#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname) + +#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...) \ +({ \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + int __result; \ + ptrdiff_t __op = (_op); \ + int (*__method)_proto; \ + \ + __result = 0; \ + __page = cl_page_top(__page); \ + do { \ + list_for_each_entry(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) { \ + __result = (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + if (__result != 0) \ + break; \ + } \ + } \ + __page = __page->cp_child; \ + } while (__page != NULL && __result == 0); \ + if (__result > 0) \ + __result = 0; \ + __result; \ +}) + +#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...) \ +do { \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + ptrdiff_t __op = (_op); \ + void (*__method)_proto; \ + \ + __page = cl_page_top(__page); \ + do { \ + list_for_each_entry(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) \ + (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + } \ + __page = __page->cp_child; \ + } while (__page != NULL); \ +} while (0) + +#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...) \ +do { \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + ptrdiff_t __op = (_op); \ + void (*__method)_proto; \ + \ + /* get to the bottom page. */ \ + while (__page->cp_child != NULL) \ + __page = __page->cp_child; \ + do { \ + list_for_each_entry_reverse(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) \ + (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + } \ + __page = __page->cp_parent; \ + } while (__page != NULL); \ +} while (0) + +static int cl_page_invoke(const struct lu_env *env, + struct cl_io *io, struct cl_page *page, ptrdiff_t op) + +{ + PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); + return CL_PAGE_INVOKE(env, page, op, + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); +} + +static void cl_page_invoid(const struct lu_env *env, + struct cl_io *io, struct cl_page *page, ptrdiff_t op) + +{ + PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); + CL_PAGE_INVOID(env, page, op, + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), io); +} + +static void cl_page_owner_clear(struct cl_page *page) +{ + for (page = cl_page_top(page); page != NULL; page = page->cp_child) { + if (page->cp_owner != NULL) { + LASSERT(page->cp_owner->ci_owned_nr > 0); + page->cp_owner->ci_owned_nr--; + page->cp_owner = NULL; + page->cp_task = NULL; + } + } +} + +static void cl_page_owner_set(struct cl_page *page) +{ + for (page = cl_page_top(page); page != NULL; page = page->cp_child) { + LASSERT(page->cp_owner != NULL); + page->cp_owner->ci_owned_nr++; + } +} + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + enum cl_page_state state; + + state = pg->cp_state; + PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING); + PINVRNT(env, pg, cl_page_invariant(pg)); + cl_page_owner_clear(pg); + + if (state == CPS_OWNED) + cl_page_state_set(env, pg, CPS_CACHED); + /* + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for VFS/VM interaction runs + * last and can release locks safely. + */ + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); +} + +/** + * returns true, iff page is owned by the given io. + */ +int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io) +{ + LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); + return pg->cp_state == CPS_OWNED && pg->cp_owner == io; +} +EXPORT_SYMBOL(cl_page_is_owned); + +/** + * Try to own a page by IO. + * + * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it + * into cl_page_state::CPS_OWNED state. + * + * \pre !cl_page_is_owned(pg, io) + * \post result == 0 iff cl_page_is_owned(pg, io) + * + * \retval 0 success + * + * \retval -ve failure, e.g., page was destroyed (and landed in + * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED). + * or, page was owned by another thread, or in IO. + * + * \see cl_page_disown() + * \see cl_page_operations::cpo_own() + * \see cl_page_own_try() + * \see cl_page_own + */ +static int cl_page_own0(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, int nonblock) +{ + int result; + + PINVRNT(env, pg, !cl_page_is_owned(pg, io)); + + pg = cl_page_top(pg); + io = cl_io_top(io); + + if (pg->cp_state == CPS_FREEING) { + result = -ENOENT; + } else { + result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own), + (const struct lu_env *, + const struct cl_page_slice *, + struct cl_io *, int), + io, nonblock); + if (result == 0) { + PASSERT(env, pg, pg->cp_owner == NULL); + PASSERT(env, pg, pg->cp_req == NULL); + pg->cp_owner = io; + pg->cp_task = current; + cl_page_owner_set(pg); + if (pg->cp_state != CPS_FREEING) { + cl_page_state_set(env, pg, CPS_OWNED); + } else { + cl_page_disown0(env, io, pg); + result = -ENOENT; + } + } + } + PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg))); + return result; +} + +/** + * Own a page, might be blocked. + * + * \see cl_page_own0() + */ +int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 0); +} +EXPORT_SYMBOL(cl_page_own); + +/** + * Nonblock version of cl_page_own(). + * + * \see cl_page_own0() + */ +int cl_page_own_try(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 1); +} +EXPORT_SYMBOL(cl_page_own_try); + + +/** + * Assume page ownership. + * + * Called when page is already locked by the hosting VM. + * + * \pre !cl_page_is_owned(pg, io) + * \post cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_assume() + */ +void cl_page_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj)); + + pg = cl_page_top(pg); + io = cl_io_top(io); + + cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume)); + PASSERT(env, pg, pg->cp_owner == NULL); + pg->cp_owner = io; + pg->cp_task = current; + cl_page_owner_set(pg); + cl_page_state_set(env, pg, CPS_OWNED); +} +EXPORT_SYMBOL(cl_page_assume); + +/** + * Releases page ownership without unlocking the page. + * + * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the + * underlying VM page (as VM is supposed to do this itself). + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_assume() + */ +void cl_page_unassume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + pg = cl_page_top(pg); + io = cl_io_top(io); + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, CPS_CACHED); + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); +} +EXPORT_SYMBOL(cl_page_unassume); + +/** + * Releases page ownership. + * + * Moves page into cl_page_state::CPS_CACHED. + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_own() + * \see cl_page_operations::cpo_disown() + */ +void cl_page_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + + pg = cl_page_top(pg); + io = cl_io_top(io); + cl_page_disown0(env, io, pg); +} +EXPORT_SYMBOL(cl_page_disown); + +/** + * Called when page is to be removed from the object, e.g., as a result of + * truncate. + * + * Calls cl_page_operations::cpo_discard() top-to-bottom. + * + * \pre cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_discard() + */ +void cl_page_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard)); +} +EXPORT_SYMBOL(cl_page_discard); + +/** + * Version of cl_page_delete() that can be called for not fully constructed + * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0() + * path. Doesn't check page invariant. + */ +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg, + int radix) +{ + struct cl_page *tmp = pg; + + PASSERT(env, pg, pg == cl_page_top(pg)); + PASSERT(env, pg, pg->cp_state != CPS_FREEING); + + /* + * Severe all ways to obtain new pointers to @pg. + */ + cl_page_owner_clear(pg); + + /* + * unexport the page firstly before freeing it so that + * the page content is considered to be invalid. + * We have to do this because a CPS_FREEING cl_page may + * be NOT under the protection of a cl_lock. + * Afterwards, if this page is found by other threads, then this + * page will be forced to reread. + */ + cl_page_export(env, pg, 0); + cl_page_state_set0(env, pg, CPS_FREEING); + + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete), + (const struct lu_env *, const struct cl_page_slice *)); + + if (tmp->cp_type == CPT_CACHEABLE) { + if (!radix) + /* !radix means that @pg is not yet in the radix tree, + * skip removing it. + */ + tmp = pg->cp_child; + for (; tmp != NULL; tmp = tmp->cp_child) { + void *value; + struct cl_object_header *hdr; + + hdr = cl_object_header(tmp->cp_obj); + spin_lock(&hdr->coh_page_guard); + value = radix_tree_delete(&hdr->coh_tree, + tmp->cp_index); + PASSERT(env, tmp, value == tmp); + PASSERT(env, tmp, hdr->coh_pages > 0); + hdr->coh_pages--; + spin_unlock(&hdr->coh_page_guard); + cl_page_put(env, tmp); + } + } +} + +/** + * Called when a decision is made to throw page out of memory. + * + * Notifies all layers about page destruction by calling + * cl_page_operations::cpo_delete() method top-to-bottom. + * + * Moves page into cl_page_state::CPS_FREEING state (this is the only place + * where transition to this state happens). + * + * Eliminates all venues through which new references to the page can be + * obtained: + * + * - removes page from the radix trees, + * + * - breaks linkage from VM page to cl_page. + * + * Once page reaches cl_page_state::CPS_FREEING, all remaining references will + * drain after some time, at which point page will be recycled. + * + * \pre pg == cl_page_top(pg) + * \pre VM page is locked + * \post pg->cp_state == CPS_FREEING + * + * \see cl_page_operations::cpo_delete() + */ +void cl_page_delete(const struct lu_env *env, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + cl_page_delete0(env, pg, 1); +} +EXPORT_SYMBOL(cl_page_delete); + +/** + * Unmaps page from user virtual memory. + * + * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The + * layer responsible for VM interaction has to unmap page from user space + * virtual memory. + * + * \see cl_page_operations::cpo_unmap() + */ +int cl_page_unmap(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap)); +} +EXPORT_SYMBOL(cl_page_unmap); + +/** + * Marks page up-to-date. + * + * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The + * layer responsible for VM interaction has to mark/clear page as up-to-date + * by the \a uptodate argument. + * + * \see cl_page_operations::cpo_export() + */ +void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export), + (const struct lu_env *, + const struct cl_page_slice *, int), uptodate); +} +EXPORT_SYMBOL(cl_page_export); + +/** + * Returns true, iff \a pg is VM locked in a suitable sense by the calling + * thread. + */ +int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg) +{ + int result; + const struct cl_page_slice *slice; + + pg = cl_page_top_trusted((struct cl_page *)pg); + slice = container_of(pg->cp_layers.next, + const struct cl_page_slice, cpl_linkage); + PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL); + /* + * Call ->cpo_is_vmlocked() directly instead of going through + * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by + * cl_page_invariant(). + */ + result = slice->cpl_ops->cpo_is_vmlocked(env, slice); + PASSERT(env, pg, result == -EBUSY || result == -ENODATA); + return result == -EBUSY; +} +EXPORT_SYMBOL(cl_page_is_vmlocked); + +static enum cl_page_state cl_req_type_state(enum cl_req_type crt) +{ + return crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN; +} + +static void cl_page_io_start(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt) +{ + /* + * Page is queued for IO, change its state. + */ + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, cl_req_type_state(crt)); +} + +/** + * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is + * called top-to-bottom. Every layer either agrees to submit this page (by + * returning 0), or requests to omit this page (by returning -EALREADY). Layer + * handling interactions with the VM also has to inform VM that page is under + * transfer now. + */ +int cl_page_prep(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt) +{ + int result; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + PINVRNT(env, pg, crt < CRT_NR); + + /* + * XXX this has to be called bottom-to-top, so that llite can set up + * PG_writeback without risking other layers deciding to skip this + * page. + */ + if (crt >= CRT_NR) + return -EINVAL; + result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep)); + if (result == 0) + cl_page_io_start(env, pg, crt); + + KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE, + equi(result == 0, + PageWriteback(cl_page_vmpage(env, pg))))); + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + return result; +} +EXPORT_SYMBOL(cl_page_prep); + +/** + * Notify layers about transfer completion. + * + * Invoked by transfer sub-system (which is a part of osc) to notify layers + * that a transfer, of which this page is a part of has completed. + * + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for the VFS/VM interaction runs last + * and can release locks safely. + * + * \pre pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * \post pg->cp_state == CPS_CACHED + * + * \see cl_page_operations::cpo_completion() + */ +void cl_page_completion(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret) +{ + struct cl_sync_io *anchor = pg->cp_sync_io; + + PASSERT(env, pg, crt < CRT_NR); + /* cl_page::cp_req already cleared by the caller (osc_completion()) */ + PASSERT(env, pg, pg->cp_req == NULL); + PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret); + if (crt == CRT_READ && ioret == 0) { + PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED)); + pg->cp_flags |= CPF_READ_COMPLETED; + } + + cl_page_state_set(env, pg, CPS_CACHED); + if (crt >= CRT_NR) + return; + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion), + (const struct lu_env *, + const struct cl_page_slice *, int), ioret); + if (anchor) { + LASSERT(cl_page_is_vmlocked(env, pg)); + LASSERT(pg->cp_sync_io == anchor); + pg->cp_sync_io = NULL; + } + /* + * As page->cp_obj is pinned by a reference from page->cp_req, it is + * safe to call cl_page_put() without risking object destruction in a + * non-blocking context. + */ + cl_page_put(env, pg); + + if (anchor) + cl_sync_io_note(anchor, ioret); +} +EXPORT_SYMBOL(cl_page_completion); + +/** + * Notify layers that transfer formation engine decided to yank this page from + * the cache and to make it a part of a transfer. + * + * \pre pg->cp_state == CPS_CACHED + * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * + * \see cl_page_operations::cpo_make_ready() + */ +int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt) +{ + int result; + + PINVRNT(env, pg, crt < CRT_NR); + + if (crt >= CRT_NR) + return -EINVAL; + result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready), + (const struct lu_env *, + const struct cl_page_slice *)); + if (result == 0) { + PASSERT(env, pg, pg->cp_state == CPS_CACHED); + cl_page_io_start(env, pg, crt); + } + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + return result; +} +EXPORT_SYMBOL(cl_page_make_ready); + +/** + * Notify layers that high level io decided to place this page into a cache + * for future transfer. + * + * The layer implementing transfer engine (osc) has to register this page in + * its queues. + * + * \pre cl_page_is_owned(pg, io) + * \post cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_cache_add() + */ +int cl_page_cache_add(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt) +{ + const struct cl_page_slice *scan; + int result = 0; + + PINVRNT(env, pg, crt < CRT_NR); + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + if (crt >= CRT_NR) + return -EINVAL; + + list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) { + if (scan->cpl_ops->io[crt].cpo_cache_add == NULL) + continue; + + result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io); + if (result != 0) + break; + } + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + return result; +} +EXPORT_SYMBOL(cl_page_cache_add); + +/** + * Called if a pge is being written back by kernel's intention. + * + * \pre cl_page_is_owned(pg, io) + * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT) + * + * \see cl_page_operations::cpo_flush() + */ +int cl_page_flush(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg) +{ + int result; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush)); + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result); + return result; +} +EXPORT_SYMBOL(cl_page_flush); + +/** + * Checks whether page is protected by any extent lock is at least required + * mode. + * + * \return the same as in cl_page_operations::cpo_is_under_lock() method. + * \see cl_page_operations::cpo_is_under_lock() + */ +int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + int rc; + + PINVRNT(env, page, cl_page_invariant(page)); + + rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + PASSERT(env, page, rc != 0); + return rc; +} +EXPORT_SYMBOL(cl_page_is_under_lock); + +static int page_prune_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + cl_page_own(env, io, page); + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + return CLP_GANG_OKAY; +} + +/** + * Purges all cached pages belonging to the object \a obj. + */ +int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj) +{ + struct cl_thread_info *info; + struct cl_object *obj = cl_object_top(clobj); + struct cl_io *io; + int result; + + info = cl_env_info(env); + io = &info->clt_io; + + /* + * initialize the io. This is ugly since we never do IO in this + * function, we just make cl_page_list functions happy. -jay + */ + io->ci_obj = obj; + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, obj); + if (result != 0) { + cl_io_fini(env, io); + return io->ci_result; + } + + do { + result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF, + page_prune_cb, NULL); + if (result == CLP_GANG_RESCHED) + cond_resched(); + } while (result != CLP_GANG_OKAY); + + cl_io_fini(env, io); + return result; +} +EXPORT_SYMBOL(cl_pages_prune); + +/** + * Tells transfer engine that only part of a page is to be transmitted. + * + * \see cl_page_operations::cpo_clip() + */ +void cl_page_clip(const struct lu_env *env, struct cl_page *pg, + int from, int to) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to); + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip), + (const struct lu_env *, + const struct cl_page_slice *,int, int), + from, to); +} +EXPORT_SYMBOL(cl_page_clip); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + (*printer)(env, cookie, + "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n", + pg, atomic_read(&pg->cp_ref), pg->cp_obj, + pg->cp_index, pg->cp_parent, pg->cp_child, + pg->cp_state, pg->cp_error, pg->cp_type, + pg->cp_owner, pg->cp_req, pg->cp_flags); +} +EXPORT_SYMBOL(cl_page_header_print); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + struct cl_page *scan; + + for (scan = cl_page_top((struct cl_page *)pg); + scan != NULL; scan = scan->cp_child) + cl_page_header_print(env, cookie, printer, scan); + CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print), + (const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p), cookie, printer); + (*printer)(env, cookie, "end page@%p\n", pg); +} +EXPORT_SYMBOL(cl_page_print); + +/** + * Cancel a page which is still in a transfer. + */ +int cl_page_cancel(const struct lu_env *env, struct cl_page *page) +{ + return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel), + (const struct lu_env *, + const struct cl_page_slice *)); +} +EXPORT_SYMBOL(cl_page_cancel); + +/** + * Converts a byte offset within object \a obj into a page index. + */ +loff_t cl_offset(const struct cl_object *obj, pgoff_t idx) +{ + /* + * XXX for now. + */ + return (loff_t)idx << PAGE_CACHE_SHIFT; +} +EXPORT_SYMBOL(cl_offset); + +/** + * Converts a page index into a byte offset within object \a obj. + */ +pgoff_t cl_index(const struct cl_object *obj, loff_t offset) +{ + /* + * XXX for now. + */ + return offset >> PAGE_CACHE_SHIFT; +} +EXPORT_SYMBOL(cl_index); + +int cl_page_size(const struct cl_object *obj) +{ + return 1 << PAGE_CACHE_SHIFT; +} +EXPORT_SYMBOL(cl_page_size); + +/** + * Adds page slice to the compound page. + * + * This is called by cl_object_operations::coo_page_init() methods to add a + * per-layer state to the page. New state is added at the end of + * cl_page::cp_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add() + */ +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops) +{ + list_add_tail(&slice->cpl_linkage, &page->cp_layers); + slice->cpl_obj = obj; + slice->cpl_ops = ops; + slice->cpl_page = page; +} +EXPORT_SYMBOL(cl_page_slice_add); + +int cl_page_init(void) +{ + return 0; +} + +void cl_page_fini(void) +{ +} diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/class_obd.c b/kernel/drivers/staging/lustre/lustre/obdclass/class_obd.c new file mode 100644 index 000000000..d4b74b670 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/class_obd.c @@ -0,0 +1,704 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_CLASS +# include + +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../../include/linux/lnet/lnetctl.h" +#include "../include/lustre_debug.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre/lustre_build_version.h" +#include +#include "../include/cl_object.h" +#include "llog_internal.h" + + +struct obd_device *obd_devs[MAX_OBD_DEVICES]; +EXPORT_SYMBOL(obd_devs); +struct list_head obd_types; +DEFINE_RWLOCK(obd_dev_lock); + +__u64 obd_max_pages = 0; +EXPORT_SYMBOL(obd_max_pages); +__u64 obd_max_alloc = 0; +EXPORT_SYMBOL(obd_max_alloc); +__u64 obd_alloc; +EXPORT_SYMBOL(obd_alloc); +__u64 obd_pages; +EXPORT_SYMBOL(obd_pages); +static DEFINE_SPINLOCK(obd_updatemax_lock); + +/* The following are visible and mutable through /proc/sys/lustre/. */ +unsigned int obd_alloc_fail_rate = 0; +EXPORT_SYMBOL(obd_alloc_fail_rate); +unsigned int obd_debug_peer_on_timeout; +EXPORT_SYMBOL(obd_debug_peer_on_timeout); +unsigned int obd_dump_on_timeout; +EXPORT_SYMBOL(obd_dump_on_timeout); +unsigned int obd_dump_on_eviction; +EXPORT_SYMBOL(obd_dump_on_eviction); +unsigned int obd_max_dirty_pages = 256; +EXPORT_SYMBOL(obd_max_dirty_pages); +atomic_t obd_dirty_pages; +EXPORT_SYMBOL(obd_dirty_pages); +unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(obd_timeout); +unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(ldlm_timeout); +unsigned int obd_timeout_set; +EXPORT_SYMBOL(obd_timeout_set); +unsigned int ldlm_timeout_set; +EXPORT_SYMBOL(ldlm_timeout_set); +/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */ +unsigned int at_min = 0; +EXPORT_SYMBOL(at_min); +unsigned int at_max = 600; +EXPORT_SYMBOL(at_max); +unsigned int at_history = 600; +EXPORT_SYMBOL(at_history); +int at_early_margin = 5; +EXPORT_SYMBOL(at_early_margin); +int at_extra = 30; +EXPORT_SYMBOL(at_extra); + +atomic_t obd_dirty_transit_pages; +EXPORT_SYMBOL(obd_dirty_transit_pages); + +char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE; +EXPORT_SYMBOL(obd_jobid_var); + +char obd_jobid_node[JOBSTATS_JOBID_SIZE + 1]; + +/* Get jobid of current process from stored variable or calculate + * it from pid and user_id. + * + * Historically this was also done by reading the environment variable + * stored in between the "env_start" & "env_end" of task struct. + * This is now deprecated. + */ +int lustre_get_jobid(char *jobid) +{ + memset(jobid, 0, JOBSTATS_JOBID_SIZE); + /* Jobstats isn't enabled */ + if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) + return 0; + + /* Use process name + fsuid as jobid */ + if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { + snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u", + current_comm(), + from_kuid(&init_user_ns, current_fsuid())); + return 0; + } + + /* Whole node dedicated to single job */ + if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) { + strcpy(jobid, obd_jobid_node); + return 0; + } + + return -ENOENT; +} +EXPORT_SYMBOL(lustre_get_jobid); + +int obd_alloc_fail(const void *ptr, const char *name, const char *type, + size_t size, const char *file, int line) +{ + if (ptr == NULL || + (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) { + CERROR("%s%salloc of %s (%llu bytes) failed at %s:%d\n", + ptr ? "force " :"", type, name, (__u64)size, file, + line); + CERROR("%llu total bytes and %llu total pages (%llu bytes) allocated by Lustre, %d total bytes by LNET\n", + obd_memory_sum(), + obd_pages_sum() << PAGE_CACHE_SHIFT, + obd_pages_sum(), + atomic_read(&libcfs_kmemory)); + return 1; + } + return 0; +} +EXPORT_SYMBOL(obd_alloc_fail); + +static inline void obd_data2conn(struct lustre_handle *conn, + struct obd_ioctl_data *data) +{ + memset(conn, 0, sizeof(*conn)); + conn->cookie = data->ioc_cookie; +} + +static inline void obd_conn2data(struct obd_ioctl_data *data, + struct lustre_handle *conn) +{ + data->ioc_cookie = conn->cookie; +} + +int class_resolve_dev_name(__u32 len, const char *name) +{ + int rc; + int dev; + + if (!len || !name) { + CERROR("No name passed,!\n"); + rc = -EINVAL; + goto out; + } + if (name[len - 1] != 0) { + CERROR("Name not nul terminated!\n"); + rc = -EINVAL; + goto out; + } + + CDEBUG(D_IOCTL, "device name %s\n", name); + dev = class_name2dev(name); + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for name %s!\n", name); + rc = -EINVAL; + goto out; + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev); + rc = dev; + +out: + return rc; +} + +int class_handle_ioctl(unsigned int cmd, unsigned long arg) +{ + char *buf = NULL; + struct obd_ioctl_data *data; + struct libcfs_debug_ioctl_data *debug_data; + struct obd_device *obd = NULL; + int err = 0, len = 0; + + /* only for debugging */ + if (cmd == LIBCFS_IOC_DEBUG_MASK) { + debug_data = (struct libcfs_debug_ioctl_data *)arg; + libcfs_subsystem_debug = debug_data->subs; + libcfs_debug = debug_data->debug; + return 0; + } + + CDEBUG(D_IOCTL, "cmd = %x\n", cmd); + if (obd_ioctl_getdata(&buf, &len, (void *)arg)) { + CERROR("OBD ioctl: data error\n"); + return -EINVAL; + } + data = (struct obd_ioctl_data *)buf; + + switch (cmd) { + case OBD_IOC_PROCESS_CFG: { + struct lustre_cfg *lcfg; + + if (!data->ioc_plen1 || !data->ioc_pbuf1) { + CERROR("No config buffer passed!\n"); + err = -EINVAL; + goto out; + } + OBD_ALLOC(lcfg, data->ioc_plen1); + if (lcfg == NULL) { + err = -ENOMEM; + goto out; + } + err = copy_from_user(lcfg, data->ioc_pbuf1, + data->ioc_plen1); + if (!err) + err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1); + if (!err) + err = class_process_config(lcfg); + + OBD_FREE(lcfg, data->ioc_plen1); + goto out; + } + + case OBD_GET_VERSION: + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + err = -EINVAL; + goto out; + } + + if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) { + CERROR("ioctl buffer too small to hold version\n"); + err = -EINVAL; + goto out; + } + + memcpy(data->ioc_bulk, BUILD_VERSION, + strlen(BUILD_VERSION) + 1); + + err = obd_ioctl_popdata((void *)arg, data, len); + if (err) + err = -EFAULT; + goto out; + + case OBD_IOC_NAME2DEV: { + /* Resolve a device name. This does not change the + * currently selected device. + */ + int dev; + + dev = class_resolve_dev_name(data->ioc_inllen1, + data->ioc_inlbuf1); + data->ioc_dev = dev; + if (dev < 0) { + err = -EINVAL; + goto out; + } + + err = obd_ioctl_popdata((void *)arg, data, sizeof(*data)); + if (err) + err = -EFAULT; + goto out; + } + + case OBD_IOC_UUID2DEV: { + /* Resolve a device uuid. This does not change the + * currently selected device. + */ + int dev; + struct obd_uuid uuid; + + if (!data->ioc_inllen1 || !data->ioc_inlbuf1) { + CERROR("No UUID passed!\n"); + err = -EINVAL; + goto out; + } + if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) { + CERROR("UUID not NUL terminated!\n"); + err = -EINVAL; + goto out; + } + + CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1); + obd_str2uuid(&uuid, data->ioc_inlbuf1); + dev = class_uuid2dev(&uuid); + data->ioc_dev = dev; + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for UUID %s!\n", + data->ioc_inlbuf1); + err = -EINVAL; + goto out; + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1, + dev); + err = obd_ioctl_popdata((void *)arg, data, sizeof(*data)); + if (err) + err = -EFAULT; + goto out; + } + + case OBD_IOC_CLOSE_UUID: { + CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n", + data->ioc_inlbuf1); + err = 0; + goto out; + } + + case OBD_IOC_GETDEVICE: { + int index = data->ioc_count; + char *status, *str; + + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + err = -EINVAL; + goto out; + } + if (data->ioc_inllen1 < 128) { + CERROR("ioctl buffer too small to hold version\n"); + err = -EINVAL; + goto out; + } + + obd = class_num2obd(index); + if (!obd) { + err = -ENOENT; + goto out; + } + + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + str = (char *)data->ioc_bulk; + snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + err = obd_ioctl_popdata((void *)arg, data, len); + + err = 0; + goto out; + } + + } + + if (data->ioc_dev == OBD_DEV_BY_DEVNAME) { + if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL) { + err = -EINVAL; + goto out; + } + if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME) { + err = -EINVAL; + goto out; + } + obd = class_name2obd(data->ioc_inlbuf4); + } else if (data->ioc_dev < class_devno_max()) { + obd = class_num2obd(data->ioc_dev); + } else { + CERROR("OBD ioctl: No device\n"); + err = -EINVAL; + goto out; + } + + if (obd == NULL) { + CERROR("OBD ioctl : No Device %d\n", data->ioc_dev); + err = -EINVAL; + goto out; + } + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + + if (!obd->obd_set_up || obd->obd_stopping) { + CERROR("OBD ioctl: device not setup %d\n", data->ioc_dev); + err = -EINVAL; + goto out; + } + + switch (cmd) { + case OBD_IOC_NO_TRANSNO: { + if (!obd->obd_attached) { + CERROR("Device %d not attached\n", obd->obd_minor); + err = -ENODEV; + goto out; + } + CDEBUG(D_HA, "%s: disabling committed-transno notification\n", + obd->obd_name); + obd->obd_no_transno = 1; + err = 0; + goto out; + } + + default: { + err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL); + if (err) + goto out; + + err = obd_ioctl_popdata((void *)arg, data, len); + if (err) + err = -EFAULT; + goto out; + } + } + + out: + if (buf) + obd_ioctl_freedata(buf, len); + return err; +} /* class_handle_ioctl */ + +#define OBD_INIT_CHECK +int obd_init_checks(void) +{ + __u64 u64val, div64val; + char buf[64]; + int len, ret = 0; + + CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", "%llu", "%lld", "%#llx"); + + CDEBUG(D_INFO, "OBD_OBJECT_EOF = %#llx\n", (__u64)OBD_OBJECT_EOF); + + u64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), "%#llx", u64val); + if (len != 18) { + CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + + div64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EOVERFLOW; + } + if (u64val >> 8 != OBD_OBJECT_EOF >> 8) { + CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + return -EOVERFLOW; + } + if (do_div(div64val, 256) != (u64val & 255)) { + CERROR("do_div(%#llx,256) != %llu\n", u64val, u64val &255); + return -EOVERFLOW; + } + if (u64val >> 8 != div64val) { + CERROR("do_div(%#llx,256) %llu != %llu\n", + u64val, div64val, u64val >> 8); + return -EOVERFLOW; + } + len = snprintf(buf, sizeof(buf), "%#llx", u64val); + if (len != 18) { + CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), "%llu", u64val); + if (len != 20) { + CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), "%lld", u64val); + if (len != 2) { + CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len); + ret = -EINVAL; + } + if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) { + CWARN("mask failed: u64val %llu >= %llu\n", u64val, + (__u64)PAGE_CACHE_SIZE); + ret = -EINVAL; + } + + return ret; +} + +#if defined (CONFIG_PROC_FS) +extern int class_procfs_init(void); +extern int class_procfs_clean(void); +#else +static inline int class_procfs_init(void) +{ return 0; } +static inline int class_procfs_clean(void) +{ return 0; } +#endif + +static int __init init_obdclass(void) +{ + int i, err; + int lustre_register_fs(void); + + for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++) + INIT_LIST_HEAD(&capa_list[i]); + + LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n"); + + spin_lock_init(&obd_types_lock); + obd_zombie_impexp_init(); + + if (IS_ENABLED(CONFIG_PROC_FS)) { + obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM, + LPROCFS_STATS_FLAG_NONE | + LPROCFS_STATS_FLAG_IRQ_SAFE); + + if (obd_memory == NULL) { + CERROR("kmalloc of 'obd_memory' failed\n"); + return -ENOMEM; + } + + lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT, + LPROCFS_CNTR_AVGMINMAX, + "memused", "bytes"); + lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT, + LPROCFS_CNTR_AVGMINMAX, + "pagesused", "pages"); + } + + err = obd_init_checks(); + if (err == -EOVERFLOW) + return err; + + class_init_uuidlist(); + err = class_handle_init(); + if (err) + return err; + + INIT_LIST_HEAD(&obd_types); + + err = misc_register(&obd_psdev); + if (err) { + CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err); + return err; + } + + /* This struct is already zeroed for us (static global) */ + for (i = 0; i < class_devno_max(); i++) + obd_devs[i] = NULL; + + /* Default the dirty page cache cap to 1/2 of system memory. + * For clients with less memory, a larger fraction is needed + * for other purposes (mostly for BGL). */ + if (totalram_pages <= 512 << (20 - PAGE_CACHE_SHIFT)) + obd_max_dirty_pages = totalram_pages / 4; + else + obd_max_dirty_pages = totalram_pages / 2; + + err = obd_init_caches(); + if (err) + return err; + + obd_sysctl_init(); + + err = class_procfs_init(); + if (err) + return err; + + err = lu_global_init(); + if (err) + return err; + + err = cl_global_init(); + if (err != 0) + return err; + + + err = llog_info_init(); + if (err) + return err; + + err = lustre_register_fs(); + + return err; +} + +void obd_update_maxusage(void) +{ + __u64 max1, max2; + + max1 = obd_pages_sum(); + max2 = obd_memory_sum(); + + spin_lock(&obd_updatemax_lock); + if (max1 > obd_max_pages) + obd_max_pages = max1; + if (max2 > obd_max_alloc) + obd_max_alloc = max2; + spin_unlock(&obd_updatemax_lock); +} +EXPORT_SYMBOL(obd_update_maxusage); + +#if defined (CONFIG_PROC_FS) +__u64 obd_memory_max(void) +{ + __u64 ret; + + spin_lock(&obd_updatemax_lock); + ret = obd_max_alloc; + spin_unlock(&obd_updatemax_lock); + + return ret; +} +EXPORT_SYMBOL(obd_memory_max); + +__u64 obd_pages_max(void) +{ + __u64 ret; + + spin_lock(&obd_updatemax_lock); + ret = obd_max_pages; + spin_unlock(&obd_updatemax_lock); + + return ret; +} +EXPORT_SYMBOL(obd_pages_max); +#endif + +/* liblustre doesn't call cleanup_obdclass, apparently. we carry on in this + * ifdef to the end of the file to cover module and versioning goo.*/ +static void cleanup_obdclass(void) +{ + int i; + int lustre_unregister_fs(void); + __u64 memory_leaked, pages_leaked; + __u64 memory_max, pages_max; + + lustre_unregister_fs(); + + misc_deregister(&obd_psdev); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + if (obd && obd->obd_set_up && + OBT(obd) && OBP(obd, detach)) { + /* XXX should this call generic detach otherwise? */ + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + OBP(obd, detach)(obd); + } + } + llog_info_fini(); + cl_global_fini(); + lu_global_fini(); + + obd_cleanup_caches(); + obd_sysctl_clean(); + + class_procfs_clean(); + + class_handle_cleanup(); + class_exit_uuidlist(); + obd_zombie_impexp_stop(); + + memory_leaked = obd_memory_sum(); + pages_leaked = obd_pages_sum(); + + memory_max = obd_memory_max(); + pages_max = obd_pages_max(); + + lprocfs_free_stats(&obd_memory); + CDEBUG((memory_leaked) ? D_ERROR : D_INFO, + "obd_memory max: %llu, leaked: %llu\n", + memory_max, memory_leaked); + CDEBUG((pages_leaked) ? D_ERROR : D_INFO, + "obd_memory_pages max: %llu, leaked: %llu\n", + pages_max, pages_leaked); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION); +MODULE_LICENSE("GPL"); +MODULE_VERSION(LUSTRE_VERSION_STRING); + +module_init(init_obdclass); +module_exit(cleanup_obdclass); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/debug.c b/kernel/drivers/staging/lustre/lustre/obdclass/debug.c new file mode 100644 index 000000000..9c934e6d2 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/debug.c @@ -0,0 +1,109 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/debug.c + * + * Helper routines for dumping data structs for debugging. + */ + +#define DEBUG_SUBSYSTEM D_OTHER + +#include + +#include "../include/obd_support.h" +#include "../include/lustre_debug.h" +#include "../include/lustre_net.h" + +void dump_lniobuf(struct niobuf_local *nb) +{ + CDEBUG(D_RPCTRACE, + "niobuf_local: file_offset=%lld, len=%d, page=%p, rc=%d\n", + nb->lnb_file_offset, nb->len, nb->page, nb->rc); + CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n", + nb->page ? page_index(nb->page) : -1); +} +EXPORT_SYMBOL(dump_lniobuf); + +#define LPDS sizeof(__u64) +int block_debug_setup(void *addr, int len, __u64 off, __u64 id) +{ + LASSERT(addr); + + put_unaligned_le64(off, addr); + put_unaligned_le64(id, addr+LPDS); + addr += len - LPDS - LPDS; + put_unaligned_le64(off, addr); + put_unaligned_le64(id, addr+LPDS); + + return 0; +} +EXPORT_SYMBOL(block_debug_setup); + +int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id) +{ + __u64 ne_off; + int err = 0; + + LASSERT(addr); + + ne_off = le64_to_cpu (off); + id = le64_to_cpu (id); + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, "%s: id %#llx offset %llu off: %#llx != %#llx\n", + who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, "%s: id %#llx offset %llu id: %#llx != %#llx\n", + who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + addr += end - LPDS - LPDS; + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, "%s: id %#llx offset %llu end off: %#llx != %#llx\n", + who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, "%s: id %#llx offset %llu end id: %#llx != %#llx\n", + who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + return err; +} +EXPORT_SYMBOL(block_debug_check); +#undef LPDS diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/dt_object.c b/kernel/drivers/staging/lustre/lustre/obdclass/dt_object.c new file mode 100644 index 000000000..b1eee0a6d --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/dt_object.c @@ -0,0 +1,1059 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/dt_object.c + * + * Dt Object. + * Generic functions from dt_object.h + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../include/obd.h" +#include "../include/dt_object.h" +#include +/* fid_be_to_cpu() */ +#include "../include/lustre_fid.h" + +#include "../include/lustre_quota.h" + +/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */ +LU_KEY_INIT(dt_global, struct dt_thread_info); +LU_KEY_FINI(dt_global, struct dt_thread_info); + +struct lu_context_key dt_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL, + .lct_init = dt_global_key_init, + .lct_fini = dt_global_key_fini +}; +EXPORT_SYMBOL(dt_key); + +/* no lock is necessary to protect the list, because call-backs + * are added during system startup. Please refer to "struct dt_device". + */ +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks); +} +EXPORT_SYMBOL(dt_txn_callback_add); + +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_del_init(&cb->dtc_linkage); +} +EXPORT_SYMBOL(dt_txn_callback_del); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *th) +{ + int rc = 0; + struct dt_txn_callback *cb; + + if (th->th_local) + return 0; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + if (cb->dtc_txn_start == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + rc = cb->dtc_txn_start(env, th, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_start); + +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn) +{ + struct dt_device *dev = txn->th_dev; + struct dt_txn_callback *cb; + int rc = 0; + + if (txn->th_local) + return 0; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + if (cb->dtc_txn_stop == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_stop); + +void dt_txn_hook_commit(struct thandle *txn) +{ + struct dt_txn_callback *cb; + + if (txn->th_local) + return; + + list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks, + dtc_linkage) { + if (cb->dtc_txn_commit) + cb->dtc_txn_commit(txn, cb->dtc_cookie); + } +} +EXPORT_SYMBOL(dt_txn_hook_commit); + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t) +{ + + INIT_LIST_HEAD(&dev->dd_txn_callbacks); + return lu_device_init(&dev->dd_lu_dev, t); +} +EXPORT_SYMBOL(dt_device_init); + +void dt_device_fini(struct dt_device *dev) +{ + lu_device_fini(&dev->dd_lu_dev); +} +EXPORT_SYMBOL(dt_device_fini); + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d) + +{ + return lu_object_init(&obj->do_lu, h, d); +} +EXPORT_SYMBOL(dt_object_init); + +void dt_object_fini(struct dt_object *obj) +{ + lu_object_fini(&obj->do_lu); +} +EXPORT_SYMBOL(dt_object_fini); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj) +{ + if (obj->do_index_ops == NULL) + obj->do_ops->do_index_try(env, obj, &dt_directory_features); + return obj->do_index_ops != NULL; +} +EXPORT_SYMBOL(dt_try_as_dir); + +enum dt_format_type dt_mode_to_dft(__u32 mode) +{ + enum dt_format_type result; + + switch (mode & S_IFMT) { + case S_IFDIR: + result = DFT_DIR; + break; + case S_IFREG: + result = DFT_REGULAR; + break; + case S_IFLNK: + result = DFT_SYM; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + result = DFT_NODE; + break; + default: + LBUG(); + break; + } + return result; +} +EXPORT_SYMBOL(dt_mode_to_dft); + +/** + * lookup fid for object named \a name in directory \a dir. + */ + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid) +{ + if (dt_try_as_dir(env, dir)) + return dt_lookup(env, dir, (struct dt_rec *)fid, + (const struct dt_key *)name, BYPASS_CAPA); + return -ENOTDIR; +} +EXPORT_SYMBOL(dt_lookup_dir); + +/* this differs from dt_locate by top_dev as parameter + * but not one from lu_site */ +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, const struct lu_fid *fid, + struct lu_device *top_dev) +{ + struct lu_object *lo, *n; + + lo = lu_object_find_at(env, top_dev, fid, NULL); + if (IS_ERR(lo)) + return (void *)lo; + + LASSERT(lo != NULL); + + list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) { + if (n->lo_dev == &dev->dd_lu_dev) + return container_of0(n, struct dt_object, do_lu); + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL(dt_locate_at); + +/** + * find a object named \a entry in given \a dfh->dfh_o directory. + */ +static int dt_find_entry(const struct lu_env *env, const char *entry, void *data) +{ + struct dt_find_hint *dfh = data; + struct dt_device *dt = dfh->dfh_dt; + struct lu_fid *fid = dfh->dfh_fid; + struct dt_object *obj = dfh->dfh_o; + int result; + + result = dt_lookup_dir(env, obj, entry, fid); + lu_object_put(env, &obj->do_lu); + if (result == 0) { + obj = dt_locate(env, dt, fid); + if (IS_ERR(obj)) + result = PTR_ERR(obj); + } + dfh->dfh_o = obj; + return result; +} + +/** + * Abstract function which parses path name. This function feeds + * path component to \a entry_func. + */ +int dt_path_parser(const struct lu_env *env, + char *path, dt_entry_func_t entry_func, + void *data) +{ + char *e; + int rc = 0; + + while (1) { + e = strsep(&path, "/"); + if (e == NULL) + break; + + if (e[0] == 0) { + if (!path || path[0] == '\0') + break; + continue; + } + rc = entry_func(env, e, data); + if (rc) + break; + } + + return rc; +} + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid) +{ + struct dt_thread_info *info = dt_info(env); + struct dt_find_hint *dfh = &info->dti_dfh; + struct dt_object *obj; + char *local = info->dti_buf; + int result; + + + dfh->dfh_dt = dt; + dfh->dfh_fid = fid; + + strncpy(local, path, DT_MAX_PATH); + local[DT_MAX_PATH - 1] = '\0'; + + result = dt->dd_ops->dt_root_get(env, dt, fid); + if (result == 0) { + obj = dt_locate(env, dt, fid); + if (!IS_ERR(obj)) { + dfh->dfh_o = obj; + result = dt_path_parser(env, local, dt_find_entry, dfh); + if (result != 0) + obj = ERR_PTR(result); + else + obj = dfh->dfh_o; + } + } else { + obj = ERR_PTR(result); + } + return obj; +} +EXPORT_SYMBOL(dt_store_resolve); + +static struct dt_object *dt_reg_open(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + const char *name, + struct lu_fid *fid) +{ + struct dt_object *o; + int result; + + result = dt_lookup_dir(env, p, name, fid); + if (result == 0){ + o = dt_locate(env, dt, fid); + } else + o = ERR_PTR(result); + + return o; +} + +/** + * Open dt object named \a filename from \a dirname directory. + * \param dt dt device + * \param fid on success, object fid is stored in *fid + */ +struct dt_object *dt_store_open(const struct lu_env *env, + struct dt_device *dt, + const char *dirname, + const char *filename, + struct lu_fid *fid) +{ + struct dt_object *file; + struct dt_object *dir; + + dir = dt_store_resolve(env, dt, dirname, fid); + if (!IS_ERR(dir)) { + file = dt_reg_open(env, dt, dir, + filename, fid); + lu_object_put(env, &dir->do_lu); + } else { + file = dir; + } + return file; +} +EXPORT_SYMBOL(dt_store_open); + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *at) +{ + struct dt_object *dto; + struct thandle *th; + int rc; + + dto = dt_locate(env, dt, fid); + if (IS_ERR(dto)) + return dto; + + LASSERT(dto != NULL); + if (dt_object_exists(dto)) + return dto; + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) { + rc = PTR_ERR(th); + goto out; + } + + rc = dt_declare_create(env, dto, at, NULL, dof, th); + if (rc) + goto trans_stop; + + rc = dt_trans_start_local(env, dt, th); + if (rc) + goto trans_stop; + + dt_write_lock(env, dto, 0); + if (dt_object_exists(dto)) { + rc = 0; + goto unlock; + } + + CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid)); + + rc = dt_create(env, dto, at, NULL, dof, th); + if (rc) + goto unlock; + LASSERT(dt_object_exists(dto)); +unlock: + dt_write_unlock(env, dto); +trans_stop: + dt_trans_stop(env, dt, th); +out: + if (rc) { + lu_object_put(env, &dto->do_lu); + return ERR_PTR(rc); + } + return dto; +} +EXPORT_SYMBOL(dt_find_or_create); + +/* dt class init function. */ +int dt_global_init(void) +{ + LU_CONTEXT_KEY_INIT(&dt_key); + return lu_context_key_register(&dt_key); +} + +void dt_global_fini(void) +{ + lu_context_key_degister(&dt_key); +} + +/** + * Generic read helper. May return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval real size of data read + * \retval -ve errno on failure + */ +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA); +} +EXPORT_SYMBOL(dt_read); + +/** + * Read structures of fixed size from storage. Unlike dt_read(), using + * dt_record_read() will return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval 0 on successfully reading full buffer + * \retval -EFAULT on short read + * \retval -ve errno on failure + */ +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + + rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA); + + if (rc == buf->lb_len) + rc = 0; + else if (rc >= 0) + rc = -EFAULT; + return rc; +} +EXPORT_SYMBOL(dt_record_read); + +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_write); + rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1); + if (rc == buf->lb_len) + rc = 0; + else if (rc >= 0) + rc = -EFAULT; + return rc; +} +EXPORT_SYMBOL(dt_record_write); + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + + LASSERT(o); + vbuf.lb_buf = NULL; + vbuf.lb_len = sizeof(dt_obj_version_t); + return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th); + +} +EXPORT_SYMBOL(dt_declare_version_set); + +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + + rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA); + if (rc < 0) + CDEBUG(D_INODE, "Can't set version, rc %d\n", rc); + return; +} +EXPORT_SYMBOL(dt_version_set); + +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + dt_obj_version_t version; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA); + if (rc != sizeof(version)) { + CDEBUG(D_INODE, "Can't get version, rc %d\n", rc); + version = 0; + } + return version; +} +EXPORT_SYMBOL(dt_version_get); + +/* list of all supported index types */ + +/* directories */ +const struct dt_index_features dt_directory_features; +EXPORT_SYMBOL(dt_directory_features); + +/* scrub iterator */ +const struct dt_index_features dt_otable_features; +EXPORT_SYMBOL(dt_otable_features); + +/* lfsck */ +const struct dt_index_features dt_lfsck_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(struct lu_fid), + .dif_keysize_max = sizeof(struct lu_fid), + .dif_recsize_min = sizeof(__u8), + .dif_recsize_max = sizeof(__u8), + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_lfsck_features); + +/* accounting indexes */ +const struct dt_index_features dt_acct_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_recsize_max = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_acct_features); + +/* global quota files */ +const struct dt_index_features dt_quota_glb_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_recsize_max = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_glb_features); + +/* slave quota files */ +const struct dt_index_features dt_quota_slv_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_recsize_max = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_slv_features); + +/* helper function returning what dt_index_features structure should be used + * based on the FID sequence. This is used by OBD_IDX_READ RPC */ +static inline const struct dt_index_features *dt_index_feat_select(__u64 seq, + __u32 mode) +{ + if (seq == FID_SEQ_QUOTA_GLB) { + /* global quota index */ + if (!S_ISREG(mode)) + /* global quota index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_glb_features; + } else if (seq == FID_SEQ_QUOTA) { + /* quota slave index */ + if (!S_ISREG(mode)) + /* slave index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_slv_features; + } else if (seq >= FID_SEQ_NORMAL) { + /* object is part of the namespace, verify that it is a + * directory */ + if (!S_ISDIR(mode)) + /* sorry, we can only deal with directory */ + return ERR_PTR(-ENOTDIR); + return &dt_directory_features; + } + + return ERR_PTR(-EOPNOTSUPP); +} + +/* + * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ + * RPC + * + * \param env - is the environment passed by the caller + * \param lp - is a pointer to the lu_page to fill + * \param nob - is the maximum number of bytes that should be copied + * \param iops - is the index operation vector associated with the index object + * \param it - is a pointer to the current iterator + * \param attr - is the index attribute to pass to iops->rec() + * \param arg - is a pointer to the idx_info structure + */ +static int dt_index_page_build(const struct lu_env *env, union lu_page *lp, + int nob, const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg) +{ + struct idx_info *ii = (struct idx_info *)arg; + struct lu_idxpage *lip = &lp->lp_idx; + char *entry; + int rc, size; + + /* no support for variable key & record size for now */ + LASSERT((ii->ii_flags & II_FL_VARKEY) == 0); + LASSERT((ii->ii_flags & II_FL_VARREC) == 0); + + /* initialize the header of the new container */ + memset(lip, 0, LIP_HDR_SIZE); + lip->lip_magic = LIP_MAGIC; + nob -= LIP_HDR_SIZE; + + /* compute size needed to store a key/record pair */ + size = ii->ii_recsize + ii->ii_keysize; + if ((ii->ii_flags & II_FL_NOHASH) == 0) + /* add hash if the client wants it */ + size += sizeof(__u64); + + entry = lip->lip_entries; + do { + char *tmp_entry = entry; + struct dt_key *key; + __u64 hash; + + /* fetch 64-bit hash value */ + hash = iops->store(env, it); + ii->ii_hash_end = hash; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) { + if (lip->lip_nr != 0) { + rc = 0; + goto out; + } + } + + if (nob < size) { + if (lip->lip_nr == 0) + rc = -EINVAL; + else + rc = 0; + goto out; + } + + if ((ii->ii_flags & II_FL_NOHASH) == 0) { + /* client wants to the 64-bit hash value associated with + * each record */ + memcpy(tmp_entry, &hash, sizeof(hash)); + tmp_entry += sizeof(hash); + } + + /* then the key value */ + LASSERT(iops->key_size(env, it) == ii->ii_keysize); + key = iops->key(env, it); + memcpy(tmp_entry, key, ii->ii_keysize); + tmp_entry += ii->ii_keysize; + + /* and finally the record */ + rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr); + if (rc != -ESTALE) { + if (rc != 0) + goto out; + + /* hash/key/record successfully copied! */ + lip->lip_nr++; + if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0)) + ii->ii_hash_start = hash; + entry = tmp_entry + ii->ii_recsize; + nob -= size; + } + + /* move on to the next record */ + do { + rc = iops->next(env, it); + } while (rc == -ESTALE); + + } while (rc == 0); + + goto out; +out: + if (rc >= 0 && lip->lip_nr > 0) + /* one more container */ + ii->ii_count++; + if (rc > 0) + /* no more entries */ + ii->ii_hash_end = II_END_OFF; + return rc; +} + +/* + * Walk index and fill lu_page containers with key/record pairs + * + * \param env - is the environment passed by the caller + * \param obj - is the index object to parse + * \param rdpg - is the lu_rdpg descriptor associated with the transfer + * \param filler - is the callback function responsible for filling a lu_page + * with key/record pairs in the format wanted by the caller + * \param arg - is an opaq argument passed to the filler function + * + * \retval sum (in bytes) of all filled lu_pages + * \retval -ve errno on failure + */ +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg) +{ + struct dt_it *it; + const struct dt_it_ops *iops; + unsigned int pageidx, nob, nlupgs = 0; + int rc; + + LASSERT(rdpg->rp_pages != NULL); + LASSERT(obj->do_index_ops != NULL); + + nob = rdpg->rp_count; + if (nob <= 0) + return -EFAULT; + + /* Iterate through index and fill containers from @rdpg */ + iops = &obj->do_index_ops->dio_it; + LASSERT(iops != NULL); + it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA); + if (IS_ERR(it)) + return PTR_ERR(it); + + rc = iops->load(env, it, rdpg->rp_hash); + if (rc == 0) { + /* + * Iterator didn't find record with exactly the key requested. + * + * It is currently either + * + * - positioned above record with key less than + * requested---skip it. + * - or not positioned at all (is in IAM_IT_SKEWED + * state)---position it on the next item. + */ + rc = iops->next(env, it); + } else if (rc > 0) { + rc = 0; + } + + /* Fill containers one after the other. There might be multiple + * containers per physical page. + * + * At this point and across for-loop: + * rc == 0 -> ok, proceed. + * rc > 0 -> end of index. + * rc < 0 -> error. */ + for (pageidx = 0; rc == 0 && nob > 0; pageidx++) { + union lu_page *lp; + int i; + + LASSERT(pageidx < rdpg->rp_npages); + lp = kmap(rdpg->rp_pages[pageidx]); + + /* fill lu pages */ + for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) { + rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE), + iops, it, rdpg->rp_attrs, arg); + if (rc < 0) + break; + /* one more lu_page */ + nlupgs++; + if (rc > 0) + /* end of index */ + break; + } + kunmap(rdpg->rp_pages[i]); + } + + iops->put(env, it); + iops->fini(env, it); + + if (rc >= 0) + rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count); + + return rc; +} +EXPORT_SYMBOL(dt_index_walk); + +/** + * Walk key/record pairs of an index and copy them into 4KB containers to be + * transferred over the network. This is the common handler for OBD_IDX_READ + * RPC processing. + * + * \param env - is the environment passed by the caller + * \param dev - is the dt_device storing the index + * \param ii - is the idx_info structure packed by the client in the + * OBD_IDX_READ request + * \param rdpg - is the lu_rdpg descriptor + * + * \retval on success, return sum (in bytes) of all filled containers + * \retval appropriate error otherwise. + */ +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg) +{ + const struct dt_index_features *feat; + struct dt_object *obj; + int rc; + + /* rp_count shouldn't be null and should be a multiple of the container + * size */ + if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0) + return -EFAULT; + + if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL) + /* we don't support directory transfer via OBD_IDX_READ for the + * time being */ + return -EOPNOTSUPP; + + if (!fid_is_quota(&ii->ii_fid)) + /* block access to all local files except quota files */ + return -EPERM; + + /* lookup index object subject to the transfer */ + obj = dt_locate(env, dev, &ii->ii_fid); + if (IS_ERR(obj)) + return PTR_ERR(obj); + if (dt_object_exists(obj) == 0) { + rc = -ENOENT; + goto out; + } + + /* fetch index features associated with index object */ + feat = dt_index_feat_select(fid_seq(&ii->ii_fid), + lu_object_attr(&obj->do_lu)); + if (IS_ERR(feat)) { + rc = PTR_ERR(feat); + goto out; + } + + /* load index feature if not done already */ + if (obj->do_index_ops == NULL) { + rc = obj->do_ops->do_index_try(env, obj, feat); + if (rc) + goto out; + } + + /* fill ii_flags with supported index features */ + ii->ii_flags &= II_FL_NOHASH; + + ii->ii_keysize = feat->dif_keysize_max; + if ((feat->dif_flags & DT_IND_VARKEY) != 0) { + /* key size is variable */ + ii->ii_flags |= II_FL_VARKEY; + /* we don't support variable key size for the time being */ + rc = -EOPNOTSUPP; + goto out; + } + + ii->ii_recsize = feat->dif_recsize_max; + if ((feat->dif_flags & DT_IND_VARREC) != 0) { + /* record size is variable */ + ii->ii_flags |= II_FL_VARREC; + /* we don't support variable record size for the time being */ + rc = -EOPNOTSUPP; + goto out; + } + + if ((feat->dif_flags & DT_IND_NONUNQ) != 0) + /* key isn't necessarily unique */ + ii->ii_flags |= II_FL_NONUNQ; + + dt_read_lock(env, obj, 0); + /* fetch object version before walking the index */ + ii->ii_version = dt_version_get(env, obj); + + /* walk the index and fill lu_idxpages with key/record pairs */ + rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii); + dt_read_unlock(env, obj); + + if (rc == 0) { + /* index is empty */ + LASSERT(ii->ii_count == 0); + ii->ii_hash_end = II_END_OFF; + } + + goto out; +out: + lu_object_put(env, &obj->do_lu); + return rc; +} +EXPORT_SYMBOL(dt_index_read); + +#if defined (CONFIG_PROC_FS) + +int lprocfs_dt_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + int rc = dt_statfs(NULL, dt, &osfs); + + if (rc == 0) { + *eof = 1; + rc = snprintf(page, count, "%u\n", + (unsigned) osfs.os_bsize); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_blksize); + +int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + int rc = dt_statfs(NULL, dt, &osfs); + + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, "%llu\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal); + +int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + int rc = dt_statfs(NULL, dt, &osfs); + + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, "%llu\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree); + +int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + int rc = dt_statfs(NULL, dt, &osfs); + + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, "%llu\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail); + +int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + int rc = dt_statfs(NULL, dt, &osfs); + + if (rc == 0) { + *eof = 1; + rc = snprintf(page, count, "%llu\n", osfs.os_files); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_filestotal); + +int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + int rc = dt_statfs(NULL, dt, &osfs); + + if (rc == 0) { + *eof = 1; + rc = snprintf(page, count, "%llu\n", osfs.os_ffree); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_filesfree); + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/genops.c b/kernel/drivers/staging/lustre/lustre/obdclass/genops.c new file mode 100644 index 000000000..66b56784f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/genops.c @@ -0,0 +1,1833 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/genops.c + * + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#include "../include/obd_class.h" +#include "../include/lprocfs_status.h" + +spinlock_t obd_types_lock; + +struct kmem_cache *obd_device_cachep; +struct kmem_cache *obdo_cachep; +EXPORT_SYMBOL(obdo_cachep); +static struct kmem_cache *import_cachep; + +static struct list_head obd_zombie_imports; +static struct list_head obd_zombie_exports; +static spinlock_t obd_zombie_impexp_lock; +static void obd_zombie_impexp_notify(void); +static void obd_zombie_export_add(struct obd_export *exp); +static void obd_zombie_import_add(struct obd_import *imp); +static void print_export_data(struct obd_export *exp, + const char *status, int locks); + +int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); +EXPORT_SYMBOL(ptlrpc_put_connection_superhack); + +/* + * support functions: we could use inter-module communication, but this + * is more portable to other OS's + */ +static struct obd_device *obd_device_alloc(void) +{ + struct obd_device *obd; + + OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, GFP_NOFS); + if (obd != NULL) { + obd->obd_magic = OBD_DEVICE_MAGIC; + } + return obd; +} + +static void obd_device_free(struct obd_device *obd) +{ + LASSERT(obd != NULL); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + if (obd->obd_namespace != NULL) { + CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n", + obd, obd->obd_namespace, obd->obd_force); + LBUG(); + } + lu_ref_fini(&obd->obd_reference); + OBD_SLAB_FREE_PTR(obd, obd_device_cachep); +} + +struct obd_type *class_search_type(const char *name) +{ + struct list_head *tmp; + struct obd_type *type; + + spin_lock(&obd_types_lock); + list_for_each(tmp, &obd_types) { + type = list_entry(tmp, struct obd_type, typ_chain); + if (strcmp(type->typ_name, name) == 0) { + spin_unlock(&obd_types_lock); + return type; + } + } + spin_unlock(&obd_types_lock); + return NULL; +} +EXPORT_SYMBOL(class_search_type); + +struct obd_type *class_get_type(const char *name) +{ + struct obd_type *type = class_search_type(name); + + if (!type) { + const char *modname = name; + + if (strcmp(modname, "obdfilter") == 0) + modname = "ofd"; + + if (strcmp(modname, LUSTRE_LWP_NAME) == 0) + modname = LUSTRE_OSP_NAME; + + if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME))) + modname = LUSTRE_MDT_NAME; + + if (!request_module("%s", modname)) { + CDEBUG(D_INFO, "Loaded module '%s'\n", modname); + type = class_search_type(name); + } else { + LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n", + modname); + } + } + if (type) { + spin_lock(&type->obd_type_lock); + type->typ_refcnt++; + try_module_get(type->typ_dt_ops->o_owner); + spin_unlock(&type->obd_type_lock); + } + return type; +} +EXPORT_SYMBOL(class_get_type); + +void class_put_type(struct obd_type *type) +{ + LASSERT(type); + spin_lock(&type->obd_type_lock); + type->typ_refcnt--; + module_put(type->typ_dt_ops->o_owner); + spin_unlock(&type->obd_type_lock); +} +EXPORT_SYMBOL(class_put_type); + +#define CLASS_MAX_NAME 1024 + +int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops, + struct lprocfs_vars *vars, const char *name, + struct lu_device_type *ldt) +{ + struct obd_type *type; + int rc = 0; + + /* sanity check */ + LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME); + + if (class_search_type(name)) { + CDEBUG(D_IOCTL, "Type %s already registered\n", name); + return -EEXIST; + } + + rc = -ENOMEM; + OBD_ALLOC(type, sizeof(*type)); + if (type == NULL) + return rc; + + OBD_ALLOC_PTR(type->typ_dt_ops); + OBD_ALLOC_PTR(type->typ_md_ops); + OBD_ALLOC(type->typ_name, strlen(name) + 1); + + if (type->typ_dt_ops == NULL || + type->typ_md_ops == NULL || + type->typ_name == NULL) + goto failed; + + *(type->typ_dt_ops) = *dt_ops; + /* md_ops is optional */ + if (md_ops) + *(type->typ_md_ops) = *md_ops; + strcpy(type->typ_name, name); + spin_lock_init(&type->obd_type_lock); + + type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root, + vars, type); + if (IS_ERR(type->typ_procroot)) { + rc = PTR_ERR(type->typ_procroot); + type->typ_procroot = NULL; + goto failed; + } + + if (ldt != NULL) { + type->typ_lu = ldt; + rc = lu_device_type_init(ldt); + if (rc != 0) + goto failed; + } + + spin_lock(&obd_types_lock); + list_add(&type->typ_chain, &obd_types); + spin_unlock(&obd_types_lock); + + return 0; + + failed: + if (type->typ_name != NULL) + OBD_FREE(type->typ_name, strlen(name) + 1); + if (type->typ_md_ops != NULL) + OBD_FREE_PTR(type->typ_md_ops); + if (type->typ_dt_ops != NULL) + OBD_FREE_PTR(type->typ_dt_ops); + OBD_FREE(type, sizeof(*type)); + return rc; +} +EXPORT_SYMBOL(class_register_type); + +int class_unregister_type(const char *name) +{ + struct obd_type *type = class_search_type(name); + + if (!type) { + CERROR("unknown obd type\n"); + return -EINVAL; + } + + if (type->typ_refcnt) { + CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt); + /* This is a bad situation, let's make the best of it */ + /* Remove ops, but leave the name for debugging */ + OBD_FREE_PTR(type->typ_dt_ops); + OBD_FREE_PTR(type->typ_md_ops); + return -EBUSY; + } + + if (type->typ_procroot) { + lprocfs_remove(&type->typ_procroot); + } + + if (type->typ_lu) + lu_device_type_fini(type->typ_lu); + + spin_lock(&obd_types_lock); + list_del(&type->typ_chain); + spin_unlock(&obd_types_lock); + OBD_FREE(type->typ_name, strlen(name) + 1); + if (type->typ_dt_ops != NULL) + OBD_FREE_PTR(type->typ_dt_ops); + if (type->typ_md_ops != NULL) + OBD_FREE_PTR(type->typ_md_ops); + OBD_FREE(type, sizeof(*type)); + return 0; +} /* class_unregister_type */ +EXPORT_SYMBOL(class_unregister_type); + +/** + * Create a new obd device. + * + * Find an empty slot in ::obd_devs[], create a new obd device in it. + * + * \param[in] type_name obd device type string. + * \param[in] name obd device name. + * + * \retval NULL if create fails, otherwise return the obd device + * pointer created. + */ +struct obd_device *class_newdev(const char *type_name, const char *name) +{ + struct obd_device *result = NULL; + struct obd_device *newdev; + struct obd_type *type = NULL; + int i; + int new_obd_minor = 0; + + if (strlen(name) >= MAX_OBD_NAME) { + CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME); + return ERR_PTR(-EINVAL); + } + + type = class_get_type(type_name); + if (type == NULL){ + CERROR("OBD: unknown type: %s\n", type_name); + return ERR_PTR(-ENODEV); + } + + newdev = obd_device_alloc(); + if (newdev == NULL) { + result = ERR_PTR(-ENOMEM); + goto out_type; + } + + LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC); + + write_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && (strcmp(name, obd->obd_name) == 0)) { + CERROR("Device %s already exists at %d, won't add\n", + name, i); + if (result) { + LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC, + "%p obd_magic %08x != %08x\n", result, + result->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(result->obd_minor == new_obd_minor, + "%p obd_minor %d != %d\n", result, + result->obd_minor, new_obd_minor); + + obd_devs[result->obd_minor] = NULL; + result->obd_name[0] = '\0'; + } + result = ERR_PTR(-EEXIST); + break; + } + if (!result && !obd) { + result = newdev; + result->obd_minor = i; + new_obd_minor = i; + result->obd_type = type; + strncpy(result->obd_name, name, + sizeof(result->obd_name) - 1); + obd_devs[i] = result; + } + } + write_unlock(&obd_dev_lock); + + if (result == NULL && i >= class_devno_max()) { + CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n", + class_devno_max()); + result = ERR_PTR(-EOVERFLOW); + goto out; + } + + if (IS_ERR(result)) + goto out; + + CDEBUG(D_IOCTL, "Adding new device %s (%p)\n", + result->obd_name, result); + + return result; +out: + obd_device_free(newdev); +out_type: + class_put_type(type); + return result; +} + +void class_release_dev(struct obd_device *obd) +{ + struct obd_type *obd_type = obd->obd_type; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, obd_devs[obd->obd_minor]); + LASSERT(obd_type != NULL); + + CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n", + obd->obd_name, obd->obd_minor, obd->obd_type->typ_name); + + write_lock(&obd_dev_lock); + obd_devs[obd->obd_minor] = NULL; + write_unlock(&obd_dev_lock); + obd_device_free(obd); + + class_put_type(obd_type); +} + +int class_name2dev(const char *name) +{ + int i; + + if (!name) + return -1; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && strcmp(name, obd->obd_name) == 0) { + /* Make sure we finished attaching before we give + out any references */ + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_attached) { + read_unlock(&obd_dev_lock); + return i; + } + break; + } + } + read_unlock(&obd_dev_lock); + + return -1; +} +EXPORT_SYMBOL(class_name2dev); + +struct obd_device *class_name2obd(const char *name) +{ + int dev = class_name2dev(name); + + if (dev < 0 || dev > class_devno_max()) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_name2obd); + +int class_uuid2dev(struct obd_uuid *uuid) +{ + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) { + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + read_unlock(&obd_dev_lock); + return i; + } + } + read_unlock(&obd_dev_lock); + + return -1; +} +EXPORT_SYMBOL(class_uuid2dev); + +struct obd_device *class_uuid2obd(struct obd_uuid *uuid) +{ + int dev = class_uuid2dev(uuid); + if (dev < 0) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_uuid2obd); + +/** + * Get obd device from ::obd_devs[] + * + * \param num [in] array index + * + * \retval NULL if ::obd_devs[\a num] does not contains an obd device + * otherwise return the obd device there. + */ +struct obd_device *class_num2obd(int num) +{ + struct obd_device *obd = NULL; + + if (num < class_devno_max()) { + obd = obd_devs[num]; + if (obd == NULL) + return NULL; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "%p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd->obd_minor == num, + "%p obd_minor %0d != %0d\n", + obd, obd->obd_minor, num); + } + + return obd; +} +EXPORT_SYMBOL(class_num2obd); + +/** + * Get obd devices count. Device in any + * state are counted + * \retval obd device count + */ +int get_devices_count(void) +{ + int index, max_index = class_devno_max(), dev_count = 0; + + read_lock(&obd_dev_lock); + for (index = 0; index <= max_index; index++) { + struct obd_device *obd = class_num2obd(index); + if (obd != NULL) + dev_count++; + } + read_unlock(&obd_dev_lock); + + return dev_count; +} +EXPORT_SYMBOL(get_devices_count); + +void class_obd_list(void) +{ + char *status; + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n", + i, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + } + read_unlock(&obd_dev_lock); + return; +} + +/* Search for a client OBD connected to tgt_uuid. If grp_uuid is + specified, then only the client with that uuid is returned, + otherwise any client connected to the tgt is returned. */ +struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid, + const char *typ_name, + struct obd_uuid *grp_uuid) +{ + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if ((strncmp(obd->obd_type->typ_name, typ_name, + strlen(typ_name)) == 0)) { + if (obd_uuid_equals(tgt_uuid, + &obd->u.cli.cl_target_uuid) && + ((grp_uuid)? obd_uuid_equals(grp_uuid, + &obd->obd_uuid) : 1)) { + read_unlock(&obd_dev_lock); + return obd; + } + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_find_client_obd); + +/* Iterate the obd_device list looking devices have grp_uuid. Start + searching at *next, and if a device is found, the next index to look + at is saved in *next. If next is NULL, then the first matching device + will always be returned. */ +struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, int *next) +{ + int i; + + if (next == NULL) + i = 0; + else if (*next >= 0 && *next < class_devno_max()) + i = *next; + else + return NULL; + + read_lock(&obd_dev_lock); + for (; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) { + if (next != NULL) + *next = i+1; + read_unlock(&obd_dev_lock); + return obd; + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_devices_in_group); + +/** + * to notify sptlrpc log for \a fsname has changed, let every relevant OBD + * adjust sptlrpc settings accordingly. + */ +int class_notify_sptlrpc_conf(const char *fsname, int namelen) +{ + struct obd_device *obd; + const char *type; + int i, rc = 0, rc2; + + LASSERT(namelen > 0); + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + obd = class_num2obd(i); + + if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping) + continue; + + /* only notify mdc, osc, mdt, ost */ + type = obd->obd_type->typ_name; + if (strcmp(type, LUSTRE_MDC_NAME) != 0 && + strcmp(type, LUSTRE_OSC_NAME) != 0 && + strcmp(type, LUSTRE_MDT_NAME) != 0 && + strcmp(type, LUSTRE_OST_NAME) != 0) + continue; + + if (strncmp(obd->obd_name, fsname, namelen)) + continue; + + class_incref(obd, __func__, obd); + read_unlock(&obd_dev_lock); + rc2 = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_SPTLRPC_CONF), + KEY_SPTLRPC_CONF, 0, NULL, NULL); + rc = rc ? rc : rc2; + class_decref(obd, __func__, obd); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + return rc; +} +EXPORT_SYMBOL(class_notify_sptlrpc_conf); + +void obd_cleanup_caches(void) +{ + if (obd_device_cachep) { + kmem_cache_destroy(obd_device_cachep); + obd_device_cachep = NULL; + } + if (obdo_cachep) { + kmem_cache_destroy(obdo_cachep); + obdo_cachep = NULL; + } + if (import_cachep) { + kmem_cache_destroy(import_cachep); + import_cachep = NULL; + } + if (capa_cachep) { + kmem_cache_destroy(capa_cachep); + capa_cachep = NULL; + } +} + +int obd_init_caches(void) +{ + LASSERT(obd_device_cachep == NULL); + obd_device_cachep = kmem_cache_create("ll_obd_dev_cache", + sizeof(struct obd_device), + 0, 0, NULL); + if (!obd_device_cachep) + goto out; + + LASSERT(obdo_cachep == NULL); + obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo), + 0, 0, NULL); + if (!obdo_cachep) + goto out; + + LASSERT(import_cachep == NULL); + import_cachep = kmem_cache_create("ll_import_cache", + sizeof(struct obd_import), + 0, 0, NULL); + if (!import_cachep) + goto out; + + LASSERT(capa_cachep == NULL); + capa_cachep = kmem_cache_create("capa_cache", + sizeof(struct obd_capa), 0, 0, NULL); + if (!capa_cachep) + goto out; + + return 0; + out: + obd_cleanup_caches(); + return -ENOMEM; + +} + +/* map connection to client */ +struct obd_export *class_conn2export(struct lustre_handle *conn) +{ + struct obd_export *export; + + if (!conn) { + CDEBUG(D_CACHE, "looking for null handle\n"); + return NULL; + } + + if (conn->cookie == -1) { /* this means assign a new connection */ + CDEBUG(D_CACHE, "want a new connection\n"); + return NULL; + } + + CDEBUG(D_INFO, "looking for export cookie %#llx\n", conn->cookie); + export = class_handle2object(conn->cookie); + return export; +} +EXPORT_SYMBOL(class_conn2export); + +struct obd_device *class_exp2obd(struct obd_export *exp) +{ + if (exp) + return exp->exp_obd; + return NULL; +} +EXPORT_SYMBOL(class_exp2obd); + +struct obd_device *class_conn2obd(struct lustre_handle *conn) +{ + struct obd_export *export; + export = class_conn2export(conn); + if (export) { + struct obd_device *obd = export->exp_obd; + class_export_put(export); + return obd; + } + return NULL; +} +EXPORT_SYMBOL(class_conn2obd); + +struct obd_import *class_exp2cliimp(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + if (obd == NULL) + return NULL; + return obd->u.cli.cl_import; +} +EXPORT_SYMBOL(class_exp2cliimp); + +struct obd_import *class_conn2cliimp(struct lustre_handle *conn) +{ + struct obd_device *obd = class_conn2obd(conn); + if (obd == NULL) + return NULL; + return obd->u.cli.cl_import; +} +EXPORT_SYMBOL(class_conn2cliimp); + +/* Export management functions */ +static void class_export_destroy(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + + LASSERT_ATOMIC_ZERO(&exp->exp_refcount); + LASSERT(obd != NULL); + + CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp, + exp->exp_client_uuid.uuid, obd->obd_name); + + /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */ + if (exp->exp_connection) + ptlrpc_put_connection_superhack(exp->exp_connection); + + LASSERT(list_empty(&exp->exp_outstanding_replies)); + LASSERT(list_empty(&exp->exp_uncommitted_replies)); + LASSERT(list_empty(&exp->exp_req_replay_queue)); + LASSERT(list_empty(&exp->exp_hp_rpcs)); + obd_destroy_export(exp); + class_decref(obd, "export", exp); + + OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle); +} + +static void export_handle_addref(void *export) +{ + class_export_get(export); +} + +static struct portals_handle_ops export_handle_ops = { + .hop_addref = export_handle_addref, + .hop_free = NULL, +}; + +struct obd_export *class_export_get(struct obd_export *exp) +{ + atomic_inc(&exp->exp_refcount); + CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount)); + return exp; +} +EXPORT_SYMBOL(class_export_get); + +void class_export_put(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount) - 1); + + if (atomic_dec_and_test(&exp->exp_refcount)) { + LASSERT(!list_empty(&exp->exp_obd_chain)); + CDEBUG(D_IOCTL, "final put %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + /* release nid stat refererence */ + lprocfs_exp_cleanup(exp); + + obd_zombie_export_add(exp); + } +} +EXPORT_SYMBOL(class_export_put); + +/* Creates a new export, adds it to the hash table, and returns a + * pointer to it. The refcount is 2: one for the hash reference, and + * one for the pointer returned by this function. */ +struct obd_export *class_new_export(struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct obd_export *export; + struct cfs_hash *hash = NULL; + int rc = 0; + + OBD_ALLOC_PTR(export); + if (!export) + return ERR_PTR(-ENOMEM); + + export->exp_conn_cnt = 0; + export->exp_lock_hash = NULL; + export->exp_flock_hash = NULL; + atomic_set(&export->exp_refcount, 2); + atomic_set(&export->exp_rpc_count, 0); + atomic_set(&export->exp_cb_count, 0); + atomic_set(&export->exp_locks_count, 0); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + INIT_LIST_HEAD(&export->exp_locks_list); + spin_lock_init(&export->exp_locks_list_guard); +#endif + atomic_set(&export->exp_replay_count, 0); + export->exp_obd = obd; + INIT_LIST_HEAD(&export->exp_outstanding_replies); + spin_lock_init(&export->exp_uncommitted_replies_lock); + INIT_LIST_HEAD(&export->exp_uncommitted_replies); + INIT_LIST_HEAD(&export->exp_req_replay_queue); + INIT_LIST_HEAD(&export->exp_handle.h_link); + INIT_LIST_HEAD(&export->exp_hp_rpcs); + class_handle_hash(&export->exp_handle, &export_handle_ops); + export->exp_last_request_time = get_seconds(); + spin_lock_init(&export->exp_lock); + spin_lock_init(&export->exp_rpc_lock); + INIT_HLIST_NODE(&export->exp_uuid_hash); + INIT_HLIST_NODE(&export->exp_nid_hash); + spin_lock_init(&export->exp_bl_list_lock); + INIT_LIST_HEAD(&export->exp_bl_list); + + export->exp_sp_peer = LUSTRE_SP_ANY; + export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; + export->exp_client_uuid = *cluuid; + obd_init_export(export); + + spin_lock(&obd->obd_dev_lock); + /* shouldn't happen, but might race */ + if (obd->obd_stopping) { + rc = -ENODEV; + goto exit_unlock; + } + + hash = cfs_hash_getref(obd->obd_uuid_hash); + if (hash == NULL) { + rc = -ENODEV; + goto exit_unlock; + } + spin_unlock(&obd->obd_dev_lock); + + if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) { + rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash); + if (rc != 0) { + LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n", + obd->obd_name, cluuid->uuid, rc); + rc = -EALREADY; + goto exit_err; + } + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + cfs_hash_del(hash, cluuid, &export->exp_uuid_hash); + rc = -ENODEV; + goto exit_unlock; + } + + class_incref(obd, "export", export); + list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports); + list_add_tail(&export->exp_obd_chain_timed, + &export->exp_obd->obd_exports_timed); + export->exp_obd->obd_num_exports++; + spin_unlock(&obd->obd_dev_lock); + cfs_hash_putref(hash); + return export; + +exit_unlock: + spin_unlock(&obd->obd_dev_lock); +exit_err: + if (hash) + cfs_hash_putref(hash); + class_handle_unhash(&export->exp_handle); + LASSERT(hlist_unhashed(&export->exp_uuid_hash)); + obd_destroy_export(export); + OBD_FREE_PTR(export); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(class_new_export); + +void class_unlink_export(struct obd_export *exp) +{ + class_handle_unhash(&exp->exp_handle); + + spin_lock(&exp->exp_obd->obd_dev_lock); + /* delete an uuid-export hashitem from hashtables */ + if (!hlist_unhashed(&exp->exp_uuid_hash)) + cfs_hash_del(exp->exp_obd->obd_uuid_hash, + &exp->exp_client_uuid, + &exp->exp_uuid_hash); + + list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports); + list_del_init(&exp->exp_obd_chain_timed); + exp->exp_obd->obd_num_exports--; + spin_unlock(&exp->exp_obd->obd_dev_lock); + class_export_put(exp); +} +EXPORT_SYMBOL(class_unlink_export); + +/* Import management functions */ +static void class_import_destroy(struct obd_import *imp) +{ + CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp, + imp->imp_obd->obd_name); + + LASSERT_ATOMIC_ZERO(&imp->imp_refcount); + + ptlrpc_put_connection_superhack(imp->imp_connection); + + while (!list_empty(&imp->imp_conn_list)) { + struct obd_import_conn *imp_conn; + + imp_conn = list_entry(imp->imp_conn_list.next, + struct obd_import_conn, oic_item); + list_del_init(&imp_conn->oic_item); + ptlrpc_put_connection_superhack(imp_conn->oic_conn); + OBD_FREE(imp_conn, sizeof(*imp_conn)); + } + + LASSERT(imp->imp_sec == NULL); + class_decref(imp->imp_obd, "import", imp); + OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle); +} + +static void import_handle_addref(void *import) +{ + class_import_get(import); +} + +static struct portals_handle_ops import_handle_ops = { + .hop_addref = import_handle_addref, + .hop_free = NULL, +}; + +struct obd_import *class_import_get(struct obd_import *import) +{ + atomic_inc(&import->imp_refcount); + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import, + atomic_read(&import->imp_refcount), + import->imp_obd->obd_name); + return import; +} +EXPORT_SYMBOL(class_import_get); + +void class_import_put(struct obd_import *imp) +{ + LASSERT(list_empty(&imp->imp_zombie_chain)); + LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON); + + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp, + atomic_read(&imp->imp_refcount) - 1, + imp->imp_obd->obd_name); + + if (atomic_dec_and_test(&imp->imp_refcount)) { + CDEBUG(D_INFO, "final put import %p\n", imp); + obd_zombie_import_add(imp); + } + + /* catch possible import put race */ + LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON); +} +EXPORT_SYMBOL(class_import_put); + +static void init_imp_at(struct imp_at *at) { + int i; + at_init(&at->iat_net_latency, 0, 0); + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + /* max service estimates are tracked on the server side, so + don't use the AT history here, just use the last reported + val. (But keep hist for proc histogram, worst_ever) */ + at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT, + AT_FLG_NOHIST); + } +} + +struct obd_import *class_new_import(struct obd_device *obd) +{ + struct obd_import *imp; + + OBD_ALLOC(imp, sizeof(*imp)); + if (imp == NULL) + return NULL; + + INIT_LIST_HEAD(&imp->imp_pinger_chain); + INIT_LIST_HEAD(&imp->imp_zombie_chain); + INIT_LIST_HEAD(&imp->imp_replay_list); + INIT_LIST_HEAD(&imp->imp_sending_list); + INIT_LIST_HEAD(&imp->imp_delayed_list); + INIT_LIST_HEAD(&imp->imp_committed_list); + imp->imp_replay_cursor = &imp->imp_committed_list; + spin_lock_init(&imp->imp_lock); + imp->imp_last_success_conn = 0; + imp->imp_state = LUSTRE_IMP_NEW; + imp->imp_obd = class_incref(obd, "import", imp); + mutex_init(&imp->imp_sec_mutex); + init_waitqueue_head(&imp->imp_recovery_waitq); + + atomic_set(&imp->imp_refcount, 2); + atomic_set(&imp->imp_unregistering, 0); + atomic_set(&imp->imp_inflight, 0); + atomic_set(&imp->imp_replay_inflight, 0); + atomic_set(&imp->imp_inval_count, 0); + INIT_LIST_HEAD(&imp->imp_conn_list); + INIT_LIST_HEAD(&imp->imp_handle.h_link); + class_handle_hash(&imp->imp_handle, &import_handle_ops); + init_imp_at(&imp->imp_at); + + /* the default magic is V2, will be used in connect RPC, and + * then adjusted according to the flags in request/reply. */ + imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2; + + return imp; +} +EXPORT_SYMBOL(class_new_import); + +void class_destroy_import(struct obd_import *import) +{ + LASSERT(import != NULL); + LASSERT(import != LP_POISON); + + class_handle_unhash(&import->imp_handle); + + spin_lock(&import->imp_lock); + import->imp_generation++; + spin_unlock(&import->imp_lock); + class_import_put(import); +} +EXPORT_SYMBOL(class_destroy_import); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + + LASSERT(lock->l_exp_refs_nr >= 0); + + if (lock->l_exp_refs_target != NULL && + lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n", + exp, lock, lock->l_exp_refs_target); + } + if ((lock->l_exp_refs_nr ++) == 0) { + list_add(&lock->l_exp_refs_link, &exp->exp_locks_list); + lock->l_exp_refs_target = exp; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_add_lock_ref); + +void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + LASSERT(lock->l_exp_refs_nr > 0); + if (lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("lock %p, mismatching export pointers: %p, %p\n", + lock, lock->l_exp_refs_target, exp); + } + if (-- lock->l_exp_refs_nr == 0) { + list_del_init(&lock->l_exp_refs_link); + lock->l_exp_refs_target = NULL; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_del_lock_ref); +#endif + +/* A connection defines an export context in which preallocation can + be managed. This releases the export pointer reference, and returns + the export handle, so the export refcount is 1 when this function + returns. */ +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct obd_export *export; + LASSERT(conn != NULL); + LASSERT(obd != NULL); + LASSERT(cluuid != NULL); + + export = class_new_export(obd, cluuid); + if (IS_ERR(export)) + return PTR_ERR(export); + + conn->cookie = export->exp_handle.h_cookie; + class_export_put(export); + + CDEBUG(D_IOCTL, "connect: client %s, cookie %#llx\n", + cluuid->uuid, conn->cookie); + return 0; +} +EXPORT_SYMBOL(class_connect); + +/* if export is involved in recovery then clean up related things */ +static void class_export_recovery_cleanup(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + + spin_lock(&obd->obd_recovery_task_lock); + if (exp->exp_delayed) + obd->obd_delayed_clients--; + if (obd->obd_recovering) { + if (exp->exp_in_recovery) { + spin_lock(&exp->exp_lock); + exp->exp_in_recovery = 0; + spin_unlock(&exp->exp_lock); + LASSERT_ATOMIC_POS(&obd->obd_connected_clients); + atomic_dec(&obd->obd_connected_clients); + } + + /* if called during recovery then should update + * obd_stale_clients counter, + * lightweight exports are not counted */ + if (exp->exp_failed && + (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0) + exp->exp_obd->obd_stale_clients++; + } + spin_unlock(&obd->obd_recovery_task_lock); + + spin_lock(&exp->exp_lock); + /** Cleanup req replay fields */ + if (exp->exp_req_replay_needed) { + exp->exp_req_replay_needed = 0; + + LASSERT(atomic_read(&obd->obd_req_replay_clients)); + atomic_dec(&obd->obd_req_replay_clients); + } + + /** Cleanup lock replay data */ + if (exp->exp_lock_replay_needed) { + exp->exp_lock_replay_needed = 0; + + LASSERT(atomic_read(&obd->obd_lock_replay_clients)); + atomic_dec(&obd->obd_lock_replay_clients); + } + spin_unlock(&exp->exp_lock); +} + +/* This function removes 1-3 references from the export: + * 1 - for export pointer passed + * and if disconnect really need + * 2 - removing from hash + * 3 - in client_unlink_export + * The export pointer passed to this function can destroyed */ +int class_disconnect(struct obd_export *export) +{ + int already_disconnected; + + if (export == NULL) { + CWARN("attempting to free NULL export %p\n", export); + return -EINVAL; + } + + spin_lock(&export->exp_lock); + already_disconnected = export->exp_disconnected; + export->exp_disconnected = 1; + spin_unlock(&export->exp_lock); + + /* class_cleanup(), abort_recovery(), and class_fail_export() + * all end up in here, and if any of them race we shouldn't + * call extra class_export_puts(). */ + if (already_disconnected) { + LASSERT(hlist_unhashed(&export->exp_nid_hash)); + goto no_disconn; + } + + CDEBUG(D_IOCTL, "disconnect: cookie %#llx\n", + export->exp_handle.h_cookie); + + if (!hlist_unhashed(&export->exp_nid_hash)) + cfs_hash_del(export->exp_obd->obd_nid_hash, + &export->exp_connection->c_peer.nid, + &export->exp_nid_hash); + + class_export_recovery_cleanup(export); + class_unlink_export(export); +no_disconn: + class_export_put(export); + return 0; +} +EXPORT_SYMBOL(class_disconnect); + +/* Return non-zero for a fully connected export */ +int class_connected_export(struct obd_export *exp) +{ + if (exp) { + int connected; + spin_lock(&exp->exp_lock); + connected = exp->exp_conn_cnt > 0; + spin_unlock(&exp->exp_lock); + return connected; + } + return 0; +} +EXPORT_SYMBOL(class_connected_export); + +static void class_disconnect_export_list(struct list_head *list, + enum obd_option flags) +{ + int rc; + struct obd_export *exp; + + /* It's possible that an export may disconnect itself, but + * nothing else will be added to this list. */ + while (!list_empty(list)) { + exp = list_entry(list->next, struct obd_export, + exp_obd_chain); + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + spin_lock(&exp->exp_lock); + exp->exp_flags = flags; + spin_unlock(&exp->exp_lock); + + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) { + CDEBUG(D_HA, + "exp %p export uuid == obd uuid, don't discon\n", + exp); + /* Need to delete this now so we don't end up pointing + * to work_list later when this export is cleaned up. */ + list_del_init(&exp->exp_obd_chain); + class_export_put(exp); + continue; + } + + class_export_get(exp); + CDEBUG(D_HA, "%s: disconnecting export at %s (%p), last request at " CFS_TIME_T "\n", + exp->exp_obd->obd_name, obd_export_nid2str(exp), + exp, exp->exp_last_request_time); + /* release one export reference anyway */ + rc = obd_disconnect(exp); + + CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n", + obd_export_nid2str(exp), exp, rc); + class_export_put(exp); + } +} + +void class_disconnect_exports(struct obd_device *obd) +{ + struct list_head work_list; + + /* Move all of the exports from obd_exports to a work list, en masse. */ + INIT_LIST_HEAD(&work_list); + spin_lock(&obd->obd_dev_lock); + list_splice_init(&obd->obd_exports, &work_list); + list_splice_init(&obd->obd_delayed_exports, &work_list); + spin_unlock(&obd->obd_dev_lock); + + if (!list_empty(&work_list)) { + CDEBUG(D_HA, "OBD device %d (%p) has exports, disconnecting them\n", + obd->obd_minor, obd); + class_disconnect_export_list(&work_list, + exp_flags_from_obd(obd)); + } else + CDEBUG(D_HA, "OBD device %d (%p) has no exports\n", + obd->obd_minor, obd); +} +EXPORT_SYMBOL(class_disconnect_exports); + +/* Remove exports that have not completed recovery. + */ +void class_disconnect_stale_exports(struct obd_device *obd, + int (*test_export)(struct obd_export *)) +{ + struct list_head work_list; + struct obd_export *exp, *n; + int evicted = 0; + + INIT_LIST_HEAD(&work_list); + spin_lock(&obd->obd_dev_lock); + list_for_each_entry_safe(exp, n, &obd->obd_exports, + exp_obd_chain) { + /* don't count self-export as client */ + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) + continue; + + /* don't evict clients which have no slot in last_rcvd + * (e.g. lightweight connection) */ + if (exp->exp_target_data.ted_lr_idx == -1) + continue; + + spin_lock(&exp->exp_lock); + if (exp->exp_failed || test_export(exp)) { + spin_unlock(&exp->exp_lock); + continue; + } + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + list_move(&exp->exp_obd_chain, &work_list); + evicted++; + CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n", + obd->obd_name, exp->exp_client_uuid.uuid, + exp->exp_connection == NULL ? "" : + libcfs_nid2str(exp->exp_connection->c_peer.nid)); + print_export_data(exp, "EVICTING", 0); + } + spin_unlock(&obd->obd_dev_lock); + + if (evicted) + LCONSOLE_WARN("%s: disconnecting %d stale clients\n", + obd->obd_name, evicted); + + class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); +} +EXPORT_SYMBOL(class_disconnect_stale_exports); + +void class_fail_export(struct obd_export *exp) +{ + int rc, already_failed; + + spin_lock(&exp->exp_lock); + already_failed = exp->exp_failed; + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + if (already_failed) { + CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n", + exp, exp->exp_client_uuid.uuid); + return; + } + + CDEBUG(D_HA, "disconnecting export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + /* Most callers into obd_disconnect are removing their own reference + * (request, for example) in addition to the one from the hash table. + * We don't have such a reference here, so make one. */ + class_export_get(exp); + rc = obd_disconnect(exp); + if (rc) + CERROR("disconnecting export %p failed: %d\n", exp, rc); + else + CDEBUG(D_HA, "disconnected export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + class_export_put(exp); +} +EXPORT_SYMBOL(class_fail_export); + +char *obd_export_nid2str(struct obd_export *exp) +{ + if (exp->exp_connection != NULL) + return libcfs_nid2str(exp->exp_connection->c_peer.nid); + + return "(no nid)"; +} +EXPORT_SYMBOL(obd_export_nid2str); + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid) +{ + struct cfs_hash *nid_hash; + struct obd_export *doomed_exp = NULL; + int exports_evicted = 0; + + lnet_nid_t nid_key = libcfs_str2nid((char *)nid); + + spin_lock(&obd->obd_dev_lock); + /* umount has run already, so evict thread should leave + * its task to umount thread now */ + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + nid_hash = obd->obd_nid_hash; + cfs_hash_getref(nid_hash); + spin_unlock(&obd->obd_dev_lock); + + do { + doomed_exp = cfs_hash_lookup(nid_hash, &nid_key); + if (doomed_exp == NULL) + break; + + LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key, + "nid %s found, wanted nid %s, requested nid %s\n", + obd_export_nid2str(doomed_exp), + libcfs_nid2str(nid_key), nid); + LASSERTF(doomed_exp != obd->obd_self_export, + "self-export is hashed by NID?\n"); + exports_evicted++; + LCONSOLE_WARN("%s: evicting %s (at %s) by administrative request\n", + obd->obd_name, + obd_uuid2str(&doomed_exp->exp_client_uuid), + obd_export_nid2str(doomed_exp)); + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + } while (1); + + cfs_hash_putref(nid_hash); + + if (!exports_evicted) + CDEBUG(D_HA, + "%s: can't disconnect NID '%s': no exports found\n", + obd->obd_name, nid); + return exports_evicted; +} +EXPORT_SYMBOL(obd_export_evict_by_nid); + +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid) +{ + struct cfs_hash *uuid_hash; + struct obd_export *doomed_exp = NULL; + struct obd_uuid doomed_uuid; + int exports_evicted = 0; + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + uuid_hash = obd->obd_uuid_hash; + cfs_hash_getref(uuid_hash); + spin_unlock(&obd->obd_dev_lock); + + obd_str2uuid(&doomed_uuid, uuid); + if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) { + CERROR("%s: can't evict myself\n", obd->obd_name); + cfs_hash_putref(uuid_hash); + return exports_evicted; + } + + doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid); + + if (doomed_exp == NULL) { + CERROR("%s: can't disconnect %s: no exports found\n", + obd->obd_name, uuid); + } else { + CWARN("%s: evicting %s at administrative request\n", + obd->obd_name, doomed_exp->exp_client_uuid.uuid); + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + exports_evicted++; + } + cfs_hash_putref(uuid_hash); + + return exports_evicted; +} +EXPORT_SYMBOL(obd_export_evict_by_uuid); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void (*class_export_dump_hook)(struct obd_export*) = NULL; +EXPORT_SYMBOL(class_export_dump_hook); +#endif + +static void print_export_data(struct obd_export *exp, const char *status, + int locks) +{ + struct ptlrpc_reply_state *rs; + struct ptlrpc_reply_state *first_reply = NULL; + int nreplies = 0; + + spin_lock(&exp->exp_lock); + list_for_each_entry(rs, &exp->exp_outstanding_replies, + rs_exp_list) { + if (nreplies == 0) + first_reply = rs; + nreplies++; + } + spin_unlock(&exp->exp_lock); + + CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s %llu\n", + exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid, + obd_export_nid2str(exp), atomic_read(&exp->exp_refcount), + atomic_read(&exp->exp_rpc_count), + atomic_read(&exp->exp_cb_count), + atomic_read(&exp->exp_locks_count), + exp->exp_disconnected, exp->exp_delayed, exp->exp_failed, + nreplies, first_reply, nreplies > 3 ? "..." : "", + exp->exp_last_committed); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + if (locks && class_export_dump_hook != NULL) + class_export_dump_hook(exp); +#endif +} + +void dump_exports(struct obd_device *obd, int locks) +{ + struct obd_export *exp; + + spin_lock(&obd->obd_dev_lock); + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) + print_export_data(exp, "ACTIVE", locks); + list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) + print_export_data(exp, "UNLINKED", locks); + list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain) + print_export_data(exp, "DELAYED", locks); + spin_unlock(&obd->obd_dev_lock); + spin_lock(&obd_zombie_impexp_lock); + list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain) + print_export_data(exp, "ZOMBIE", locks); + spin_unlock(&obd_zombie_impexp_lock); +} +EXPORT_SYMBOL(dump_exports); + +void obd_exports_barrier(struct obd_device *obd) +{ + int waited = 2; + LASSERT(list_empty(&obd->obd_exports)); + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_unlinked_exports)) { + spin_unlock(&obd->obd_dev_lock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(waited)); + if (waited > 5 && IS_PO2(waited)) { + LCONSOLE_WARN("%s is waiting for obd_unlinked_exports more than %d seconds. The obd refcount = %d. Is it stuck?\n", + obd->obd_name, waited, + atomic_read(&obd->obd_refcount)); + dump_exports(obd, 1); + } + waited *= 2; + spin_lock(&obd->obd_dev_lock); + } + spin_unlock(&obd->obd_dev_lock); +} +EXPORT_SYMBOL(obd_exports_barrier); + +/* Total amount of zombies to be destroyed */ +static int zombies_count; + +/** + * kill zombie imports and exports + */ +void obd_zombie_impexp_cull(void) +{ + struct obd_import *import; + struct obd_export *export; + + do { + spin_lock(&obd_zombie_impexp_lock); + + import = NULL; + if (!list_empty(&obd_zombie_imports)) { + import = list_entry(obd_zombie_imports.next, + struct obd_import, + imp_zombie_chain); + list_del_init(&import->imp_zombie_chain); + } + + export = NULL; + if (!list_empty(&obd_zombie_exports)) { + export = list_entry(obd_zombie_exports.next, + struct obd_export, + exp_obd_chain); + list_del_init(&export->exp_obd_chain); + } + + spin_unlock(&obd_zombie_impexp_lock); + + if (import != NULL) { + class_import_destroy(import); + spin_lock(&obd_zombie_impexp_lock); + zombies_count--; + spin_unlock(&obd_zombie_impexp_lock); + } + + if (export != NULL) { + class_export_destroy(export); + spin_lock(&obd_zombie_impexp_lock); + zombies_count--; + spin_unlock(&obd_zombie_impexp_lock); + } + + cond_resched(); + } while (import != NULL || export != NULL); +} + +static struct completion obd_zombie_start; +static struct completion obd_zombie_stop; +static unsigned long obd_zombie_flags; +static wait_queue_head_t obd_zombie_waitq; +static pid_t obd_zombie_pid; + +enum { + OBD_ZOMBIE_STOP = 0x0001, +}; + +/** + * check for work for kill zombie import/export thread. + */ +static int obd_zombie_impexp_check(void *arg) +{ + int rc; + + spin_lock(&obd_zombie_impexp_lock); + rc = (zombies_count == 0) && + !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags); + spin_unlock(&obd_zombie_impexp_lock); + + return rc; +} + +/** + * Add export to the obd_zombie thread and notify it. + */ +static void obd_zombie_export_add(struct obd_export *exp) { + spin_lock(&exp->exp_obd->obd_dev_lock); + LASSERT(!list_empty(&exp->exp_obd_chain)); + list_del_init(&exp->exp_obd_chain); + spin_unlock(&exp->exp_obd->obd_dev_lock); + spin_lock(&obd_zombie_impexp_lock); + zombies_count++; + list_add(&exp->exp_obd_chain, &obd_zombie_exports); + spin_unlock(&obd_zombie_impexp_lock); + + obd_zombie_impexp_notify(); +} + +/** + * Add import to the obd_zombie thread and notify it. + */ +static void obd_zombie_import_add(struct obd_import *imp) { + LASSERT(imp->imp_sec == NULL); + LASSERT(imp->imp_rq_pool == NULL); + spin_lock(&obd_zombie_impexp_lock); + LASSERT(list_empty(&imp->imp_zombie_chain)); + zombies_count++; + list_add(&imp->imp_zombie_chain, &obd_zombie_imports); + spin_unlock(&obd_zombie_impexp_lock); + + obd_zombie_impexp_notify(); +} + +/** + * notify import/export destroy thread about new zombie. + */ +static void obd_zombie_impexp_notify(void) +{ + /* + * Make sure obd_zombie_impexp_thread get this notification. + * It is possible this signal only get by obd_zombie_barrier, and + * barrier gulps this notification and sleeps away and hangs ensues + */ + wake_up_all(&obd_zombie_waitq); +} + +/** + * check whether obd_zombie is idle + */ +static int obd_zombie_is_idle(void) +{ + int rc; + + LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)); + spin_lock(&obd_zombie_impexp_lock); + rc = (zombies_count == 0); + spin_unlock(&obd_zombie_impexp_lock); + return rc; +} + +/** + * wait when obd_zombie import/export queues become empty + */ +void obd_zombie_barrier(void) +{ + struct l_wait_info lwi = { 0 }; + + if (obd_zombie_pid == current_pid()) + /* don't wait for myself */ + return; + l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi); +} +EXPORT_SYMBOL(obd_zombie_barrier); + + +/** + * destroy zombie export/import thread. + */ +static int obd_zombie_impexp_thread(void *unused) +{ + unshare_fs_struct(); + complete(&obd_zombie_start); + + obd_zombie_pid = current_pid(); + + while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) { + struct l_wait_info lwi = { 0 }; + + l_wait_event(obd_zombie_waitq, + !obd_zombie_impexp_check(NULL), &lwi); + obd_zombie_impexp_cull(); + + /* + * Notify obd_zombie_barrier callers that queues + * may be empty. + */ + wake_up(&obd_zombie_waitq); + } + + complete(&obd_zombie_stop); + + return 0; +} + + +/** + * start destroy zombie import/export thread + */ +int obd_zombie_impexp_init(void) +{ + struct task_struct *task; + + INIT_LIST_HEAD(&obd_zombie_imports); + INIT_LIST_HEAD(&obd_zombie_exports); + spin_lock_init(&obd_zombie_impexp_lock); + init_completion(&obd_zombie_start); + init_completion(&obd_zombie_stop); + init_waitqueue_head(&obd_zombie_waitq); + obd_zombie_pid = 0; + + task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid"); + if (IS_ERR(task)) + return PTR_ERR(task); + + wait_for_completion(&obd_zombie_start); + return 0; +} +/** + * stop destroy zombie import/export thread + */ +void obd_zombie_impexp_stop(void) +{ + set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags); + obd_zombie_impexp_notify(); + wait_for_completion(&obd_zombie_stop); +} + +/***** Kernel-userspace comm helpers *******/ + +/* Get length of entire message, including header */ +int kuc_len(int payload_len) +{ + return sizeof(struct kuc_hdr) + payload_len; +} +EXPORT_SYMBOL(kuc_len); + +/* Get a pointer to kuc header, given a ptr to the payload + * @param p Pointer to payload area + * @returns Pointer to kuc header + */ +struct kuc_hdr *kuc_ptr(void *p) +{ + struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1; + LASSERT(lh->kuc_magic == KUC_MAGIC); + return lh; +} +EXPORT_SYMBOL(kuc_ptr); + +/* Test if payload is part of kuc message + * @param p Pointer to payload area + * @returns boolean + */ +int kuc_ispayload(void *p) +{ + struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1; + + if (kh->kuc_magic == KUC_MAGIC) + return 1; + else + return 0; +} +EXPORT_SYMBOL(kuc_ispayload); + +/* Alloc space for a message, and fill in header + * @return Pointer to payload area + */ +void *kuc_alloc(int payload_len, int transport, int type) +{ + struct kuc_hdr *lh; + int len = kuc_len(payload_len); + + OBD_ALLOC(lh, len); + if (lh == NULL) + return ERR_PTR(-ENOMEM); + + lh->kuc_magic = KUC_MAGIC; + lh->kuc_transport = transport; + lh->kuc_msgtype = type; + lh->kuc_msglen = len; + + return (void *)(lh + 1); +} +EXPORT_SYMBOL(kuc_alloc); + +/* Takes pointer to payload area */ +inline void kuc_free(void *p, int payload_len) +{ + struct kuc_hdr *lh = kuc_ptr(p); + OBD_FREE(lh, kuc_len(payload_len)); +} +EXPORT_SYMBOL(kuc_free); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c new file mode 100644 index 000000000..06944b863 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c @@ -0,0 +1,449 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/linux/linux-module.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../include/linux/libcfs/libcfs.h" +#include "../../../include/linux/lnet/lnetctl.h" +#include "../../include/obd_support.h" +#include "../../include/obd_class.h" +#include "../../include/lprocfs_status.h" +#include "../../include/lustre_ver.h" +#include "../../include/lustre/lustre_build_version.h" + +int proc_version; + +/* buffer MUST be at least the size of obd_ioctl_hdr */ +int obd_ioctl_getdata(char **buf, int *len, void *arg) +{ + struct obd_ioctl_hdr hdr; + struct obd_ioctl_data *data; + int err; + int offset = 0; + + if (copy_from_user(&hdr, (void *)arg, sizeof(hdr))) + return -EFAULT; + + if (hdr.ioc_version != OBD_IOCTL_VERSION) { + CERROR("Version mismatch kernel (%x) vs application (%x)\n", + OBD_IOCTL_VERSION, hdr.ioc_version); + return -EINVAL; + } + + if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) { + CERROR("User buffer len %d exceeds %d max buffer\n", + hdr.ioc_len, OBD_MAX_IOCTL_BUFFER); + return -EINVAL; + } + + if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) { + CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len); + return -EINVAL; + } + + /* When there are lots of processes calling vmalloc on multi-core + * system, the high lock contention will hurt performance badly, + * obdfilter-survey is an example, which relies on ioctl. So we'd + * better avoid vmalloc on ioctl path. LU-66 */ + OBD_ALLOC_LARGE(*buf, hdr.ioc_len); + if (*buf == NULL) { + CERROR("Cannot allocate control buffer of len %d\n", + hdr.ioc_len); + return -EINVAL; + } + *len = hdr.ioc_len; + data = (struct obd_ioctl_data *)*buf; + + if (copy_from_user(*buf, (void *)arg, hdr.ioc_len)) { + err = -EFAULT; + goto free_buf; + } + if (hdr.ioc_len != data->ioc_len) { + err = -EINVAL; + goto free_buf; + } + + if (obd_ioctl_is_invalid(data)) { + CERROR("ioctl not correctly formatted\n"); + err = -EINVAL; + goto free_buf; + } + + if (data->ioc_inllen1) { + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + offset += cfs_size_round(data->ioc_inllen1); + } + + if (data->ioc_inllen2) { + data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen2); + } + + if (data->ioc_inllen3) { + data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen3); + } + + if (data->ioc_inllen4) { + data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset; + } + + return 0; + +free_buf: + OBD_FREE_LARGE(*buf, hdr.ioc_len); + return err; +} +EXPORT_SYMBOL(obd_ioctl_getdata); + +int obd_ioctl_popdata(void *arg, void *data, int len) +{ + int err; + + err = copy_to_user(arg, data, len); + if (err) + err = -EFAULT; + return err; +} +EXPORT_SYMBOL(obd_ioctl_popdata); + +/* opening /dev/obd */ +static int obd_class_open(struct inode *inode, struct file *file) +{ + try_module_get(THIS_MODULE); + return 0; +} + +/* closing /dev/obd */ +static int obd_class_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + +/* to control /dev/obd */ +static long obd_class_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int err = 0; + + /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */ + if (!capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET)) + return err = -EACCES; + if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */ + return err = -ENOTTY; + + err = class_handle_ioctl(cmd, (unsigned long)arg); + + return err; +} + +/* declare character device */ +static struct file_operations obd_psdev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */ + .open = obd_class_open, /* open */ + .release = obd_class_release, /* release */ +}; + +/* modules setup */ +struct miscdevice obd_psdev = { + .minor = OBD_DEV_MINOR, + .name = OBD_DEV_NAME, + .fops = &obd_psdev_fops, +}; + + +#if defined (CONFIG_PROC_FS) +static int obd_proc_version_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "lustre: %s\nkernel: %s\nbuild: %s\n", + LUSTRE_VERSION_STRING, "patchless_client", BUILD_VERSION); + return 0; +} +LPROC_SEQ_FOPS_RO(obd_proc_version); + +int obd_proc_pinger_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", "on"); + return 0; +} +LPROC_SEQ_FOPS_RO(obd_proc_pinger); + +static int obd_proc_health_seq_show(struct seq_file *m, void *v) +{ + bool healthy = true; + int i; + + if (libcfs_catastrophe) + seq_printf(m, "LBUG\n"); + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd; + + obd = class_num2obd(i); + if (obd == NULL || !obd->obd_attached || !obd->obd_set_up) + continue; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + continue; + + class_incref(obd, __func__, current); + read_unlock(&obd_dev_lock); + + if (obd_health_check(NULL, obd)) { + seq_printf(m, "device %s reported unhealthy\n", + obd->obd_name); + healthy = false; + } + class_decref(obd, __func__, current); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + + if (healthy) + seq_puts(m, "healthy\n"); + else + seq_puts(m, "NOT HEALTHY\n"); + + return 0; +} +LPROC_SEQ_FOPS_RO(obd_proc_health); + +static int obd_proc_jobid_var_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", obd_jobid_var); + return 0; +} + +static ssize_t obd_proc_jobid_var_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN) + return -EINVAL; + + memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1); + + /* This might leave the var invalid on error, which is probably fine.*/ + if (copy_from_user(obd_jobid_var, buffer, count)) + return -EFAULT; + + /* Trim the trailing '\n' if any */ + if (obd_jobid_var[count - 1] == '\n') + obd_jobid_var[count - 1] = 0; + + return count; +} +LPROC_SEQ_FOPS(obd_proc_jobid_var); + +static int obd_proc_jobid_name_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", obd_jobid_var); + return 0; +} + +static ssize_t obd_proc_jobid_name_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + if (!count || count > JOBSTATS_JOBID_SIZE) + return -EINVAL; + + if (copy_from_user(obd_jobid_node, buffer, count)) + return -EFAULT; + + obd_jobid_node[count] = 0; + + /* Trim the trailing '\n' if any */ + if (obd_jobid_node[count - 1] == '\n') + obd_jobid_node[count - 1] = 0; + + return count; +} +LPROC_SEQ_FOPS(obd_proc_jobid_name); + +/* Root for /proc/fs/lustre */ +struct proc_dir_entry *proc_lustre_root = NULL; +EXPORT_SYMBOL(proc_lustre_root); + +struct lprocfs_vars lprocfs_base[] = { + { "version", &obd_proc_version_fops }, + { "pinger", &obd_proc_pinger_fops }, + { "health_check", &obd_proc_health_fops }, + { "jobid_var", &obd_proc_jobid_var_fops }, + { .name = "jobid_name", + .fops = &obd_proc_jobid_name_fops}, + { NULL } +}; + +static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos) +{ + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static void obd_device_list_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++*pos; + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static int obd_device_list_seq_show(struct seq_file *p, void *v) +{ + loff_t index = *(loff_t *)v; + struct obd_device *obd = class_num2obd((int)index); + char *status; + + if (obd == NULL) + return 0; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_inactive) + status = "IN"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + + seq_printf(p, "%3d %s %s %s %s %d\n", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + return 0; +} + +struct seq_operations obd_device_list_sops = { + .start = obd_device_list_seq_start, + .stop = obd_device_list_seq_stop, + .next = obd_device_list_seq_next, + .show = obd_device_list_seq_show, +}; + +static int obd_device_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = seq_open(file, &obd_device_list_sops); + + if (rc) + return rc; + + seq = file->private_data; + seq->private = PDE_DATA(inode); + + return 0; +} + +struct file_operations obd_device_list_fops = { + .owner = THIS_MODULE, + .open = obd_device_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int class_procfs_init(void) +{ + int rc = 0; + + proc_lustre_root = lprocfs_register("fs/lustre", NULL, + lprocfs_base, NULL); + if (IS_ERR(proc_lustre_root)) { + rc = PTR_ERR(proc_lustre_root); + proc_lustre_root = NULL; + goto out; + } + + rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444, + &obd_device_list_fops, NULL); +out: + if (rc) + CERROR("error adding /proc/fs/lustre/devices file\n"); + return 0; +} + +int class_procfs_clean(void) +{ + if (proc_lustre_root) { + lprocfs_remove(&proc_lustre_root); + } + return 0; +} +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c new file mode 100644 index 000000000..62ed706b1 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c @@ -0,0 +1,222 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/linux/linux-obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include "../../include/obd_class.h" +#include "../../include/lustre/lustre_idl.h" + +#include +#include /* for PAGE_CACHE_SIZE */ + +/*FIXME: Just copy from obdo_from_inode*/ +void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid) +{ + u32 newvalid = 0; + + if (valid & LA_ATIME) { + dst->o_atime = la->la_atime; + newvalid |= OBD_MD_FLATIME; + } + if (valid & LA_MTIME) { + dst->o_mtime = la->la_mtime; + newvalid |= OBD_MD_FLMTIME; + } + if (valid & LA_CTIME) { + dst->o_ctime = la->la_ctime; + newvalid |= OBD_MD_FLCTIME; + } + if (valid & LA_SIZE) { + dst->o_size = la->la_size; + newvalid |= OBD_MD_FLSIZE; + } + if (valid & LA_BLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = la->la_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & LA_TYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (la->la_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & LA_MODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (la->la_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & LA_UID) { + dst->o_uid = la->la_uid; + newvalid |= OBD_MD_FLUID; + } + if (valid & LA_GID) { + dst->o_gid = la->la_gid; + newvalid |= OBD_MD_FLGID; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_la); + +/*FIXME: Just copy from obdo_from_inode*/ +void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, u32 valid) +{ + __u64 newvalid = 0; + + valid &= obdo->o_valid; + + if (valid & OBD_MD_FLATIME) { + dst->la_atime = obdo->o_atime; + newvalid |= LA_ATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->la_mtime = obdo->o_mtime; + newvalid |= LA_MTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->la_ctime = obdo->o_ctime; + newvalid |= LA_CTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->la_size = obdo->o_size; + newvalid |= LA_SIZE; + } + if (valid & OBD_MD_FLBLOCKS) { + dst->la_blocks = obdo->o_blocks; + newvalid |= LA_BLOCKS; + } + if (valid & OBD_MD_FLTYPE) { + dst->la_mode = (dst->la_mode & S_IALLUGO) | + (obdo->o_mode & S_IFMT); + newvalid |= LA_TYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->la_mode = (dst->la_mode & S_IFMT) | + (obdo->o_mode & S_IALLUGO); + newvalid |= LA_MODE; + } + if (valid & OBD_MD_FLUID) { + dst->la_uid = obdo->o_uid; + newvalid |= LA_UID; + } + if (valid & OBD_MD_FLGID) { + dst->la_gid = obdo->o_gid; + newvalid |= LA_GID; + } + dst->la_valid = newvalid; +} +EXPORT_SYMBOL(la_from_obdo); + +void obdo_refresh_inode(struct inode *dst, struct obdo *src, u32 valid) +{ + valid &= src->o_valid; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, + "valid %#llx, cur time %lu/%lu, new %llu/%llu\n", + src->o_valid, LTIME_S(dst->i_mtime), + LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime); + + if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime)) + LTIME_S(dst->i_atime) = src->o_atime; + if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime)) + LTIME_S(dst->i_mtime) = src->o_mtime; + if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) + LTIME_S(dst->i_ctime) = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + i_size_write(dst, src->o_size); + /* optimum IO size */ + if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits)) + dst->i_blkbits = ffs(src->o_blksize) - 1; + + if (dst->i_blkbits < PAGE_CACHE_SHIFT) + dst->i_blkbits = PAGE_CACHE_SHIFT; + + /* allocation of space */ + if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks) + /* + * XXX shouldn't overflow be checked here like in + * obdo_to_inode(). + */ + dst->i_blocks = src->o_blocks; +} +EXPORT_SYMBOL(obdo_refresh_inode); + +void obdo_to_inode(struct inode *dst, struct obdo *src, u32 valid) +{ + valid &= src->o_valid; + + LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID | + OBD_MD_FLID | OBD_MD_FLGROUP)), + "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid); + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, + "valid %#llx, cur time %lu/%lu, new %llu/%llu\n", + src->o_valid, LTIME_S(dst->i_mtime), + LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime); + + if (valid & OBD_MD_FLATIME) + LTIME_S(dst->i_atime) = src->o_atime; + if (valid & OBD_MD_FLMTIME) + LTIME_S(dst->i_mtime) = src->o_mtime; + if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) + LTIME_S(dst->i_ctime) = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + i_size_write(dst, src->o_size); + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */ + dst->i_blocks = src->o_blocks; + if (dst->i_blocks < src->o_blocks) /* overflow */ + dst->i_blocks = -1; + + } + if (valid & OBD_MD_FLBLKSZ) + dst->i_blkbits = ffs(src->o_blksize)-1; + if (valid & OBD_MD_FLMODE) + dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + dst->i_uid = make_kuid(&init_user_ns, src->o_uid); + if (valid & OBD_MD_FLGID) + dst->i_gid = make_kgid(&init_user_ns, src->o_gid); + if (valid & OBD_MD_FLFLAGS) + dst->i_flags = src->o_flags; +} +EXPORT_SYMBOL(obdo_to_inode); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c new file mode 100644 index 000000000..4b62d2576 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c @@ -0,0 +1,405 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../../include/obd_support.h" +#include "../../include/lprocfs_status.h" + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *obd_table_header; +#endif + + +#define OBD_SYSCTL 300 + +enum { + OBD_TIMEOUT = 3, /* RPC timeout before recovery/intr */ + OBD_DUMP_ON_TIMEOUT, /* dump kernel debug log upon eviction */ + OBD_MEMUSED, /* bytes currently OBD_ALLOCated */ + OBD_PAGESUSED, /* pages currently OBD_PAGE_ALLOCated */ + OBD_MAXMEMUSED, /* maximum bytes OBD_ALLOCated concurrently */ + OBD_MAXPAGESUSED, /* maximum pages OBD_PAGE_ALLOCated concurrently */ + OBD_SYNCFILTER, /* XXX temporary, as we play with sync osts.. */ + OBD_LDLM_TIMEOUT, /* LDLM timeout for ASTs before client eviction */ + OBD_DUMP_ON_EVICTION, /* dump kernel debug log upon eviction */ + OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */ + OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */ + OBD_MAX_DIRTY_PAGES, /* maximum dirty pages */ + OBD_AT_MIN, /* Adaptive timeouts params */ + OBD_AT_MAX, + OBD_AT_EXTRA, + OBD_AT_EARLY_MARGIN, + OBD_AT_HISTORY, +}; + + +#ifdef CONFIG_SYSCTL +static int proc_set_timeout(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (ldlm_timeout >= obd_timeout) + ldlm_timeout = max(obd_timeout / 3, 1U); + return rc; +} + +static int proc_memory_alloc(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char buf[22]; + int len; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), "%llu\n", obd_memory_sum()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +static int proc_pages_alloc(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char buf[22]; + int len; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), "%llu\n", obd_pages_sum()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +static int proc_mem_max(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + char buf[22]; + int len; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), "%llu\n", obd_memory_max()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +static int proc_pages_max(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char buf[22]; + int len; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), "%llu\n", obd_pages_max()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +static int proc_max_dirty_pages_in_mb(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + + if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) { + rc = lprocfs_write_frac_helper(buffer, *lenp, + (unsigned int *)table->data, + 1 << (20 - PAGE_CACHE_SHIFT)); + /* Don't allow them to let dirty pages exceed 90% of system + * memory and set a hard minimum of 4MB. */ + if (obd_max_dirty_pages > ((totalram_pages / 10) * 9)) { + CERROR("Refusing to set max dirty pages to %u, which is more than 90%% of available RAM; setting to %lu\n", + obd_max_dirty_pages, + ((totalram_pages / 10) * 9)); + obd_max_dirty_pages = (totalram_pages / 10) * 9; + } else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) { + obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT); + } + } else { + char buf[21]; + int len; + + len = lprocfs_read_frac_helper(buf, sizeof(buf), + *(unsigned int *)table->data, + 1 << (20 - PAGE_CACHE_SHIFT)); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + } + *ppos += *lenp; + return rc; +} + +static int proc_alloc_fail_rate(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + + if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) { + rc = lprocfs_write_frac_helper(buffer, *lenp, + (unsigned int *)table->data, + OBD_ALLOC_FAIL_MULT); + } else { + char buf[21]; + int len; + + len = lprocfs_read_frac_helper(buf, 21, + *(unsigned int *)table->data, + OBD_ALLOC_FAIL_MULT); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + } + *ppos += *lenp; + return rc; +} + +static struct ctl_table obd_table[] = { + { + .procname = "timeout", + .data = &obd_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_set_timeout + }, + { + .procname = "debug_peer_on_timeout", + .data = &obd_debug_peer_on_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "dump_on_timeout", + .data = &obd_dump_on_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "dump_on_eviction", + .data = &obd_dump_on_eviction, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "memused", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_memory_alloc + }, + { + .procname = "pagesused", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_pages_alloc + }, + { + .procname = "memused_max", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_mem_max + }, + { + .procname = "pagesused_max", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_pages_max + }, + { + .procname = "ldlm_timeout", + .data = &ldlm_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_set_timeout + }, + { + .procname = "alloc_fail_rate", + .data = &obd_alloc_fail_rate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_alloc_fail_rate + }, + { + .procname = "max_dirty_mb", + .data = &obd_max_dirty_pages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_max_dirty_pages_in_mb + }, + { + .procname = "at_min", + .data = &at_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "at_max", + .data = &at_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "at_extra", + .data = &at_extra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "at_early_margin", + .data = &at_early_margin, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "at_history", + .data = &at_history, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + {} +}; + +static struct ctl_table parent_table[] = { + { + .procname = "lustre", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = obd_table + }, + {} +}; +#endif + +void obd_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + if (!obd_table_header) + obd_table_header = register_sysctl_table(parent_table); +#endif +} + +void obd_sysctl_clean(void) +{ +#ifdef CONFIG_SYSCTL + if (obd_table_header) + unregister_sysctl_table(obd_table_header); + obd_table_header = NULL; +#endif +} diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog.c b/kernel/drivers/staging/lustre/lustre/obdclass/llog.c new file mode 100644 index 000000000..114be4a78 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog.c @@ -0,0 +1,1007 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog.c + * + * OST<->MDS recovery logging infrastructure. + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + * Author: Alex Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include "../include/obd_class.h" +#include "../include/lustre_log.h" +#include "llog_internal.h" + +/* + * Allocate a new log or catalog handle + * Used inside llog_open(). + */ +static struct llog_handle *llog_alloc_handle(void) +{ + struct llog_handle *loghandle; + + OBD_ALLOC_PTR(loghandle); + if (loghandle == NULL) + return NULL; + + init_rwsem(&loghandle->lgh_lock); + spin_lock_init(&loghandle->lgh_hdr_lock); + INIT_LIST_HEAD(&loghandle->u.phd.phd_entry); + atomic_set(&loghandle->lgh_refcount, 1); + + return loghandle; +} + +/* + * Free llog handle and header data if exists. Used in llog_close() only + */ +static void llog_free_handle(struct llog_handle *loghandle) +{ + LASSERT(loghandle != NULL); + + /* failed llog_init_handle */ + if (!loghandle->lgh_hdr) + goto out; + + if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) + LASSERT(list_empty(&loghandle->u.phd.phd_entry)); + else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + LASSERT(list_empty(&loghandle->u.chd.chd_head)); + LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE); + OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE); +out: + OBD_FREE_PTR(loghandle); +} + +void llog_handle_get(struct llog_handle *loghandle) +{ + atomic_inc(&loghandle->lgh_refcount); +} + +void llog_handle_put(struct llog_handle *loghandle) +{ + LASSERT(atomic_read(&loghandle->lgh_refcount) > 0); + if (atomic_dec_and_test(&loghandle->lgh_refcount)) + llog_free_handle(loghandle); +} + +/* returns negative on error; 0 if success; 1 if success & log destroyed */ +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index) +{ + struct llog_log_hdr *llh = loghandle->lgh_hdr; + int rc = 0; + + CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n", + index, POSTID(&loghandle->lgh_id.lgl_oi)); + + if (index == 0) { + CERROR("Can't cancel index 0 which is header\n"); + return -EINVAL; + } + + spin_lock(&loghandle->lgh_hdr_lock); + if (!ext2_clear_bit(index, llh->llh_bitmap)) { + spin_unlock(&loghandle->lgh_hdr_lock); + CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index); + return -ENOENT; + } + + llh->llh_count--; + + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1) && + (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) { + spin_unlock(&loghandle->lgh_hdr_lock); + rc = llog_destroy(env, loghandle); + if (rc < 0) { + CERROR("%s: can't destroy empty llog #"DOSTID + "#%08x: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, rc); + goto out_err; + } + return 1; + } + spin_unlock(&loghandle->lgh_hdr_lock); + + rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0); + if (rc < 0) { + CERROR("%s: fail to write header for llog #"DOSTID + "#%08x: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, rc); + goto out_err; + } + return 0; +out_err: + spin_lock(&loghandle->lgh_hdr_lock); + ext2_set_bit(index, llh->llh_bitmap); + llh->llh_count++; + spin_unlock(&loghandle->lgh_hdr_lock); + return rc; +} +EXPORT_SYMBOL(llog_cancel_rec); + +static int llog_read_header(const struct lu_env *env, + struct llog_handle *handle, + struct obd_uuid *uuid) +{ + struct llog_operations *lop; + int rc; + + rc = llog_handle2ops(handle, &lop); + if (rc) + return rc; + + if (lop->lop_read_header == NULL) + return -EOPNOTSUPP; + + rc = lop->lop_read_header(env, handle); + if (rc == LLOG_EEMPTY) { + struct llog_log_hdr *llh = handle->lgh_hdr; + + handle->lgh_last_idx = 0; /* header is record with index 0 */ + llh->llh_count = 1; /* for the header record */ + llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC; + llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE; + llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0; + llh->llh_timestamp = get_seconds(); + if (uuid) + memcpy(&llh->llh_tgtuuid, uuid, + sizeof(llh->llh_tgtuuid)); + llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap); + ext2_set_bit(0, llh->llh_bitmap); + rc = 0; + } + return rc; +} + +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid) +{ + struct llog_log_hdr *llh; + int rc; + + LASSERT(handle->lgh_hdr == NULL); + + OBD_ALLOC_PTR(llh); + if (llh == NULL) + return -ENOMEM; + handle->lgh_hdr = llh; + /* first assign flags to use llog_client_ops */ + llh->llh_flags = flags; + rc = llog_read_header(env, handle, uuid); + if (rc == 0) { + if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN && + flags & LLOG_F_IS_CAT) || + (llh->llh_flags & LLOG_F_IS_CAT && + flags & LLOG_F_IS_PLAIN))) { + CERROR("%s: llog type is %s but initializing %s\n", + handle->lgh_ctxt->loc_obd->obd_name, + llh->llh_flags & LLOG_F_IS_CAT ? + "catalog" : "plain", + flags & LLOG_F_IS_CAT ? "catalog" : "plain"); + rc = -EINVAL; + goto out; + } else if (llh->llh_flags & + (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) { + /* + * it is possible to open llog without specifying llog + * type so it is taken from llh_flags + */ + flags = llh->llh_flags; + } else { + /* for some reason the llh_flags has no type set */ + CERROR("llog type is not specified!\n"); + rc = -EINVAL; + goto out; + } + if (unlikely(uuid && + !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) { + CERROR("%s: llog uuid mismatch: %s/%s\n", + handle->lgh_ctxt->loc_obd->obd_name, + (char *)uuid->uuid, + (char *)llh->llh_tgtuuid.uuid); + rc = -EEXIST; + goto out; + } + } + if (flags & LLOG_F_IS_CAT) { + LASSERT(list_empty(&handle->u.chd.chd_head)); + INIT_LIST_HEAD(&handle->u.chd.chd_head); + llh->llh_size = sizeof(struct llog_logid_rec); + } else if (!(flags & LLOG_F_IS_PLAIN)) { + CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n", + handle->lgh_ctxt->loc_obd->obd_name, + flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN); + rc = -EINVAL; + } +out: + if (rc) { + OBD_FREE_PTR(llh); + handle->lgh_hdr = NULL; + } + return rc; +} +EXPORT_SYMBOL(llog_init_handle); + +static int llog_process_thread(void *arg) +{ + struct llog_process_info *lpi = arg; + struct llog_handle *loghandle = lpi->lpi_loghandle; + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = lpi->lpi_catdata; + char *buf; + __u64 cur_offset = LLOG_CHUNK_SIZE; + __u64 last_offset; + int rc = 0, index = 1, last_index; + int saved_index = 0; + int last_called_index = 0; + + LASSERT(llh); + + OBD_ALLOC(buf, LLOG_CHUNK_SIZE); + if (!buf) { + lpi->lpi_rc = -ENOMEM; + return 0; + } + + if (cd != NULL) { + last_called_index = cd->lpcd_first_idx; + index = cd->lpcd_first_idx + 1; + } + if (cd != NULL && cd->lpcd_last_idx) + last_index = cd->lpcd_last_idx; + else + last_index = LLOG_BITMAP_BYTES * 8 - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + + /* skip records not set in bitmap */ + while (index <= last_index && + !ext2_test_bit(index, llh->llh_bitmap)) + ++index; + + LASSERT(index <= last_index + 1); + if (index == last_index + 1) + break; +repeat: + CDEBUG(D_OTHER, "index: %d last_index %d\n", + index, last_index); + + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, LLOG_CHUNK_SIZE); + last_offset = cur_offset; + rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index, + index, &cur_offset, buf, LLOG_CHUNK_SIZE); + if (rc) + goto out; + + /* NB: when rec->lrh_len is accessed it is already swabbed + * since it is used at the "end" of the loop and the rec + * swabbing is done at the beginning of the loop. */ + for (rec = (struct llog_rec_hdr *)buf; + (char *)rec < buf + LLOG_CHUNK_SIZE; + rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){ + + CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n", + rec, rec->lrh_type); + + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n", + rec->lrh_type, rec->lrh_index); + + if (rec->lrh_index == 0) { + /* probably another rec just got added? */ + rc = 0; + if (index <= loghandle->lgh_last_idx) + goto repeat; + goto out; /* no more records */ + } + if (rec->lrh_len == 0 || + rec->lrh_len > LLOG_CHUNK_SIZE) { + CWARN("invalid length %d in llog record for index %d/%d\n", + rec->lrh_len, + rec->lrh_index, index); + rc = -EINVAL; + goto out; + } + + if (rec->lrh_index < index) { + CDEBUG(D_OTHER, "skipping lrh_index %d\n", + rec->lrh_index); + continue; + } + + CDEBUG(D_OTHER, + "lrh_index: %d lrh_len: %d (%d remains)\n", + rec->lrh_index, rec->lrh_len, + (int)(buf + LLOG_CHUNK_SIZE - (char *)rec)); + + loghandle->lgh_cur_idx = rec->lrh_index; + loghandle->lgh_cur_offset = (char *)rec - (char *)buf + + last_offset; + + /* if set, process the callback on this record */ + if (ext2_test_bit(index, llh->llh_bitmap)) { + rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec, + lpi->lpi_cbdata); + last_called_index = index; + if (rc == LLOG_PROC_BREAK) { + goto out; + } else if (rc == LLOG_DEL_RECORD) { + llog_cancel_rec(lpi->lpi_env, + loghandle, + rec->lrh_index); + rc = 0; + } + if (rc) + goto out; + } else { + CDEBUG(D_OTHER, "Skipped index %d\n", index); + } + + /* next record, still in buffer? */ + ++index; + if (index > last_index) { + rc = 0; + goto out; + } + } + } + +out: + if (cd != NULL) + cd->lpcd_last_idx = last_called_index; + + OBD_FREE(buf, LLOG_CHUNK_SIZE); + lpi->lpi_rc = rc; + return 0; +} + +static int llog_process_thread_daemonize(void *arg) +{ + struct llog_process_info *lpi = arg; + struct lu_env env; + int rc; + + unshare_fs_struct(); + + /* client env has no keys, tags is just 0 */ + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + goto out; + lpi->lpi_env = &env; + + rc = llog_process_thread(arg); + + lu_env_fini(&env); +out: + complete(&lpi->lpi_completion); + return rc; +} + +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork) +{ + struct llog_process_info *lpi; + int rc; + + OBD_ALLOC_PTR(lpi); + if (lpi == NULL) { + CERROR("cannot alloc pointer\n"); + return -ENOMEM; + } + lpi->lpi_loghandle = loghandle; + lpi->lpi_cb = cb; + lpi->lpi_cbdata = data; + lpi->lpi_catdata = catdata; + + if (fork) { + /* The new thread can't use parent env, + * init the new one in llog_process_thread_daemonize. */ + lpi->lpi_env = NULL; + init_completion(&lpi->lpi_completion); + rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi, + "llog_process_thread")); + if (IS_ERR_VALUE(rc)) { + CERROR("%s: cannot start thread: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, rc); + OBD_FREE_PTR(lpi); + return rc; + } + wait_for_completion(&lpi->lpi_completion); + } else { + lpi->lpi_env = env; + llog_process_thread(lpi); + } + rc = lpi->lpi_rc; + OBD_FREE_PTR(lpi); + return rc; +} +EXPORT_SYMBOL(llog_process_or_fork); + +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata) +{ + return llog_process_or_fork(env, loghandle, cb, data, catdata, true); +} +EXPORT_SYMBOL(llog_process); + +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata) +{ + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = catdata; + void *buf; + int rc = 0, first_index = 1, index, idx; + + OBD_ALLOC(buf, LLOG_CHUNK_SIZE); + if (!buf) + return -ENOMEM; + + if (cd != NULL) + first_index = cd->lpcd_first_idx + 1; + if (cd != NULL && cd->lpcd_last_idx) + index = cd->lpcd_last_idx; + else + index = LLOG_BITMAP_BYTES * 8 - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + struct llog_rec_tail *tail; + + /* skip records not set in bitmap */ + while (index >= first_index && + !ext2_test_bit(index, llh->llh_bitmap)) + --index; + + LASSERT(index >= first_index - 1); + if (index == first_index - 1) + break; + + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, LLOG_CHUNK_SIZE); + rc = llog_prev_block(env, loghandle, index, buf, + LLOG_CHUNK_SIZE); + if (rc) + goto out; + + rec = buf; + idx = rec->lrh_index; + CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx); + while (idx < index) { + rec = (void *)rec + rec->lrh_len; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + idx ++; + } + LASSERT(idx == index); + tail = (void *)rec + rec->lrh_len - sizeof(*tail); + + /* process records in buffer, starting where we found one */ + while ((void *)tail > buf) { + if (tail->lrt_index == 0) { + /* no more records */ + rc = 0; + goto out; + } + + /* if set, process the callback on this record */ + if (ext2_test_bit(index, llh->llh_bitmap)) { + rec = (void *)tail - tail->lrt_len + + sizeof(*tail); + + rc = cb(env, loghandle, rec, data); + if (rc == LLOG_PROC_BREAK) { + goto out; + } else if (rc == LLOG_DEL_RECORD) { + llog_cancel_rec(env, loghandle, + tail->lrt_index); + rc = 0; + } + if (rc) + goto out; + } + + /* previous record, still in buffer? */ + --index; + if (index < first_index) { + rc = 0; + goto out; + } + tail = (void *)tail - tail->lrt_len; + } + } + +out: + if (buf) + OBD_FREE(buf, LLOG_CHUNK_SIZE); + return rc; +} +EXPORT_SYMBOL(llog_reverse_process); + +/** + * new llog API + * + * API functions: + * llog_open - open llog, may not exist + * llog_exist - check if llog exists + * llog_close - close opened llog, pair for open, frees llog_handle + * llog_declare_create - declare llog creation + * llog_create - create new llog on disk, need transaction handle + * llog_declare_write_rec - declaration of llog write + * llog_write_rec - write llog record on disk, need transaction handle + * llog_declare_add - declare llog catalog record addition + * llog_add - add llog record in catalog, need transaction handle + */ +int llog_exist(struct llog_handle *loghandle) +{ + struct llog_operations *lop; + int rc; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + return rc; + if (lop->lop_exist == NULL) + return -EOPNOTSUPP; + + rc = lop->lop_exist(loghandle); + return rc; +} +EXPORT_SYMBOL(llog_exist); + +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + return rc; + if (lop->lop_declare_create == NULL) + return -EOPNOTSUPP; + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_declare_create(env, loghandle, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + return rc; +} +EXPORT_SYMBOL(llog_declare_create); + +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + rc = llog_handle2ops(handle, &lop); + if (rc) + return rc; + if (lop->lop_create == NULL) + return -EOPNOTSUPP; + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_create(env, handle, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + return rc; +} +EXPORT_SYMBOL(llog_create); + +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + rc = llog_handle2ops(handle, &lop); + if (rc) + return rc; + LASSERT(lop); + if (lop->lop_declare_write_rec == NULL) + return -EOPNOTSUPP; + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_declare_write_rec(env, handle, rec, idx, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + return rc; +} +EXPORT_SYMBOL(llog_declare_write_rec); + +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int numcookies, void *buf, int idx, struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc, buflen; + + rc = llog_handle2ops(handle, &lop); + if (rc) + return rc; + + LASSERT(lop); + if (lop->lop_write_rec == NULL) + return -EOPNOTSUPP; + + if (buf) + buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) + + sizeof(struct llog_rec_tail); + else + buflen = rec->lrh_len; + LASSERT(cfs_size_round(buflen) == buflen); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies, + buf, idx, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + return rc; +} +EXPORT_SYMBOL(llog_write_rec); + +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + void *buf, struct thandle *th) +{ + int raised, rc; + + if (lgh->lgh_logops->lop_add == NULL) + return -EOPNOTSUPP; + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + return rc; +} +EXPORT_SYMBOL(llog_add); + +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th) +{ + int raised, rc; + + if (lgh->lgh_logops->lop_declare_add == NULL) + return -EOPNOTSUPP; + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + return rc; +} +EXPORT_SYMBOL(llog_declare_add); + +/** + * Helper function to open llog or create it if doesn't exist. + * It hides all transaction handling from caller. + */ +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name) +{ + struct dt_device *d; + struct thandle *th; + int rc; + + rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW); + if (rc) + return rc; + + if (llog_exist(*res)) + return 0; + + LASSERT((*res)->lgh_obj != NULL); + + d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev); + + th = dt_trans_create(env, d); + if (IS_ERR(th)) { + rc = PTR_ERR(th); + goto out; + } + + rc = llog_declare_create(env, *res, th); + if (rc == 0) { + rc = dt_trans_start_local(env, d, th); + if (rc == 0) + rc = llog_create(env, *res, th); + } + dt_trans_stop(env, d, th); +out: + if (rc) + llog_close(env, *res); + return rc; +} +EXPORT_SYMBOL(llog_open_create); + +/** + * Helper function to delete existent llog. + */ +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name) +{ + struct llog_handle *handle; + int rc = 0, rc2; + + /* nothing to erase */ + if (name == NULL && logid == NULL) + return 0; + + rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS); + if (rc < 0) + return rc; + + rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL); + if (rc == 0) + rc = llog_destroy(env, handle); + + rc2 = llog_close(env, handle); + if (rc == 0) + rc = rc2; + return rc; +} +EXPORT_SYMBOL(llog_erase); + +/* + * Helper function for write record in llog. + * It hides all transaction handling from caller. + * Valid only with local llog. + */ +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + int cookiecount, void *buf, int idx) +{ + struct dt_device *dt; + struct thandle *th; + int rc; + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + LASSERT(loghandle->lgh_obj != NULL); + + dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + return PTR_ERR(th); + + rc = llog_declare_write_rec(env, loghandle, rec, idx, th); + if (rc) + goto out_trans; + + rc = dt_trans_start_local(env, dt, th); + if (rc) + goto out_trans; + + down_write(&loghandle->lgh_lock); + rc = llog_write_rec(env, loghandle, rec, reccookie, + cookiecount, buf, idx, th); + up_write(&loghandle->lgh_lock); +out_trans: + dt_trans_stop(env, dt, th); + return rc; +} +EXPORT_SYMBOL(llog_write); + +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param) +{ + int raised; + int rc; + + LASSERT(ctxt); + LASSERT(ctxt->loc_logops); + + if (ctxt->loc_logops->lop_open == NULL) { + *lgh = NULL; + return -EOPNOTSUPP; + } + + *lgh = llog_alloc_handle(); + if (*lgh == NULL) + return -ENOMEM; + (*lgh)->lgh_ctxt = ctxt; + (*lgh)->lgh_logops = ctxt->loc_logops; + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + if (rc) { + llog_free_handle(*lgh); + *lgh = NULL; + } + return rc; +} +EXPORT_SYMBOL(llog_open); + +int llog_close(const struct lu_env *env, struct llog_handle *loghandle) +{ + struct llog_operations *lop; + int rc; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + goto out; + if (lop->lop_close == NULL) { + rc = -EOPNOTSUPP; + goto out; + } + rc = lop->lop_close(env, loghandle); +out: + llog_handle_put(loghandle); + return rc; +} +EXPORT_SYMBOL(llog_close); + +int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name) +{ + struct llog_handle *llh; + int rc = 0; + + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc < 0) { + if (likely(rc == -ENOENT)) + rc = 0; + goto out; + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + goto out_close; + rc = llog_get_size(llh); + +out_close: + llog_close(env, llh); +out: + /* header is record 1 */ + return rc <= 1; +} +EXPORT_SYMBOL(llog_is_empty); + +int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_handle *copy_llh = data; + + /* Append all records */ + return llog_write(env, copy_llh, rec, NULL, 0, NULL, -1); +} +EXPORT_SYMBOL(llog_copy_handler); + +/* backup plain llog */ +int llog_backup(const struct lu_env *env, struct obd_device *obd, + struct llog_ctxt *ctxt, struct llog_ctxt *bctxt, + char *name, char *backup) +{ + struct llog_handle *llh, *bllh; + int rc; + + + + /* open original log */ + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc < 0) { + /* the -ENOENT case is also reported to the caller + * but silently so it should handle that if needed. + */ + if (rc != -ENOENT) + CERROR("%s: failed to open log %s: rc = %d\n", + obd->obd_name, name, rc); + return rc; + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + goto out_close; + + /* Make sure there's no old backup log */ + rc = llog_erase(env, bctxt, NULL, backup); + if (rc < 0 && rc != -ENOENT) + goto out_close; + + /* open backup log */ + rc = llog_open_create(env, bctxt, &bllh, NULL, backup); + if (rc) { + CERROR("%s: failed to open backup logfile %s: rc = %d\n", + obd->obd_name, backup, rc); + goto out_close; + } + + /* check that backup llog is not the same object as original one */ + if (llh->lgh_obj == bllh->lgh_obj) { + CERROR("%s: backup llog %s to itself (%s), objects %p/%p\n", + obd->obd_name, name, backup, llh->lgh_obj, + bllh->lgh_obj); + rc = -EEXIST; + goto out_backup; + } + + rc = llog_init_handle(env, bllh, LLOG_F_IS_PLAIN, NULL); + if (rc) + goto out_backup; + + /* Copy log record by record */ + rc = llog_process_or_fork(env, llh, llog_copy_handler, (void *)bllh, + NULL, false); + if (rc) + CERROR("%s: failed to backup log %s: rc = %d\n", + obd->obd_name, name, rc); +out_backup: + llog_close(env, bllh); +out_close: + llog_close(env, llh); + return rc; +} +EXPORT_SYMBOL(llog_backup); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/kernel/drivers/staging/lustre/lustre/obdclass/llog_cat.c new file mode 100644 index 000000000..c8f6ab006 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog_cat.c @@ -0,0 +1,815 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_cat.c + * + * OST<->MDS recovery logging infrastructure. + * + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + * Author: Alexey Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include "../include/obd_class.h" + +#include "llog_internal.h" + +/* Create a new log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + */ +static int llog_cat_new_log(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_handle *loghandle, + struct thandle *th) +{ + + struct llog_log_hdr *llh; + struct llog_logid_rec rec = { { 0 }, }; + int rc, index, bitmap_size; + + llh = cathandle->lgh_hdr; + bitmap_size = LLOG_BITMAP_SIZE(llh); + + index = (cathandle->lgh_last_idx + 1) % bitmap_size; + + /* maximum number of available slots in catlog is bitmap_size - 2 */ + if (llh->llh_cat_idx == index) { + CERROR("no free catalog slots for log...\n"); + return -ENOSPC; + } + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED)) + return -ENOSPC; + + rc = llog_create(env, loghandle, th); + /* if llog is already created, no need to initialize it */ + if (rc == -EEXIST) { + return 0; + } else if (rc != 0) { + CERROR("%s: can't create new plain llog in catalog: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, rc); + return rc; + } + + rc = llog_init_handle(env, loghandle, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &cathandle->lgh_hdr->llh_tgtuuid); + if (rc) + goto out_destroy; + + if (index == 0) + index = 1; + + spin_lock(&loghandle->lgh_hdr_lock); + llh->llh_count++; + if (ext2_set_bit(index, llh->llh_bitmap)) { + CERROR("argh, index %u already set in log bitmap?\n", + index); + spin_unlock(&loghandle->lgh_hdr_lock); + LBUG(); /* should never happen */ + } + spin_unlock(&loghandle->lgh_hdr_lock); + + cathandle->lgh_last_idx = index; + llh->llh_tail.lrt_index = index; + + CDEBUG(D_RPCTRACE, + "new recovery log "DOSTID":%x for index %u of catalog" + DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, index, + POSTID(&cathandle->lgh_id.lgl_oi)); + /* build the record for this log in the catalog */ + rec.lid_hdr.lrh_len = sizeof(rec); + rec.lid_hdr.lrh_index = index; + rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + rec.lid_id = loghandle->lgh_id; + rec.lid_tail.lrt_len = sizeof(rec); + rec.lid_tail.lrt_index = index; + + /* update the catalog: header and record */ + rc = llog_write_rec(env, cathandle, &rec.lid_hdr, + &loghandle->u.phd.phd_cookie, 1, NULL, index, th); + if (rc < 0) + goto out_destroy; + + loghandle->lgh_hdr->llh_cat_idx = index; + return 0; +out_destroy: + llog_destroy(env, loghandle); + return rc; +} + +/* Open an existent log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * We return a lock on the handle to ensure nobody yanks it from us. + * + * This takes extra reference on llog_handle via llog_handle_get() and require + * this reference to be put by caller using llog_handle_put() + */ +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid) +{ + struct llog_handle *loghandle; + int rc = 0; + + if (cathandle == NULL) + return -EBADF; + + down_write(&cathandle->lgh_lock); + list_for_each_entry(loghandle, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_logid *cgl = &loghandle->lgh_id; + + if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) && + ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) { + if (cgl->lgl_ogen != logid->lgl_ogen) { + CERROR("%s: log "DOSTID" generation %x != %x\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), cgl->lgl_ogen, + logid->lgl_ogen); + continue; + } + loghandle->u.phd.phd_cat_handle = cathandle; + up_write(&cathandle->lgh_lock); + rc = 0; + goto out; + } + } + up_write(&cathandle->lgh_lock); + + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL, + LLOG_OPEN_EXISTS); + if (rc < 0) { + CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); + return rc; + } + + rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL); + if (rc < 0) { + llog_close(env, loghandle); + loghandle = NULL; + return rc; + } + + down_write(&cathandle->lgh_lock); + list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head); + up_write(&cathandle->lgh_lock); + + loghandle->u.phd.phd_cat_handle = cathandle; + loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id; + loghandle->u.phd.phd_cookie.lgc_index = + loghandle->lgh_hdr->llh_cat_idx; +out: + llog_handle_get(loghandle); + *res = loghandle; + return 0; +} + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle) +{ + struct llog_handle *loghandle, *n; + int rc; + + list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_log_hdr *llh = loghandle->lgh_hdr; + int index; + + /* unlink open-not-created llogs */ + list_del_init(&loghandle->u.phd.phd_entry); + llh = loghandle->lgh_hdr; + if (loghandle->lgh_obj != NULL && llh != NULL && + (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1)) { + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("%s: failure destroying log during cleanup: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + rc); + + index = loghandle->u.phd.phd_cookie.lgc_index; + llog_cat_cleanup(env, cathandle, NULL, index); + } + llog_close(env, loghandle); + } + /* if handle was stored in ctxt, remove it too */ + if (cathandle->lgh_ctxt->loc_handle == cathandle) + cathandle->lgh_ctxt->loc_handle = NULL; + rc = llog_close(env, cathandle); + return rc; +} +EXPORT_SYMBOL(llog_cat_close); + +/** + * lockdep markers for nested struct llog_handle::lgh_lock locking. + */ +enum { + LLOGH_CAT, + LLOGH_LOG +}; + +/** Return the currently active log handle. If the current log handle doesn't + * have enough space left for the current record, start a new one. + * + * If reclen is 0, we only want to know what the currently active log is, + * otherwise we get a lock on this log so nobody can steal our space. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * + * NOTE: loghandle is write-locked upon successful return + */ +static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle, + struct thandle *th) +{ + struct llog_handle *loghandle = NULL; + + down_read_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + if (llh == NULL || + loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) { + up_read(&cathandle->lgh_lock); + return loghandle; + } else { + up_write(&loghandle->lgh_lock); + } + } + up_read(&cathandle->lgh_lock); + + /* time to use next log */ + + /* first, we have to make sure the state hasn't changed */ + down_write_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + LASSERT(llh); + if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) { + up_write(&cathandle->lgh_lock); + return loghandle; + } else { + up_write(&loghandle->lgh_lock); + } + } + + CDEBUG(D_INODE, "use next log\n"); + + loghandle = cathandle->u.chd.chd_next_log; + cathandle->u.chd.chd_current_log = loghandle; + cathandle->u.chd.chd_next_log = NULL; + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + up_write(&cathandle->lgh_lock); + LASSERT(loghandle); + return loghandle; +} + +/* Add a single record to the recovery log(s) using a catalog + * Returns as llog_write_record + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf, struct thandle *th) +{ + struct llog_handle *loghandle; + int rc; + + LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE); + loghandle = llog_cat_current_log(cathandle, th); + LASSERT(!IS_ERR(loghandle)); + + /* loghandle is already locked by llog_cat_current_log() for us */ + if (!llog_exist(loghandle)) { + rc = llog_cat_new_log(env, cathandle, loghandle, th); + if (rc < 0) { + up_write(&loghandle->lgh_lock); + return rc; + } + } + /* now let's try to add the record */ + rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th); + if (rc < 0) + CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR, + "llog_write_rec %d: lh=%p\n", rc, loghandle); + up_write(&loghandle->lgh_lock); + if (rc == -ENOSPC) { + /* try to use next log */ + loghandle = llog_cat_current_log(cathandle, th); + LASSERT(!IS_ERR(loghandle)); + /* new llog can be created concurrently */ + if (!llog_exist(loghandle)) { + rc = llog_cat_new_log(env, cathandle, loghandle, th); + if (rc < 0) { + up_write(&loghandle->lgh_lock); + return rc; + } + } + /* now let's try to add the record */ + rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, + -1, th); + if (rc < 0) + CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle); + up_write(&loghandle->lgh_lock); + } + + return rc; +} +EXPORT_SYMBOL(llog_cat_add_rec); + +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th) +{ + struct llog_handle *loghandle, *next; + int rc = 0; + + if (cathandle->u.chd.chd_current_log == NULL) { + /* declare new plain llog */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == NULL) { + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, + NULL, NULL, LLOG_OPEN_NEW); + if (rc == 0) { + cathandle->u.chd.chd_current_log = loghandle; + list_add_tail(&loghandle->u.phd.phd_entry, + &cathandle->u.chd.chd_head); + } + } + up_write(&cathandle->lgh_lock); + } else if (cathandle->u.chd.chd_next_log == NULL) { + /* declare next plain llog */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_next_log == NULL) { + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, + NULL, NULL, LLOG_OPEN_NEW); + if (rc == 0) { + cathandle->u.chd.chd_next_log = loghandle; + list_add_tail(&loghandle->u.phd.phd_entry, + &cathandle->u.chd.chd_head); + } + } + up_write(&cathandle->lgh_lock); + } + if (rc) + goto out; + + if (!llog_exist(cathandle->u.chd.chd_current_log)) { + rc = llog_declare_create(env, cathandle->u.chd.chd_current_log, + th); + if (rc) + goto out; + llog_declare_write_rec(env, cathandle, NULL, -1, th); + } + /* declare records in the llogs */ + rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log, + rec, -1, th); + if (rc) + goto out; + + next = cathandle->u.chd.chd_next_log; + if (next) { + if (!llog_exist(next)) { + rc = llog_declare_create(env, next, th); + llog_declare_write_rec(env, cathandle, NULL, -1, th); + } + llog_declare_write_rec(env, next, rec, -1, th); + } +out: + return rc; +} +EXPORT_SYMBOL(llog_cat_declare_add_rec); + +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf) +{ + struct llog_ctxt *ctxt; + struct dt_device *dt; + struct thandle *th = NULL; + int rc; + + ctxt = cathandle->lgh_ctxt; + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + + if (cathandle->lgh_obj != NULL) { + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + LASSERT(dt); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + return PTR_ERR(th); + + rc = llog_cat_declare_add_rec(env, cathandle, rec, th); + if (rc) + goto out_trans; + + rc = dt_trans_start_local(env, dt, th); + if (rc) + goto out_trans; + rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th); +out_trans: + dt_trans_stop(env, dt, th); + } else { /* lvfs compat code */ + LASSERT(cathandle->lgh_file != NULL); + rc = llog_cat_declare_add_rec(env, cathandle, rec, th); + if (rc == 0) + rc = llog_cat_add_rec(env, cathandle, rec, reccookie, + buf, th); + } + return rc; +} +EXPORT_SYMBOL(llog_cat_add); + +/* For each cookie in the cookie array, we clear the log in-use bit and either: + * - the log is empty, so mark it free in the catalog header and delete it + * - the log is not empty, just write out the log header + * + * The cookies may be in different log files, so we need to get new logs + * each time. + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies) +{ + int i, index, rc = 0, failed = 0; + + for (i = 0; i < count; i++, cookies++) { + struct llog_handle *loghandle; + struct llog_logid *lgl = &cookies->lgc_lgl; + int lrc; + + rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&lgl->lgl_oi), rc); + failed++; + continue; + } + + lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index); + if (lrc == 1) { /* log has been destroyed */ + index = loghandle->u.phd.phd_cookie.lgc_index; + rc = llog_cat_cleanup(env, cathandle, loghandle, + index); + } else if (lrc == -ENOENT) { + if (rc == 0) /* ENOENT shouldn't rewrite any error */ + rc = lrc; + } else if (lrc < 0) { + failed++; + rc = lrc; + } + llog_handle_put(loghandle); + } + if (rc) + CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, failed, count, + rc); + + return rc; +} +EXPORT_SYMBOL(llog_cat_cancel_records); + +static int llog_cat_process_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *llh; + int rc; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + return -EINVAL; + } + CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog " + DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, + rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi)); + + rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cat_llh->lgh_ctxt->loc_obd->obd_name, + POSTID(&lir->lid_id.lgl_oi), rc); + return rc; + } + + if (rec->lrh_index < d->lpd_startcat) + /* Skip processing of the logs until startcat */ + rc = 0; + else if (d->lpd_startidx > 0) { + struct llog_process_cat_data cd; + + cd.lpcd_first_idx = d->lpd_startidx; + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + &cd, false); + /* Continue processing the next log from idx 0 */ + d->lpd_startidx = 0; + } else { + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + NULL, false); + } + + llog_handle_put(llh); + + return rc; +} + +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, + int startidx, bool fork) +{ + struct llog_process_data d; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + d.lpd_data = data; + d.lpd_cb = cb; + d.lpd_startcat = startcat; + d.lpd_startidx = startidx; + + if (llh->llh_cat_idx > cat_llh->lgh_last_idx) { + struct llog_process_cat_data cd; + + CWARN("catlog "DOSTID" crosses index zero\n", + POSTID(&cat_llh->lgh_id.lgl_oi)); + + cd.lpcd_first_idx = llh->llh_cat_idx; + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, + &d, &cd, fork); + if (rc != 0) + return rc; + + cd.lpcd_first_idx = 0; + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, + &d, &cd, fork); + } else { + rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, + &d, NULL, fork); + } + + return rc; +} +EXPORT_SYMBOL(llog_cat_process_or_fork); + +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx) +{ + return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat, + startidx, false); +} +EXPORT_SYMBOL(llog_cat_process); + +static int llog_cat_reverse_process_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *llh; + int rc; + + if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + return -EINVAL; + } + CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog " + DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, + le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi)); + + rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cat_llh->lgh_ctxt->loc_obd->obd_name, + POSTID(&lir->lid_id.lgl_oi), rc); + return rc; + } + + rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL); + llog_handle_put(llh); + return rc; +} + +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, + llog_cb_t cb, void *data) +{ + struct llog_process_data d; + struct llog_process_cat_data cd; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + d.lpd_data = data; + d.lpd_cb = cb; + + if (llh->llh_cat_idx > cat_llh->lgh_last_idx) { + CWARN("catalog "DOSTID" crosses index zero\n", + POSTID(&cat_llh->lgh_id.lgl_oi)); + + cd.lpcd_first_idx = 0; + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + if (rc != 0) + return rc; + + cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx); + cd.lpcd_last_idx = 0; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + } else { + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, NULL); + } + + return rc; +} +EXPORT_SYMBOL(llog_cat_reverse_process); + +static int llog_cat_set_first_idx(struct llog_handle *cathandle, int index) +{ + struct llog_log_hdr *llh = cathandle->lgh_hdr; + int i, bitmap_size, idx; + + bitmap_size = LLOG_BITMAP_SIZE(llh); + if (llh->llh_cat_idx == (index - 1)) { + idx = llh->llh_cat_idx + 1; + llh->llh_cat_idx = idx; + if (idx == cathandle->lgh_last_idx) + goto out; + for (i = (index + 1) % bitmap_size; + i != cathandle->lgh_last_idx; + i = (i + 1) % bitmap_size) { + if (!ext2_test_bit(i, llh->llh_bitmap)) { + idx = llh->llh_cat_idx + 1; + llh->llh_cat_idx = idx; + } else if (i == 0) { + llh->llh_cat_idx = 0; + } else { + break; + } + } +out: + CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n", + POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx); + } + + return 0; +} + +/* Cleanup deleted plain llog traces from catalog */ +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index) +{ + int rc; + + LASSERT(index); + if (loghandle != NULL) { + /* remove destroyed llog from catalog list and + * chd_current_log variable */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == loghandle) + cathandle->u.chd.chd_current_log = NULL; + list_del_init(&loghandle->u.phd.phd_entry); + up_write(&cathandle->lgh_lock); + LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index); + /* llog was opened and keep in a list, close it now */ + llog_close(env, loghandle); + } + /* remove plain llog entry from catalog by index */ + llog_cat_set_first_idx(cathandle, index); + rc = llog_cancel_rec(env, cathandle, index); + if (rc == 0) + CDEBUG(D_HA, "cancel plain log at index %u of catalog " DOSTID "\n", + index, POSTID(&cathandle->lgh_id.lgl_oi)); + return rc; +} + +static int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *loghandle; + struct llog_log_hdr *llh; + int rc; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + return -EINVAL; + } + + CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog " + DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, + rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi)); + + rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&lir->lid_id.lgl_oi), rc); + if (rc == -ENOENT || rc == -ESTALE) { + /* remove index from catalog */ + llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index); + } + return rc; + } + + llh = loghandle->lgh_hdr; + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1)) { + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("%s: fail to destroy empty log: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, rc); + + llog_cat_cleanup(env, cathandle, loghandle, + loghandle->u.phd.phd_cookie.lgc_index); + } + llog_handle_put(loghandle); + + return rc; +} + +/* helper to initialize catalog llog and process it to cancel */ +int llog_cat_init_and_process(const struct lu_env *env, + struct llog_handle *llh) +{ + int rc; + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL); + if (rc) + return rc; + + rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false); + if (rc) + CERROR("%s: llog_process() with cat_cancel_cb failed: rc = %d\n", + llh->lgh_ctxt->loc_obd->obd_name, rc); + return 0; +} +EXPORT_SYMBOL(llog_cat_init_and_process); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/kernel/drivers/staging/lustre/lustre/obdclass/llog_internal.h new file mode 100644 index 000000000..5332131a2 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog_internal.h @@ -0,0 +1,98 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LLOG_INTERNAL_H__ +#define __LLOG_INTERNAL_H__ + +#include "../include/lustre_log.h" + +struct llog_process_info { + struct llog_handle *lpi_loghandle; + llog_cb_t lpi_cb; + void *lpi_cbdata; + void *lpi_catdata; + int lpi_rc; + struct completion lpi_completion; + const struct lu_env *lpi_env; + +}; + +struct llog_thread_info { + struct lu_attr lgi_attr; + struct lu_fid lgi_fid; + struct dt_object_format lgi_dof; + struct lu_buf lgi_buf; + loff_t lgi_off; + struct llog_rec_hdr lgi_lrh; + struct llog_rec_tail lgi_tail; +}; + +extern struct lu_context_key llog_thread_key; + +static inline struct llog_thread_info *llog_info(const struct lu_env *env) +{ + struct llog_thread_info *lgi; + + lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key); + LASSERT(lgi); + return lgi; +} + +static inline void +lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen) +{ + ostid_set_seq_llog(&logid->lgl_oi); + ostid_set_id(&logid->lgl_oi, ino); + logid->lgl_ogen = gen; +} + +int llog_info_init(void); +void llog_info_fini(void); + +void llog_handle_get(struct llog_handle *loghandle); +void llog_handle_put(struct llog_handle *loghandle); +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid); +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index); +#endif diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/kernel/drivers/staging/lustre/lustre/obdclass/llog_obd.c new file mode 100644 index 000000000..978d886a1 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog_obd.c @@ -0,0 +1,262 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include "../include/obd_class.h" +#include "../include/lustre_log.h" +#include "llog_internal.h" + +/* helper functions for calling the llog obd methods */ +static struct llog_ctxt *llog_new_ctxt(struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + OBD_ALLOC_PTR(ctxt); + if (!ctxt) + return NULL; + + ctxt->loc_obd = obd; + atomic_set(&ctxt->loc_refcount, 1); + + return ctxt; +} + +static void llog_ctxt_destroy(struct llog_ctxt *ctxt) +{ + if (ctxt->loc_exp) { + class_export_put(ctxt->loc_exp); + ctxt->loc_exp = NULL; + } + if (ctxt->loc_imp) { + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = NULL; + } + OBD_FREE_PTR(ctxt); +} + +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct obd_llog_group *olg = ctxt->loc_olg; + struct obd_device *obd; + int rc = 0; + + spin_lock(&olg->olg_lock); + if (!atomic_dec_and_test(&ctxt->loc_refcount)) { + spin_unlock(&olg->olg_lock); + return rc; + } + olg->olg_ctxts[ctxt->loc_idx] = NULL; + spin_unlock(&olg->olg_lock); + + obd = ctxt->loc_obd; + spin_lock(&obd->obd_dev_lock); + /* sync with llog ctxt user thread */ + spin_unlock(&obd->obd_dev_lock); + + /* obd->obd_starting is needed for the case of cleanup + * in error case while obd is starting up. */ + LASSERTF(obd->obd_starting == 1 || + obd->obd_stopping == 1 || obd->obd_set_up == 0, + "wrong obd state: %d/%d/%d\n", !!obd->obd_starting, + !!obd->obd_stopping, !!obd->obd_set_up); + + /* cleanup the llog ctxt here */ + if (CTXTP(ctxt, cleanup)) + rc = CTXTP(ctxt, cleanup)(env, ctxt); + + llog_ctxt_destroy(ctxt); + wake_up(&olg->olg_waitq); + return rc; +} +EXPORT_SYMBOL(__llog_ctxt_put); + +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + struct obd_llog_group *olg; + int rc, idx; + + LASSERT(ctxt != NULL); + LASSERT(ctxt != LP_POISON); + + olg = ctxt->loc_olg; + LASSERT(olg != NULL); + LASSERT(olg != LP_POISON); + + idx = ctxt->loc_idx; + + /* + * Banlance the ctxt get when calling llog_cleanup() + */ + LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON); + LASSERT(atomic_read(&ctxt->loc_refcount) > 1); + llog_ctxt_put(ctxt); + + /* + * Try to free the ctxt. + */ + rc = __llog_ctxt_put(env, ctxt); + if (rc) + CERROR("Error %d while cleaning up ctxt %p\n", + rc, ctxt); + + l_wait_event(olg->olg_waitq, + llog_group_ctxt_null(olg, idx), &lwi); + + return rc; +} +EXPORT_SYMBOL(llog_cleanup); + +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, struct llog_operations *op) +{ + struct llog_ctxt *ctxt; + int rc = 0; + + if (index < 0 || index >= LLOG_MAX_CTXTS) + return -EINVAL; + + LASSERT(olg != NULL); + + ctxt = llog_new_ctxt(obd); + if (!ctxt) + return -ENOMEM; + + ctxt->loc_obd = obd; + ctxt->loc_olg = olg; + ctxt->loc_idx = index; + ctxt->loc_logops = op; + mutex_init(&ctxt->loc_mutex); + ctxt->loc_exp = class_export_get(disk_obd->obd_self_export); + ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED; + + rc = llog_group_set_ctxt(olg, ctxt, index); + if (rc) { + llog_ctxt_destroy(ctxt); + if (rc == -EEXIST) { + ctxt = llog_group_get_ctxt(olg, index); + if (ctxt) { + /* + * mds_lov_update_desc() might call here multiple + * times. So if the llog is already set up then + * don't to do it again. + */ + CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n", + obd->obd_name, index); + LASSERT(ctxt->loc_olg == olg); + LASSERT(ctxt->loc_obd == obd); + LASSERT(ctxt->loc_exp == disk_obd->obd_self_export); + LASSERT(ctxt->loc_logops == op); + llog_ctxt_put(ctxt); + } + rc = 0; + } + return rc; + } + + if (op->lop_setup) { + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP)) + rc = -EOPNOTSUPP; + else + rc = op->lop_setup(env, obd, olg, index, disk_obd); + } + + if (rc) { + CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n", + obd->obd_name, index, op->lop_setup, rc); + llog_group_clear_ctxt(olg, index); + llog_ctxt_destroy(ctxt); + } else { + CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n", + obd->obd_name, index); + ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED; + } + + return rc; +} +EXPORT_SYMBOL(llog_setup); + +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags) +{ + int rc = 0; + + if (!ctxt) + return 0; + + if (CTXTP(ctxt, sync)) + rc = CTXTP(ctxt, sync)(ctxt, exp, flags); + + return rc; +} +EXPORT_SYMBOL(llog_sync); + +int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_cookie *cookies, int flags) +{ + int rc; + + if (!ctxt) { + CERROR("No ctxt\n"); + return -ENODEV; + } + + CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP); + rc = CTXTP(ctxt, cancel)(env, ctxt, cookies, flags); + return rc; +} +EXPORT_SYMBOL(llog_cancel); + +/* context key constructor/destructor: llog_key_init, llog_key_fini */ +LU_KEY_INIT_FINI(llog, struct llog_thread_info); +/* context key: llog_thread_key */ +LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL); +LU_KEY_INIT_GENERIC(llog); +EXPORT_SYMBOL(llog_thread_key); + +int llog_info_init(void) +{ + llog_key_init_generic(&llog_thread_key, NULL); + lu_context_key_register(&llog_thread_key); + return 0; +} + +void llog_info_fini(void) +{ + lu_context_key_degister(&llog_thread_key); +} diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/kernel/drivers/staging/lustre/lustre/obdclass/llog_swab.c new file mode 100644 index 000000000..a2d5aa105 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/llog_swab.c @@ -0,0 +1,415 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_swab.c + * + * Swabbing of llog datatypes (from disk or over the wire). + * + * Author: jacob berkman + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include "../include/lustre_log.h" + +static void print_llogd_body(struct llogd_body *d) +{ + CDEBUG(D_OTHER, "llogd body: %p\n", d); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n", + POSTID(&d->lgd_logid.lgl_oi)); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen); + CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx); + CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags); + CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index); + CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index); + CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len); + CDEBUG(D_OTHER, "\tlgd_cur_offset: %#llx\n", d->lgd_cur_offset); +} + +void lustre_swab_lu_fid(struct lu_fid *fid) +{ + __swab64s(&fid->f_seq); + __swab32s(&fid->f_oid); + __swab32s(&fid->f_ver); +} +EXPORT_SYMBOL(lustre_swab_lu_fid); + +void lustre_swab_ost_id(struct ost_id *oid) +{ + if (fid_seq_is_mdt0(oid->oi.oi_seq)) { + __swab64s(&oid->oi.oi_id); + __swab64s(&oid->oi.oi_seq); + } else { + lustre_swab_lu_fid(&oid->oi_fid); + } +} +EXPORT_SYMBOL(lustre_swab_ost_id); + +void lustre_swab_llog_id(struct llog_logid *log_id) +{ + __swab64s(&log_id->lgl_oi.oi.oi_id); + __swab64s(&log_id->lgl_oi.oi.oi_seq); + __swab32s(&log_id->lgl_ogen); +} +EXPORT_SYMBOL(lustre_swab_llog_id); + +void lustre_swab_llogd_body(struct llogd_body *d) +{ + print_llogd_body(d); + lustre_swab_llog_id(&d->lgd_logid); + __swab32s(&d->lgd_ctxt_idx); + __swab32s(&d->lgd_llh_flags); + __swab32s(&d->lgd_index); + __swab32s(&d->lgd_saved_index); + __swab32s(&d->lgd_len); + __swab64s(&d->lgd_cur_offset); + print_llogd_body(d); +} +EXPORT_SYMBOL(lustre_swab_llogd_body); + +void lustre_swab_llogd_conn_body(struct llogd_conn_body *d) +{ + __swab64s(&d->lgdc_gen.mnt_cnt); + __swab64s(&d->lgdc_gen.conn_cnt); + lustre_swab_llog_id(&d->lgdc_logid); + __swab32s(&d->lgdc_ctxt_idx); +} +EXPORT_SYMBOL(lustre_swab_llogd_conn_body); + +void lustre_swab_ll_fid(struct ll_fid *fid) +{ + __swab64s(&fid->id); + __swab32s(&fid->generation); + __swab32s(&fid->f_type); +} +EXPORT_SYMBOL(lustre_swab_ll_fid); + +void lustre_swab_lu_seq_range(struct lu_seq_range *range) +{ + __swab64s(&range->lsr_start); + __swab64s(&range->lsr_end); + __swab32s(&range->lsr_index); + __swab32s(&range->lsr_flags); +} +EXPORT_SYMBOL(lustre_swab_lu_seq_range); + +void lustre_swab_llog_rec(struct llog_rec_hdr *rec) +{ + struct llog_rec_tail *tail = NULL; + + __swab32s(&rec->lrh_len); + __swab32s(&rec->lrh_index); + __swab32s(&rec->lrh_type); + __swab32s(&rec->lrh_id); + + switch (rec->lrh_type) { + case OST_SZ_REC: + { + struct llog_size_change_rec *lsc = + (struct llog_size_change_rec *)rec; + + lustre_swab_ll_fid(&lsc->lsc_fid); + __swab32s(&lsc->lsc_ioepoch); + tail = &lsc->lsc_tail; + break; + } + case MDS_UNLINK_REC: + { + struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec; + + __swab64s(&lur->lur_oid); + __swab32s(&lur->lur_oseq); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case MDS_UNLINK64_REC: + { + struct llog_unlink64_rec *lur = + (struct llog_unlink64_rec *)rec; + + lustre_swab_lu_fid(&lur->lur_fid); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case CHANGELOG_REC: + { + struct llog_changelog_rec *cr = + (struct llog_changelog_rec *)rec; + + __swab16s(&cr->cr.cr_namelen); + __swab16s(&cr->cr.cr_flags); + __swab32s(&cr->cr.cr_type); + __swab64s(&cr->cr.cr_index); + __swab64s(&cr->cr.cr_prev); + __swab64s(&cr->cr.cr_time); + lustre_swab_lu_fid(&cr->cr.cr_tfid); + lustre_swab_lu_fid(&cr->cr.cr_pfid); + if (CHANGELOG_REC_EXTENDED(&cr->cr)) { + struct llog_changelog_ext_rec *ext = + (struct llog_changelog_ext_rec *)rec; + + lustre_swab_lu_fid(&ext->cr.cr_sfid); + lustre_swab_lu_fid(&ext->cr.cr_spfid); + tail = &ext->cr_tail; + } else { + tail = &cr->cr_tail; + } + tail = (struct llog_rec_tail *)((char *)tail + + cr->cr.cr_namelen); + break; + } + case CHANGELOG_USER_REC: + { + struct llog_changelog_user_rec *cur = + (struct llog_changelog_user_rec *)rec; + + __swab32s(&cur->cur_id); + __swab64s(&cur->cur_endrec); + tail = &cur->cur_tail; + break; + } + + case HSM_AGENT_REC: { + struct llog_agent_req_rec *arr = + (struct llog_agent_req_rec *)rec; + + __swab32s(&arr->arr_hai.hai_len); + __swab32s(&arr->arr_hai.hai_action); + lustre_swab_lu_fid(&arr->arr_hai.hai_fid); + lustre_swab_lu_fid(&arr->arr_hai.hai_dfid); + __swab64s(&arr->arr_hai.hai_cookie); + __swab64s(&arr->arr_hai.hai_extent.offset); + __swab64s(&arr->arr_hai.hai_extent.length); + __swab64s(&arr->arr_hai.hai_gid); + /* no swabing for opaque data */ + /* hai_data[0]; */ + break; + } + + case MDS_SETATTR64_REC: + { + struct llog_setattr64_rec *lsr = + (struct llog_setattr64_rec *)rec; + + lustre_swab_ost_id(&lsr->lsr_oi); + __swab32s(&lsr->lsr_uid); + __swab32s(&lsr->lsr_uid_h); + __swab32s(&lsr->lsr_gid); + __swab32s(&lsr->lsr_gid_h); + tail = &lsr->lsr_tail; + break; + } + case OBD_CFG_REC: + /* these are swabbed as they are consumed */ + break; + case LLOG_HDR_MAGIC: + { + struct llog_log_hdr *llh = (struct llog_log_hdr *)rec; + + __swab64s(&llh->llh_timestamp); + __swab32s(&llh->llh_count); + __swab32s(&llh->llh_bitmap_offset); + __swab32s(&llh->llh_flags); + __swab32s(&llh->llh_size); + __swab32s(&llh->llh_cat_idx); + tail = &llh->llh_tail; + break; + } + case LLOG_LOGID_MAGIC: + { + struct llog_logid_rec *lid = (struct llog_logid_rec *)rec; + + lustre_swab_llog_id(&lid->lid_id); + tail = &lid->lid_tail; + break; + } + case LLOG_GEN_REC: + { + struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; + + __swab64s(&lgr->lgr_gen.mnt_cnt); + __swab64s(&lgr->lgr_gen.conn_cnt); + tail = &lgr->lgr_tail; + break; + } + case LLOG_PAD_MAGIC: + break; + default: + CERROR("Unknown llog rec type %#x swabbing rec %p\n", + rec->lrh_type, rec); + } + + if (tail) { + __swab32s(&tail->lrt_len); + __swab32s(&tail->lrt_index); + } +} +EXPORT_SYMBOL(lustre_swab_llog_rec); + +static void print_llog_hdr(struct llog_log_hdr *h) +{ + CDEBUG(D_OTHER, "llog header: %p\n", h); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type); + CDEBUG(D_OTHER, "\tllh_timestamp: %#llx\n", h->llh_timestamp); + CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count); + CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset); + CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags); + CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size); + CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx); + CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index); + CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len); +} + +void lustre_swab_llog_hdr(struct llog_log_hdr *h) +{ + print_llog_hdr(h); + + lustre_swab_llog_rec(&h->llh_hdr); + + print_llog_hdr(h); +} +EXPORT_SYMBOL(lustre_swab_llog_hdr); + +static void print_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + + if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */ + return; + CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg); + CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command); + CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num); + CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags); + CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid)); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount); + if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT) + for (i = 0; i < lcfg->lcfg_bufcount; i++) + CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n", + i, lcfg->lcfg_buflens[i]); +} + +void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + + __swab32s(&lcfg->lcfg_version); + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) { + CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n", + lcfg->lcfg_version, LUSTRE_CFG_VERSION); + return; + } + + __swab32s(&lcfg->lcfg_command); + __swab32s(&lcfg->lcfg_num); + __swab32s(&lcfg->lcfg_flags); + __swab64s(&lcfg->lcfg_nid); + __swab32s(&lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++) + __swab32s(&lcfg->lcfg_buflens[i]); + + print_lustre_cfg(lcfg); + return; +} +EXPORT_SYMBOL(lustre_swab_lustre_cfg); + +/* used only for compatibility with old on-disk cfg_marker data */ +struct cfg_marker32 { + __u32 cm_step; + __u32 cm_flags; + __u32 cm_vers; + __u32 padding; + __u32 cm_createtime; + __u32 cm_canceltime; + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +#define MTI_NAMELEN32 (MTI_NAME_MAXLEN - \ + (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32))) + +void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size) +{ + struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker; + + if (swab) { + __swab32s(&marker->cm_step); + __swab32s(&marker->cm_flags); + __swab32s(&marker->cm_vers); + } + if (size == sizeof(*cm32)) { + __u32 createtime, canceltime; + /* There was a problem with the original declaration of + * cfg_marker on 32-bit systems because it used time_t as + * a wire protocol structure, and didn't verify this in + * wirecheck. We now have to convert the offsets of the + * later fields in order to work on 32- and 64-bit systems. + * + * Fortunately, the cm_comment field has no functional use + * so can be sacrificed when converting the timestamp size. + * + * Overwrite fields from the end first, so they are not + * clobbered, and use memmove() instead of memcpy() because + * the source and target buffers overlap. bug 16771 */ + createtime = cm32->cm_createtime; + canceltime = cm32->cm_canceltime; + memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32); + marker->cm_comment[MTI_NAMELEN32 - 1] = '\0'; + memmove(marker->cm_tgtname, cm32->cm_tgtname, + sizeof(marker->cm_tgtname)); + if (swab) { + __swab32s(&createtime); + __swab32s(&canceltime); + } + marker->cm_createtime = createtime; + marker->cm_canceltime = canceltime; + CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n", + marker->cm_tgtname); + } else if (swab) { + __swab64s(&marker->cm_createtime); + __swab64s(&marker->cm_canceltime); + } + + return; +} +EXPORT_SYMBOL(lustre_swab_cfg_marker); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c b/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c new file mode 100644 index 000000000..c49dfe541 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c @@ -0,0 +1,139 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lprocfs_counters.c + * + * Lustre lprocfs counter routines + * + * Author: Andreas Dilger + */ + +#include +#include "../include/lprocfs_status.h" +#include "../include/obd_support.h" + +struct lprocfs_stats *obd_memory = NULL; +EXPORT_SYMBOL(obd_memory); + +void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int smp_id; + unsigned long flags = 0; + + if (stats == NULL) + return; + + LASSERTF(0 <= idx && idx < stats->ls_num, + "idx %d, ls_num %hu\n", idx, stats->ls_num); + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); + if (smp_id < 0) + return; + + header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); + percpu_cntr->lc_count++; + + if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + /* + * lprocfs_counter_add() can be called in interrupt context, + * as memory allocation could trigger memory shrinker call + * ldlm_pool_shrink(), which calls lprocfs_counter_add(). + * LU-1727. + * + * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE + * flag, because it needs accurate counting lest memory leak + * check reports error. + */ + if (in_interrupt() && + (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq += amount; + else + percpu_cntr->lc_sum += amount; + + if (header->lc_config & LPROCFS_CNTR_STDDEV) + percpu_cntr->lc_sumsquare += (__s64)amount * amount; + if (amount < percpu_cntr->lc_min) + percpu_cntr->lc_min = amount; + if (amount > percpu_cntr->lc_max) + percpu_cntr->lc_max = amount; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_add); + +void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int smp_id; + unsigned long flags = 0; + + if (stats == NULL) + return; + + LASSERTF(0 <= idx && idx < stats->ls_num, + "idx %d, ls_num %hu\n", idx, stats->ls_num); + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); + if (smp_id < 0) + return; + + header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); + if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + /* + * Sometimes we use RCU callbacks to free memory which calls + * lprocfs_counter_sub(), and RCU callbacks may execute in + * softirq context - right now that's the only case we're in + * softirq context here, use separate counter for that. + * bz20650. + * + * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE + * flag, because it needs accurate counting lest memory leak + * check reports error. + */ + if (in_interrupt() && + (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq -= amount; + else + percpu_cntr->lc_sum -= amount; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_sub); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c new file mode 100644 index 000000000..c171c6c6c --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c @@ -0,0 +1,2059 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lprocfs_status.c + * + * Author: Hariharan Thantry + */ + +#define DEBUG_SUBSYSTEM S_CLASS + + +#include "../include/obd_class.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre/lustre_idl.h" +#include +#include + +static const char * const obd_connect_names[] = { + "read_only", + "lov_index", + "unused", + "write_grant", + "server_lock", + "version", + "request_portal", + "acl", + "xattr", + "create_on_write", + "truncate_lock", + "initial_transno", + "inode_bit_locks", + "join_file(obsolete)", + "getattr_by_fid", + "no_oh_for_devices", + "remote_client", + "remote_client_by_force", + "max_byte_per_rpc", + "64bit_qdata", + "mds_capability", + "oss_capability", + "early_lock_cancel", + "som", + "adaptive_timeouts", + "lru_resize", + "mds_mds_connection", + "real_conn", + "change_qunit_size", + "alt_checksum_algorithm", + "fid_is_enabled", + "version_recovery", + "pools", + "grant_shrink", + "skip_orphan", + "large_ea", + "full20", + "layout_lock", + "64bithash", + "object_max_bytes", + "imp_recov", + "jobstats", + "umask", + "einprogress", + "grant_param", + "flock_owner", + "lvb_type", + "nanoseconds_times", + "lightweight_conn", + "short_io", + "pingless", + "flock_deadlock", + "disp_stripe", + "unknown", + NULL +}; + +int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep) +{ + __u64 mask = 1; + int i, ret = 0; + + for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags & mask) + ret += snprintf(page + ret, count - ret, "%s%s", + ret ? sep : "", obd_connect_names[i]); + } + if (flags & ~(mask - 1)) + ret += snprintf(page + ret, count - ret, + "%sunknown flags %#llx", + ret ? sep : "", flags & ~(mask - 1)); + return ret; +} +EXPORT_SYMBOL(obd_connect_flags2str); + +int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, + int mult) +{ + long decimal_val, frac_val; + int prtn; + + if (count < 10) + return -EINVAL; + + decimal_val = val / mult; + prtn = snprintf(buffer, count, "%ld", decimal_val); + frac_val = val % mult; + + if (prtn < (count - 4) && frac_val > 0) { + long temp_frac; + int i, temp_mult = 1, frac_bits = 0; + + temp_frac = frac_val * 10; + buffer[prtn++] = '.'; + while (frac_bits < 2 && (temp_frac / mult) < 1) { + /* only reserved 2 bits fraction */ + buffer[prtn++] = '0'; + temp_frac *= 10; + frac_bits++; + } + /* + * Need to think these cases : + * 1. #echo x.00 > /proc/xxx output result : x + * 2. #echo x.0x > /proc/xxx output result : x.0x + * 3. #echo x.x0 > /proc/xxx output result : x.x + * 4. #echo x.xx > /proc/xxx output result : x.xx + * Only reserved 2 bits fraction. + */ + for (i = 0; i < (5 - prtn); i++) + temp_mult *= 10; + + frac_bits = min((int)count - prtn, 3 - frac_bits); + prtn += snprintf(buffer + prtn, frac_bits, "%ld", + frac_val * temp_mult / mult); + + prtn--; + while (buffer[prtn] < '1' || buffer[prtn] > '9') { + prtn--; + if (buffer[prtn] == '.') { + prtn--; + break; + } + } + prtn++; + } + buffer[prtn++] = '\n'; + return prtn; +} +EXPORT_SYMBOL(lprocfs_read_frac_helper); + +int lprocfs_write_frac_helper(const char __user *buffer, unsigned long count, + int *val, int mult) +{ + char kernbuf[20], *end, *pbuf; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + pbuf = kernbuf; + if (*pbuf == '-') { + mult = -mult; + pbuf++; + } + + *val = (int)simple_strtoul(pbuf, &end, 10) * mult; + if (pbuf == end) + return -EINVAL; + + if (end != NULL && *end == '.') { + int temp_val, pow = 1; + int i; + + pbuf = end + 1; + if (strlen(pbuf) > 5) + pbuf[5] = '\0'; /*only allow 5bits fractional*/ + + temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult; + + if (pbuf < end) { + for (i = 0; i < (end - pbuf); i++) + pow *= 10; + + *val += temp_val / pow; + } + } + return 0; +} +EXPORT_SYMBOL(lprocfs_write_frac_helper); + +#if defined (CONFIG_PROC_FS) + +static int lprocfs_no_percpu_stats; +module_param(lprocfs_no_percpu_stats, int, 0644); +MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs stats"); + +#define MAX_STRING_SIZE 128 + +int lprocfs_single_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_single_release); + +int lprocfs_seq_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_seq_release); + +/* lprocfs API calls */ + +struct proc_dir_entry *lprocfs_add_simple(struct proc_dir_entry *root, + char *name, void *data, + struct file_operations *fops) +{ + struct proc_dir_entry *proc; + umode_t mode = 0; + + if (root == NULL || name == NULL || fops == NULL) + return ERR_PTR(-EINVAL); + + if (fops->read) + mode = 0444; + if (fops->write) + mode |= 0200; + proc = proc_create_data(name, mode, root, fops, data); + if (!proc) { + CERROR("LprocFS: No memory to create /proc entry %s", name); + return ERR_PTR(-ENOMEM); + } + return proc; +} +EXPORT_SYMBOL(lprocfs_add_simple); + +struct proc_dir_entry *lprocfs_add_symlink(const char *name, + struct proc_dir_entry *parent, const char *format, ...) +{ + struct proc_dir_entry *entry; + char *dest; + va_list ap; + + if (parent == NULL || format == NULL) + return NULL; + + OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1); + if (dest == NULL) + return NULL; + + va_start(ap, format); + vsnprintf(dest, MAX_STRING_SIZE, format, ap); + va_end(ap); + + entry = proc_symlink(name, parent, dest); + if (entry == NULL) + CERROR("LprocFS: Could not create symbolic link from %s to %s", + name, dest); + + OBD_FREE(dest, MAX_STRING_SIZE + 1); + return entry; +} +EXPORT_SYMBOL(lprocfs_add_symlink); + +static struct file_operations lprocfs_generic_fops = { }; + +/** + * Add /proc entries. + * + * \param root [in] The parent proc entry on which new entry will be added. + * \param list [in] Array of proc entries to be added. + * \param data [in] The argument to be passed when entries read/write routines + * are called through /proc file. + * + * \retval 0 on success + * < 0 on error + */ +int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, + void *data) +{ + if (root == NULL || list == NULL) + return -EINVAL; + + while (list->name != NULL) { + struct proc_dir_entry *proc; + umode_t mode = 0; + + if (list->proc_mode != 0000) { + mode = list->proc_mode; + } else if (list->fops) { + if (list->fops->read) + mode = 0444; + if (list->fops->write) + mode |= 0200; + } + proc = proc_create_data(list->name, mode, root, + list->fops ?: &lprocfs_generic_fops, + list->data ?: data); + if (proc == NULL) + return -ENOMEM; + list++; + } + return 0; +} +EXPORT_SYMBOL(lprocfs_add_vars); + +void lprocfs_remove(struct proc_dir_entry **rooth) +{ + proc_remove(*rooth); + *rooth = NULL; +} +EXPORT_SYMBOL(lprocfs_remove); + +void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + LASSERT(parent != NULL); + remove_proc_entry(name, parent); +} +EXPORT_SYMBOL(lprocfs_remove_proc_entry); + +struct proc_dir_entry *lprocfs_register(const char *name, + struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir(name, parent); + if (entry == NULL) { + entry = ERR_PTR(-ENOMEM); + goto out; + } + + if (list != NULL) { + int rc = lprocfs_add_vars(entry, list, data); + if (rc != 0) { + lprocfs_remove(&entry); + entry = ERR_PTR(rc); + } + } +out: + return entry; +} +EXPORT_SYMBOL(lprocfs_register); + +/* Generic callbacks */ +int lprocfs_rd_uint(struct seq_file *m, void *data) +{ + seq_printf(m, "%u\n", *(unsigned int *)data); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_uint); + +int lprocfs_wr_uint(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned *p = data; + char dummy[MAX_STRING_SIZE + 1], *end; + unsigned long tmp; + + dummy[MAX_STRING_SIZE] = '\0'; + if (copy_from_user(dummy, buffer, MAX_STRING_SIZE)) + return -EFAULT; + + tmp = simple_strtoul(dummy, &end, 0); + if (dummy == end) + return -EINVAL; + + *p = (unsigned int)tmp; + return count; +} +EXPORT_SYMBOL(lprocfs_wr_uint); + +int lprocfs_rd_u64(struct seq_file *m, void *data) +{ + seq_printf(m, "%llu\n", *(__u64 *)data); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_u64); + +int lprocfs_rd_atomic(struct seq_file *m, void *data) +{ + atomic_t *atom = data; + LASSERT(atom != NULL); + seq_printf(m, "%d\n", atomic_read(atom)); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_atomic); + +int lprocfs_wr_atomic(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + atomic_t *atm = data; + int val = 0; + int rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val <= 0) + return -ERANGE; + + atomic_set(atm, val); + return count; +} +EXPORT_SYMBOL(lprocfs_wr_atomic); + +int lprocfs_rd_uuid(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + + LASSERT(obd != NULL); + seq_printf(m, "%s\n", obd->obd_uuid.uuid); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_uuid); + +int lprocfs_rd_name(struct seq_file *m, void *data) +{ + struct obd_device *dev = data; + + LASSERT(dev != NULL); + seq_printf(m, "%s\n", dev->obd_name); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_name); + +int lprocfs_rd_blksize(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) + seq_printf(m, "%u\n", osfs.os_bsize); + + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_blksize); + +int lprocfs_rd_kbytestotal(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_kbytestotal); + +int lprocfs_rd_kbytesfree(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_kbytesfree); + +int lprocfs_rd_kbytesavail(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_kbytesavail); + +int lprocfs_rd_filestotal(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) + seq_printf(m, "%llu\n", osfs.os_files); + + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_filestotal); + +int lprocfs_rd_filesfree(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) + seq_printf(m, "%llu\n", osfs.os_ffree); + + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_filesfree); + +int lprocfs_rd_server_uuid(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp; + char *imp_state_name = NULL; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + imp_state_name = ptlrpc_import_state_name(imp->imp_state); + seq_printf(m, "%s\t%s%s\n", + obd2cli_tgt(obd), imp_state_name, + imp->imp_deactive ? "\tDEACTIVATED" : ""); + + LPROCFS_CLIMP_EXIT(obd); + + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_server_uuid); + +int lprocfs_rd_conn_uuid(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct ptlrpc_connection *conn; + + LASSERT(obd != NULL); + + LPROCFS_CLIMP_CHECK(obd); + conn = obd->u.cli.cl_import->imp_connection; + if (conn && obd->u.cli.cl_import) + seq_printf(m, "%s\n", conn->c_remote_uuid.uuid); + else + seq_puts(m, "\n"); + + LPROCFS_CLIMP_EXIT(obd); + + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_conn_uuid); + +/** add up per-cpu counters */ +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ + unsigned int num_entry; + struct lprocfs_counter *percpu_cntr; + int i; + unsigned long flags = 0; + + memset(cnt, 0, sizeof(*cnt)); + + if (stats == NULL) { + /* set count to 1 to avoid divide-by-zero errs in callers */ + cnt->lc_count = 1; + return; + } + + cnt->lc_min = LC_MIN_INIT; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (stats->ls_percpu[i] == NULL) + continue; + percpu_cntr = lprocfs_stats_counter_get(stats, i, idx); + + cnt->lc_count += percpu_cntr->lc_count; + cnt->lc_sum += percpu_cntr->lc_sum; + if (percpu_cntr->lc_min < cnt->lc_min) + cnt->lc_min = percpu_cntr->lc_min; + if (percpu_cntr->lc_max > cnt->lc_max) + cnt->lc_max = percpu_cntr->lc_max; + cnt->lc_sumsquare += percpu_cntr->lc_sumsquare; + } + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_stats_collect); + +/** + * Append a space separated list of current set flags to str. + */ +#define flag2str(flag, first) \ + do { \ + if (imp->imp_##flag) \ + seq_printf(m, "%s" #flag, first ? "" : ", "); \ + } while (0) +static int obd_import_flags2str(struct obd_import *imp, struct seq_file *m) +{ + bool first = true; + + if (imp->imp_obd->obd_no_recov) { + seq_printf(m, "no_recov"); + first = false; + } + + flag2str(invalid, first); + first = false; + flag2str(deactive, first); + flag2str(replayable, first); + flag2str(pingable, first); + return 0; +} +#undef flags2str + +static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, char *sep) +{ + __u64 mask = 1; + int i; + bool first = true; + + for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags & mask) { + seq_printf(m, "%s%s", + first ? sep : "", obd_connect_names[i]); + first = false; + } + } + if (flags & ~(mask - 1)) + seq_printf(m, "%sunknown flags %#llx", + first ? sep : "", flags & ~(mask - 1)); +} + +int lprocfs_rd_import(struct seq_file *m, void *data) +{ + struct lprocfs_counter ret; + struct lprocfs_counter_header *header; + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + struct obd_import_conn *conn; + int j; + int k; + int rw = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + + seq_printf(m, + "import:\n" + " name: %s\n" + " target: %s\n" + " state: %s\n" + " instance: %u\n" + " connect_flags: [", + obd->obd_name, + obd2cli_tgt(obd), + ptlrpc_import_state_name(imp->imp_state), + imp->imp_connect_data.ocd_instance); + obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, ", "); + seq_printf(m, + "]\n" + " import_flags: ["); + obd_import_flags2str(imp, m); + + seq_printf(m, + "]\n" + " connection:\n" + " failover_nids: ["); + spin_lock(&imp->imp_lock); + j = 0; + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + seq_printf(m, "%s%s", j ? ", " : "", + libcfs_nid2str(conn->oic_conn->c_peer.nid)); + j++; + } + seq_printf(m, + "]\n" + " current_connection: %s\n" + " connection_attempts: %u\n" + " generation: %u\n" + " in-progress_invalidations: %u\n", + imp->imp_connection == NULL ? "" : + libcfs_nid2str(imp->imp_connection->c_peer.nid), + imp->imp_conn_cnt, + imp->imp_generation, + atomic_read(&imp->imp_inval_count)); + spin_unlock(&imp->imp_lock); + + if (obd->obd_svc_stats == NULL) + goto out_climp; + + header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR]; + lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret); + if (ret.lc_count != 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + } else + ret.lc_sum = 0; + seq_printf(m, + " rpcs:\n" + " inflight: %u\n" + " unregistering: %u\n" + " timeouts: %u\n" + " avg_waittime: %llu %s\n", + atomic_read(&imp->imp_inflight), + atomic_read(&imp->imp_unregistering), + atomic_read(&imp->imp_timeouts), + ret.lc_sum, header->lc_units); + + k = 0; + for (j = 0; j < IMP_AT_MAX_PORTALS; j++) { + if (imp->imp_at.iat_portal[j] == 0) + break; + k = max_t(unsigned int, k, + at_get(&imp->imp_at.iat_service_estimate[j])); + } + seq_printf(m, + " service_estimates:\n" + " services: %u sec\n" + " network: %u sec\n", + k, + at_get(&imp->imp_at.iat_net_latency)); + + seq_printf(m, + " transactions:\n" + " last_replay: %llu\n" + " peer_committed: %llu\n" + " last_checked: %llu\n", + imp->imp_last_replay_transno, + imp->imp_peer_committed_transno, + imp->imp_last_transno_checked); + + /* avg data rates */ + for (rw = 0; rw <= 1; rw++) { + lprocfs_stats_collect(obd->obd_svc_stats, + PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw, + &ret); + if (ret.lc_sum > 0 && ret.lc_count > 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + seq_printf(m, + " %s_data_averages:\n" + " bytes_per_rpc: %llu\n", + rw ? "write" : "read", + ret.lc_sum); + } + k = (int)ret.lc_sum; + j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES; + header = &obd->obd_svc_stats->ls_cnt_header[j]; + lprocfs_stats_collect(obd->obd_svc_stats, j, &ret); + if (ret.lc_sum > 0 && ret.lc_count != 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + seq_printf(m, + " %s_per_rpc: %llu\n", + header->lc_units, ret.lc_sum); + j = (int)ret.lc_sum; + if (j > 0) + seq_printf(m, + " MB_per_sec: %u.%.02u\n", + k / j, (100 * k / j) % 100); + } + } + +out_climp: + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_import); + +int lprocfs_rd_state(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + int j, k; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + + seq_printf(m, "current_state: %s\n", + ptlrpc_import_state_name(imp->imp_state)); + seq_printf(m, "state_history:\n"); + k = imp->imp_state_hist_idx; + for (j = 0; j < IMP_STATE_HIST_LEN; j++) { + struct import_state_hist *ish = + &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN]; + if (ish->ish_state == 0) + continue; + seq_printf(m, " - ["CFS_TIME_T", %s]\n", + ish->ish_time, + ptlrpc_import_state_name(ish->ish_state)); + } + + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_state); + +int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at) +{ + int i; + for (i = 0; i < AT_BINS; i++) + seq_printf(m, "%3u ", at->at_hist[i]); + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(lprocfs_at_hist_helper); + +/* See also ptlrpc_lprocfs_rd_timeouts */ +int lprocfs_rd_timeouts(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + unsigned int cur, worst; + time_t now, worstt; + struct dhms ts; + int i; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + + now = get_seconds(); + + /* Some network health info for kicks */ + s2dhms(&ts, now - imp->imp_last_reply_time); + seq_printf(m, "%-10s : %ld, "DHMS_FMT" ago\n", + "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts)); + + cur = at_get(&imp->imp_at.iat_net_latency); + worst = imp->imp_at.iat_net_latency.at_worst_ever; + worstt = imp->imp_at.iat_net_latency.at_worst_time; + s2dhms(&ts, now - worstt); + seq_printf(m, "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ", + "network", cur, worst, worstt, DHMS_VARS(&ts)); + lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency); + + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (imp->imp_at.iat_portal[i] == 0) + break; + cur = at_get(&imp->imp_at.iat_service_estimate[i]); + worst = imp->imp_at.iat_service_estimate[i].at_worst_ever; + worstt = imp->imp_at.iat_service_estimate[i].at_worst_time; + s2dhms(&ts, now - worstt); + seq_printf(m, "portal %-2d : cur %3u worst %3u (at %ld, " + DHMS_FMT" ago) ", imp->imp_at.iat_portal[i], + cur, worst, worstt, DHMS_VARS(&ts)); + lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]); + } + + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_timeouts); + +int lprocfs_rd_connect_flags(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + __u64 flags; + + LPROCFS_CLIMP_CHECK(obd); + flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags; + seq_printf(m, "flags=%#llx\n", flags); + obd_connect_seq_flags2str(m, flags, "\n"); + seq_printf(m, "\n"); + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_connect_flags); + +int lprocfs_rd_num_exports(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + + LASSERT(obd != NULL); + seq_printf(m, "%u\n", obd->obd_num_exports); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_num_exports); + +int lprocfs_rd_numrefs(struct seq_file *m, void *data) +{ + struct obd_type *class = (struct obd_type *) data; + + LASSERT(class != NULL); + seq_printf(m, "%d\n", class->typ_refcnt); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_numrefs); + +int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list) +{ + int rc = 0; + + LASSERT(obd != NULL); + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + LASSERT(obd->obd_type->typ_procroot != NULL); + + obd->obd_proc_entry = lprocfs_register(obd->obd_name, + obd->obd_type->typ_procroot, + list, obd); + if (IS_ERR(obd->obd_proc_entry)) { + rc = PTR_ERR(obd->obd_proc_entry); + CERROR("error %d setting up lprocfs for %s\n", + rc, obd->obd_name); + obd->obd_proc_entry = NULL; + } + return rc; +} +EXPORT_SYMBOL(lprocfs_obd_setup); + +int lprocfs_obd_cleanup(struct obd_device *obd) +{ + if (!obd) + return -EINVAL; + if (obd->obd_proc_exports_entry) { + /* Should be no exports left */ + lprocfs_remove(&obd->obd_proc_exports_entry); + obd->obd_proc_exports_entry = NULL; + } + if (obd->obd_proc_entry) { + lprocfs_remove(&obd->obd_proc_entry); + obd->obd_proc_entry = NULL; + } + return 0; +} +EXPORT_SYMBOL(lprocfs_obd_cleanup); + +static void lprocfs_free_client_stats(struct nid_stat *client_stat) +{ + CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat, + client_stat->nid_proc, client_stat->nid_stats); + + LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0, + "nid %s:count %d\n", libcfs_nid2str(client_stat->nid), + atomic_read(&client_stat->nid_exp_ref_count)); + + if (client_stat->nid_proc) + lprocfs_remove(&client_stat->nid_proc); + + if (client_stat->nid_stats) + lprocfs_free_stats(&client_stat->nid_stats); + + if (client_stat->nid_ldlm_stats) + lprocfs_free_stats(&client_stat->nid_ldlm_stats); + + OBD_FREE_PTR(client_stat); + return; + +} + +void lprocfs_free_per_client_stats(struct obd_device *obd) +{ + struct cfs_hash *hash = obd->obd_nid_stats_hash; + struct nid_stat *stat; + + /* we need extra list - because hash_exit called to early */ + /* not need locking because all clients is died */ + while (!list_empty(&obd->obd_nid_stats)) { + stat = list_entry(obd->obd_nid_stats.next, + struct nid_stat, nid_list); + list_del_init(&stat->nid_list); + cfs_hash_del(hash, &stat->nid, &stat->nid_hash); + lprocfs_free_client_stats(stat); + } +} +EXPORT_SYMBOL(lprocfs_free_per_client_stats); + +int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid) +{ + struct lprocfs_counter *cntr; + unsigned int percpusize; + int rc = -ENOMEM; + unsigned long flags = 0; + int i; + + LASSERT(stats->ls_percpu[cpuid] == NULL); + LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0); + + percpusize = lprocfs_stats_counter_size(stats); + LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize); + if (stats->ls_percpu[cpuid] != NULL) { + rc = 0; + if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, flags); + else + spin_lock(&stats->ls_lock); + if (stats->ls_biggest_alloc_num <= cpuid) + stats->ls_biggest_alloc_num = cpuid + 1; + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_unlock_irqrestore(&stats->ls_lock, flags); + else + spin_unlock(&stats->ls_lock); + } + /* initialize the ls_percpu[cpuid] non-zero counter */ + for (i = 0; i < stats->ls_num; ++i) { + cntr = lprocfs_stats_counter_get(stats, cpuid, i); + cntr->lc_min = LC_MIN_INIT; + } + } + return rc; +} +EXPORT_SYMBOL(lprocfs_stats_alloc_one); + +struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, + enum lprocfs_stats_flags flags) +{ + struct lprocfs_stats *stats; + unsigned int num_entry; + unsigned int percpusize = 0; + int i; + + if (num == 0) + return NULL; + + if (lprocfs_no_percpu_stats != 0) + flags |= LPROCFS_STATS_FLAG_NOPERCPU; + + if (flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + /* alloc percpu pointers for all possible cpu slots */ + LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); + if (stats == NULL) + return NULL; + + stats->ls_num = num; + stats->ls_flags = flags; + spin_lock_init(&stats->ls_lock); + + /* alloc num of counter headers */ + LIBCFS_ALLOC(stats->ls_cnt_header, + stats->ls_num * sizeof(struct lprocfs_counter_header)); + if (stats->ls_cnt_header == NULL) + goto fail; + + if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) { + /* contains only one set counters */ + percpusize = lprocfs_stats_counter_size(stats); + LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize); + if (stats->ls_percpu[0] == NULL) + goto fail; + stats->ls_biggest_alloc_num = 1; + } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) { + /* alloc all percpu data, currently only obd_memory use this */ + for (i = 0; i < num_entry; ++i) + if (lprocfs_stats_alloc_one(stats, i) < 0) + goto fail; + } + + return stats; + +fail: + lprocfs_free_stats(&stats); + return NULL; +} +EXPORT_SYMBOL(lprocfs_alloc_stats); + +void lprocfs_free_stats(struct lprocfs_stats **statsh) +{ + struct lprocfs_stats *stats = *statsh; + unsigned int num_entry; + unsigned int percpusize; + unsigned int i; + + if (stats == NULL || stats->ls_num == 0) + return; + *statsh = NULL; + + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + percpusize = lprocfs_stats_counter_size(stats); + for (i = 0; i < num_entry; i++) + if (stats->ls_percpu[i] != NULL) + LIBCFS_FREE(stats->ls_percpu[i], percpusize); + if (stats->ls_cnt_header != NULL) + LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num * + sizeof(struct lprocfs_counter_header)); + LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); +} +EXPORT_SYMBOL(lprocfs_free_stats); + +void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ + struct lprocfs_counter *percpu_cntr; + int i; + int j; + unsigned int num_entry; + unsigned long flags = 0; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (stats->ls_percpu[i] == NULL) + continue; + for (j = 0; j < stats->ls_num; j++) { + percpu_cntr = lprocfs_stats_counter_get(stats, i, j); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + percpu_cntr->lc_sum_irq = 0; + } + } + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_clear_stats); + +static ssize_t lprocfs_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct lprocfs_stats *stats = seq->private; + + lprocfs_clear_stats(stats); + + return len; +} + +static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos) +{ + struct lprocfs_stats *stats = p->private; + + return (*pos < stats->ls_num) ? pos : NULL; +} + +static void lprocfs_stats_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + (*pos)++; + return lprocfs_stats_seq_start(p, pos); +} + +/* seq file export of one lprocfs counter */ +static int lprocfs_stats_seq_show(struct seq_file *p, void *v) +{ + struct lprocfs_stats *stats = p->private; + struct lprocfs_counter_header *hdr; + struct lprocfs_counter ctr; + int idx = *(loff_t *)v; + + if (idx == 0) { + struct timeval now; + do_gettimeofday(&now); + seq_printf(p, "%-25s %lu.%lu secs.usecs\n", + "snapshot_time", + now.tv_sec, (unsigned long)now.tv_usec); + } + + hdr = &stats->ls_cnt_header[idx]; + lprocfs_stats_collect(stats, idx, &ctr); + + if (ctr.lc_count != 0) { + seq_printf(p, "%-25s %lld samples [%s]", + hdr->lc_name, ctr.lc_count, hdr->lc_units); + + if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && + (ctr.lc_count > 0)) { + seq_printf(p, " %lld %lld %lld", + ctr.lc_min, ctr.lc_max, ctr.lc_sum); + if (hdr->lc_config & LPROCFS_CNTR_STDDEV) + seq_printf(p, " %lld", ctr.lc_sumsquare); + } + seq_putc(p, '\n'); + } + + return 0; +} + +static const struct seq_operations lprocfs_stats_seq_sops = { + .start = lprocfs_stats_seq_start, + .stop = lprocfs_stats_seq_stop, + .next = lprocfs_stats_seq_next, + .show = lprocfs_stats_seq_show, +}; + +static int lprocfs_stats_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lprocfs_stats_seq_sops); + if (rc) + return rc; + seq = file->private_data; + seq->private = PDE_DATA(inode); + return 0; +} + +struct file_operations lprocfs_stats_seq_fops = { + .owner = THIS_MODULE, + .open = lprocfs_stats_seq_open, + .read = seq_read, + .write = lprocfs_stats_seq_write, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; + +int lprocfs_register_stats(struct proc_dir_entry *root, const char *name, + struct lprocfs_stats *stats) +{ + struct proc_dir_entry *entry; + LASSERT(root != NULL); + + entry = proc_create_data(name, 0644, root, + &lprocfs_stats_seq_fops, stats); + if (entry == NULL) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(lprocfs_register_stats); + +void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, const char *units) +{ + struct lprocfs_counter_header *header; + struct lprocfs_counter *percpu_cntr; + unsigned long flags = 0; + unsigned int i; + unsigned int num_cpu; + + LASSERT(stats != NULL); + + header = &stats->ls_cnt_header[index]; + LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n", + index, name, units); + + header->lc_config = conf; + header->lc_name = name; + header->lc_units = units; + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; ++i) { + if (stats->ls_percpu[i] == NULL) + continue; + percpu_cntr = lprocfs_stats_counter_get(stats, i, index); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq = 0; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_init); + +#define LPROCFS_OBD_OP_INIT(base, stats, op) \ +do { \ + unsigned int coffset = base + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < stats->ls_num); \ + lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \ +} while (0) + +void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) +{ + LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, create); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref); +} +EXPORT_SYMBOL(lprocfs_init_ops_stats); + +int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) +{ + struct lprocfs_stats *stats; + unsigned int num_stats; + int rc, i; + + LASSERT(obd->obd_stats == NULL); + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->obd_cntr_base == 0); + + num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) + + num_private_stats - 1 /* o_owner */; + stats = lprocfs_alloc_stats(num_stats, 0); + if (stats == NULL) + return -ENOMEM; + + lprocfs_init_ops_stats(num_private_stats, stats); + + for (i = num_private_stats; i < num_stats; i++) { + /* If this LBUGs, it is likely that an obd + * operation was added to struct obd_ops in + * , and that the corresponding line item + * LPROCFS_OBD_OP_INIT(.., .., opname) + * is missing from the list above. */ + LASSERTF(stats->ls_cnt_header[i].lc_name != NULL, + "Missing obd_stat initializer obd_op operation at offset %d.\n", + i - num_private_stats); + } + rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats); + if (rc < 0) { + lprocfs_free_stats(&stats); + } else { + obd->obd_stats = stats; + obd->obd_cntr_base = num_private_stats; + } + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_obd_stats); + +void lprocfs_free_obd_stats(struct obd_device *obd) +{ + if (obd->obd_stats) + lprocfs_free_stats(&obd->obd_stats); +} +EXPORT_SYMBOL(lprocfs_free_obd_stats); + +#define LPROCFS_MD_OP_INIT(base, stats, op) \ +do { \ + unsigned int coffset = base + MD_COUNTER_OFFSET(op); \ + LASSERT(coffset < stats->ls_num); \ + lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \ +} while (0) + +void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats) +{ + LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus); + LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode); + LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata); + LPROCFS_MD_OP_INIT(num_private_stats, stats, close); + LPROCFS_MD_OP_INIT(num_private_stats, stats, create); + LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing); + LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue); + LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name); + LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock); + LPROCFS_MD_OP_INIT(num_private_stats, stats, link); + LPROCFS_MD_OP_INIT(num_private_stats, stats, rename); + LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir); + LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, sync); + LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage); + LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink); + LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size); + LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md); + LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md); + LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data); + LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data); + LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data); + LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match); + LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused); + LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa); + LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa); + LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm); + LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async); + LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock); +} +EXPORT_SYMBOL(lprocfs_init_mps_stats); + +int lprocfs_alloc_md_stats(struct obd_device *obd, + unsigned num_private_stats) +{ + struct lprocfs_stats *stats; + unsigned int num_stats; + int rc, i; + + LASSERT(obd->md_stats == NULL); + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->md_cntr_base == 0); + + num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) + + num_private_stats; + stats = lprocfs_alloc_stats(num_stats, 0); + if (stats == NULL) + return -ENOMEM; + + lprocfs_init_mps_stats(num_private_stats, stats); + + for (i = num_private_stats; i < num_stats; i++) { + if (stats->ls_cnt_header[i].lc_name == NULL) { + CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n", + i - num_private_stats); + LBUG(); + } + } + rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats); + if (rc < 0) { + lprocfs_free_stats(&stats); + } else { + obd->md_stats = stats; + obd->md_cntr_base = num_private_stats; + } + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_md_stats); + +void lprocfs_free_md_stats(struct obd_device *obd) +{ + struct lprocfs_stats *stats = obd->md_stats; + + if (stats != NULL) { + obd->md_stats = NULL; + obd->md_cntr_base = 0; + lprocfs_free_stats(&stats); + } +} +EXPORT_SYMBOL(lprocfs_free_md_stats); + +void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ + lprocfs_counter_init(ldlm_stats, + LDLM_ENQUEUE - LDLM_FIRST_OPC, + 0, "ldlm_enqueue", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CONVERT - LDLM_FIRST_OPC, + 0, "ldlm_convert", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CANCEL - LDLM_FIRST_OPC, + 0, "ldlm_cancel", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_BL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_bl_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CP_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_cp_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_GL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_gl_callback", "reqs"); +} +EXPORT_SYMBOL(lprocfs_init_ldlm_stats); + +int lprocfs_exp_print_uuid(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) + +{ + struct obd_export *exp = cfs_hash_object(hs, hnode); + struct seq_file *m = (struct seq_file *)data; + + if (exp->exp_nid_stats) + seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid)); + + return 0; +} + +static int +lproc_exp_uuid_seq_show(struct seq_file *m, void *unused) +{ + struct nid_stat *stats = (struct nid_stat *)m->private; + struct obd_device *obd = stats->nid_obd; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_uuid, m); + return 0; +} + +LPROC_SEQ_FOPS_RO(lproc_exp_uuid); + +struct exp_hash_cb_data { + struct seq_file *m; + bool first; +}; + +int lprocfs_exp_print_hash(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *cb_data) + +{ + struct exp_hash_cb_data *data = (struct exp_hash_cb_data *)cb_data; + struct obd_export *exp = cfs_hash_object(hs, hnode); + + if (exp->exp_lock_hash != NULL) { + if (data->first) { + cfs_hash_debug_header(data->m); + data->first = false; + } + cfs_hash_debug_str(hs, data->m); + } + + return 0; +} + +static int +lproc_exp_hash_seq_show(struct seq_file *m, void *unused) +{ + struct nid_stat *stats = (struct nid_stat *)m->private; + struct obd_device *obd = stats->nid_obd; + struct exp_hash_cb_data cb_data = { + .m = m, + .first = true + }; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_hash, &cb_data); + return 0; +} + +LPROC_SEQ_FOPS_RO(lproc_exp_hash); + +int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data) +{ + seq_printf(m, "%s\n", + "Write into this file to clear all nid stats and stale nid entries"); + return 0; +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_read); + +static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data) +{ + struct nid_stat *stat = obj; + + CDEBUG(D_INFO, "refcnt %d\n", atomic_read(&stat->nid_exp_ref_count)); + if (atomic_read(&stat->nid_exp_ref_count) == 1) { + /* object has only hash references. */ + spin_lock(&stat->nid_obd->obd_nid_lock); + list_move(&stat->nid_list, data); + spin_unlock(&stat->nid_obd->obd_nid_lock); + return 1; + } + /* we has reference to object - only clear data*/ + if (stat->nid_stats) + lprocfs_clear_stats(stat->nid_stats); + + return 0; +} + +int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct nid_stat *client_stat; + LIST_HEAD(free_list); + + cfs_hash_cond_del(obd->obd_nid_stats_hash, + lprocfs_nid_stats_clear_write_cb, &free_list); + + while (!list_empty(&free_list)) { + client_stat = list_entry(free_list.next, struct nid_stat, + nid_list); + list_del_init(&client_stat->nid_list); + lprocfs_free_client_stats(client_stat); + } + + return count; +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_write); + +int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid) +{ + struct nid_stat *new_stat, *old_stat; + struct obd_device *obd = NULL; + struct proc_dir_entry *entry; + char *buffer = NULL; + int rc = 0; + + *newnid = 0; + + if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry || + !exp->exp_obd->obd_nid_stats_hash) + return -EINVAL; + + /* not test against zero because eric say: + * You may only test nid against another nid, or LNET_NID_ANY. + * Anything else is nonsense.*/ + if (!nid || *nid == LNET_NID_ANY) + return 0; + + obd = exp->exp_obd; + + CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash); + + OBD_ALLOC_PTR(new_stat); + if (new_stat == NULL) + return -ENOMEM; + + new_stat->nid = *nid; + new_stat->nid_obd = exp->exp_obd; + /* we need set default refcount to 1 to balance obd_disconnect */ + atomic_set(&new_stat->nid_exp_ref_count, 1); + + old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash, + nid, &new_stat->nid_hash); + CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n", + old_stat, libcfs_nid2str(*nid), + atomic_read(&new_stat->nid_exp_ref_count)); + + /* We need to release old stats because lprocfs_exp_cleanup() hasn't + * been and will never be called. */ + if (exp->exp_nid_stats) { + nidstat_putref(exp->exp_nid_stats); + exp->exp_nid_stats = NULL; + } + + /* Return -EALREADY here so that we know that the /proc + * entry already has been created */ + if (old_stat != new_stat) { + exp->exp_nid_stats = old_stat; + rc = -EALREADY; + goto destroy_new; + } + /* not found - create */ + OBD_ALLOC(buffer, LNET_NIDSTR_SIZE); + if (buffer == NULL) { + rc = -ENOMEM; + goto destroy_new; + } + + memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE); + new_stat->nid_proc = lprocfs_register(buffer, + obd->obd_proc_exports_entry, + NULL, NULL); + OBD_FREE(buffer, LNET_NIDSTR_SIZE); + + if (IS_ERR(new_stat->nid_proc)) { + CERROR("Error making export directory for nid %s\n", + libcfs_nid2str(*nid)); + rc = PTR_ERR(new_stat->nid_proc); + new_stat->nid_proc = NULL; + goto destroy_new_ns; + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "uuid", + new_stat, &lproc_exp_uuid_fops); + if (IS_ERR(entry)) { + CWARN("Error adding the NID stats file\n"); + rc = PTR_ERR(entry); + goto destroy_new_ns; + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "hash", + new_stat, &lproc_exp_hash_fops); + if (IS_ERR(entry)) { + CWARN("Error adding the hash file\n"); + rc = PTR_ERR(entry); + goto destroy_new_ns; + } + + exp->exp_nid_stats = new_stat; + *newnid = 1; + /* protect competitive add to list, not need locking on destroy */ + spin_lock(&obd->obd_nid_lock); + list_add(&new_stat->nid_list, &obd->obd_nid_stats); + spin_unlock(&obd->obd_nid_lock); + + return rc; + +destroy_new_ns: + if (new_stat->nid_proc != NULL) + lprocfs_remove(&new_stat->nid_proc); + cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash); + +destroy_new: + nidstat_putref(new_stat); + OBD_FREE_PTR(new_stat); + return rc; +} +EXPORT_SYMBOL(lprocfs_exp_setup); + +int lprocfs_exp_cleanup(struct obd_export *exp) +{ + struct nid_stat *stat = exp->exp_nid_stats; + + if (!stat || !exp->exp_obd) + return 0; + + nidstat_putref(exp->exp_nid_stats); + exp->exp_nid_stats = NULL; + + return 0; +} +EXPORT_SYMBOL(lprocfs_exp_cleanup); + +__s64 lprocfs_read_helper(struct lprocfs_counter *lc, + struct lprocfs_counter_header *header, + enum lprocfs_stats_flags flags, + enum lprocfs_fields_flags field) +{ + __s64 ret = 0; + + if (lc == NULL || header == NULL) + return 0; + + switch (field) { + case LPROCFS_FIELDS_FLAGS_CONFIG: + ret = header->lc_config; + break; + case LPROCFS_FIELDS_FLAGS_SUM: + ret = lc->lc_sum; + if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + ret += lc->lc_sum_irq; + break; + case LPROCFS_FIELDS_FLAGS_MIN: + ret = lc->lc_min; + break; + case LPROCFS_FIELDS_FLAGS_MAX: + ret = lc->lc_max; + break; + case LPROCFS_FIELDS_FLAGS_AVG: + ret = (lc->lc_max - lc->lc_min) / 2; + break; + case LPROCFS_FIELDS_FLAGS_SUMSQUARE: + ret = lc->lc_sumsquare; + break; + case LPROCFS_FIELDS_FLAGS_COUNT: + ret = lc->lc_count; + break; + default: + break; + } + + return 0; +} +EXPORT_SYMBOL(lprocfs_read_helper); + +int lprocfs_write_helper(const char __user *buffer, unsigned long count, + int *val) +{ + return lprocfs_write_frac_helper(buffer, count, val, 1); +} +EXPORT_SYMBOL(lprocfs_write_helper); + +int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult) +{ + long decimal_val, frac_val; + + decimal_val = val / mult; + seq_printf(m, "%ld", decimal_val); + frac_val = val % mult; + + if (frac_val > 0) { + frac_val *= 100; + frac_val /= mult; + } + if (frac_val > 0) { + /* Three cases: x0, xx, 0x */ + if ((frac_val % 10) != 0) + seq_printf(m, ".%ld", frac_val); + else + seq_printf(m, ".%ld", frac_val / 10); + } + + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(lprocfs_seq_read_frac_helper); + +int lprocfs_write_u64_helper(const char __user *buffer, unsigned long count, + __u64 *val) +{ + return lprocfs_write_frac_u64_helper(buffer, count, val, 1); +} +EXPORT_SYMBOL(lprocfs_write_u64_helper); + +int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count, + __u64 *val, int mult) +{ + char kernbuf[22], *end, *pbuf; + __u64 whole, frac = 0, units; + unsigned frac_d = 1; + int sign = 1; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + pbuf = kernbuf; + if (*pbuf == '-') { + sign = -1; + pbuf++; + } + + whole = simple_strtoull(pbuf, &end, 10); + if (pbuf == end) + return -EINVAL; + + if (*end == '.') { + int i; + pbuf = end + 1; + + /* need to limit frac_d to a __u32 */ + if (strlen(pbuf) > 10) + pbuf[10] = '\0'; + + frac = simple_strtoull(pbuf, &end, 10); + /* count decimal places */ + for (i = 0; i < (end - pbuf); i++) + frac_d *= 10; + } + + units = 1; + switch (tolower(*end)) { + case 'p': + units <<= 10; + case 't': + units <<= 10; + case 'g': + units <<= 10; + case 'm': + units <<= 10; + case 'k': + units <<= 10; + } + /* Specified units override the multiplier */ + if (units > 1) + mult = units; + + frac *= mult; + do_div(frac, frac_d); + *val = sign * (whole * mult + frac); + return 0; +} +EXPORT_SYMBOL(lprocfs_write_frac_u64_helper); + +static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len) +{ + size_t l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + while (len >= l2) { + len--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} + +/** + * Find the string \a name in the input \a buffer, and return a pointer to the + * value immediately following \a name, reducing \a count appropriately. + * If \a name is not found the original \a buffer is returned. + */ +char *lprocfs_find_named_value(const char *buffer, const char *name, + size_t *count) +{ + char *val; + size_t buflen = *count; + + /* there is no strnstr() in rhel5 and ubuntu kernels */ + val = lprocfs_strnstr(buffer, name, buflen); + if (val == NULL) + return (char *)buffer; + + val += strlen(name); /* skip prefix */ + while (val < buffer + buflen && isspace(*val)) /* skip separator */ + val++; + + *count = 0; + while (val < buffer + buflen && isalnum(*val)) { + ++*count; + ++val; + } + + return val - *count; +} +EXPORT_SYMBOL(lprocfs_find_named_value); + +int lprocfs_seq_create(struct proc_dir_entry *parent, + const char *name, + umode_t mode, + const struct file_operations *seq_fops, + void *data) +{ + struct proc_dir_entry *entry; + + /* Disallow secretly (un)writable entries. */ + LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0)); + entry = proc_create_data(name, mode, parent, seq_fops, data); + + if (entry == NULL) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(lprocfs_seq_create); + +int lprocfs_obd_seq_create(struct obd_device *dev, + const char *name, + umode_t mode, + const struct file_operations *seq_fops, + void *data) +{ + return lprocfs_seq_create(dev->obd_proc_entry, name, + mode, seq_fops, data); +} +EXPORT_SYMBOL(lprocfs_obd_seq_create); + +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ + if (value >= OBD_HIST_MAX) + value = OBD_HIST_MAX - 1; + + spin_lock(&oh->oh_lock); + oh->oh_buckets[value]++; + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_tally); + +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ + unsigned int val; + + for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++) + ; + + lprocfs_oh_tally(oh, val); +} +EXPORT_SYMBOL(lprocfs_oh_tally_log2); + +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ + unsigned long ret = 0; + int i; + + for (i = 0; i < OBD_HIST_MAX; i++) + ret += oh->oh_buckets[i]; + return ret; +} +EXPORT_SYMBOL(lprocfs_oh_sum); + +void lprocfs_oh_clear(struct obd_histogram *oh) +{ + spin_lock(&oh->oh_lock); + memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets)); + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_clear); + +int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + + client_obd_list_lock(&cli->cl_loi_list_lock); + seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return 0; +} +EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc); + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lu_object.c b/kernel/drivers/staging/lustre/lustre/obdclass/lu_object.c new file mode 100644 index 000000000..20c077995 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -0,0 +1,2192 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_object.c + * + * Lustre Object. + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../../include/linux/libcfs/libcfs.h" + +# include + +/* hash_long() */ +#include "../../include/linux/libcfs/libcfs_hash.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lustre_disk.h" +#include "../include/lustre_fid.h" +#include "../include/lu_object.h" +#include "../include/lu_ref.h" +#include + +static void lu_object_free(const struct lu_env *env, struct lu_object *o); + +/** + * Decrease reference counter on object. If last reference is freed, return + * object to the cache, unless lu_object_is_dying(o) holds. In the latter + * case, free object immediately. + */ +void lu_object_put(const struct lu_env *env, struct lu_object *o) +{ + struct lu_site_bkt_data *bkt; + struct lu_object_header *top; + struct lu_site *site; + struct lu_object *orig; + struct cfs_hash_bd bd; + const struct lu_fid *fid; + + top = o->lo_header; + site = o->lo_dev->ld_site; + orig = o; + + /* + * till we have full fids-on-OST implemented anonymous objects + * are possible in OSP. such an object isn't listed in the site + * so we should not remove it from the site. + */ + fid = lu_object_fid(o); + if (fid_is_zero(fid)) { + LASSERT(top->loh_hash.next == NULL + && top->loh_hash.pprev == NULL); + LASSERT(list_empty(&top->loh_lru)); + if (!atomic_dec_and_test(&top->loh_ref)) + return; + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + lu_object_free(env, orig); + return; + } + + cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd); + bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); + + if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) { + if (lu_object_is_dying(top)) { + + /* + * somebody may be waiting for this, currently only + * used for cl_object, see cl_object_put_last(). + */ + wake_up_all(&bkt->lsb_marche_funebre); + } + return; + } + + LASSERT(bkt->lsb_busy > 0); + bkt->lsb_busy--; + /* + * When last reference is released, iterate over object + * layers, and notify them that object is no longer busy. + */ + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + + if (!lu_object_is_dying(top)) { + LASSERT(list_empty(&top->loh_lru)); + list_add_tail(&top->loh_lru, &bkt->lsb_lru); + cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); + return; + } + + /* + * If object is dying (will not be cached), removed it + * from hash table and LRU. + * + * This is done with hash table and LRU lists locked. As the only + * way to acquire first reference to previously unreferenced + * object is through hash-table lookup (lu_object_find()), + * or LRU scanning (lu_site_purge()), that are done under hash-table + * and LRU lock, no race with concurrent object lookup is possible + * and we can safely destroy object below. + */ + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) + cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash); + cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); + /* + * Object was already removed from hash and lru above, can + * kill it. + */ + lu_object_free(env, orig); +} +EXPORT_SYMBOL(lu_object_put); + +/** + * Put object and don't keep in cache. This is temporary solution for + * multi-site objects when its layering is not constant. + */ +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o) +{ + set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags); + return lu_object_put(env, o); +} +EXPORT_SYMBOL(lu_object_put_nocache); + +/** + * Kill the object and take it out of LRU cache. + * Currently used by client code for layout change. + */ +void lu_object_unhash(const struct lu_env *env, struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags); + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) { + struct cfs_hash *obj_hash = o->lo_dev->ld_site->ls_obj_hash; + struct cfs_hash_bd bd; + + cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1); + list_del_init(&top->loh_lru); + cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash); + cfs_hash_bd_unlock(obj_hash, &bd, 1); + } +} +EXPORT_SYMBOL(lu_object_unhash); + +/** + * Allocate new object. + * + * This follows object creation protocol, described in the comment within + * struct lu_device_operations definition. + */ +static struct lu_object *lu_object_alloc(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *scan; + struct lu_object *top; + struct list_head *layers; + unsigned int init_mask = 0; + unsigned int init_flag; + int clean; + int result; + + /* + * Create top-level object slice. This will also create + * lu_object_header. + */ + top = dev->ld_ops->ldo_object_alloc(env, NULL, dev); + if (top == NULL) + return ERR_PTR(-ENOMEM); + if (IS_ERR(top)) + return top; + /* + * This is the only place where object fid is assigned. It's constant + * after this point. + */ + top->lo_header->loh_fid = *f; + layers = &top->lo_header->loh_layers; + + do { + /* + * Call ->loo_object_init() repeatedly, until no more new + * object slices are created. + */ + clean = 1; + init_flag = 1; + list_for_each_entry(scan, layers, lo_linkage) { + if (init_mask & init_flag) + goto next; + clean = 0; + scan->lo_header = top->lo_header; + result = scan->lo_ops->loo_object_init(env, scan, conf); + if (result != 0) { + lu_object_free(env, top); + return ERR_PTR(result); + } + init_mask |= init_flag; +next: + init_flag <<= 1; + } + } while (!clean); + + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_start != NULL) { + result = scan->lo_ops->loo_object_start(env, scan); + if (result != 0) { + lu_object_free(env, top); + return ERR_PTR(result); + } + } + } + + lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED); + return top; +} + +/** + * Free an object. + */ +static void lu_object_free(const struct lu_env *env, struct lu_object *o) +{ + struct lu_site_bkt_data *bkt; + struct lu_site *site; + struct lu_object *scan; + struct list_head *layers; + struct list_head splice; + + site = o->lo_dev->ld_site; + layers = &o->lo_header->loh_layers; + bkt = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid); + /* + * First call ->loo_object_delete() method to release all resources. + */ + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_delete != NULL) + scan->lo_ops->loo_object_delete(env, scan); + } + + /* + * Then, splice object layers into stand-alone list, and call + * ->loo_object_free() on all layers to free memory. Splice is + * necessary, because lu_object_header is freed together with the + * top-level slice. + */ + INIT_LIST_HEAD(&splice); + list_splice_init(layers, &splice); + while (!list_empty(&splice)) { + /* + * Free layers in bottom-to-top order, so that object header + * lives as long as possible and ->loo_object_free() methods + * can look at its contents. + */ + o = container_of0(splice.prev, struct lu_object, lo_linkage); + list_del_init(&o->lo_linkage); + LASSERT(o->lo_ops->loo_object_free != NULL); + o->lo_ops->loo_object_free(env, o); + } + + if (waitqueue_active(&bkt->lsb_marche_funebre)) + wake_up_all(&bkt->lsb_marche_funebre); +} + +/** + * Free \a nr objects from the cold end of the site LRU list. + */ +int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr) +{ + struct lu_object_header *h; + struct lu_object_header *temp; + struct lu_site_bkt_data *bkt; + struct cfs_hash_bd bd; + struct cfs_hash_bd bd2; + struct list_head dispose; + int did_sth; + int start; + int count; + int bnr; + int i; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU)) + return 0; + + INIT_LIST_HEAD(&dispose); + /* + * Under LRU list lock, scan LRU list and move unreferenced objects to + * the dispose list, removing them from LRU and hash table. + */ + start = s->ls_purge_start; + bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1; + again: + did_sth = 0; + cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { + if (i < start) + continue; + count = bnr; + cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1); + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); + + list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) { + LASSERT(atomic_read(&h->loh_ref) == 0); + + cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2); + LASSERT(bd.bd_bucket == bd2.bd_bucket); + + cfs_hash_bd_del_locked(s->ls_obj_hash, + &bd2, &h->loh_hash); + list_move(&h->loh_lru, &dispose); + if (did_sth == 0) + did_sth = 1; + + if (nr != ~0 && --nr == 0) + break; + + if (count > 0 && --count == 0) + break; + + } + cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1); + cond_resched(); + /* + * Free everything on the dispose list. This is safe against + * races due to the reasons described in lu_object_put(). + */ + while (!list_empty(&dispose)) { + h = container_of0(dispose.next, + struct lu_object_header, loh_lru); + list_del_init(&h->loh_lru); + lu_object_free(env, lu_object_top(h)); + lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED); + } + + if (nr == 0) + break; + } + + if (nr != 0 && did_sth && start != 0) { + start = 0; /* restart from the first bucket */ + goto again; + } + /* race on s->ls_purge_start, but nobody cares */ + s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash); + + return nr; +} +EXPORT_SYMBOL(lu_site_purge); + +/* + * Object printing. + * + * Code below has to jump through certain loops to output object description + * into libcfs_debug_msg-based log. The problem is that lu_object_print() + * composes object description from strings that are parts of _lines_ of + * output (i.e., strings that are not terminated by newline). This doesn't fit + * very well into libcfs_debug_msg() interface that assumes that each message + * supplied to it is a self-contained output line. + * + * To work around this, strings are collected in a temporary buffer + * (implemented as a value of lu_cdebug_key key), until terminating newline + * character is detected. + * + */ + +enum { + /** + * Maximal line size. + * + * XXX overflow is not handled correctly. + */ + LU_CDEBUG_LINE = 512 +}; + +struct lu_cdebug_data { + /** + * Temporary buffer. + */ + char lck_area[LU_CDEBUG_LINE]; +}; + +/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */ +LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data); + +/** + * Key, holding temporary buffer. This key is registered very early by + * lu_global_init(). + */ +struct lu_context_key lu_global_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | + LCT_MG_THREAD | LCT_CL_THREAD | LCT_LOCAL, + .lct_init = lu_global_key_init, + .lct_fini = lu_global_key_fini +}; + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...) +{ + struct libcfs_debug_msg_data *msgdata = cookie; + struct lu_cdebug_data *key; + int used; + int complete; + va_list args; + + va_start(args, format); + + key = lu_context_key_get(&env->le_ctx, &lu_global_key); + LASSERT(key != NULL); + + used = strlen(key->lck_area); + complete = format[strlen(format) - 1] == '\n'; + /* + * Append new chunk to the buffer. + */ + vsnprintf(key->lck_area + used, + ARRAY_SIZE(key->lck_area) - used, format, args); + if (complete) { + if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys)) + libcfs_debug_msg(msgdata, "%s", key->lck_area); + key->lck_area[0] = 0; + } + va_end(args); + return 0; +} +EXPORT_SYMBOL(lu_cdebug_printer); + +/** + * Print object header. + */ +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr) +{ + (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]", + hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref), + PFID(&hdr->loh_fid), + hlist_unhashed(&hdr->loh_hash) ? "" : " hash", + list_empty((struct list_head *)&hdr->loh_lru) ? \ + "" : " lru", + hdr->loh_attr & LOHA_EXISTS ? " exist":""); +} +EXPORT_SYMBOL(lu_object_header_print); + +/** + * Print human readable representation of the \a o to the \a printer. + */ +void lu_object_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o) +{ + static const char ruler[] = "........................................"; + struct lu_object_header *top; + int depth = 4; + + top = o->lo_header; + lu_object_header_print(env, cookie, printer, top); + (*printer)(env, cookie, "{\n"); + + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + /* + * print `.' \a depth times followed by type name and address + */ + (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler, + o->lo_dev->ld_type->ldt_name, o); + + if (o->lo_ops->loo_object_print != NULL) + (*o->lo_ops->loo_object_print)(env, cookie, printer, o); + + (*printer)(env, cookie, "\n"); + } + + (*printer)(env, cookie, "} header@%p\n", top); +} +EXPORT_SYMBOL(lu_object_print); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_invariant != NULL && + !o->lo_ops->loo_object_invariant(o)) + return 0; + } + return 1; +} +EXPORT_SYMBOL(lu_object_invariant); + +static struct lu_object *htable_lookup(struct lu_site *s, + struct cfs_hash_bd *bd, + const struct lu_fid *f, + wait_queue_t *waiter, + __u64 *version) +{ + struct lu_site_bkt_data *bkt; + struct lu_object_header *h; + struct hlist_node *hnode; + __u64 ver = cfs_hash_bd_version_get(bd); + + if (*version == ver) + return ERR_PTR(-ENOENT); + + *version = ver; + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd); + /* cfs_hash_bd_peek_locked is a somehow "internal" function + * of cfs_hash, it doesn't add refcount on object. */ + hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f); + if (hnode == NULL) { + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS); + return ERR_PTR(-ENOENT); + } + + h = container_of0(hnode, struct lu_object_header, loh_hash); + if (likely(!lu_object_is_dying(h))) { + cfs_hash_get(s->ls_obj_hash, hnode); + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT); + list_del_init(&h->loh_lru); + return lu_object_top(h); + } + + /* + * Lookup found an object being destroyed this object cannot be + * returned (to assure that references to dying objects are eventually + * drained), and moreover, lookup has to wait until object is freed. + */ + + init_waitqueue_entry(waiter, current); + add_wait_queue(&bkt->lsb_marche_funebre, waiter); + set_current_state(TASK_UNINTERRUPTIBLE); + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE); + return ERR_PTR(-EAGAIN); +} + +/** + * Search cache for an object with the fid \a f. If such object is found, + * return it. Otherwise, create new object, insert it into cache and return + * it. In any case, additional reference is acquired on the returned object. + */ +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf); +} +EXPORT_SYMBOL(lu_object_find); + +static struct lu_object *lu_object_new(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *o; + struct cfs_hash *hs; + struct cfs_hash_bd bd; + struct lu_site_bkt_data *bkt; + + o = lu_object_alloc(env, dev, f, conf); + if (unlikely(IS_ERR(o))) + return o; + + hs = dev->ld_site->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1); + bkt = cfs_hash_bd_extra_get(hs, &bd); + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + bkt->lsb_busy++; + cfs_hash_bd_unlock(hs, &bd, 1); + return o; +} + +/** + * Core logic of lu_object_find*() functions. + */ +static struct lu_object *lu_object_find_try(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf, + wait_queue_t *waiter) +{ + struct lu_object *o; + struct lu_object *shadow; + struct lu_site *s; + struct cfs_hash *hs; + struct cfs_hash_bd bd; + __u64 version = 0; + + /* + * This uses standard index maintenance protocol: + * + * - search index under lock, and return object if found; + * - otherwise, unlock index, allocate new object; + * - lock index and search again; + * - if nothing is found (usual case), insert newly created + * object into index; + * - otherwise (race: other thread inserted object), free + * object just allocated. + * - unlock index; + * - return object. + * + * For "LOC_F_NEW" case, we are sure the object is new established. + * It is unnecessary to perform lookup-alloc-lookup-insert, instead, + * just alloc and insert directly. + * + * If dying object is found during index search, add @waiter to the + * site wait-queue and return ERR_PTR(-EAGAIN). + */ + if (conf != NULL && conf->loc_flags & LOC_F_NEW) + return lu_object_new(env, dev, f, conf); + + s = dev->ld_site; + hs = s->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1); + o = htable_lookup(s, &bd, f, waiter, &version); + cfs_hash_bd_unlock(hs, &bd, 1); + if (!IS_ERR(o) || PTR_ERR(o) != -ENOENT) + return o; + + /* + * Allocate new object. This may result in rather complicated + * operations, including fld queries, inode loading, etc. + */ + o = lu_object_alloc(env, dev, f, conf); + if (unlikely(IS_ERR(o))) + return o; + + LASSERT(lu_fid_eq(lu_object_fid(o), f)); + + cfs_hash_bd_lock(hs, &bd, 1); + + shadow = htable_lookup(s, &bd, f, waiter, &version); + if (likely(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT)) { + struct lu_site_bkt_data *bkt; + + bkt = cfs_hash_bd_extra_get(hs, &bd); + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + bkt->lsb_busy++; + cfs_hash_bd_unlock(hs, &bd, 1); + return o; + } + + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE); + cfs_hash_bd_unlock(hs, &bd, 1); + lu_object_free(env, o); + return shadow; +} + +/** + * Much like lu_object_find(), but top level device of object is specifically + * \a dev rather than top level device of the site. This interface allows + * objects of different "stacking" to be created within the same site. + */ +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_site_bkt_data *bkt; + struct lu_object *obj; + wait_queue_t wait; + + while (1) { + obj = lu_object_find_try(env, dev, f, conf, &wait); + if (obj != ERR_PTR(-EAGAIN)) + return obj; + /* + * lu_object_find_try() already added waiter into the + * wait queue. + */ + schedule(); + bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f); + remove_wait_queue(&bkt->lsb_marche_funebre, &wait); + } +} +EXPORT_SYMBOL(lu_object_find_at); + +/** + * Find object with given fid, and return its slice belonging to given device. + */ +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *top; + struct lu_object *obj; + + top = lu_object_find(env, dev, f, conf); + if (!IS_ERR(top)) { + obj = lu_object_locate(top->lo_header, dev->ld_type); + if (obj == NULL) + lu_object_put(env, top); + } else + obj = top; + return obj; +} +EXPORT_SYMBOL(lu_object_find_slice); + +/** + * Global list of all device types. + */ +static LIST_HEAD(lu_device_types); + +int lu_device_type_init(struct lu_device_type *ldt) +{ + int result = 0; + + INIT_LIST_HEAD(&ldt->ldt_linkage); + if (ldt->ldt_ops->ldto_init) + result = ldt->ldt_ops->ldto_init(ldt); + if (result == 0) + list_add(&ldt->ldt_linkage, &lu_device_types); + return result; +} +EXPORT_SYMBOL(lu_device_type_init); + +void lu_device_type_fini(struct lu_device_type *ldt) +{ + list_del_init(&ldt->ldt_linkage); + if (ldt->ldt_ops->ldto_fini) + ldt->ldt_ops->ldto_fini(ldt); +} +EXPORT_SYMBOL(lu_device_type_fini); + +void lu_types_stop(void) +{ + struct lu_device_type *ldt; + + list_for_each_entry(ldt, &lu_device_types, ldt_linkage) { + if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop) + ldt->ldt_ops->ldto_stop(ldt); + } +} +EXPORT_SYMBOL(lu_types_stop); + +/** + * Global list of all sites on this node + */ +static LIST_HEAD(lu_sites); +static DEFINE_MUTEX(lu_sites_guard); + +/** + * Global environment used by site shrinker. + */ +static struct lu_env lu_shrink_env; + +struct lu_site_print_arg { + struct lu_env *lsp_env; + void *lsp_cookie; + lu_printer_t lsp_printer; +}; + +static int +lu_site_obj_print(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data; + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + if (!list_empty(&h->loh_layers)) { + const struct lu_object *o; + + o = lu_object_top(h); + lu_object_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, o); + } else { + lu_object_header_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, h); + } + return 0; +} + +/** + * Print all objects in \a s. + */ +void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, + lu_printer_t printer) +{ + struct lu_site_print_arg arg = { + .lsp_env = (struct lu_env *)env, + .lsp_cookie = cookie, + .lsp_printer = printer, + }; + + cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg); +} +EXPORT_SYMBOL(lu_site_print); + +enum { + LU_CACHE_PERCENT_MAX = 50, + LU_CACHE_PERCENT_DEFAULT = 20 +}; + +static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; +module_param(lu_cache_percent, int, 0644); +MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache"); + +/** + * Return desired hash table order. + */ +static int lu_htable_order(void) +{ + unsigned long cache_size; + int bits; + + /* + * Calculate hash table size, assuming that we want reasonable + * performance when 20% of total memory is occupied by cache of + * lu_objects. + * + * Size of lu_object is (arbitrary) taken as 1K (together with inode). + */ + cache_size = totalram_pages; + +#if BITS_PER_LONG == 32 + /* limit hashtable size for lowmem systems to low RAM */ + if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT)) + cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4; +#endif + + /* clear off unreasonable cache setting. */ + if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) { + CWARN("obdclass: invalid lu_cache_percent: %u, it must be in the range of (0, %u]. Will use default value: %u.\n", + lu_cache_percent, LU_CACHE_PERCENT_MAX, + LU_CACHE_PERCENT_DEFAULT); + + lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; + } + cache_size = cache_size / 100 * lu_cache_percent * + (PAGE_CACHE_SIZE / 1024); + + for (bits = 1; (1 << bits) < cache_size; ++bits) { + ; + } + return bits; +} + +static unsigned lu_obj_hop_hash(struct cfs_hash *hs, + const void *key, unsigned mask) +{ + struct lu_fid *fid = (struct lu_fid *)key; + __u32 hash; + + hash = fid_flatten32(fid); + hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ + hash = hash_long(hash, hs->hs_bkt_bits); + + /* give me another random factor */ + hash -= hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3); + + hash <<= hs->hs_cur_bits - hs->hs_bkt_bits; + hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1); + + return hash & mask; +} + +static void *lu_obj_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct lu_object_header, loh_hash); +} + +static void *lu_obj_hop_key(struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + return &h->loh_fid; +} + +static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key); +} + +static void lu_obj_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + if (atomic_add_return(1, &h->loh_ref) == 1) { + struct lu_site_bkt_data *bkt; + struct cfs_hash_bd bd; + + cfs_hash_bd_get(hs, &h->loh_fid, &bd); + bkt = cfs_hash_bd_extra_get(hs, &bd); + bkt->lsb_busy++; + } +} + +static void lu_obj_hop_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + LBUG(); /* we should never called it */ +} + +cfs_hash_ops_t lu_site_hash_ops = { + .hs_hash = lu_obj_hop_hash, + .hs_key = lu_obj_hop_key, + .hs_keycmp = lu_obj_hop_keycmp, + .hs_object = lu_obj_hop_object, + .hs_get = lu_obj_hop_get, + .hs_put_locked = lu_obj_hop_put_locked, +}; + +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + if (list_empty(&d->ld_linkage)) + list_add(&d->ld_linkage, &s->ls_ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_add_linkage); + +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + list_del_init(&d->ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_del_linkage); + +/** + * Initialize site \a s, with \a d as the top level device. + */ +#define LU_SITE_BITS_MIN 12 +#define LU_SITE_BITS_MAX 24 +/** + * total 256 buckets, we don't want too many buckets because: + * - consume too much memory + * - avoid unbalanced LRU list + */ +#define LU_SITE_BKT_BITS 8 + +int lu_site_init(struct lu_site *s, struct lu_device *top) +{ + struct lu_site_bkt_data *bkt; + struct cfs_hash_bd bd; + char name[16]; + int bits; + int i; + + memset(s, 0, sizeof(*s)); + bits = lu_htable_order(); + snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name); + for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX); + bits >= LU_SITE_BITS_MIN; bits--) { + s->ls_obj_hash = cfs_hash_create(name, bits, bits, + bits - LU_SITE_BKT_BITS, + sizeof(*bkt), 0, 0, + &lu_site_hash_ops, + CFS_HASH_SPIN_BKTLOCK | + CFS_HASH_NO_ITEMREF | + CFS_HASH_DEPTH | + CFS_HASH_ASSERT_EMPTY); + if (s->ls_obj_hash != NULL) + break; + } + + if (s->ls_obj_hash == NULL) { + CERROR("failed to create lu_site hash with bits: %d\n", bits); + return -ENOMEM; + } + + cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); + INIT_LIST_HEAD(&bkt->lsb_lru); + init_waitqueue_head(&bkt->lsb_marche_funebre); + } + + s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0); + if (s->ls_stats == NULL) { + cfs_hash_putref(s->ls_obj_hash); + s->ls_obj_hash = NULL; + return -ENOMEM; + } + + lprocfs_counter_init(s->ls_stats, LU_SS_CREATED, + 0, "created", "created"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT, + 0, "cache_hit", "cache_hit"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS, + 0, "cache_miss", "cache_miss"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE, + 0, "cache_race", "cache_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE, + 0, "cache_death_race", "cache_death_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED, + 0, "lru_purged", "lru_purged"); + + INIT_LIST_HEAD(&s->ls_linkage); + s->ls_top_dev = top; + top->ld_site = s; + lu_device_get(top); + lu_ref_add(&top->ld_reference, "site-top", s); + + INIT_LIST_HEAD(&s->ls_ld_linkage); + spin_lock_init(&s->ls_ld_lock); + + lu_dev_add_linkage(s, top); + + return 0; +} +EXPORT_SYMBOL(lu_site_init); + +/** + * Finalize \a s and release its resources. + */ +void lu_site_fini(struct lu_site *s) +{ + mutex_lock(&lu_sites_guard); + list_del_init(&s->ls_linkage); + mutex_unlock(&lu_sites_guard); + + if (s->ls_obj_hash != NULL) { + cfs_hash_putref(s->ls_obj_hash); + s->ls_obj_hash = NULL; + } + + if (s->ls_top_dev != NULL) { + s->ls_top_dev->ld_site = NULL; + lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s); + lu_device_put(s->ls_top_dev); + s->ls_top_dev = NULL; + } + + if (s->ls_stats != NULL) + lprocfs_free_stats(&s->ls_stats); +} +EXPORT_SYMBOL(lu_site_fini); + +/** + * Called when initialization of stack for this site is completed. + */ +int lu_site_init_finish(struct lu_site *s) +{ + int result; + mutex_lock(&lu_sites_guard); + result = lu_context_refill(&lu_shrink_env.le_ctx); + if (result == 0) + list_add(&s->ls_linkage, &lu_sites); + mutex_unlock(&lu_sites_guard); + return result; +} +EXPORT_SYMBOL(lu_site_init_finish); + +/** + * Acquire additional reference on device \a d + */ +void lu_device_get(struct lu_device *d) +{ + atomic_inc(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_get); + +/** + * Release reference on device \a d. + */ +void lu_device_put(struct lu_device *d) +{ + LASSERT(atomic_read(&d->ld_ref) > 0); + atomic_dec(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_put); + +/** + * Initialize device \a d of type \a t. + */ +int lu_device_init(struct lu_device *d, struct lu_device_type *t) +{ + if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL) + t->ldt_ops->ldto_start(t); + memset(d, 0, sizeof(*d)); + atomic_set(&d->ld_ref, 0); + d->ld_type = t; + lu_ref_init(&d->ld_reference); + INIT_LIST_HEAD(&d->ld_linkage); + return 0; +} +EXPORT_SYMBOL(lu_device_init); + +/** + * Finalize device \a d. + */ +void lu_device_fini(struct lu_device *d) +{ + struct lu_device_type *t; + + t = d->ld_type; + if (d->ld_obd != NULL) { + d->ld_obd->obd_lu_dev = NULL; + d->ld_obd = NULL; + } + + lu_ref_fini(&d->ld_reference); + LASSERTF(atomic_read(&d->ld_ref) == 0, + "Refcount is %u\n", atomic_read(&d->ld_ref)); + LASSERT(t->ldt_device_nr > 0); + if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL) + t->ldt_ops->ldto_stop(t); +} +EXPORT_SYMBOL(lu_device_fini); + +/** + * Initialize object \a o that is part of compound object \a h and was created + * by device \a d. + */ +int lu_object_init(struct lu_object *o, struct lu_object_header *h, + struct lu_device *d) +{ + memset(o, 0, sizeof(*o)); + o->lo_header = h; + o->lo_dev = d; + lu_device_get(d); + lu_ref_add_at(&d->ld_reference, &o->lo_dev_ref, "lu_object", o); + INIT_LIST_HEAD(&o->lo_linkage); + + return 0; +} +EXPORT_SYMBOL(lu_object_init); + +/** + * Finalize object and release its resources. + */ +void lu_object_fini(struct lu_object *o) +{ + struct lu_device *dev = o->lo_dev; + + LASSERT(list_empty(&o->lo_linkage)); + + if (dev != NULL) { + lu_ref_del_at(&dev->ld_reference, &o->lo_dev_ref, + "lu_object", o); + lu_device_put(dev); + o->lo_dev = NULL; + } +} +EXPORT_SYMBOL(lu_object_fini); + +/** + * Add object \a o as first layer of compound object \a h + * + * This is typically called by the ->ldo_object_alloc() method of top-level + * device. + */ +void lu_object_add_top(struct lu_object_header *h, struct lu_object *o) +{ + list_move(&o->lo_linkage, &h->loh_layers); +} +EXPORT_SYMBOL(lu_object_add_top); + +/** + * Add object \a o as a layer of compound object, going after \a before. + * + * This is typically called by the ->ldo_object_alloc() method of \a + * before->lo_dev. + */ +void lu_object_add(struct lu_object *before, struct lu_object *o) +{ + list_move(&o->lo_linkage, &before->lo_linkage); +} +EXPORT_SYMBOL(lu_object_add); + +/** + * Initialize compound object. + */ +int lu_object_header_init(struct lu_object_header *h) +{ + memset(h, 0, sizeof(*h)); + atomic_set(&h->loh_ref, 1); + INIT_HLIST_NODE(&h->loh_hash); + INIT_LIST_HEAD(&h->loh_lru); + INIT_LIST_HEAD(&h->loh_layers); + lu_ref_init(&h->loh_reference); + return 0; +} +EXPORT_SYMBOL(lu_object_header_init); + +/** + * Finalize compound object. + */ +void lu_object_header_fini(struct lu_object_header *h) +{ + LASSERT(list_empty(&h->loh_layers)); + LASSERT(list_empty(&h->loh_lru)); + LASSERT(hlist_unhashed(&h->loh_hash)); + lu_ref_fini(&h->loh_reference); +} +EXPORT_SYMBOL(lu_object_header_fini); + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype) +{ + struct lu_object *o; + + list_for_each_entry(o, &h->loh_layers, lo_linkage) { + if (o->lo_dev->ld_type == dtype) + return o; + } + return NULL; +} +EXPORT_SYMBOL(lu_object_locate); + + + +/** + * Finalize and free devices in the device stack. + * + * Finalize device stack by purging object cache, and calling + * lu_device_type_operations::ldto_device_fini() and + * lu_device_type_operations::ldto_device_free() on all devices in the stack. + */ +void lu_stack_fini(const struct lu_env *env, struct lu_device *top) +{ + struct lu_site *site = top->ld_site; + struct lu_device *scan; + struct lu_device *next; + + lu_site_purge(env, site, ~0); + for (scan = top; scan != NULL; scan = next) { + next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan); + lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init); + lu_device_put(scan); + } + + /* purge again. */ + lu_site_purge(env, site, ~0); + + for (scan = top; scan != NULL; scan = next) { + const struct lu_device_type *ldt = scan->ld_type; + struct obd_type *type; + + next = ldt->ldt_ops->ldto_device_free(env, scan); + type = ldt->ldt_obd_type; + if (type != NULL) { + type->typ_refcnt--; + class_put_type(type); + } + } +} +EXPORT_SYMBOL(lu_stack_fini); + +enum { + /** + * Maximal number of tld slots. + */ + LU_CONTEXT_KEY_NR = 40 +}; + +static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, }; + +static DEFINE_SPINLOCK(lu_keys_guard); + +/** + * Global counter incremented whenever key is registered, unregistered, + * revived or quiesced. This is used to void unnecessary calls to + * lu_context_refill(). No locking is provided, as initialization and shutdown + * are supposed to be externally serialized. + */ +static unsigned key_set_version; + +/** + * Register new key. + */ +int lu_context_key_register(struct lu_context_key *key) +{ + int result; + int i; + + LASSERT(key->lct_init != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(key->lct_tags != 0); + + result = -ENFILE; + spin_lock(&lu_keys_guard); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + if (lu_keys[i] == NULL) { + key->lct_index = i; + atomic_set(&key->lct_used, 1); + lu_keys[i] = key; + lu_ref_init(&key->lct_reference); + result = 0; + ++key_set_version; + break; + } + } + spin_unlock(&lu_keys_guard); + return result; +} +EXPORT_SYMBOL(lu_context_key_register); + +static void key_fini(struct lu_context *ctx, int index) +{ + if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) { + struct lu_context_key *key; + + key = lu_keys[index]; + LASSERT(key != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(atomic_read(&key->lct_used) > 1); + + key->lct_fini(ctx, key, ctx->lc_value[index]); + lu_ref_del(&key->lct_reference, "ctx", ctx); + atomic_dec(&key->lct_used); + + if ((ctx->lc_tags & LCT_NOREF) == 0) { +#ifdef CONFIG_MODULE_UNLOAD + LINVRNT(module_refcount(key->lct_owner) > 0); +#endif + module_put(key->lct_owner); + } + ctx->lc_value[index] = NULL; + } +} + +/** + * Deregister key. + */ +void lu_context_key_degister(struct lu_context_key *key) +{ + LASSERT(atomic_read(&key->lct_used) >= 1); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + + lu_context_key_quiesce(key); + + ++key_set_version; + spin_lock(&lu_keys_guard); + key_fini(&lu_shrink_env.le_ctx, key->lct_index); + if (lu_keys[key->lct_index]) { + lu_keys[key->lct_index] = NULL; + lu_ref_fini(&key->lct_reference); + } + spin_unlock(&lu_keys_guard); + + LASSERTF(atomic_read(&key->lct_used) == 1, + "key has instances: %d\n", + atomic_read(&key->lct_used)); +} +EXPORT_SYMBOL(lu_context_key_degister); + +/** + * Register a number of keys. This has to be called after all keys have been + * initialized by a call to LU_CONTEXT_KEY_INIT(). + */ +int lu_context_key_register_many(struct lu_context_key *k, ...) +{ + struct lu_context_key *key = k; + va_list args; + int result; + + va_start(args, k); + do { + result = lu_context_key_register(key); + if (result) + break; + key = va_arg(args, struct lu_context_key *); + } while (key != NULL); + va_end(args); + + if (result != 0) { + va_start(args, k); + while (k != key) { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key *); + } + va_end(args); + } + + return result; +} +EXPORT_SYMBOL(lu_context_key_register_many); + +/** + * De-register a number of keys. This is a dual to + * lu_context_key_register_many(). + */ +void lu_context_key_degister_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_degister_many); + +/** + * Revive a number of keys. + */ +void lu_context_key_revive_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_revive(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_revive_many); + +/** + * Quiescent a number of keys. + */ +void lu_context_key_quiesce_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_quiesce(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_quiesce_many); + +/** + * Return value associated with key \a key in context \a ctx. + */ +void *lu_context_key_get(const struct lu_context *ctx, + const struct lu_context_key *key) +{ + LINVRNT(ctx->lc_state == LCS_ENTERED); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + LASSERT(lu_keys[key->lct_index] == key); + return ctx->lc_value[key->lct_index]; +} +EXPORT_SYMBOL(lu_context_key_get); + +/** + * List of remembered contexts. XXX document me. + */ +static LIST_HEAD(lu_context_remembered); + +/** + * Destroy \a key in all remembered contexts. This is used to destroy key + * values in "shared" contexts (like service threads), when a module owning + * the key is about to be unloaded. + */ +void lu_context_key_quiesce(struct lu_context_key *key) +{ + struct lu_context *ctx; + + if (!(key->lct_tags & LCT_QUIESCENT)) { + /* + * XXX layering violation. + */ + key->lct_tags |= LCT_QUIESCENT; + /* + * XXX memory barrier has to go here. + */ + spin_lock(&lu_keys_guard); + list_for_each_entry(ctx, &lu_context_remembered, + lc_remember) + key_fini(ctx, key->lct_index); + spin_unlock(&lu_keys_guard); + ++key_set_version; + } +} +EXPORT_SYMBOL(lu_context_key_quiesce); + +void lu_context_key_revive(struct lu_context_key *key) +{ + key->lct_tags &= ~LCT_QUIESCENT; + ++key_set_version; +} +EXPORT_SYMBOL(lu_context_key_revive); + +static void keys_fini(struct lu_context *ctx) +{ + int i; + + if (ctx->lc_value == NULL) + return; + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) + key_fini(ctx, i); + + OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof(ctx->lc_value[0])); + ctx->lc_value = NULL; +} + +static int keys_fill(struct lu_context *ctx) +{ + int i; + + LINVRNT(ctx->lc_value != NULL); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (ctx->lc_value[i] == NULL && key != NULL && + (key->lct_tags & ctx->lc_tags) && + /* + * Don't create values for a LCT_QUIESCENT key, as this + * will pin module owning a key. + */ + !(key->lct_tags & LCT_QUIESCENT)) { + void *value; + + LINVRNT(key->lct_init != NULL); + LINVRNT(key->lct_index == i); + + value = key->lct_init(ctx, key); + if (unlikely(IS_ERR(value))) + return PTR_ERR(value); + + if (!(ctx->lc_tags & LCT_NOREF)) + try_module_get(key->lct_owner); + lu_ref_add_atomic(&key->lct_reference, "ctx", ctx); + atomic_inc(&key->lct_used); + /* + * This is the only place in the code, where an + * element of ctx->lc_value[] array is set to non-NULL + * value. + */ + ctx->lc_value[i] = value; + if (key->lct_exit != NULL) + ctx->lc_tags |= LCT_HAS_EXIT; + } + ctx->lc_version = key_set_version; + } + return 0; +} + +static int keys_init(struct lu_context *ctx) +{ + OBD_ALLOC(ctx->lc_value, + ARRAY_SIZE(lu_keys) * sizeof(ctx->lc_value[0])); + if (likely(ctx->lc_value != NULL)) + return keys_fill(ctx); + + return -ENOMEM; +} + +/** + * Initialize context data-structure. Create values for all keys. + */ +int lu_context_init(struct lu_context *ctx, __u32 tags) +{ + int rc; + + memset(ctx, 0, sizeof(*ctx)); + ctx->lc_state = LCS_INITIALIZED; + ctx->lc_tags = tags; + if (tags & LCT_REMEMBER) { + spin_lock(&lu_keys_guard); + list_add(&ctx->lc_remember, &lu_context_remembered); + spin_unlock(&lu_keys_guard); + } else { + INIT_LIST_HEAD(&ctx->lc_remember); + } + + rc = keys_init(ctx); + if (rc != 0) + lu_context_fini(ctx); + + return rc; +} +EXPORT_SYMBOL(lu_context_init); + +/** + * Finalize context data-structure. Destroy key values. + */ +void lu_context_fini(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_FINALIZED; + + if ((ctx->lc_tags & LCT_REMEMBER) == 0) { + LASSERT(list_empty(&ctx->lc_remember)); + keys_fini(ctx); + + } else { /* could race with key degister */ + spin_lock(&lu_keys_guard); + keys_fini(ctx); + list_del_init(&ctx->lc_remember); + spin_unlock(&lu_keys_guard); + } +} +EXPORT_SYMBOL(lu_context_fini); + +/** + * Called before entering context. + */ +void lu_context_enter(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_ENTERED; +} +EXPORT_SYMBOL(lu_context_enter); + +/** + * Called after exiting from \a ctx + */ +void lu_context_exit(struct lu_context *ctx) +{ + int i; + + LINVRNT(ctx->lc_state == LCS_ENTERED); + ctx->lc_state = LCS_LEFT; + if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) { + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + if (ctx->lc_value[i] != NULL) { + struct lu_context_key *key; + + key = lu_keys[i]; + LASSERT(key != NULL); + if (key->lct_exit != NULL) + key->lct_exit(ctx, + key, ctx->lc_value[i]); + } + } + } +} +EXPORT_SYMBOL(lu_context_exit); + +/** + * Allocate for context all missing keys that were registered after context + * creation. key_set_version is only changed in rare cases when modules + * are loaded and removed. + */ +int lu_context_refill(struct lu_context *ctx) +{ + return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx); +} +EXPORT_SYMBOL(lu_context_refill); + +/** + * lu_ctx_tags/lu_ses_tags will be updated if there are new types of + * obd being added. Currently, this is only used on client side, specifically + * for echo device client, for other stack (like ptlrpc threads), context are + * predefined when the lu_device type are registered, during the module probe + * phase. + */ +__u32 lu_context_tags_default = 0; +__u32 lu_session_tags_default = 0; + +void lu_context_tags_update(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_context_tags_default |= tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_context_tags_update); + +void lu_context_tags_clear(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_context_tags_default &= ~tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_context_tags_clear); + +void lu_session_tags_update(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_session_tags_default |= tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_session_tags_update); + +void lu_session_tags_clear(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_session_tags_default &= ~tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_session_tags_clear); + +int lu_env_init(struct lu_env *env, __u32 tags) +{ + int result; + + env->le_ses = NULL; + result = lu_context_init(&env->le_ctx, tags); + if (likely(result == 0)) + lu_context_enter(&env->le_ctx); + return result; +} +EXPORT_SYMBOL(lu_env_init); + +void lu_env_fini(struct lu_env *env) +{ + lu_context_exit(&env->le_ctx); + lu_context_fini(&env->le_ctx); + env->le_ses = NULL; +} +EXPORT_SYMBOL(lu_env_fini); + +int lu_env_refill(struct lu_env *env) +{ + int result; + + result = lu_context_refill(&env->le_ctx); + if (result == 0 && env->le_ses != NULL) + result = lu_context_refill(env->le_ses); + return result; +} +EXPORT_SYMBOL(lu_env_refill); + +/** + * Currently, this API will only be used by echo client. + * Because echo client and normal lustre client will share + * same cl_env cache. So echo client needs to refresh + * the env context after it get one from the cache, especially + * when normal client and echo client co-exist in the same client. + */ +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, + __u32 stags) +{ + if ((env->le_ctx.lc_tags & ctags) != ctags) { + env->le_ctx.lc_version = 0; + env->le_ctx.lc_tags |= ctags; + } + + if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) { + env->le_ses->lc_version = 0; + env->le_ses->lc_tags |= stags; + } + + return lu_env_refill(env); +} +EXPORT_SYMBOL(lu_env_refill_by_tags); + + +typedef struct lu_site_stats{ + unsigned lss_populated; + unsigned lss_max_search; + unsigned lss_total; + unsigned lss_busy; +} lu_site_stats_t; + +static void lu_site_stats_get(struct cfs_hash *hs, + lu_site_stats_t *stats, int populated) +{ + struct cfs_hash_bd bd; + int i; + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd); + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, 1); + stats->lss_busy += bkt->lsb_busy; + stats->lss_total += cfs_hash_bd_count_get(&bd); + stats->lss_max_search = max((int)stats->lss_max_search, + cfs_hash_bd_depmax_get(&bd)); + if (!populated) { + cfs_hash_bd_unlock(hs, &bd, 1); + continue; + } + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + if (!hlist_empty(hhead)) + stats->lss_populated++; + } + cfs_hash_bd_unlock(hs, &bd, 1); + } +} + + +/* + * There exists a potential lock inversion deadlock scenario when using + * Lustre on top of ZFS. This occurs between one of ZFS's + * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially, + * thread A will take the lu_sites_guard lock and sleep on the ht_lock, + * while thread B will take the ht_lock and sleep on the lu_sites_guard + * lock. Obviously neither thread will wake and drop their respective hold + * on their lock. + * + * To prevent this from happening we must ensure the lu_sites_guard lock is + * not taken while down this code path. ZFS reliably does not set the + * __GFP_FS bit in its code paths, so this can be used to determine if it + * is safe to take the lu_sites_guard lock. + * + * Ideally we should accurately return the remaining number of cached + * objects without taking the lu_sites_guard lock, but this is not + * possible in the current implementation. + */ +static unsigned long lu_cache_shrink_count(struct shrinker *sk, + struct shrink_control *sc) +{ + lu_site_stats_t stats; + struct lu_site *s; + struct lu_site *tmp; + unsigned long cached = 0; + + if (!(sc->gfp_mask & __GFP_FS)) + return 0; + + mutex_lock(&lu_sites_guard); + list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { + memset(&stats, 0, sizeof(stats)); + lu_site_stats_get(s->ls_obj_hash, &stats, 0); + cached += stats.lss_total - stats.lss_busy; + } + mutex_unlock(&lu_sites_guard); + + cached = (cached / 100) * sysctl_vfs_cache_pressure; + CDEBUG(D_INODE, "%ld objects cached\n", cached); + return cached; +} + +static unsigned long lu_cache_shrink_scan(struct shrinker *sk, + struct shrink_control *sc) +{ + struct lu_site *s; + struct lu_site *tmp; + unsigned long remain = sc->nr_to_scan, freed = 0; + LIST_HEAD(splice); + + if (!(sc->gfp_mask & __GFP_FS)) + /* We must not take the lu_sites_guard lock when + * __GFP_FS is *not* set because of the deadlock + * possibility detailed above. Additionally, + * since we cannot determine the number of + * objects in the cache without taking this + * lock, we're in a particularly tough spot. As + * a result, we'll just lie and say our cache is + * empty. This _should_ be ok, as we can't + * reclaim objects when __GFP_FS is *not* set + * anyways. + */ + return SHRINK_STOP; + + mutex_lock(&lu_sites_guard); + list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { + freed = lu_site_purge(&lu_shrink_env, s, remain); + remain -= freed; + /* + * Move just shrunk site to the tail of site list to + * assure shrinking fairness. + */ + list_move_tail(&s->ls_linkage, &splice); + } + list_splice(&splice, lu_sites.prev); + mutex_unlock(&lu_sites_guard); + + return sc->nr_to_scan - remain; +} + +/* + * Debugging stuff. + */ + +/** + * Environment to be used in debugger, contains all tags. + */ +struct lu_env lu_debugging_env; + +/** + * Debugging printer function using printk(). + */ +int lu_printk_printer(const struct lu_env *env, + void *unused, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vprintk(format, args); + va_end(args); + return 0; +} + +static struct shrinker lu_site_shrinker = { + .count_objects = lu_cache_shrink_count, + .scan_objects = lu_cache_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void) +{ + int result; + + CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys); + + result = lu_ref_global_init(); + if (result != 0) + return result; + + LU_CONTEXT_KEY_INIT(&lu_global_key); + result = lu_context_key_register(&lu_global_key); + if (result != 0) + return result; + + /* + * At this level, we don't know what tags are needed, so allocate them + * conservatively. This should not be too bad, because this + * environment is global. + */ + mutex_lock(&lu_sites_guard); + result = lu_env_init(&lu_shrink_env, LCT_SHRINKER); + mutex_unlock(&lu_sites_guard); + if (result != 0) + return result; + + /* + * seeks estimation: 3 seeks to read a record from oi, one to read + * inode, one for ea. Unfortunately setting this high value results in + * lu_object/inode cache consuming all the memory. + */ + register_shrinker(&lu_site_shrinker); + + return result; +} + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void) +{ + unregister_shrinker(&lu_site_shrinker); + lu_context_key_degister(&lu_global_key); + + /* + * Tear shrinker environment down _after_ de-registering + * lu_global_key, because the latter has a value in the former. + */ + mutex_lock(&lu_sites_guard); + lu_env_fini(&lu_shrink_env); + mutex_unlock(&lu_sites_guard); + + lu_ref_global_fini(); +} + +static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx) +{ +#if defined (CONFIG_PROC_FS) + struct lprocfs_counter ret; + + lprocfs_stats_collect(stats, idx, &ret); + return (__u32)ret.lc_count; +#else + return 0; +#endif +} + +/** + * Output site statistical counters into a buffer. Suitable for + * lprocfs_rd_*()-style functions. + */ +int lu_site_stats_print(const struct lu_site *s, struct seq_file *m) +{ + lu_site_stats_t stats; + + memset(&stats, 0, sizeof(stats)); + lu_site_stats_get(s->ls_obj_hash, &stats, 1); + + seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n", + stats.lss_busy, + stats.lss_total, + stats.lss_populated, + CFS_HASH_NHLIST(s->ls_obj_hash), + stats.lss_max_search, + ls_stats_read(s->ls_stats, LU_SS_CREATED), + ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT), + ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS), + ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE), + ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE), + ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED)); + return 0; +} +EXPORT_SYMBOL(lu_site_stats_print); + +/** + * Helper function to initialize a number of kmem slab caches at once. + */ +int lu_kmem_init(struct lu_kmem_descr *caches) +{ + int result; + struct lu_kmem_descr *iter = caches; + + for (result = 0; iter->ckd_cache != NULL; ++iter) { + *iter->ckd_cache = kmem_cache_create(iter->ckd_name, + iter->ckd_size, + 0, 0, NULL); + if (*iter->ckd_cache == NULL) { + result = -ENOMEM; + /* free all previously allocated caches */ + lu_kmem_fini(caches); + break; + } + } + return result; +} +EXPORT_SYMBOL(lu_kmem_init); + +/** + * Helper function to finalize a number of kmem slab cached at once. Dual to + * lu_kmem_init(). + */ +void lu_kmem_fini(struct lu_kmem_descr *caches) +{ + for (; caches->ckd_cache != NULL; ++caches) { + if (*caches->ckd_cache != NULL) { + kmem_cache_destroy(*caches->ckd_cache); + *caches->ckd_cache = NULL; + } + } +} +EXPORT_SYMBOL(lu_kmem_fini); + +/** + * Temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid) +{ + struct lu_site *s = o->lo_dev->ld_site; + struct lu_fid *old = &o->lo_header->loh_fid; + struct lu_site_bkt_data *bkt; + struct lu_object *shadow; + wait_queue_t waiter; + struct cfs_hash *hs; + struct cfs_hash_bd bd; + __u64 version = 0; + + LASSERT(fid_is_zero(old)); + + hs = s->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1); + shadow = htable_lookup(s, &bd, fid, &waiter, &version); + /* supposed to be unique */ + LASSERT(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT); + *old = *fid; + bkt = cfs_hash_bd_extra_get(hs, &bd); + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + bkt->lsb_busy++; + cfs_hash_bd_unlock(hs, &bd, 1); +} +EXPORT_SYMBOL(lu_object_assign_fid); + +/** + * allocates object with 0 (non-assigned) fid + * XXX: temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf) +{ + struct lu_fid fid; + struct lu_object *o; + + fid_zero(&fid); + o = lu_object_alloc(env, dev, &fid, conf); + + return o; +} +EXPORT_SYMBOL(lu_object_anon); + +struct lu_buf LU_BUF_NULL = { + .lb_buf = NULL, + .lb_len = 0 +}; +EXPORT_SYMBOL(LU_BUF_NULL); + +void lu_buf_free(struct lu_buf *buf) +{ + LASSERT(buf); + if (buf->lb_buf) { + LASSERT(buf->lb_len > 0); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + buf->lb_buf = NULL; + buf->lb_len = 0; + } +} +EXPORT_SYMBOL(lu_buf_free); + +void lu_buf_alloc(struct lu_buf *buf, int size) +{ + LASSERT(buf); + LASSERT(buf->lb_buf == NULL); + LASSERT(buf->lb_len == 0); + OBD_ALLOC_LARGE(buf->lb_buf, size); + if (likely(buf->lb_buf)) + buf->lb_len = size; +} +EXPORT_SYMBOL(lu_buf_alloc); + +void lu_buf_realloc(struct lu_buf *buf, int size) +{ + lu_buf_free(buf); + lu_buf_alloc(buf, size); +} +EXPORT_SYMBOL(lu_buf_realloc); + +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len) +{ + if (buf->lb_buf == NULL && buf->lb_len == 0) + lu_buf_alloc(buf, len); + + if ((len > buf->lb_len) && (buf->lb_buf != NULL)) + lu_buf_realloc(buf, len); + + return buf; +} +EXPORT_SYMBOL(lu_buf_check_and_alloc); + +/** + * Increase the size of the \a buf. + * preserves old data in buffer + * old buffer remains unchanged on error + * \retval 0 or -ENOMEM + */ +int lu_buf_check_and_grow(struct lu_buf *buf, int len) +{ + char *ptr; + + if (len <= buf->lb_len) + return 0; + + OBD_ALLOC_LARGE(ptr, len); + if (ptr == NULL) + return -ENOMEM; + + /* Free the old buf */ + if (buf->lb_buf != NULL) { + memcpy(ptr, buf->lb_buf, buf->lb_len); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + } + + buf->lb_buf = ptr; + buf->lb_len = len; + return 0; +} +EXPORT_SYMBOL(lu_buf_check_and_grow); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/kernel/drivers/staging/lustre/lustre/obdclass/lu_ref.c new file mode 100644 index 000000000..993697b66 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/lu_ref.c @@ -0,0 +1,50 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_ref.c + * + * Lustre reference. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lu_ref.h" diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/kernel/drivers/staging/lustre/lustre/obdclass/lustre_handles.c new file mode 100644 index 000000000..f720e3183 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/lustre_handles.c @@ -0,0 +1,257 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lustre_handles.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../include/obd_support.h" +#include "../include/lustre_handles.h" +#include "../include/lustre_lib.h" + + +static __u64 handle_base; +#define HANDLE_INCR 7 +static spinlock_t handle_base_lock; + +static struct handle_bucket { + spinlock_t lock; + struct list_head head; +} *handle_hash; + +#define HANDLE_HASH_SIZE (1 << 16) +#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1) + +/* + * Generate a unique 64bit cookie (hash) for a handle and insert it into + * global (per-node) hash-table. + */ +void class_handle_hash(struct portals_handle *h, + struct portals_handle_ops *ops) +{ + struct handle_bucket *bucket; + + LASSERT(h != NULL); + LASSERT(list_empty(&h->h_link)); + + /* + * This is fast, but simplistic cookie generation algorithm, it will + * need a re-do at some point in the future for security. + */ + spin_lock(&handle_base_lock); + handle_base += HANDLE_INCR; + + if (unlikely(handle_base == 0)) { + /* + * Cookie of zero is "dangerous", because in many places it's + * assumed that 0 means "unassigned" handle, not bound to any + * object. + */ + CWARN("The universe has been exhausted: cookie wrap-around.\n"); + handle_base += HANDLE_INCR; + } + h->h_cookie = handle_base; + spin_unlock(&handle_base_lock); + + h->h_ops = ops; + spin_lock_init(&h->h_lock); + + bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK]; + spin_lock(&bucket->lock); + list_add_rcu(&h->h_link, &bucket->head); + h->h_in = 1; + spin_unlock(&bucket->lock); + + CDEBUG(D_INFO, "added object %p with handle %#llx to hash\n", + h, h->h_cookie); +} +EXPORT_SYMBOL(class_handle_hash); + +static void class_handle_unhash_nolock(struct portals_handle *h) +{ + if (list_empty(&h->h_link)) { + CERROR("removing an already-removed handle (%#llx)\n", + h->h_cookie); + return; + } + + CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n", + h, h->h_cookie); + + spin_lock(&h->h_lock); + if (h->h_in == 0) { + spin_unlock(&h->h_lock); + return; + } + h->h_in = 0; + spin_unlock(&h->h_lock); + list_del_rcu(&h->h_link); +} + +void class_handle_unhash(struct portals_handle *h) +{ + struct handle_bucket *bucket; + bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); + + spin_lock(&bucket->lock); + class_handle_unhash_nolock(h); + spin_unlock(&bucket->lock); +} +EXPORT_SYMBOL(class_handle_unhash); + +void class_handle_hash_back(struct portals_handle *h) +{ + struct handle_bucket *bucket; + + bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); + + spin_lock(&bucket->lock); + list_add_rcu(&h->h_link, &bucket->head); + h->h_in = 1; + spin_unlock(&bucket->lock); +} +EXPORT_SYMBOL(class_handle_hash_back); + +void *class_handle2object(__u64 cookie) +{ + struct handle_bucket *bucket; + struct portals_handle *h; + void *retval = NULL; + + LASSERT(handle_hash != NULL); + + /* Be careful when you want to change this code. See the + * rcu_read_lock() definition on top this file. - jxiong */ + bucket = handle_hash + (cookie & HANDLE_HASH_MASK); + + rcu_read_lock(); + list_for_each_entry_rcu(h, &bucket->head, h_link) { + if (h->h_cookie != cookie) + continue; + + spin_lock(&h->h_lock); + if (likely(h->h_in != 0)) { + h->h_ops->hop_addref(h); + retval = h; + } + spin_unlock(&h->h_lock); + break; + } + rcu_read_unlock(); + + return retval; +} +EXPORT_SYMBOL(class_handle2object); + +void class_handle_free_cb(struct rcu_head *rcu) +{ + struct portals_handle *h = RCU2HANDLE(rcu); + void *ptr = (void *)(unsigned long)h->h_cookie; + + if (h->h_ops->hop_free != NULL) + h->h_ops->hop_free(ptr, h->h_size); + else + OBD_FREE(ptr, h->h_size); +} +EXPORT_SYMBOL(class_handle_free_cb); + +int class_handle_init(void) +{ + struct handle_bucket *bucket; + struct timeval tv; + int seed[2]; + + LASSERT(handle_hash == NULL); + + OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE); + if (handle_hash == NULL) + return -ENOMEM; + + spin_lock_init(&handle_base_lock); + for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash; + bucket--) { + INIT_LIST_HEAD(&bucket->head); + spin_lock_init(&bucket->lock); + } + + /** bug 21430: add randomness to the initial base */ + cfs_get_random_bytes(seed, sizeof(seed)); + do_gettimeofday(&tv); + cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]); + + cfs_get_random_bytes(&handle_base, sizeof(handle_base)); + LASSERT(handle_base != 0ULL); + + return 0; +} + +static int cleanup_all_handles(void) +{ + int rc; + int i; + + for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) { + struct portals_handle *h; + + spin_lock(&handle_hash[i].lock); + list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) { + CERROR("force clean handle %#llx addr %p ops %p\n", + h->h_cookie, h, h->h_ops); + + class_handle_unhash_nolock(h); + rc++; + } + spin_unlock(&handle_hash[i].lock); + } + + return rc; +} + +void class_handle_cleanup(void) +{ + int count; + LASSERT(handle_hash != NULL); + + count = cleanup_all_handles(); + + OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE); + handle_hash = NULL; + + if (count != 0) + CERROR("handle_count at cleanup: %d\n", count); +} diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/kernel/drivers/staging/lustre/lustre/obdclass/lustre_peer.c new file mode 100644 index 000000000..64b2f35e2 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/lustre_peer.c @@ -0,0 +1,217 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include "../include/obd.h" +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_lib.h" +#include "../include/lustre_ha.h" +#include "../include/lustre_net.h" +#include "../include/lprocfs_status.h" + +#define NIDS_MAX 32 + +struct uuid_nid_data { + struct list_head un_list; + struct obd_uuid un_uuid; + int un_nid_count; + lnet_nid_t un_nids[NIDS_MAX]; +}; + +/* FIXME: This should probably become more elegant than a global linked list */ +static struct list_head g_uuid_list; +static spinlock_t g_uuid_lock; + +void class_init_uuidlist(void) +{ + INIT_LIST_HEAD(&g_uuid_list); + spin_lock_init(&g_uuid_lock); +} + +void class_exit_uuidlist(void) +{ + /* delete all */ + class_del_uuid(NULL); +} + +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index) +{ + struct uuid_nid_data *data; + struct obd_uuid tmp; + int rc = -ENOENT; + + obd_str2uuid(&tmp, uuid); + spin_lock(&g_uuid_lock); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + if (index >= data->un_nid_count) + break; + + rc = 0; + *peer_nid = data->un_nids[index]; + break; + } + } + spin_unlock(&g_uuid_lock); + return rc; +} +EXPORT_SYMBOL(lustre_uuid_to_peer); + +/* Add a nid to a niduuid. Multiple nids can be added to a single uuid; + LNET will choose the best one. */ +int class_add_uuid(const char *uuid, __u64 nid) +{ + struct uuid_nid_data *data, *entry; + int found = 0; + + LASSERT(nid != 0); /* valid newconfig NID is never zero */ + + if (strlen(uuid) > UUID_MAX - 1) + return -EOVERFLOW; + + OBD_ALLOC_PTR(data); + if (data == NULL) + return -ENOMEM; + + obd_str2uuid(&data->un_uuid, uuid); + data->un_nids[0] = nid; + data->un_nid_count = 1; + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) { + int i; + + found = 1; + for (i = 0; i < entry->un_nid_count; i++) + if (nid == entry->un_nids[i]) + break; + + if (i == entry->un_nid_count) { + LASSERT(entry->un_nid_count < NIDS_MAX); + entry->un_nids[entry->un_nid_count++] = nid; + } + break; + } + } + if (!found) + list_add(&data->un_list, &g_uuid_list); + spin_unlock(&g_uuid_lock); + + if (found) { + CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid, + libcfs_nid2str(nid), entry->un_nid_count); + OBD_FREE(data, sizeof(*data)); + } else { + CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid)); + } + return 0; +} +EXPORT_SYMBOL(class_add_uuid); + +/* Delete the nids for one uuid if specified, otherwise delete all */ +int class_del_uuid(const char *uuid) +{ + LIST_HEAD(deathrow); + struct uuid_nid_data *data; + + spin_lock(&g_uuid_lock); + if (uuid != NULL) { + struct obd_uuid tmp; + + obd_str2uuid(&tmp, uuid); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + list_move(&data->un_list, &deathrow); + break; + } + } + } else + list_splice_init(&g_uuid_list, &deathrow); + spin_unlock(&g_uuid_lock); + + if (uuid != NULL && list_empty(&deathrow)) { + CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid); + return -EINVAL; + } + + while (!list_empty(&deathrow)) { + data = list_entry(deathrow.next, struct uuid_nid_data, + un_list); + list_del(&data->un_list); + + CDEBUG(D_INFO, "del uuid %s %s/%d\n", + obd_uuid2str(&data->un_uuid), + libcfs_nid2str(data->un_nids[0]), + data->un_nid_count); + + OBD_FREE(data, sizeof(*data)); + } + + return 0; +} + +/* check if @nid exists in nid list of @uuid */ +int class_check_uuid(struct obd_uuid *uuid, __u64 nid) +{ + struct uuid_nid_data *entry; + int found = 0; + + CDEBUG(D_INFO, "check if uuid %s has %s.\n", + obd_uuid2str(uuid), libcfs_nid2str(nid)); + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + int i; + + if (!obd_uuid_equals(&entry->un_uuid, uuid)) + continue; + + /* found the uuid, check if it has @nid */ + for (i = 0; i < entry->un_nid_count; i++) { + if (entry->un_nids[i] == nid) { + found = 1; + break; + } + } + break; + } + spin_unlock(&g_uuid_lock); + return found; +} +EXPORT_SYMBOL(class_check_uuid); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/obd_config.c b/kernel/drivers/staging/lustre/lustre/obdclass/obd_config.c new file mode 100644 index 000000000..6ce9adc2f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/obd_config.c @@ -0,0 +1,1953 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_config.c + * + * Config API + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#include "../include/obd_class.h" +#include +#include "../include/lustre_log.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre_param.h" + +#include "llog_internal.h" + +static cfs_hash_ops_t uuid_hash_ops; +static cfs_hash_ops_t nid_hash_ops; +static cfs_hash_ops_t nid_stat_hash_ops; + +/*********** string parsing utils *********/ + +/* returns 0 if we find this key in the buffer, else 1 */ +int class_find_param(char *buf, char *key, char **valp) +{ + char *ptr; + + if (!buf) + return 1; + + ptr = strstr(buf, key); + if (ptr == NULL) + return 1; + + if (valp) + *valp = ptr + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_find_param); + +/** + * Check whether the proc parameter \a param is an old parameter or not from + * the array \a ptr which contains the mapping from old parameters to new ones. + * If it's an old one, then return the pointer to the cfg_interop_param struc- + * ture which contains both the old and new parameters. + * + * \param param proc parameter + * \param ptr an array which contains the mapping from + * old parameters to new ones + * + * \retval valid-pointer pointer to the cfg_interop_param structure + * which contains the old and new parameters + * \retval NULL \a param or \a ptr is NULL, + * or \a param is not an old parameter + */ +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr) +{ + char *value = NULL; + int name_len = 0; + + if (param == NULL || ptr == NULL) + return NULL; + + value = strchr(param, '='); + if (value == NULL) + name_len = strlen(param); + else + name_len = value - param; + + while (ptr->old_param != NULL) { + if (strncmp(param, ptr->old_param, name_len) == 0 && + name_len == strlen(ptr->old_param)) + return ptr; + ptr++; + } + + return NULL; +} +EXPORT_SYMBOL(class_find_old_param); + +/** + * Finds a parameter in \a params and copies it to \a copy. + * + * Leading spaces are skipped. Next space or end of string is the + * parameter terminator with the exception that spaces inside single or double + * quotes get included into a parameter. The parameter is copied into \a copy + * which has to be allocated big enough by a caller, quotes are stripped in + * the copy and the copy is terminated by 0. + * + * On return \a params is set to next parameter or to NULL if last + * parameter is returned. + * + * \retval 0 if parameter is returned in \a copy + * \retval 1 otherwise + * \retval -EINVAL if unbalanced quota is found + */ +int class_get_next_param(char **params, char *copy) +{ + char *q1, *q2, *str; + int len; + + str = *params; + while (*str == ' ') + str++; + + if (*str == '\0') { + *params = NULL; + return 1; + } + + while (1) { + q1 = strpbrk(str, " '\""); + if (q1 == NULL) { + len = strlen(str); + memcpy(copy, str, len); + copy[len] = '\0'; + *params = NULL; + return 0; + } + len = q1 - str; + if (*q1 == ' ') { + memcpy(copy, str, len); + copy[len] = '\0'; + *params = str + len; + return 0; + } + + memcpy(copy, str, len); + copy += len; + + /* search for the matching closing quote */ + str = q1 + 1; + q2 = strchr(str, *q1); + if (q2 == NULL) { + CERROR("Unbalanced quota in parameters: \"%s\"\n", + *params); + return -EINVAL; + } + len = q2 - str; + memcpy(copy, str, len); + copy += len; + str = q2 + 1; + } + return 1; +} +EXPORT_SYMBOL(class_get_next_param); + +/* returns 0 if this is the first key in the buffer, else 1. + valp points to first char after key. */ +int class_match_param(char *buf, char *key, char **valp) +{ + if (!buf) + return 1; + + if (memcmp(buf, key, strlen(key)) != 0) + return 1; + + if (valp) + *valp = buf + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_match_param); + +static int parse_nid(char *buf, void *value, int quiet) +{ + lnet_nid_t *nid = (lnet_nid_t *)value; + + *nid = libcfs_str2nid(buf); + if (*nid != LNET_NID_ANY) + return 0; + + if (!quiet) + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf); + return -EINVAL; +} + +static int parse_net(char *buf, void *value) +{ + __u32 *net = (__u32 *)value; + + *net = libcfs_str2net(buf); + CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net)); + return 0; +} + +enum { + CLASS_PARSE_NID = 1, + CLASS_PARSE_NET, +}; + +/* 0 is good nid, + 1 not found + < 0 error + endh is set to next separator */ +static int class_parse_value(char *buf, int opc, void *value, char **endh, + int quiet) +{ + char *endp; + char tmp; + int rc = 0; + + if (!buf) + return 1; + while (*buf == ',' || *buf == ':') + buf++; + if (*buf == ' ' || *buf == '/' || *buf == '\0') + return 1; + + /* nid separators or end of nids */ + endp = strpbrk(buf, ",: /"); + if (endp == NULL) + endp = buf + strlen(buf); + + tmp = *endp; + *endp = '\0'; + switch (opc) { + default: + LBUG(); + case CLASS_PARSE_NID: + rc = parse_nid(buf, value, quiet); + break; + case CLASS_PARSE_NET: + rc = parse_net(buf, value); + break; + } + *endp = tmp; + if (rc != 0) + return rc; + if (endh) + *endh = endp; + return 0; +} + +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0); +} +EXPORT_SYMBOL(class_parse_nid); + +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1); +} +EXPORT_SYMBOL(class_parse_nid_quiet); + +int class_parse_net(char *buf, __u32 *net, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0); +} +EXPORT_SYMBOL(class_parse_net); + +/* 1 param contains key and match + * 0 param contains key and not match + * -1 param does not contain key + */ +int class_match_nid(char *buf, char *key, lnet_nid_t nid) +{ + lnet_nid_t tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* please restrict to the nids pertaining to + * the specified nids */ + while (class_parse_nid(buf, &tmp, &buf) == 0) { + if (tmp == nid) + return 1; + } + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(class_match_nid); + +int class_match_net(char *buf, char *key, __u32 net) +{ + __u32 tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* please restrict to the nids pertaining to + * the specified networks */ + while (class_parse_net(buf, &tmp, &buf) == 0) { + if (tmp == net) + return 1; + } + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(class_match_net); + +/********************** class fns **********************/ + +/** + * Create a new obd device and set the type, name and uuid. If successful, + * the new device can be accessed by either name or uuid. + */ +int class_attach(struct lustre_cfg *lcfg) +{ + struct obd_device *obd = NULL; + char *typename, *name, *uuid; + int rc, len; + + if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("No type passed!\n"); + return -EINVAL; + } + typename = lustre_cfg_string(lcfg, 1); + + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) { + CERROR("No name passed!\n"); + return -EINVAL; + } + name = lustre_cfg_string(lcfg, 0); + + if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) { + CERROR("No UUID passed!\n"); + return -EINVAL; + } + uuid = lustre_cfg_string(lcfg, 2); + + CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n", + MKSTR(typename), MKSTR(name), MKSTR(uuid)); + + obd = class_newdev(typename, name); + if (IS_ERR(obd)) { + /* Already exists or out of obds */ + rc = PTR_ERR(obd); + obd = NULL; + CERROR("Cannot create device %s of type %s : %d\n", + name, typename, rc); + goto out; + } + LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n", + name, typename); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08X != %08X\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0, + "%p obd_name %s != %s\n", obd, obd->obd_name, name); + + rwlock_init(&obd->obd_pool_lock); + obd->obd_pool_limit = 0; + obd->obd_pool_slv = 0; + + INIT_LIST_HEAD(&obd->obd_exports); + INIT_LIST_HEAD(&obd->obd_unlinked_exports); + INIT_LIST_HEAD(&obd->obd_delayed_exports); + INIT_LIST_HEAD(&obd->obd_exports_timed); + INIT_LIST_HEAD(&obd->obd_nid_stats); + spin_lock_init(&obd->obd_nid_lock); + spin_lock_init(&obd->obd_dev_lock); + mutex_init(&obd->obd_dev_mutex); + spin_lock_init(&obd->obd_osfs_lock); + /* obd->obd_osfs_age must be set to a value in the distant + * past to guarantee a fresh statfs is fetched on mount. */ + obd->obd_osfs_age = cfs_time_shift_64(-1000); + + /* XXX belongs in setup not attach */ + init_rwsem(&obd->obd_observer_link_sem); + /* recovery data */ + cfs_init_timer(&obd->obd_recovery_timer); + spin_lock_init(&obd->obd_recovery_task_lock); + init_waitqueue_head(&obd->obd_next_transno_waitq); + init_waitqueue_head(&obd->obd_evict_inprogress_waitq); + INIT_LIST_HEAD(&obd->obd_req_replay_queue); + INIT_LIST_HEAD(&obd->obd_lock_replay_queue); + INIT_LIST_HEAD(&obd->obd_final_req_queue); + INIT_LIST_HEAD(&obd->obd_evict_list); + + llog_group_init(&obd->obd_olg, FID_SEQ_LLOG); + + obd->obd_conn_inprogress = 0; + + len = strlen(uuid); + if (len >= sizeof(obd->obd_uuid)) { + CERROR("uuid must be < %d bytes long\n", + (int)sizeof(obd->obd_uuid)); + rc = -EINVAL; + goto out; + } + memcpy(obd->obd_uuid.uuid, uuid, len); + + /* do the attach */ + if (OBP(obd, attach)) { + rc = OBP(obd, attach)(obd, sizeof(*lcfg), lcfg); + if (rc) { + rc = -EINVAL; + goto out; + } + } + + /* Detach drops this */ + spin_lock(&obd->obd_dev_lock); + atomic_set(&obd->obd_refcount, 1); + spin_unlock(&obd->obd_dev_lock); + lu_ref_init(&obd->obd_reference); + lu_ref_add(&obd->obd_reference, "attach", obd); + + obd->obd_attached = 1; + CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n", + obd->obd_minor, typename, atomic_read(&obd->obd_refcount)); + return 0; + out: + if (obd != NULL) { + class_release_dev(obd); + } + return rc; +} +EXPORT_SYMBOL(class_attach); + +/** Create hashes, self-export, and call type-specific setup. + * Setup is effectively the "start this obd" call. + */ +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + struct obd_export *exp; + + LASSERT(obd != NULL); + LASSERTF(obd == class_num2obd(obd->obd_minor), + "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, class_num2obd(obd->obd_minor)); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + + /* have we attached a type to this device? */ + if (!obd->obd_attached) { + CERROR("Device %d not attached\n", obd->obd_minor); + return -ENODEV; + } + + if (obd->obd_set_up) { + CERROR("Device %d already setup (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + return -EEXIST; + } + + /* is someone else setting us up right now? (attach inits spinlock) */ + spin_lock(&obd->obd_dev_lock); + if (obd->obd_starting) { + spin_unlock(&obd->obd_dev_lock); + CERROR("Device %d setup in progress (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + return -EEXIST; + } + /* just leave this on forever. I can't use obd_set_up here because + other fns check that status, and we're not actually set up yet. */ + obd->obd_starting = 1; + obd->obd_uuid_hash = NULL; + obd->obd_nid_hash = NULL; + obd->obd_nid_stats_hash = NULL; + spin_unlock(&obd->obd_dev_lock); + + /* create an uuid-export lustre hash */ + obd->obd_uuid_hash = cfs_hash_create("UUID_HASH", + HASH_UUID_CUR_BITS, + HASH_UUID_MAX_BITS, + HASH_UUID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &uuid_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_uuid_hash) { + err = -ENOMEM; + goto err_hash; + } + + /* create a nid-export lustre hash */ + obd->obd_nid_hash = cfs_hash_create("NID_HASH", + HASH_NID_CUR_BITS, + HASH_NID_MAX_BITS, + HASH_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nid_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_nid_hash) { + err = -ENOMEM; + goto err_hash; + } + + /* create a nid-stats lustre hash */ + obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS", + HASH_NID_STATS_CUR_BITS, + HASH_NID_STATS_MAX_BITS, + HASH_NID_STATS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nid_stat_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_nid_stats_hash) { + err = -ENOMEM; + goto err_hash; + } + + exp = class_new_export(obd, &obd->obd_uuid); + if (IS_ERR(exp)) { + err = PTR_ERR(exp); + goto err_hash; + } + + obd->obd_self_export = exp; + list_del_init(&exp->exp_obd_chain_timed); + class_export_put(exp); + + err = obd_setup(obd, lcfg); + if (err) + goto err_exp; + + obd->obd_set_up = 1; + + spin_lock(&obd->obd_dev_lock); + /* cleanup drops this */ + class_incref(obd, "setup", obd); + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + return 0; +err_exp: + if (obd->obd_self_export) { + class_unlink_export(obd->obd_self_export); + obd->obd_self_export = NULL; + } +err_hash: + if (obd->obd_uuid_hash) { + cfs_hash_putref(obd->obd_uuid_hash); + obd->obd_uuid_hash = NULL; + } + if (obd->obd_nid_hash) { + cfs_hash_putref(obd->obd_nid_hash); + obd->obd_nid_hash = NULL; + } + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } + obd->obd_starting = 0; + CERROR("setup %s failed (%d)\n", obd->obd_name, err); + return err; +} +EXPORT_SYMBOL(class_setup); + +/** We have finished using this obd and are ready to destroy it. + * There can be no more references to this obd. + */ +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + if (obd->obd_set_up) { + CERROR("OBD device %d still set up\n", obd->obd_minor); + return -EBUSY; + } + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_attached) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD device %d not attached\n", obd->obd_minor); + return -ENODEV; + } + obd->obd_attached = 0; + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + class_decref(obd, "attach", obd); + return 0; +} +EXPORT_SYMBOL(class_detach); + +/** Start shutting down the obd. There may be in-progress ops when + * this is called. We tell them to start shutting down with a call + * to class_disconnect_exports(). + */ +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + char *flag; + + OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS); + + if (!obd->obd_set_up) { + CERROR("Device %d not setup\n", obd->obd_minor); + return -ENODEV; + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD %d already stopping\n", obd->obd_minor); + return -ENODEV; + } + /* Leave this on forever */ + obd->obd_stopping = 1; + + /* wait for already-arrived-connections to finish. */ + while (obd->obd_conn_inprogress > 0) { + spin_unlock(&obd->obd_dev_lock); + + cond_resched(); + + spin_lock(&obd->obd_dev_lock); + } + spin_unlock(&obd->obd_dev_lock); + + if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) { + for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++) + switch (*flag) { + case 'F': + obd->obd_force = 1; + break; + case 'A': + LCONSOLE_WARN("Failing over %s\n", + obd->obd_name); + obd->obd_fail = 1; + obd->obd_no_transno = 1; + obd->obd_no_recov = 1; + if (OBP(obd, iocontrol)) { + obd_iocontrol(OBD_IOC_SYNC, + obd->obd_self_export, + 0, NULL, NULL); + } + break; + default: + CERROR("Unrecognised flag '%c'\n", *flag); + } + } + + LASSERT(obd->obd_self_export); + + /* The three references that should be remaining are the + * obd_self_export and the attach and setup references. */ + if (atomic_read(&obd->obd_refcount) > 3) { + /* refcount - 3 might be the number of real exports + (excluding self export). But class_incref is called + by other things as well, so don't count on it. */ + CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n", + obd->obd_name, atomic_read(&obd->obd_refcount) - 3); + dump_exports(obd, 0); + class_disconnect_exports(obd); + } + + /* Precleanup, we must make sure all exports get destroyed. */ + err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS); + if (err) + CERROR("Precleanup %s returned %d\n", + obd->obd_name, err); + + /* destroy an uuid-export hash body */ + if (obd->obd_uuid_hash) { + cfs_hash_putref(obd->obd_uuid_hash); + obd->obd_uuid_hash = NULL; + } + + /* destroy a nid-export hash body */ + if (obd->obd_nid_hash) { + cfs_hash_putref(obd->obd_nid_hash); + obd->obd_nid_hash = NULL; + } + + /* destroy a nid-stats hash body */ + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } + + class_decref(obd, "setup", obd); + obd->obd_set_up = 0; + + return 0; +} +EXPORT_SYMBOL(class_cleanup); + +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, const void *source) +{ + lu_ref_add_atomic(&obd->obd_reference, scope, source); + atomic_inc(&obd->obd_refcount); + CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd, + atomic_read(&obd->obd_refcount)); + + return obd; +} +EXPORT_SYMBOL(class_incref); + +void class_decref(struct obd_device *obd, const char *scope, const void *source) +{ + int err; + int refs; + + spin_lock(&obd->obd_dev_lock); + atomic_dec(&obd->obd_refcount); + refs = atomic_read(&obd->obd_refcount); + spin_unlock(&obd->obd_dev_lock); + lu_ref_del(&obd->obd_reference, scope, source); + + CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs); + + if ((refs == 1) && obd->obd_stopping) { + /* All exports have been destroyed; there should + be no more in-progress ops by this point.*/ + + spin_lock(&obd->obd_self_export->exp_lock); + obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd); + spin_unlock(&obd->obd_self_export->exp_lock); + + /* note that we'll recurse into class_decref again */ + class_unlink_export(obd->obd_self_export); + return; + } + + if (refs == 0) { + CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n", + obd->obd_name, obd->obd_uuid.uuid); + LASSERT(!obd->obd_attached); + if (obd->obd_stopping) { + /* If we're not stopping, we were never set up */ + err = obd_cleanup(obd); + if (err) + CERROR("Cleanup %s returned %d\n", + obd->obd_name, err); + } + if (OBP(obd, detach)) { + err = OBP(obd, detach)(obd); + if (err) + CERROR("Detach returned %d\n", err); + } + class_release_dev(obd); + } +} +EXPORT_SYMBOL(class_decref); + +/** Add a failover nid location. + * Client obd types contact server obd types using this nid list. + */ +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + return -EINVAL; + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) { + CERROR("can't add connection on non-client dev\n"); + return -EINVAL; + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to add conn on immature client dev\n"); + return -EINVAL; + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num); + + return rc; +} +EXPORT_SYMBOL(class_add_conn); + +/** Remove a failover nid location. + */ +int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + return -EINVAL; + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { + CERROR("can't del connection on non-client dev\n"); + return -EINVAL; + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to del conn on immature client dev\n"); + return -EINVAL; + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_del_conn(imp, &uuid); + + return rc; +} + +LIST_HEAD(lustre_profile_list); + +struct lustre_profile *class_get_profile(const char *prof) +{ + struct lustre_profile *lprof; + + list_for_each_entry(lprof, &lustre_profile_list, lp_list) { + if (!strcmp(lprof->lp_profile, prof)) { + return lprof; + } + } + return NULL; +} +EXPORT_SYMBOL(class_get_profile); + +/** Create a named "profile". + * This defines the mdc and osc names to use for a client. + * This also is used to define the lov to be used by a mdt. + */ +int class_add_profile(int proflen, char *prof, int osclen, char *osc, + int mdclen, char *mdc) +{ + struct lustre_profile *lprof; + int err = 0; + + CDEBUG(D_CONFIG, "Add profile %s\n", prof); + + OBD_ALLOC(lprof, sizeof(*lprof)); + if (lprof == NULL) + return -ENOMEM; + INIT_LIST_HEAD(&lprof->lp_list); + + LASSERT(proflen == (strlen(prof) + 1)); + OBD_ALLOC(lprof->lp_profile, proflen); + if (lprof->lp_profile == NULL) { + err = -ENOMEM; + goto out; + } + memcpy(lprof->lp_profile, prof, proflen); + + LASSERT(osclen == (strlen(osc) + 1)); + OBD_ALLOC(lprof->lp_dt, osclen); + if (lprof->lp_dt == NULL) { + err = -ENOMEM; + goto out; + } + memcpy(lprof->lp_dt, osc, osclen); + + if (mdclen > 0) { + LASSERT(mdclen == (strlen(mdc) + 1)); + OBD_ALLOC(lprof->lp_md, mdclen); + if (lprof->lp_md == NULL) { + err = -ENOMEM; + goto out; + } + memcpy(lprof->lp_md, mdc, mdclen); + } + + list_add(&lprof->lp_list, &lustre_profile_list); + return err; + +out: + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, mdclen); + if (lprof->lp_dt) + OBD_FREE(lprof->lp_dt, osclen); + if (lprof->lp_profile) + OBD_FREE(lprof->lp_profile, proflen); + OBD_FREE(lprof, sizeof(*lprof)); + return err; +} + +void class_del_profile(const char *prof) +{ + struct lustre_profile *lprof; + + CDEBUG(D_CONFIG, "Del profile %s\n", prof); + + lprof = class_get_profile(prof); + if (lprof) { + list_del(&lprof->lp_list); + OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1); + OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1); + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1); + OBD_FREE(lprof, sizeof(*lprof)); + } +} +EXPORT_SYMBOL(class_del_profile); + +/* COMPAT_146 */ +void class_del_profiles(void) +{ + struct lustre_profile *lprof, *n; + + list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) { + list_del(&lprof->lp_list); + OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1); + OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1); + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1); + OBD_FREE(lprof, sizeof(*lprof)); + } +} +EXPORT_SYMBOL(class_del_profiles); + +static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg) +{ + if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0) + at_min = val; + else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0) + at_max = val; + else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0) + at_extra = val; + else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0) + at_early_margin = val; + else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0) + at_history = val; + else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0) + strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2), + JOBSTATS_JOBID_VAR_MAX_LEN + 1); + else + return -EINVAL; + + CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val); + return 0; +} + + +/* We can't call ll_process_config or lquota_process_config directly because + * it lives in a module that must be loaded after this one. */ +static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL; +static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL; + +void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)) +{ + client_process_config = cpc; +} +EXPORT_SYMBOL(lustre_register_client_process_config); + +/** + * Rename the proc parameter in \a cfg with a new name \a new_name. + * + * \param cfg config structure which contains the proc parameter + * \param new_name new name of the proc parameter + * + * \retval valid-pointer pointer to the newly-allocated config structure + * which contains the renamed proc parameter + * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does + * not contain a proc parameter + * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs + */ +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name) +{ + struct lustre_cfg_bufs *bufs = NULL; + struct lustre_cfg *new_cfg = NULL; + char *param = NULL; + char *new_param = NULL; + char *value = NULL; + int name_len = 0; + int new_len = 0; + + if (cfg == NULL || new_name == NULL) + return ERR_PTR(-EINVAL); + + param = lustre_cfg_string(cfg, 1); + if (param == NULL) + return ERR_PTR(-EINVAL); + + value = strchr(param, '='); + if (value == NULL) + name_len = strlen(param); + else + name_len = value - param; + + new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len; + + OBD_ALLOC(new_param, new_len); + if (new_param == NULL) + return ERR_PTR(-ENOMEM); + + strcpy(new_param, new_name); + if (value != NULL) + strcat(new_param, value); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) { + OBD_FREE(new_param, new_len); + return ERR_PTR(-ENOMEM); + } + + lustre_cfg_bufs_reset(bufs, NULL); + lustre_cfg_bufs_init(bufs, cfg); + lustre_cfg_bufs_set_string(bufs, 1, new_param); + + new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs); + + OBD_FREE(new_param, new_len); + OBD_FREE_PTR(bufs); + if (new_cfg == NULL) + return ERR_PTR(-ENOMEM); + + new_cfg->lcfg_num = cfg->lcfg_num; + new_cfg->lcfg_flags = cfg->lcfg_flags; + new_cfg->lcfg_nid = cfg->lcfg_nid; + new_cfg->lcfg_nal = cfg->lcfg_nal; + + return new_cfg; +} +EXPORT_SYMBOL(lustre_cfg_rename); + +static int process_param2_config(struct lustre_cfg *lcfg) +{ + char *param = lustre_cfg_string(lcfg, 1); + char *upcall = lustre_cfg_string(lcfg, 2); + char *argv[] = { + [0] = "/usr/sbin/lctl", + [1] = "set_param", + [2] = param, + [3] = NULL + }; + struct timeval start; + struct timeval end; + int rc; + + + /* Add upcall processing here. Now only lctl is supported */ + if (strcmp(upcall, LCTL_UPCALL) != 0) { + CERROR("Unsupported upcall %s\n", upcall); + return -EINVAL; + } + + do_gettimeofday(&start); + rc = call_usermodehelper(argv[0], argv, NULL, 1); + do_gettimeofday(&end); + + if (rc < 0) { + CERROR( + "lctl: error invoking upcall %s %s %s: rc = %d; time %ldus\n", + argv[0], argv[1], argv[2], rc, + cfs_timeval_sub(&end, &start, NULL)); + } else { + CDEBUG(D_HA, "lctl: invoked upcall %s %s %s, time %ldus\n", + argv[0], argv[1], argv[2], + cfs_timeval_sub(&end, &start, NULL)); + rc = 0; + } + + return rc; +} + +void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg)) +{ + quota_process_config = qpc; +} +EXPORT_SYMBOL(lustre_register_quota_process_config); + +/** Process configuration commands given in lustre_cfg form. + * These may come from direct calls (e.g. class_manual_cleanup) + * or processing the config llog, or ioctl from lctl. + */ +int class_process_config(struct lustre_cfg *lcfg) +{ + struct obd_device *obd; + int err; + + LASSERT(lcfg && !IS_ERR(lcfg)); + CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command); + + /* Commands that don't need a device */ + switch (lcfg->lcfg_command) { + case LCFG_ATTACH: { + err = class_attach(lcfg); + goto out; + } + case LCFG_ADD_UUID: { + CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid %#llx (%s)\n", + lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid, + libcfs_nid2str(lcfg->lcfg_nid)); + + err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid); + goto out; + } + case LCFG_DEL_UUID: { + CDEBUG(D_IOCTL, "removing mappings for uuid %s\n", + (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0) + ? "" : lustre_cfg_string(lcfg, 1)); + + err = class_del_uuid(lustre_cfg_string(lcfg, 1)); + goto out; + } + case LCFG_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n", + lustre_cfg_string(lcfg, 1), + lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + /* set these mount options somewhere, so ll_fill_super + * can find them. */ + err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1), + lustre_cfg_string(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 2), + lustre_cfg_string(lcfg, 2), + LUSTRE_CFG_BUFLEN(lcfg, 3), + lustre_cfg_string(lcfg, 3)); + goto out; + } + case LCFG_DEL_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s\n", + lustre_cfg_string(lcfg, 1)); + class_del_profile(lustre_cfg_string(lcfg, 1)); + err = 0; + goto out; + } + case LCFG_SET_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", + obd_timeout, lcfg->lcfg_num); + obd_timeout = max(lcfg->lcfg_num, 1U); + obd_timeout_set = 1; + err = 0; + goto out; + } + case LCFG_SET_LDLM_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n", + ldlm_timeout, lcfg->lcfg_num); + ldlm_timeout = max(lcfg->lcfg_num, 1U); + if (ldlm_timeout >= obd_timeout) + ldlm_timeout = max(obd_timeout / 3, 1U); + ldlm_timeout_set = 1; + err = 0; + goto out; + } + case LCFG_SET_UPCALL: { + LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n"); + /* COMPAT_146 Don't fail on old configs */ + err = 0; + goto out; + } + case LCFG_MARKER: { + struct cfg_marker *marker; + marker = lustre_cfg_buf(lcfg, 1); + CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step, + marker->cm_flags, marker->cm_tgtname, marker->cm_comment); + err = 0; + goto out; + } + case LCFG_PARAM: { + char *tmp; + /* llite has no obd */ + if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_LLITE, NULL) == 0) && + client_process_config) { + err = (*client_process_config)(lcfg); + goto out; + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_SYS, &tmp) == 0)) { + /* Global param settings */ + err = class_set_global(tmp, lcfg->lcfg_num, lcfg); + /* + * Client or server should not fail to mount if + * it hits an unknown configuration parameter. + */ + if (err != 0) + CWARN("Ignoring unknown param %s\n", tmp); + + err = 0; + goto out; + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_QUOTA, &tmp) == 0) && + quota_process_config) { + err = (*quota_process_config)(lcfg); + goto out; + } + + break; + } + case LCFG_SET_PARAM: { + err = process_param2_config(lcfg); + goto out; + } + } + /* Commands that require a device */ + obd = class_name2obd(lustre_cfg_string(lcfg, 0)); + if (obd == NULL) { + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) + CERROR("this lcfg command requires a device name\n"); + else + CERROR("no device for: %s\n", + lustre_cfg_string(lcfg, 0)); + + err = -EINVAL; + goto out; + } + + switch (lcfg->lcfg_command) { + case LCFG_SETUP: { + err = class_setup(obd, lcfg); + goto out; + } + case LCFG_DETACH: { + err = class_detach(obd, lcfg); + err = 0; + goto out; + } + case LCFG_CLEANUP: { + err = class_cleanup(obd, lcfg); + err = 0; + goto out; + } + case LCFG_ADD_CONN: { + err = class_add_conn(obd, lcfg); + err = 0; + goto out; + } + case LCFG_DEL_CONN: { + err = class_del_conn(obd, lcfg); + err = 0; + goto out; + } + case LCFG_POOL_NEW: { + err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2)); + err = 0; + goto out; + } + case LCFG_POOL_ADD: { + err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + err = 0; + goto out; + } + case LCFG_POOL_REM: { + err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + err = 0; + goto out; + } + case LCFG_POOL_DEL: { + err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2)); + err = 0; + goto out; + } + default: { + err = obd_process_config(obd, sizeof(*lcfg), lcfg); + goto out; + + } + } +out: + if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) { + CWARN("Ignoring error %d on optional command %#x\n", err, + lcfg->lcfg_command); + err = 0; + } + return err; +} +EXPORT_SYMBOL(class_process_config); + +int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, + struct lustre_cfg *lcfg, void *data) +{ + struct lprocfs_vars *var; + struct file fakefile; + struct seq_file fake_seqfile; + char *key, *sval; + int i, keylen, vallen; + int matched = 0, j = 0; + int rc = 0; + int skip = 0; + + if (lcfg->lcfg_command != LCFG_PARAM) { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + return -EINVAL; + } + + /* fake a seq file so that var->fops->write can work... */ + fakefile.private_data = &fake_seqfile; + fake_seqfile.private = data; + /* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt + or lctl conf_param lustre-MDT0000.mdt.group_upcall=bar + or lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */ + for (i = 1; i < lcfg->lcfg_bufcount; i++) { + key = lustre_cfg_buf(lcfg, i); + /* Strip off prefix */ + class_match_param(key, prefix, &key); + sval = strchr(key, '='); + if (!sval || (*(sval + 1) == 0)) { + CERROR("Can't parse param %s (missing '=')\n", key); + /* rc = -EINVAL; continue parsing other params */ + continue; + } + keylen = sval - key; + sval++; + vallen = strlen(sval); + matched = 0; + j = 0; + /* Search proc entries */ + while (lvars[j].name) { + var = &lvars[j]; + if (class_match_param(key, (char *)var->name, NULL) == 0 + && keylen == strlen(var->name)) { + matched++; + rc = -EROFS; + if (var->fops && var->fops->write) { + mm_segment_t oldfs; + oldfs = get_fs(); + set_fs(KERNEL_DS); + rc = (var->fops->write)(&fakefile, sval, + vallen, NULL); + set_fs(oldfs); + } + break; + } + j++; + } + if (!matched) { + /* If the prefix doesn't match, return error so we + can pass it down the stack */ + if (strnchr(key, keylen, '.')) + return -ENOSYS; + CERROR("%s: unknown param %s\n", + (char *)lustre_cfg_string(lcfg, 0), key); + /* rc = -EINVAL; continue parsing other params */ + skip++; + } else if (rc < 0) { + CERROR("writing proc entry %s err %d\n", + var->name, rc); + rc = 0; + } else { + CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n", + lustre_cfg_string(lcfg, 0), + (int)strlen(prefix) - 1, prefix, + (int)(sval - key - 1), key, sval); + } + } + + if (rc > 0) + rc = 0; + if (!rc && skip) + rc = skip; + return rc; +} +EXPORT_SYMBOL(class_process_proc_param); + +extern int lustre_check_exclusion(struct super_block *sb, char *svname); + +/** Parse a configuration llog, doing various manipulations on them + * for various reasons, (modifications for compatibility, skip obsolete + * records, change uuids, etc), then class_process_config() resulting + * net records. + */ +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct config_llog_instance *clli = data; + int cfg_len = rec->lrh_len; + char *cfg_buf = (char *) (rec + 1); + int rc = 0; + + //class_config_dump_handler(handle, rec, data); + + switch (rec->lrh_type) { + case OBD_CFG_REC: { + struct lustre_cfg *lcfg, *lcfg_new; + struct lustre_cfg_bufs bufs; + char *inst_name = NULL; + int inst_len = 0; + int inst = 0, swab = 0; + + lcfg = (struct lustre_cfg *)cfg_buf; + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { + lustre_swab_lustre_cfg(lcfg); + swab = 1; + } + + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); + if (rc) + goto out; + + /* Figure out config state info */ + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + lustre_swab_cfg_marker(marker, swab, + LUSTRE_CFG_BUFLEN(lcfg, 1)); + CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n", + clli->cfg_flags, marker->cm_flags); + if (marker->cm_flags & CM_START) { + /* all previous flags off */ + clli->cfg_flags = CFG_F_MARKER; + if (marker->cm_flags & CM_SKIP) { + clli->cfg_flags |= CFG_F_SKIP; + CDEBUG(D_CONFIG, "SKIP #%d\n", + marker->cm_step); + } else if ((marker->cm_flags & CM_EXCLUDE) || + (clli->cfg_sb && + lustre_check_exclusion(clli->cfg_sb, + marker->cm_tgtname))) { + clli->cfg_flags |= CFG_F_EXCLUDE; + CDEBUG(D_CONFIG, "EXCLUDE %d\n", + marker->cm_step); + } + } else if (marker->cm_flags & CM_END) { + clli->cfg_flags = 0; + } + } + /* A config command without a start marker before it is + illegal (post 146) */ + if (!(clli->cfg_flags & CFG_F_COMPAT146) && + !(clli->cfg_flags & CFG_F_MARKER) && + (lcfg->lcfg_command != LCFG_MARKER)) { + CWARN("Config not inside markers, ignoring! (inst: %p, uuid: %s, flags: %#x)\n", + clli->cfg_instance, + clli->cfg_uuid.uuid, clli->cfg_flags); + clli->cfg_flags |= CFG_F_SKIP; + } + if (clli->cfg_flags & CFG_F_SKIP) { + CDEBUG(D_CONFIG, "skipping %#x\n", + clli->cfg_flags); + rc = 0; + /* No processing! */ + break; + } + + /* + * For interoperability between 1.8 and 2.0, + * rename "mds" obd device type to "mdt". + */ + { + char *typename = lustre_cfg_string(lcfg, 1); + char *index = lustre_cfg_string(lcfg, 2); + + if ((lcfg->lcfg_command == LCFG_ATTACH && typename && + strcmp(typename, "mds") == 0)) { + CWARN("For 1.8 interoperability, rename obd type from mds to mdt\n"); + typename[2] = 't'; + } + if ((lcfg->lcfg_command == LCFG_SETUP && index && + strcmp(index, "type") == 0)) { + CDEBUG(D_INFO, "For 1.8 interoperability, set this index to '0'\n"); + index[0] = '0'; + index[1] = 0; + } + } + + + if (clli->cfg_flags & CFG_F_EXCLUDE) { + CDEBUG(D_CONFIG, "cmd: %x marked EXCLUDED\n", + lcfg->lcfg_command); + if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD) + /* Add inactive instead */ + lcfg->lcfg_command = LCFG_LOV_ADD_INA; + } + + lustre_cfg_bufs_init(&bufs, lcfg); + + if (clli && clli->cfg_instance && + LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){ + inst = 1; + inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + + sizeof(clli->cfg_instance) * 2 + 4; + OBD_ALLOC(inst_name, inst_len); + if (inst_name == NULL) { + rc = -ENOMEM; + goto out; + } + sprintf(inst_name, "%s-%p", + lustre_cfg_string(lcfg, 0), + clli->cfg_instance); + lustre_cfg_bufs_set_string(&bufs, 0, inst_name); + CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n", + lcfg->lcfg_command, inst_name); + } + + /* we override the llog's uuid for clients, to insure they + are unique */ + if (clli && clli->cfg_instance != NULL && + lcfg->lcfg_command == LCFG_ATTACH) { + lustre_cfg_bufs_set_string(&bufs, 2, + clli->cfg_uuid.uuid); + } + /* + * sptlrpc config record, we expect 2 data segments: + * [0]: fs_name/target_name, + * [1]: rule string + * moving them to index [1] and [2], and insert MGC's + * obdname at index [0]. + */ + if (clli && clli->cfg_instance == NULL && + lcfg->lcfg_command == LCFG_SPTLRPC_CONF) { + lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1], + bufs.lcfg_buflen[1]); + lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0], + bufs.lcfg_buflen[0]); + lustre_cfg_bufs_set_string(&bufs, 0, + clli->cfg_obdname); + } + + lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs); + + lcfg_new->lcfg_num = lcfg->lcfg_num; + lcfg_new->lcfg_flags = lcfg->lcfg_flags; + + /* XXX Hack to try to remain binary compatible with + * pre-newconfig logs */ + if (lcfg->lcfg_nal != 0 && /* pre-newconfig log? */ + (lcfg->lcfg_nid >> 32) == 0) { + __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff); + + lcfg_new->lcfg_nid = + LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr); + CWARN("Converted pre-newconfig NAL %d NID %x to %s\n", + lcfg->lcfg_nal, addr, + libcfs_nid2str(lcfg_new->lcfg_nid)); + } else { + lcfg_new->lcfg_nid = lcfg->lcfg_nid; + } + + lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */ + + rc = class_process_config(lcfg_new); + lustre_cfg_free(lcfg_new); + + if (inst) + OBD_FREE(inst_name, inst_len); + break; + } + default: + CERROR("Unknown llog record type %#x encountered\n", + rec->lrh_type); + break; + } +out: + if (rc) { + CERROR("%s: cfg command failed: rc = %d\n", + handle->lgh_ctxt->loc_obd->obd_name, rc); + class_config_dump_handler(NULL, handle, rec, data); + } + return rc; +} +EXPORT_SYMBOL(class_config_llog_handler); + +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg) +{ + struct llog_process_cat_data cd = {0, 0}; + struct llog_handle *llh; + llog_cb_t callback; + int rc; + + CDEBUG(D_INFO, "looking up llog %s\n", name); + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) + return rc; + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + goto parse_out; + + /* continue processing from where we last stopped to end-of-log */ + if (cfg) { + cd.lpcd_first_idx = cfg->cfg_last_idx; + callback = cfg->cfg_callback; + LASSERT(callback != NULL); + } else { + callback = class_config_llog_handler; + } + + cd.lpcd_last_idx = 0; + + rc = llog_process(env, llh, callback, cfg, &cd); + + CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name, + cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc); + if (cfg) + cfg->cfg_last_idx = cd.lpcd_last_idx; + +parse_out: + llog_close(env, llh); + return rc; +} +EXPORT_SYMBOL(class_config_parse_llog); + +/** + * parse config record and output dump in supplied buffer. + * This is separated from class_config_dump_handler() to use + * for ioctl needs as well + */ +int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + char *ptr = buf; + char *end = buf + size; + int rc = 0; + + LASSERT(rec->lrh_type == OBD_CFG_REC); + rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len); + if (rc < 0) + return rc; + + ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command); + if (lcfg->lcfg_flags) + ptr += snprintf(ptr, end-ptr, "flags=%#08x ", + lcfg->lcfg_flags); + + if (lcfg->lcfg_num) + ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num); + + if (lcfg->lcfg_nid) + ptr += snprintf(ptr, end-ptr, "nid=%s(%#llx)\n ", + libcfs_nid2str(lcfg->lcfg_nid), + lcfg->lcfg_nid); + + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + + ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'", + marker->cm_step, marker->cm_flags, + marker->cm_tgtname, marker->cm_comment); + } else { + int i; + + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + ptr += snprintf(ptr, end-ptr, "%d:%s ", i, + lustre_cfg_string(lcfg, i)); + } + } + /* return consumed bytes */ + rc = ptr - buf; + return rc; +} + +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + char *outstr; + int rc = 0; + + OBD_ALLOC(outstr, 256); + if (outstr == NULL) + return -ENOMEM; + + if (rec->lrh_type == OBD_CFG_REC) { + class_config_parse_rec(rec, outstr, 256); + LCONSOLE(D_WARNING, " %s\n", outstr); + } else { + LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type); + rc = -EINVAL; + } + + OBD_FREE(outstr, 256); + return rc; +} + +int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg) +{ + struct llog_handle *llh; + int rc; + + LCONSOLE_INFO("Dumping config log %s\n", name); + + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) + return rc; + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + goto parse_out; + + rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL); +parse_out: + llog_close(env, llh); + + LCONSOLE_INFO("End config log %s\n", name); + return rc; +} +EXPORT_SYMBOL(class_config_dump_llog); + +/** Call class_cleanup and class_detach. + * "Manual" only in the sense that we're faking lcfg commands. + */ +int class_manual_cleanup(struct obd_device *obd) +{ + char flags[3] = ""; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + int rc; + + if (!obd) { + CERROR("empty cleanup\n"); + return -EALREADY; + } + + if (obd->obd_force) + strcat(flags, "F"); + if (obd->obd_fail) + strcat(flags, "A"); + + CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", + obd->obd_name, flags); + + lustre_cfg_bufs_reset(&bufs, obd->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, flags); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + if (!lcfg) + return -ENOMEM; + + rc = class_process_config(lcfg); + if (rc) { + CERROR("cleanup failed %d: %s\n", rc, obd->obd_name); + goto out; + } + + /* the lcfg is almost the same for both ops */ + lcfg->lcfg_command = LCFG_DETACH; + rc = class_process_config(lcfg); + if (rc) + CERROR("detach failed %d: %s\n", rc, obd->obd_name); +out: + lustre_cfg_free(lcfg); + return rc; +} +EXPORT_SYMBOL(class_manual_cleanup); + +/* + * uuid<->export lustre hash operations + */ + +static unsigned +uuid_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid, + sizeof(((struct obd_uuid *)key)->uuid), mask); +} + +static void * +uuid_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + + return &exp->exp_client_uuid; +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +uuid_keycmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + + return obd_uuid_equals(key, &exp->exp_client_uuid) && + !exp->exp_failed; +} + +static void * +uuid_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_uuid_hash); +} + +static void +uuid_export_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + class_export_get(exp); +} + +static void +uuid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + class_export_put(exp); +} + +static cfs_hash_ops_t uuid_hash_ops = { + .hs_hash = uuid_hash, + .hs_key = uuid_key, + .hs_keycmp = uuid_keycmp, + .hs_object = uuid_export_object, + .hs_get = uuid_export_get, + .hs_put_locked = uuid_export_put_locked, +}; + + +/* + * nid<->export hash operations + */ + +static unsigned +nid_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask); +} + +static void * +nid_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + + return &exp->exp_connection->c_peer.nid; +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +nid_kepcmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + + return exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key && + !exp->exp_failed; +} + +static void * +nid_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_nid_hash); +} + +static void +nid_export_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + class_export_get(exp); +} + +static void +nid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + class_export_put(exp); +} + +static cfs_hash_ops_t nid_hash_ops = { + .hs_hash = nid_hash, + .hs_key = nid_key, + .hs_keycmp = nid_kepcmp, + .hs_object = nid_export_object, + .hs_get = nid_export_get, + .hs_put_locked = nid_export_put_locked, +}; + + +/* + * nid<->nidstats hash operations + */ + +static void * +nidstats_key(struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + + return &ns->nid; +} + +static int +nidstats_keycmp(const void *key, struct hlist_node *hnode) +{ + return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key; +} + +static void * +nidstats_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nid_stat, nid_hash); +} + +static void +nidstats_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_getref(ns); +} + +static void +nidstats_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_putref(ns); +} + +static cfs_hash_ops_t nid_stat_hash_ops = { + .hs_hash = nid_hash, + .hs_key = nidstats_key, + .hs_keycmp = nidstats_keycmp, + .hs_object = nidstats_object, + .hs_get = nidstats_get, + .hs_put_locked = nidstats_put_locked, +}; diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/kernel/drivers/staging/lustre/lustre/obdclass/obd_mount.c new file mode 100644 index 000000000..3437b2ecf --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/obd_mount.c @@ -0,0 +1,1319 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_mount.c + * + * Client mount routines + * + * Author: Nathan Rutman + */ + + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */) +#define PRINT_CMD CDEBUG + +#include "../include/obd.h" +#include "../include/linux/lustre_compat25.h" +#include "../include/obd_class.h" +#include "../include/lustre/lustre_user.h" +#include "../include/lustre_log.h" +#include "../include/lustre_disk.h" +#include "../include/lustre_param.h" + +static int (*client_fill_super)(struct super_block *sb, + struct vfsmount *mnt); + +static void (*kill_super_cb)(struct super_block *sb); + +/**************** config llog ********************/ + +/** Get a config log from the MGS and process it. + * This func is called for both clients and servers. + * Continue to process new statements appended to the logs + * (whenever the config lock is revoked) until lustre_end_log + * is called. + * @param sb The superblock is used by the MGC to write to the local copy of + * the config log + * @param logname The name of the llog to replicate from the MGS + * @param cfg Since the same mgc may be used to follow multiple config logs + * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for + * this log, and is added to the mgc's list of logs to follow. + */ +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs *bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + + LASSERT(mgc); + LASSERT(cfg); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + return -ENOMEM; + + /* mgc_process_config */ + lustre_cfg_bufs_reset(bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(bufs, 1, logname); + lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg)); + lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb)); + lcfg = lustre_cfg_new(LCFG_LOG_START, bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + lustre_cfg_free(lcfg); + + OBD_FREE_PTR(bufs); + + if (rc == -EINVAL) + LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s' failed from the MGS (%d). Make sure this client and the MGS are running compatible versions of Lustre.\n", + mgc->obd_name, logname, rc); + + if (rc) + LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' failed (%d). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.\n", + mgc->obd_name, logname, + rc); + + /* class_obd_list(); */ + return rc; +} +EXPORT_SYMBOL(lustre_process_log); + +/* Stop watching this config log for updates */ +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + + if (!mgc) + return -ENOENT; + + /* mgc_process_config */ + lustre_cfg_bufs_reset(&bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, logname); + if (cfg) + lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); + lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + lustre_cfg_free(lcfg); + return rc; +} +EXPORT_SYMBOL(lustre_end_log); + +/**************** obd start *******************/ + +/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from + * lctl (and do for echo cli/srv. + */ +int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg * lcfg = NULL; + int rc; + + CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname, + cmd, s1, s2, s3, s4); + + lustre_cfg_bufs_reset(&bufs, cfgname); + if (s1) + lustre_cfg_bufs_set_string(&bufs, 1, s1); + if (s2) + lustre_cfg_bufs_set_string(&bufs, 2, s2); + if (s3) + lustre_cfg_bufs_set_string(&bufs, 3, s3); + if (s4) + lustre_cfg_bufs_set_string(&bufs, 4, s4); + + lcfg = lustre_cfg_new(cmd, &bufs); + lcfg->lcfg_nid = nid; + rc = class_process_config(lcfg); + lustre_cfg_free(lcfg); + return rc; +} +EXPORT_SYMBOL(do_lcfg); + +/** Call class_attach and class_setup. These methods in turn call + * obd type-specific methods. + */ +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4) +{ + int rc; + CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type); + + rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, NULL, NULL); + if (rc) { + CERROR("%s attach error %d\n", obdname, rc); + return rc; + } + rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4); + if (rc) { + CERROR("%s setup error %d\n", obdname, rc); + do_lcfg(obdname, 0, LCFG_DETACH, NULL, NULL, NULL, NULL); + } + return rc; +} + +DEFINE_MUTEX(mgc_start_lock); + +/** Set up a mgc obd to process startup logs + * + * \param sb [in] super block of the mgc obd + * + * \retval 0 success, otherwise error code + */ +int lustre_start_mgc(struct super_block *sb) +{ + struct obd_connect_data *data = NULL; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + struct obd_export *exp; + struct obd_uuid *uuid; + class_uuid_t uuidc; + lnet_nid_t nid; + char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL; + char *ptr; + int rc = 0, i = 0, j, len; + + LASSERT(lsi->lsi_lmd); + + /* Find the first non-lo MGS nid for our MGC name */ + if (IS_SERVER(lsi)) { + /* mount -o mgsnode=nid */ + ptr = lsi->lsi_lmd->lmd_mgs; + if (lsi->lsi_lmd->lmd_mgs && + (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) { + i++; + } else if (IS_MGS(lsi)) { + lnet_process_id_t id; + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) + continue; + nid = id.nid; + i++; + break; + } + } + } else { /* client */ + /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + if (class_parse_nid(ptr, &nid, &ptr) == 0) + i++; + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + return -EINVAL; + } + + mutex_lock(&mgc_start_lock); + + len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1; + OBD_ALLOC(mgcname, len); + OBD_ALLOC(niduuid, len + 2); + if (!mgcname || !niduuid) { + rc = -ENOMEM; + goto out_free; + } + sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid)); + + mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : ""; + + OBD_ALLOC_PTR(data); + if (data == NULL) { + rc = -ENOMEM; + goto out_free; + } + + obd = class_name2obd(mgcname); + if (obd && !obd->obd_stopping) { + int recov_bk; + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + goto out_free; + + /* Re-using an existing MGC */ + atomic_inc(&obd->u.cli.cl_mgc_refcount); + + /* IR compatibility check, only for clients */ + if (lmd_is_client(lsi->lsi_lmd)) { + int has_ir; + int vallen = sizeof(*data); + __u32 *flags = &lsi->lsi_lmd->lmd_flags; + + rc = obd_get_info(NULL, obd->obd_self_export, + strlen(KEY_CONN_DATA), KEY_CONN_DATA, + &vallen, data, NULL); + LASSERT(rc == 0); + has_ir = OCD_HAS_FLAG(data, IMP_RECOV); + if (has_ir ^ !(*flags & LMD_FLG_NOIR)) { + /* LMD_FLG_NOIR is for test purpose only */ + LCONSOLE_WARN( + "Trying to mount a client with IR setting not compatible with current mgc. Force to use current mgc setting that is IR %s.\n", + has_ir ? "enabled" : "disabled"); + if (has_ir) + *flags &= ~LMD_FLG_NOIR; + else + *flags |= LMD_FLG_NOIR; + } + } + + recov_bk = 0; + /* If we are restarting the MGS, don't try to keep the MGC's + old connection, or registration will fail. */ + if (IS_MGS(lsi)) { + CDEBUG(D_MOUNT, "New MGS with live MGC\n"); + recov_bk = 1; + } + + /* Try all connections, but only once (again). + We don't want to block another target from starting + (using its local copy of the log), but we do want to connect + if at all possible. */ + recov_bk++; + CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname, + recov_bk); + rc = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_INIT_RECOV_BACKUP), + KEY_INIT_RECOV_BACKUP, + sizeof(recov_bk), &recov_bk, NULL); + rc = 0; + goto out; + } + + CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname); + + /* Add the primary nids for the MGS */ + i = 0; + sprintf(niduuid, "%s_%x", mgcname, i); + if (IS_SERVER(lsi)) { + ptr = lsi->lsi_lmd->lmd_mgs; + if (IS_MGS(lsi)) { + /* Use local nids (including LO) */ + lnet_process_id_t id; + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + rc = do_lcfg(mgcname, id.nid, + LCFG_ADD_UUID, niduuid, + NULL, NULL, NULL); + } + } else { + /* Use mgsnode= nids */ + /* mount -o mgsnode=nid */ + if (lsi->lsi_lmd->lmd_mgs) { + ptr = lsi->lsi_lmd->lmd_mgs; + } else if (class_find_param(ptr, PARAM_MGSNODE, + &ptr) != 0) { + CERROR("No MGS nids given.\n"); + rc = -EINVAL; + goto out_free; + } + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, + NULL, NULL, NULL); + i++; + } + } + } else { /* client */ + /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, NULL, NULL, NULL); + i++; + /* Stop at the first failover nid */ + if (*ptr == ':') + break; + } + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + rc = -EINVAL; + goto out_free; + } + lsi->lsi_lmd->lmd_mgs_failnodes = 1; + + /* Random uuid for MGC allows easier reconnects */ + OBD_ALLOC_PTR(uuid); + if (!uuid) { + rc = -ENOMEM; + goto out_free; + } + + ll_generate_random_uuid(uuidc); + class_uuid_unparse(uuidc, uuid); + + /* Start the MGC */ + rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, + (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, + niduuid, NULL, NULL); + OBD_FREE_PTR(uuid); + if (rc) + goto out_free; + + /* Add any failover MGS nids */ + i = 1; + while (ptr && ((*ptr == ':' || + class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) { + /* New failover node */ + sprintf(niduuid, "%s_%x", mgcname, i); + j = 0; + while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) { + j++; + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, NULL, NULL, NULL); + if (*ptr == ':') + break; + } + if (j > 0) { + rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN, + niduuid, NULL, NULL, NULL); + i++; + } else { + /* at ":/fsname" */ + break; + } + } + lsi->lsi_lmd->lmd_mgs_failnodes = i; + + obd = class_name2obd(mgcname); + if (!obd) { + CERROR("Can't find mgcobd %s\n", mgcname); + rc = -ENOTCONN; + goto out_free; + } + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + goto out_free; + + /* Keep a refcount of servers/clients who started with "mount", + so we know when we can get rid of the mgc. */ + atomic_set(&obd->u.cli.cl_mgc_refcount, 1); + + /* We connect to the MGS at setup, and don't disconnect until cleanup */ + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT | + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | + OBD_CONNECT_LVB_TYPE; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) + data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB; +#else +#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" +#endif + + if (lmd_is_client(lsi->lsi_lmd) && + lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) + data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV; + data->ocd_version = LUSTRE_VERSION_CODE; + rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL); + if (rc) { + CERROR("connect failed %d\n", rc); + goto out; + } + + obd->u.cli.cl_mgc_mgsexp = exp; + +out: + /* Keep the mgc info in the sb. Note that many lsi's can point + to the same mgc.*/ + lsi->lsi_mgc = obd; +out_free: + mutex_unlock(&mgc_start_lock); + + if (data) + OBD_FREE_PTR(data); + if (mgcname) + OBD_FREE(mgcname, len); + if (niduuid) + OBD_FREE(niduuid, len + 2); + return rc; +} + +static int lustre_stop_mgc(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + char *niduuid = NULL, *ptr = NULL; + int i, rc = 0, len = 0; + + if (!lsi) + return -ENOENT; + obd = lsi->lsi_mgc; + if (!obd) + return -ENOENT; + lsi->lsi_mgc = NULL; + + mutex_lock(&mgc_start_lock); + LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0); + if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { + /* This is not fatal, every client that stops + will call in here. */ + CDEBUG(D_MOUNT, "mgc still has %d references.\n", + atomic_read(&obd->u.cli.cl_mgc_refcount)); + rc = -EBUSY; + goto out; + } + + /* The MGC has no recoverable data in any case. + * force shutdown set in umount_begin */ + obd->obd_no_recov = 1; + + if (obd->u.cli.cl_mgc_mgsexp) { + /* An error is not fatal, if we are unable to send the + disconnect mgs ping evictor cleans up the export */ + rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp); + if (rc) + CDEBUG(D_MOUNT, "disconnect failed %d\n", rc); + } + + /* Save the obdname for cleaning the nid uuids, which are + obdname_XX */ + len = strlen(obd->obd_name) + 6; + OBD_ALLOC(niduuid, len); + if (niduuid) { + strcpy(niduuid, obd->obd_name); + ptr = niduuid + strlen(niduuid); + } + + rc = class_manual_cleanup(obd); + if (rc) + goto out; + + /* Clean the nid uuids */ + if (!niduuid) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { + sprintf(ptr, "_%x", i); + rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID, + niduuid, NULL, NULL, NULL); + if (rc) + CERROR("del MDC UUID %s failed: rc = %d\n", + niduuid, rc); + } +out: + if (niduuid) + OBD_FREE(niduuid, len); + + /* class_import_put will get rid of the additional connections */ + mutex_unlock(&mgc_start_lock); + return rc; +} + +/***************** lustre superblock **************/ + +struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi; + + OBD_ALLOC_PTR(lsi); + if (!lsi) + return NULL; + OBD_ALLOC_PTR(lsi->lsi_lmd); + if (!lsi->lsi_lmd) { + OBD_FREE_PTR(lsi); + return NULL; + } + + lsi->lsi_lmd->lmd_exclude_count = 0; + lsi->lsi_lmd->lmd_recovery_time_soft = 0; + lsi->lsi_lmd->lmd_recovery_time_hard = 0; + s2lsi_nocast(sb) = lsi; + /* we take 1 extra ref for our setup */ + atomic_set(&lsi->lsi_mounts, 1); + + /* Default umount style */ + lsi->lsi_flags = LSI_UMOUNT_FAILOVER; + + return lsi; +} + +static int lustre_free_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + LASSERT(lsi != NULL); + CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi); + + /* someone didn't call server_put_mount. */ + LASSERT(atomic_read(&lsi->lsi_mounts) == 0); + + if (lsi->lsi_lmd != NULL) { + if (lsi->lsi_lmd->lmd_dev != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_dev, + strlen(lsi->lsi_lmd->lmd_dev) + 1); + if (lsi->lsi_lmd->lmd_profile != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_profile, + strlen(lsi->lsi_lmd->lmd_profile) + 1); + if (lsi->lsi_lmd->lmd_mgssec != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgssec, + strlen(lsi->lsi_lmd->lmd_mgssec) + 1); + if (lsi->lsi_lmd->lmd_opts != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_opts, + strlen(lsi->lsi_lmd->lmd_opts) + 1); + if (lsi->lsi_lmd->lmd_exclude_count) + OBD_FREE(lsi->lsi_lmd->lmd_exclude, + sizeof(lsi->lsi_lmd->lmd_exclude[0]) * + lsi->lsi_lmd->lmd_exclude_count); + if (lsi->lsi_lmd->lmd_mgs != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgs, + strlen(lsi->lsi_lmd->lmd_mgs) + 1); + if (lsi->lsi_lmd->lmd_osd_type != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_osd_type, + strlen(lsi->lsi_lmd->lmd_osd_type) + 1); + if (lsi->lsi_lmd->lmd_params != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_params, 4096); + + OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd)); + } + + LASSERT(lsi->lsi_llsbi == NULL); + OBD_FREE(lsi, sizeof(*lsi)); + s2lsi_nocast(sb) = NULL; + + return 0; +} + +/* The lsi has one reference for every server that is using the disk - + e.g. MDT, MGS, and potentially MGC */ +int lustre_put_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + LASSERT(lsi != NULL); + + CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts)); + if (atomic_dec_and_test(&lsi->lsi_mounts)) { + if (IS_SERVER(lsi) && lsi->lsi_osd_exp) { + lu_device_put(&lsi->lsi_dt_dev->dd_lu_dev); + lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt = NULL; + lsi->lsi_dt_dev = NULL; + obd_disconnect(lsi->lsi_osd_exp); + /* wait till OSD is gone */ + obd_zombie_barrier(); + } + lustre_free_lsi(sb); + return 1; + } + return 0; +} + +/*** SERVER NAME *** + * + * FSNAME is between 1 and 8 characters (inclusive). + * Excluded characters are '/' and ':' + * SEPARATOR is either ':' or '-' + * TYPE: "OST", "MDT", etc. + * INDEX: Hex representation of the index + */ + +/** Get the fsname ("lustre") from the server name ("lustre-OST003F"). + * @param [in] svname server name including type and index + * @param [out] fsname Buffer to copy filesystem name prefix into. + * Must have at least 'strlen(fsname) + 1' chars. + * @param [out] endptr if endptr isn't NULL it is set to end of fsname + * rc < 0 on error + */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr) +{ + const char *dash; + + dash = svname + strnlen(svname, 8); /* max fsname length is 8 */ + for (; dash > svname && *dash != '-' && *dash != ':'; dash--) + ; + if (dash == svname) + return -EINVAL; + + if (fsname != NULL) { + strncpy(fsname, svname, dash - svname); + fsname[dash - svname] = '\0'; + } + + if (endptr != NULL) + *endptr = dash; + + return 0; +} +EXPORT_SYMBOL(server_name2fsname); + +/** + * Get service name (svname) from string + * rc < 0 on error + * if endptr isn't NULL it is set to end of fsname * + */ +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize) +{ + int rc; + const char *dash; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(label, NULL, &dash); + if (rc != 0) + return rc; + + if (endptr != NULL) + *endptr = dash; + + if (strlcpy(svname, dash + 1, svsize) >= svsize) + return -E2BIG; + + return 0; +} +EXPORT_SYMBOL(server_name2svname); + + +/* Get the index from the obd name. + rc = server type, or + rc < 0 on error + if endptr isn't NULL it is set to end of name */ +int server_name2index(const char *svname, __u32 *idx, const char **endptr) +{ + unsigned long index; + int rc; + const char *dash; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(svname, NULL, &dash); + if (rc != 0) + return rc; + + dash++; + + if (strncmp(dash, "MDT", 3) == 0) + rc = LDD_F_SV_TYPE_MDT; + else if (strncmp(dash, "OST", 3) == 0) + rc = LDD_F_SV_TYPE_OST; + else + return -EINVAL; + + dash += 3; + + if (strncmp(dash, "all", 3) == 0) { + if (endptr != NULL) + *endptr = dash + 3; + return rc | LDD_F_SV_ALL; + } + + index = simple_strtoul(dash, (char **)endptr, 16); + if (idx != NULL) + *idx = index; + + /* Account for -mdc after index that is possible when specifying mdt */ + if (endptr != NULL && strncmp(LUSTRE_MDC_NAME, *endptr + 1, + sizeof(LUSTRE_MDC_NAME)-1) == 0) + *endptr += sizeof(LUSTRE_MDC_NAME); + + return rc; +} +EXPORT_SYMBOL(server_name2index); + +/*************** mount common between server and client ***************/ + +/* Common umount */ +int lustre_common_put_super(struct super_block *sb) +{ + int rc; + + CDEBUG(D_MOUNT, "dropping sb %p\n", sb); + + /* Drop a ref to the MGC */ + rc = lustre_stop_mgc(sb); + if (rc && (rc != -ENOENT)) { + if (rc != -EBUSY) { + CERROR("Can't stop MGC: %d\n", rc); + return rc; + } + /* BUSY just means that there's some other obd that + needs the mgc. Let him clean it up. */ + CDEBUG(D_MOUNT, "MGC still in use\n"); + } + /* Drop a ref to the mounted disk */ + lustre_put_lsi(sb); + lu_types_stop(); + return rc; +} +EXPORT_SYMBOL(lustre_common_put_super); + +static void lmd_print(struct lustre_mount_data *lmd) +{ + int i; + + PRINT_CMD(D_MOUNT, " mount data:\n"); + if (lmd_is_client(lmd)) + PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile); + PRINT_CMD(D_MOUNT, "device: %s\n", lmd->lmd_dev); + PRINT_CMD(D_MOUNT, "flags: %x\n", lmd->lmd_flags); + + if (lmd->lmd_opts) + PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts); + + if (lmd->lmd_recovery_time_soft) + PRINT_CMD(D_MOUNT, "recovery time soft: %d\n", + lmd->lmd_recovery_time_soft); + + if (lmd->lmd_recovery_time_hard) + PRINT_CMD(D_MOUNT, "recovery time hard: %d\n", + lmd->lmd_recovery_time_hard); + + for (i = 0; i < lmd->lmd_exclude_count; i++) { + PRINT_CMD(D_MOUNT, "exclude %d: OST%04x\n", i, + lmd->lmd_exclude[i]); + } +} + +/* Is this server on the exclusion list */ +int lustre_check_exclusion(struct super_block *sb, char *svname) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_mount_data *lmd = lsi->lsi_lmd; + __u32 index; + int i, rc; + + rc = server_name2index(svname, &index, NULL); + if (rc != LDD_F_SV_TYPE_OST) + /* Only exclude OSTs */ + return 0; + + CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname, + index, lmd->lmd_exclude_count, lmd->lmd_dev); + + for (i = 0; i < lmd->lmd_exclude_count; i++) { + if (index == lmd->lmd_exclude[i]) { + CWARN("Excluding %s (on exclusion list)\n", svname); + return 1; + } + } + return 0; +} + +/* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */ +static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr) +{ + const char *s1 = ptr, *s2; + __u32 index, *exclude_list; + int rc = 0, devmax; + + /* The shortest an ost name can be is 8 chars: -OST0000. + We don't actually know the fsname at this time, so in fact + a user could specify any fsname. */ + devmax = strlen(ptr) / 8 + 1; + + /* temp storage until we figure out how many we have */ + OBD_ALLOC(exclude_list, sizeof(index) * devmax); + if (!exclude_list) + return -ENOMEM; + + /* we enter this fn pointing at the '=' */ + while (*s1 && *s1 != ' ' && *s1 != ',') { + s1++; + rc = server_name2index(s1, &index, &s2); + if (rc < 0) { + CERROR("Can't parse server name '%s': rc = %d\n", + s1, rc); + break; + } + if (rc == LDD_F_SV_TYPE_OST) + exclude_list[lmd->lmd_exclude_count++] = index; + else + CDEBUG(D_MOUNT, "ignoring exclude %.*s: type = %#x\n", + (uint)(s2-s1), s1, rc); + s1 = s2; + /* now we are pointing at ':' (next exclude) + or ',' (end of excludes) */ + if (lmd->lmd_exclude_count >= devmax) + break; + } + if (rc >= 0) /* non-err */ + rc = 0; + + if (lmd->lmd_exclude_count) { + /* permanent, freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_exclude, sizeof(index) * + lmd->lmd_exclude_count); + if (lmd->lmd_exclude) { + memcpy(lmd->lmd_exclude, exclude_list, + sizeof(index) * lmd->lmd_exclude_count); + } else { + rc = -ENOMEM; + lmd->lmd_exclude_count = 0; + } + } + OBD_FREE(exclude_list, sizeof(index) * devmax); + return rc; +} + +static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr) +{ + char *tail; + int length; + + if (lmd->lmd_mgssec != NULL) { + OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1); + lmd->lmd_mgssec = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(lmd->lmd_mgssec, length + 1); + if (lmd->lmd_mgssec == NULL) + return -ENOMEM; + + memcpy(lmd->lmd_mgssec, ptr, length); + lmd->lmd_mgssec[length] = '\0'; + return 0; +} + +static int lmd_parse_string(char **handle, char *ptr) +{ + char *tail; + int length; + + if ((handle == NULL) || (ptr == NULL)) + return -EINVAL; + + if (*handle != NULL) { + OBD_FREE(*handle, strlen(*handle) + 1); + *handle = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(*handle, length + 1); + if (*handle == NULL) + return -ENOMEM; + + memcpy(*handle, ptr, length); + (*handle)[length] = '\0'; + + return 0; +} + +/* Collect multiple values for mgsnid specifiers */ +static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr) +{ + lnet_nid_t nid; + char *tail = *ptr; + char *mgsnid; + int length; + int oldlen = 0; + + /* Find end of nidlist */ + while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {} + length = tail - *ptr; + if (length == 0) { + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr); + return -EINVAL; + } + + if (lmd->lmd_mgs != NULL) + oldlen = strlen(lmd->lmd_mgs) + 1; + + OBD_ALLOC(mgsnid, oldlen + length + 1); + if (mgsnid == NULL) + return -ENOMEM; + + if (lmd->lmd_mgs != NULL) { + /* Multiple mgsnid= are taken to mean failover locations */ + memcpy(mgsnid, lmd->lmd_mgs, oldlen); + mgsnid[oldlen - 1] = ':'; + OBD_FREE(lmd->lmd_mgs, oldlen); + } + memcpy(mgsnid + oldlen, *ptr, length); + mgsnid[oldlen + length] = '\0'; + lmd->lmd_mgs = mgsnid; + *ptr = tail; + + return 0; +} + +/** Parse mount line options + * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre + * dev is passed as device=uml1:/lustre by mount.lustre + */ +static int lmd_parse(char *options, struct lustre_mount_data *lmd) +{ + char *s1, *s2, *devname = NULL; + struct lustre_mount_data *raw = (struct lustre_mount_data *)options; + int rc = 0; + + LASSERT(lmd); + if (!options) { + LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that /sbin/mount.lustre is installed.\n"); + return -EINVAL; + } + + /* Options should be a string - try to detect old lmd data */ + if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) { + LCONSOLE_ERROR_MSG(0x163, "You're using an old version of /sbin/mount.lustre. Please install version %s\n", + LUSTRE_VERSION_STRING); + return -EINVAL; + } + lmd->lmd_magic = LMD_MAGIC; + + OBD_ALLOC(lmd->lmd_params, 4096); + if (lmd->lmd_params == NULL) + return -ENOMEM; + lmd->lmd_params[0] = '\0'; + + /* Set default flags here */ + + s1 = options; + while (*s1) { + int clear = 0; + int time_min = OBD_RECOVERY_TIME_MIN; + + /* Skip whitespace and extra commas */ + while (*s1 == ' ' || *s1 == ',') + s1++; + + /* Client options are parsed in ll_options: eg. flock, + user_xattr, acl */ + + /* Parse non-ldiskfs options here. Rather than modifying + ldiskfs, we just zero these out here */ + if (strncmp(s1, "abort_recov", 11) == 0) { + lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; + clear++; + } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) { + lmd->lmd_recovery_time_soft = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; + } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) { + lmd->lmd_recovery_time_hard = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; + } else if (strncmp(s1, "noir", 4) == 0) { + lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */ + clear++; + } else if (strncmp(s1, "nosvc", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSVC; + clear++; + } else if (strncmp(s1, "nomgs", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOMGS; + clear++; + } else if (strncmp(s1, "noscrub", 7) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSCRUB; + clear++; + } else if (strncmp(s1, PARAM_MGSNODE, + sizeof(PARAM_MGSNODE) - 1) == 0) { + s2 = s1 + sizeof(PARAM_MGSNODE) - 1; + /* Assume the next mount opt is the first + invalid nid we get to. */ + rc = lmd_parse_mgs(lmd, &s2); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "writeconf", 9) == 0) { + lmd->lmd_flags |= LMD_FLG_WRITECONF; + clear++; + } else if (strncmp(s1, "update", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_UPDATE; + clear++; + } else if (strncmp(s1, "virgin", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_VIRGIN; + clear++; + } else if (strncmp(s1, "noprimnode", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE; + clear++; + } else if (strncmp(s1, "mgssec=", 7) == 0) { + rc = lmd_parse_mgssec(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + /* ost exclusion list */ + } else if (strncmp(s1, "exclude=", 8) == 0) { + rc = lmd_make_exclusion(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "mgs", 3) == 0) { + /* We are an MGS */ + lmd->lmd_flags |= LMD_FLG_MGS; + clear++; + } else if (strncmp(s1, "svname=", 7) == 0) { + rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "param=", 6) == 0) { + int length; + char *tail = strchr(s1 + 6, ','); + if (tail == NULL) + length = strlen(s1); + else + length = tail - s1; + length -= 6; + strncat(lmd->lmd_params, s1 + 6, length); + strcat(lmd->lmd_params, " "); + clear++; + } else if (strncmp(s1, "osd=", 4) == 0) { + rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4); + if (rc) + goto invalid; + clear++; + } + /* Linux 2.4 doesn't pass the device, so we stuck it at the + end of the options. */ + else if (strncmp(s1, "device=", 7) == 0) { + devname = s1 + 7; + /* terminate options right before device. device + must be the last one. */ + *s1 = '\0'; + break; + } + + /* Find next opt */ + s2 = strchr(s1, ','); + if (s2 == NULL) { + if (clear) + *s1 = '\0'; + break; + } + s2++; + if (clear) + memmove(s1, s2, strlen(s2) + 1); + else + s1 = s2; + } + + if (!devname) { + LCONSOLE_ERROR_MSG(0x164, "Can't find the device name (need mount option 'device=...')\n"); + goto invalid; + } + + s1 = strstr(devname, ":/"); + if (s1) { + ++s1; + lmd->lmd_flags |= LMD_FLG_CLIENT; + /* Remove leading /s from fsname */ + while (*++s1 == '/') ; + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8); + if (!lmd->lmd_profile) + return -ENOMEM; + sprintf(lmd->lmd_profile, "%s-client", s1); + } + + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1); + if (!lmd->lmd_dev) + return -ENOMEM; + strcpy(lmd->lmd_dev, devname); + + /* Save mount options */ + s1 = options + strlen(options) - 1; + while (s1 >= options && (*s1 == ',' || *s1 == ' ')) + *s1-- = 0; + if (*options != 0) { + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1); + if (!lmd->lmd_opts) + return -ENOMEM; + strcpy(lmd->lmd_opts, options); + } + + lmd_print(lmd); + lmd->lmd_magic = LMD_MAGIC; + + return rc; + +invalid: + CERROR("Bad mount options %s\n", options); + return -EINVAL; +} + +struct lustre_mount_data2 { + void *lmd2_data; + struct vfsmount *lmd2_mnt; +}; + +/** This is the entry point for the mount call into Lustre. + * This is called when a server or client is mounted, + * and this is where we start setting things up. + * @param data Mount options (e.g. -o flock,abort_recov) + */ +int lustre_fill_super(struct super_block *sb, void *data, int silent) +{ + struct lustre_mount_data *lmd; + struct lustre_mount_data2 *lmd2 = data; + struct lustre_sb_info *lsi; + int rc; + + CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb); + + lsi = lustre_init_lsi(sb); + if (!lsi) + return -ENOMEM; + lmd = lsi->lsi_lmd; + + /* + * Disable lockdep during mount, because mount locking patterns are + * `special'. + */ + lockdep_off(); + + /* + * LU-639: the obd cleanup of last mount may not finish yet, wait here. + */ + obd_zombie_barrier(); + + /* Figure out the lmd from the mount options */ + if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) { + lustre_put_lsi(sb); + rc = -EINVAL; + goto out; + } + + if (lmd_is_client(lmd)) { + CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile); + if (client_fill_super == NULL) + request_module("lustre"); + if (client_fill_super == NULL) { + LCONSOLE_ERROR_MSG(0x165, "Nothing registered for client mount! Is the 'lustre' module loaded?\n"); + lustre_put_lsi(sb); + rc = -ENODEV; + } else { + rc = lustre_start_mgc(sb); + if (rc) { + lustre_put_lsi(sb); + goto out; + } + /* Connect and start */ + /* (should always be ll_fill_super) */ + rc = (*client_fill_super)(sb, lmd2->lmd2_mnt); + /* c_f_s will call lustre_common_put_super on failure */ + } + } else { + CERROR("This is client-side-only module, cannot handle server mount.\n"); + rc = -EINVAL; + } + + /* If error happens in fill_super() call, @lsi will be killed there. + * This is why we do not put it here. */ + goto out; +out: + if (rc) { + CERROR("Unable to mount %s (%d)\n", + s2lsi(sb) ? lmd->lmd_dev : "", rc); + } else { + CDEBUG(D_SUPER, "Mount %s complete\n", + lmd->lmd_dev); + } + lockdep_on(); + return rc; +} + + +/* We can't call ll_fill_super by name because it lives in a module that + must be loaded after this one. */ +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb, + struct vfsmount *mnt)) +{ + client_fill_super = cfs; +} +EXPORT_SYMBOL(lustre_register_client_fill_super); + +void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb)) +{ + kill_super_cb = cfs; +} +EXPORT_SYMBOL(lustre_register_kill_super_cb); + +/***************** FS registration ******************/ +struct dentry *lustre_mount(struct file_system_type *fs_type, int flags, + const char *devname, void *data) +{ + struct lustre_mount_data2 lmd2 = { + .lmd2_data = data, + .lmd2_mnt = NULL + }; + + return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super); +} + +static void lustre_kill_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + if (kill_super_cb && lsi && !IS_SERVER(lsi)) + (*kill_super_cb)(sb); + + kill_anon_super(sb); +} + +/** Register the "lustre" fs type + */ +struct file_system_type lustre_fs_type = { + .owner = THIS_MODULE, + .name = "lustre", + .mount = lustre_mount, + .kill_sb = lustre_kill_super, + .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV | + FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE, +}; +MODULE_ALIAS_FS("lustre"); + +int lustre_register_fs(void) +{ + return register_filesystem(&lustre_fs_type); +} + +int lustre_unregister_fs(void) +{ + return unregister_filesystem(&lustre_fs_type); +} diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/obdo.c b/kernel/drivers/staging/lustre/lustre/obdclass/obdo.c new file mode 100644 index 000000000..307ffe347 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/obdo.c @@ -0,0 +1,362 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../include/obd_class.h" +#include "../include/lustre/lustre_idl.h" + +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent) +{ + dst->o_parent_oid = fid_oid(parent); + dst->o_parent_seq = fid_seq(parent); + dst->o_parent_ver = fid_ver(parent); + dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID; +} +EXPORT_SYMBOL(obdo_set_parent_fid); + +/* WARNING: the file systems must take care not to tinker with + attributes they don't manage (such as blocks). */ +void obdo_from_inode(struct obdo *dst, struct inode *src, u32 valid) +{ + u32 newvalid = 0; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n", + valid, LTIME_S(src->i_mtime), + LTIME_S(src->i_ctime)); + + if (valid & OBD_MD_FLATIME) { + dst->o_atime = LTIME_S(src->i_atime); + newvalid |= OBD_MD_FLATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->o_mtime = LTIME_S(src->i_mtime); + newvalid |= OBD_MD_FLMTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->o_ctime = LTIME_S(src->i_ctime); + newvalid |= OBD_MD_FLCTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->o_size = i_size_read(src); + newvalid |= OBD_MD_FLSIZE; + } + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = src->i_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */ + dst->o_blksize = 1 << src->i_blkbits; + newvalid |= OBD_MD_FLBLKSZ; + } + if (valid & OBD_MD_FLTYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (src->i_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (src->i_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & OBD_MD_FLUID) { + dst->o_uid = from_kuid(&init_user_ns, src->i_uid); + newvalid |= OBD_MD_FLUID; + } + if (valid & OBD_MD_FLGID) { + dst->o_gid = from_kgid(&init_user_ns, src->i_gid); + newvalid |= OBD_MD_FLGID; + } + if (valid & OBD_MD_FLFLAGS) { + dst->o_flags = src->i_flags; + newvalid |= OBD_MD_FLFLAGS; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_inode); + +void obdo_cpy_md(struct obdo *dst, struct obdo *src, u32 valid) +{ + CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n", + POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi)); + if (valid & OBD_MD_FLATIME) + dst->o_atime = src->o_atime; + if (valid & OBD_MD_FLMTIME) + dst->o_mtime = src->o_mtime; + if (valid & OBD_MD_FLCTIME) + dst->o_ctime = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + dst->o_size = src->o_size; + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + dst->o_blocks = src->o_blocks; + if (valid & OBD_MD_FLBLKSZ) + dst->o_blksize = src->o_blksize; + if (valid & OBD_MD_FLTYPE) + dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT); + if (valid & OBD_MD_FLMODE) + dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + dst->o_uid = src->o_uid; + if (valid & OBD_MD_FLGID) + dst->o_gid = src->o_gid; + if (valid & OBD_MD_FLFLAGS) + dst->o_flags = src->o_flags; + if (valid & OBD_MD_FLFID) { + dst->o_parent_seq = src->o_parent_seq; + dst->o_parent_ver = src->o_parent_ver; + } + if (valid & OBD_MD_FLGENER) + dst->o_parent_oid = src->o_parent_oid; + if (valid & OBD_MD_FLHANDLE) + dst->o_handle = src->o_handle; + if (valid & OBD_MD_FLCOOKIE) + dst->o_lcookie = src->o_lcookie; + + dst->o_valid |= valid; +} +EXPORT_SYMBOL(obdo_cpy_md); + +/* returns FALSE if comparison (by flags) is same, TRUE if changed */ +int obdo_cmp_md(struct obdo *dst, struct obdo *src, u32 compare) +{ + int res = 0; + + if (compare & OBD_MD_FLATIME) + res |= dst->o_atime != src->o_atime; + if (compare & OBD_MD_FLMTIME) + res |= dst->o_mtime != src->o_mtime; + if (compare & OBD_MD_FLCTIME) + res |= dst->o_ctime != src->o_ctime; + if (compare & OBD_MD_FLSIZE) + res |= dst->o_size != src->o_size; + if (compare & OBD_MD_FLBLOCKS) /* allocation of space */ + res |= dst->o_blocks != src->o_blocks; + if (compare & OBD_MD_FLBLKSZ) + res |= dst->o_blksize != src->o_blksize; + if (compare & OBD_MD_FLTYPE) + res |= ((dst->o_mode ^ src->o_mode) & S_IFMT) != 0; + if (compare & OBD_MD_FLMODE) + res |= ((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0; + if (compare & OBD_MD_FLUID) + res |= dst->o_uid != src->o_uid; + if (compare & OBD_MD_FLGID) + res |= dst->o_gid != src->o_gid; + if (compare & OBD_MD_FLFLAGS) + res |= dst->o_flags != src->o_flags; + if (compare & OBD_MD_FLNLINK) + res |= dst->o_nlink != src->o_nlink; + if (compare & OBD_MD_FLFID) { + res |= dst->o_parent_seq != src->o_parent_seq; + res |= dst->o_parent_ver != src->o_parent_ver; + } + if (compare & OBD_MD_FLGENER) + res |= dst->o_parent_oid != src->o_parent_oid; + /* XXX Don't know if these should be included here - wasn't previously + if ( compare & OBD_MD_FLINLINE ) + res |= memcmp(dst->o_inline, src->o_inline); + */ + return res; +} +EXPORT_SYMBOL(obdo_cmp_md); + +void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj) +{ + ioobj->ioo_oid = oa->o_oi; + if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) + ostid_set_seq_mdt0(&ioobj->ioo_oid); + + /* Since 2.4 this does not contain o_mode in the low 16 bits. + * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */ + ioobj->ioo_max_brw = 0; +} +EXPORT_SYMBOL(obdo_to_ioobj); + +void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid) +{ + if (ia_valid & ATTR_ATIME) { + oa->o_atime = LTIME_S(attr->ia_atime); + oa->o_valid |= OBD_MD_FLATIME; + } + if (ia_valid & ATTR_MTIME) { + oa->o_mtime = LTIME_S(attr->ia_mtime); + oa->o_valid |= OBD_MD_FLMTIME; + } + if (ia_valid & ATTR_CTIME) { + oa->o_ctime = LTIME_S(attr->ia_ctime); + oa->o_valid |= OBD_MD_FLCTIME; + } + if (ia_valid & ATTR_SIZE) { + oa->o_size = attr->ia_size; + oa->o_valid |= OBD_MD_FLSIZE; + } + if (ia_valid & ATTR_MODE) { + oa->o_mode = attr->ia_mode; + oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE; + if (!in_group_p(make_kgid(&init_user_ns, oa->o_gid)) && + !capable(CFS_CAP_FSETID)) + oa->o_mode &= ~S_ISGID; + } + if (ia_valid & ATTR_UID) { + oa->o_uid = from_kuid(&init_user_ns, attr->ia_uid); + oa->o_valid |= OBD_MD_FLUID; + } + if (ia_valid & ATTR_GID) { + oa->o_gid = from_kgid(&init_user_ns, attr->ia_gid); + oa->o_valid |= OBD_MD_FLGID; + } +} +EXPORT_SYMBOL(obdo_from_iattr); + +void iattr_from_obdo(struct iattr *attr, struct obdo *oa, u32 valid) +{ + valid &= oa->o_valid; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid %#llx, new time %llu/%llu\n", + oa->o_valid, oa->o_mtime, oa->o_ctime); + + attr->ia_valid = 0; + if (valid & OBD_MD_FLATIME) { + LTIME_S(attr->ia_atime) = oa->o_atime; + attr->ia_valid |= ATTR_ATIME; + } + if (valid & OBD_MD_FLMTIME) { + LTIME_S(attr->ia_mtime) = oa->o_mtime; + attr->ia_valid |= ATTR_MTIME; + } + if (valid & OBD_MD_FLCTIME) { + LTIME_S(attr->ia_ctime) = oa->o_ctime; + attr->ia_valid |= ATTR_CTIME; + } + if (valid & OBD_MD_FLSIZE) { + attr->ia_size = oa->o_size; + attr->ia_valid |= ATTR_SIZE; + } +#if 0 /* you shouldn't be able to change a file's type with setattr */ + if (valid & OBD_MD_FLTYPE) { + attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT); + attr->ia_valid |= ATTR_MODE; + } +#endif + if (valid & OBD_MD_FLMODE) { + attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT); + attr->ia_valid |= ATTR_MODE; + if (!in_group_p(make_kgid(&init_user_ns, oa->o_gid)) && + !capable(CFS_CAP_FSETID)) + attr->ia_mode &= ~S_ISGID; + } + if (valid & OBD_MD_FLUID) { + attr->ia_uid = make_kuid(&init_user_ns, oa->o_uid); + attr->ia_valid |= ATTR_UID; + } + if (valid & OBD_MD_FLGID) { + attr->ia_gid = make_kgid(&init_user_ns, oa->o_gid); + attr->ia_valid |= ATTR_GID; + } +} +EXPORT_SYMBOL(iattr_from_obdo); + +void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, u32 valid) +{ + iattr_from_obdo(&op_data->op_attr, oa, valid); + if (valid & OBD_MD_FLBLOCKS) { + op_data->op_attr_blocks = oa->o_blocks; + op_data->op_attr.ia_valid |= ATTR_BLOCKS; + } + if (valid & OBD_MD_FLFLAGS) { + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = + oa->o_flags; + op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG; + } +} +EXPORT_SYMBOL(md_from_obdo); + +void obdo_from_md(struct obdo *oa, struct md_op_data *op_data, + unsigned int valid) +{ + obdo_from_iattr(oa, &op_data->op_attr, valid); + if (valid & ATTR_BLOCKS) { + oa->o_blocks = op_data->op_attr_blocks; + oa->o_valid |= OBD_MD_FLBLOCKS; + } + if (valid & ATTR_ATTR_FLAG) { + oa->o_flags = + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags; + oa->o_valid |= OBD_MD_FLFLAGS; + } +} +EXPORT_SYMBOL(obdo_from_md); + +void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo) +{ + dobdo->o_size = cpu_to_le64(sobdo->o_size); + dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime); + dobdo->o_atime = cpu_to_le64(sobdo->o_atime); + dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime); + dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks); + dobdo->o_mode = cpu_to_le32(sobdo->o_mode); + dobdo->o_uid = cpu_to_le32(sobdo->o_uid); + dobdo->o_gid = cpu_to_le32(sobdo->o_gid); + dobdo->o_flags = cpu_to_le32(sobdo->o_flags); + dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink); + dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize); + dobdo->o_valid = cpu_to_le64(sobdo->o_valid); +} +EXPORT_SYMBOL(obdo_cpu_to_le); + +void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo) +{ + dobdo->o_size = le64_to_cpu(sobdo->o_size); + dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime); + dobdo->o_atime = le64_to_cpu(sobdo->o_atime); + dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime); + dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks); + dobdo->o_mode = le32_to_cpu(sobdo->o_mode); + dobdo->o_uid = le32_to_cpu(sobdo->o_uid); + dobdo->o_gid = le32_to_cpu(sobdo->o_gid); + dobdo->o_flags = le32_to_cpu(sobdo->o_flags); + dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink); + dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize); + dobdo->o_valid = le64_to_cpu(sobdo->o_valid); +} +EXPORT_SYMBOL(obdo_le_to_cpu); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/kernel/drivers/staging/lustre/lustre/obdclass/statfs_pack.c new file mode 100644 index 000000000..cc785ab3f --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/statfs_pack.c @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/statfs_pack.c + * + * (Un)packing of OST/MDS requests + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include "../include/lustre_export.h" +#include "../include/lustre_net.h" +#include "../include/obd_support.h" +#include "../include/obd_class.h" + +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs) +{ + memset(osfs, 0, sizeof(*osfs)); + osfs->os_type = sfs->f_type; + osfs->os_blocks = sfs->f_blocks; + osfs->os_bfree = sfs->f_bfree; + osfs->os_bavail = sfs->f_bavail; + osfs->os_files = sfs->f_files; + osfs->os_ffree = sfs->f_ffree; + osfs->os_bsize = sfs->f_bsize; + osfs->os_namelen = sfs->f_namelen; +} +EXPORT_SYMBOL(statfs_pack); + +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs) +{ + memset(sfs, 0, sizeof(*sfs)); + sfs->f_type = osfs->os_type; + sfs->f_blocks = osfs->os_blocks; + sfs->f_bfree = osfs->os_bfree; + sfs->f_bavail = osfs->os_bavail; + sfs->f_files = osfs->os_files; + sfs->f_ffree = osfs->os_ffree; + sfs->f_bsize = osfs->os_bsize; + sfs->f_namelen = osfs->os_namelen; +} +EXPORT_SYMBOL(statfs_unpack); diff --git a/kernel/drivers/staging/lustre/lustre/obdclass/uuid.c b/kernel/drivers/staging/lustre/lustre/obdclass/uuid.c new file mode 100644 index 000000000..ff0a01bcf --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdclass/uuid.c @@ -0,0 +1,82 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/uuid.c + * + * Public include file for the UUID library + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_support.h" +#include "../include/obd_class.h" + + +static inline __u32 consume(int nob, __u8 **ptr) +{ + __u32 value; + + LASSERT(nob <= sizeof(value)); + + for (value = 0; nob > 0; --nob) + value = (value << 8) | *((*ptr)++); + return value; +} + +#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr)) + +static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr) +{ + __u8 *ptr = in; + + LASSERT(nr * sizeof(*uu) == sizeof(class_uuid_t)); + + while (nr-- > 0) + CONSUME(uu[nr], &ptr); +} + +void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out) +{ + /* uu as an array of __u16's */ + __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)]; + + CLASSERT(ARRAY_SIZE(uuid) == 8); + + uuid_unpack(uu, uuid, ARRAY_SIZE(uuid)); + sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x", + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7]); +} +EXPORT_SYMBOL(class_uuid_unparse); diff --git a/kernel/drivers/staging/lustre/lustre/obdecho/Makefile b/kernel/drivers/staging/lustre/lustre/obdecho/Makefile new file mode 100644 index 000000000..672028fc7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdecho/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_LUSTRE_FS) += obdecho.o +obdecho-y := echo_client.o lproc_echo.o diff --git a/kernel/drivers/staging/lustre/lustre/obdecho/echo_client.c b/kernel/drivers/staging/lustre/lustre/obdecho/echo_client.c new file mode 100644 index 000000000..d542e06d6 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdecho/echo_client.c @@ -0,0 +1,2197 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_ECHO +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd.h" +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_debug.h" +#include "../include/lprocfs_status.h" +#include "../include/cl_object.h" +#include "../include/lustre_fid.h" +#include "../include/lustre_acl.h" +#include "../include/lustre_net.h" + +#include "echo_internal.h" + +/** \defgroup echo_client Echo Client + * @{ + */ + +struct echo_device { + struct cl_device ed_cl; + struct echo_client_obd *ed_ec; + + struct cl_site ed_site_myself; + struct cl_site *ed_site; + struct lu_device *ed_next; + int ed_next_islov; +}; + +struct echo_object { + struct cl_object eo_cl; + struct cl_object_header eo_hdr; + + struct echo_device *eo_dev; + struct list_head eo_obj_chain; + struct lov_stripe_md *eo_lsm; + atomic_t eo_npages; + int eo_deleted; +}; + +struct echo_object_conf { + struct cl_object_conf eoc_cl; + struct lov_stripe_md **eoc_md; +}; + +struct echo_page { + struct cl_page_slice ep_cl; + struct mutex ep_lock; + struct page *ep_vmpage; +}; + +struct echo_lock { + struct cl_lock_slice el_cl; + struct list_head el_chain; + struct echo_object *el_object; + __u64 el_cookie; + atomic_t el_refcount; +}; + +static int echo_client_setup(const struct lu_env *env, + struct obd_device *obddev, + struct lustre_cfg *lcfg); +static int echo_client_cleanup(struct obd_device *obddev); + + +/** \defgroup echo_helpers Helper functions + * @{ + */ +static inline struct echo_device *cl2echo_dev(const struct cl_device *dev) +{ + return container_of0(dev, struct echo_device, ed_cl); +} + +static inline struct cl_device *echo_dev2cl(struct echo_device *d) +{ + return &d->ed_cl; +} + +static inline struct echo_device *obd2echo_dev(const struct obd_device *obd) +{ + return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev)); +} + +static inline struct cl_object *echo_obj2cl(struct echo_object *eco) +{ + return &eco->eo_cl; +} + +static inline struct echo_object *cl2echo_obj(const struct cl_object *o) +{ + return container_of(o, struct echo_object, eo_cl); +} + +static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s) +{ + return container_of(s, struct echo_page, ep_cl); +} + +static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s) +{ + return container_of(s, struct echo_lock, el_cl); +} + +static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl) +{ + return ecl->el_cl.cls_lock; +} + +static struct lu_context_key echo_thread_key; +static inline struct echo_thread_info *echo_env_info(const struct lu_env *env) +{ + struct echo_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &echo_thread_key); + LASSERT(info != NULL); + return info; +} + +static inline +struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c) +{ + return container_of(c, struct echo_object_conf, eoc_cl); +} + +/** @} echo_helpers */ + +static struct echo_object *cl_echo_object_find(struct echo_device *d, + struct lov_stripe_md **lsm); +static int cl_echo_object_put(struct echo_object *eco); +static int cl_echo_enqueue(struct echo_object *eco, u64 start, + u64 end, int mode, __u64 *cookie); +static int cl_echo_cancel(struct echo_device *d, __u64 cookie); +static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, + struct page **pages, int npages, int async); + +static struct echo_thread_info *echo_env_info(const struct lu_env *env); + +struct echo_thread_info { + struct echo_object_conf eti_conf; + struct lustre_md eti_md; + + struct cl_2queue eti_queue; + struct cl_io eti_io; + struct cl_lock_descr eti_descr; + struct lu_fid eti_fid; + struct lu_fid eti_fid2; +}; + +/* No session used right now */ +struct echo_session_info { + unsigned long dummy; +}; + +static struct kmem_cache *echo_lock_kmem; +static struct kmem_cache *echo_object_kmem; +static struct kmem_cache *echo_thread_kmem; +static struct kmem_cache *echo_session_kmem; + +static struct lu_kmem_descr echo_caches[] = { + { + .ckd_cache = &echo_lock_kmem, + .ckd_name = "echo_lock_kmem", + .ckd_size = sizeof(struct echo_lock) + }, + { + .ckd_cache = &echo_object_kmem, + .ckd_name = "echo_object_kmem", + .ckd_size = sizeof(struct echo_object) + }, + { + .ckd_cache = &echo_thread_kmem, + .ckd_name = "echo_thread_kmem", + .ckd_size = sizeof(struct echo_thread_info) + }, + { + .ckd_cache = &echo_session_kmem, + .ckd_name = "echo_session_kmem", + .ckd_size = sizeof(struct echo_session_info) + }, + { + .ckd_cache = NULL + } +}; + +/** \defgroup echo_page Page operations + * + * Echo page operations. + * + * @{ + */ +static struct page *echo_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return cl2echo_page(slice)->ep_vmpage; +} + +static int echo_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock) +{ + struct echo_page *ep = cl2echo_page(slice); + + if (!nonblock) + mutex_lock(&ep->ep_lock); + else if (!mutex_trylock(&ep->ep_lock)) + return -EAGAIN; + return 0; +} + +static void echo_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct echo_page *ep = cl2echo_page(slice); + + LASSERT(mutex_is_locked(&ep->ep_lock)); + mutex_unlock(&ep->ep_lock); +} + +static void echo_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + cl_page_delete(env, slice->cpl_page); +} + +static int echo_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + if (mutex_is_locked(&cl2echo_page(slice)->ep_lock)) + return -EBUSY; + return -ENODATA; +} + +static void echo_page_completion(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + LASSERT(slice->cpl_page->cp_sync_io != NULL); +} + +static void echo_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct echo_page *ep = cl2echo_page(slice); + struct echo_object *eco = cl2echo_obj(slice->cpl_obj); + struct page *vmpage = ep->ep_vmpage; + + atomic_dec(&eco->eo_npages); + page_cache_release(vmpage); +} + +static int echo_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + return 0; +} + +static int echo_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct echo_page *ep = cl2echo_page(slice); + + (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n", + ep, mutex_is_locked(&ep->ep_lock), ep->ep_vmpage); + return 0; +} + +static const struct cl_page_operations echo_page_ops = { + .cpo_own = echo_page_own, + .cpo_disown = echo_page_disown, + .cpo_discard = echo_page_discard, + .cpo_vmpage = echo_page_vmpage, + .cpo_fini = echo_page_fini, + .cpo_print = echo_page_print, + .cpo_is_vmlocked = echo_page_is_vmlocked, + .io = { + [CRT_READ] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + }, + [CRT_WRITE] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + } + } +}; +/** @} echo_page */ + +/** \defgroup echo_lock Locking + * + * echo lock operations + * + * @{ + */ +static void echo_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct echo_lock *ecl = cl2echo_lock(slice); + + LASSERT(list_empty(&ecl->el_chain)); + OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem); +} + +static void echo_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct echo_lock *ecl = cl2echo_lock(slice); + + LASSERT(list_empty(&ecl->el_chain)); +} + +static int echo_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *unused) +{ + return 1; +} + +static struct cl_lock_operations echo_lock_ops = { + .clo_fini = echo_lock_fini, + .clo_delete = echo_lock_delete, + .clo_fits_into = echo_lock_fits_into +}; + +/** @} echo_lock */ + +/** \defgroup echo_cl_ops cl_object operations + * + * operations for cl_object + * + * @{ + */ +static int echo_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct echo_page *ep = cl_object_page_slice(obj, page); + struct echo_object *eco = cl2echo_obj(obj); + + ep->ep_vmpage = vmpage; + page_cache_get(vmpage); + mutex_init(&ep->ep_lock); + cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops); + atomic_inc(&eco->eo_npages); + return 0; +} + +static int echo_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + return 0; +} + +static int echo_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *unused) +{ + struct echo_lock *el; + + OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, GFP_NOFS); + if (el != NULL) { + cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops); + el->el_object = cl2echo_obj(obj); + INIT_LIST_HEAD(&el->el_chain); + atomic_set(&el->el_refcount, 0); + } + return el == NULL ? -ENOMEM : 0; +} + +static int echo_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + return 0; +} + +static const struct cl_object_operations echo_cl_obj_ops = { + .coo_page_init = echo_page_init, + .coo_lock_init = echo_lock_init, + .coo_io_init = echo_io_init, + .coo_conf_set = echo_conf_set +}; +/** @} echo_cl_ops */ + +/** \defgroup echo_lu_ops lu_object operations + * + * operations for echo lu object. + * + * @{ + */ +static int echo_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(obj->lo_dev)); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + const struct cl_object_conf *cconf; + struct echo_object_conf *econf; + + if (ed->ed_next) { + struct lu_object *below; + struct lu_device *under; + + under = ed->ed_next; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, + under); + if (below == NULL) + return -ENOMEM; + lu_object_add(obj, below); + } + + cconf = lu2cl_conf(conf); + econf = cl2echo_conf(cconf); + + LASSERT(econf->eoc_md); + eco->eo_lsm = *econf->eoc_md; + /* clear the lsm pointer so that it won't get freed. */ + *econf->eoc_md = NULL; + + eco->eo_dev = ed; + atomic_set(&eco->eo_npages, 0); + cl_object_page_init(lu2cl(obj), sizeof(struct echo_page)); + + spin_lock(&ec->ec_lock); + list_add_tail(&eco->eo_obj_chain, &ec->ec_objects); + spin_unlock(&ec->ec_lock); + + return 0; +} + +/* taken from osc_unpackmd() */ +static int echo_alloc_memmd(struct echo_device *ed, + struct lov_stripe_md **lsmp) +{ + int lsm_size; + + /* If export is lov/osc then use their obd method */ + if (ed->ed_next != NULL) + return obd_alloc_memmd(ed->ed_ec->ec_exp, lsmp); + /* OFD has no unpackmd method, do everything here */ + lsm_size = lov_stripe_md_size(1); + + LASSERT(*lsmp == NULL); + OBD_ALLOC(*lsmp, lsm_size); + if (*lsmp == NULL) + return -ENOMEM; + + OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + if ((*lsmp)->lsm_oinfo[0] == NULL) { + OBD_FREE(*lsmp, lsm_size); + return -ENOMEM; + } + + loi_init((*lsmp)->lsm_oinfo[0]); + (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; + ostid_set_seq_echo(&(*lsmp)->lsm_oi); + + return lsm_size; +} + +static int echo_free_memmd(struct echo_device *ed, struct lov_stripe_md **lsmp) +{ + int lsm_size; + + /* If export is lov/osc then use their obd method */ + if (ed->ed_next != NULL) + return obd_free_memmd(ed->ed_ec->ec_exp, lsmp); + /* OFD has no unpackmd method, do everything here */ + lsm_size = lov_stripe_md_size(1); + + LASSERT(*lsmp != NULL); + OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + OBD_FREE(*lsmp, lsm_size); + *lsmp = NULL; + return 0; +} + +static void echo_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + struct echo_client_obd *ec = eco->eo_dev->ed_ec; + + LASSERT(atomic_read(&eco->eo_npages) == 0); + + spin_lock(&ec->ec_lock); + list_del_init(&eco->eo_obj_chain); + spin_unlock(&ec->ec_lock); + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + + if (eco->eo_lsm) + echo_free_memmd(eco->eo_dev, &eco->eo_lsm); + OBD_SLAB_FREE_PTR(eco, echo_object_kmem); +} + +static int echo_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct echo_object *obj = cl2echo_obj(lu2cl(o)); + + return (*p)(env, cookie, "echoclient-object@%p", obj); +} + +static const struct lu_object_operations echo_lu_obj_ops = { + .loo_object_init = echo_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = echo_object_free, + .loo_object_print = echo_object_print, + .loo_object_invariant = NULL +}; +/** @} echo_lu_ops */ + +/** \defgroup echo_lu_dev_ops lu_device operations + * + * Operations for echo lu device. + * + * @{ + */ +static struct lu_object *echo_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + struct echo_object *eco; + struct lu_object *obj = NULL; + + /* we're the top dev. */ + LASSERT(hdr == NULL); + OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, GFP_NOFS); + if (eco != NULL) { + struct cl_object_header *hdr = &eco->eo_hdr; + + obj = &echo_obj2cl(eco)->co_lu; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + eco->eo_cl.co_ops = &echo_cl_obj_ops; + obj->lo_ops = &echo_lu_obj_ops; + } + return obj; +} + +static struct lu_device_operations echo_device_lu_ops = { + .ldo_object_alloc = echo_object_alloc, +}; + +/** @} echo_lu_dev_ops */ + +static struct cl_device_operations echo_device_cl_ops = { +}; + +/** \defgroup echo_init Setup and teardown + * + * Init and fini functions for echo client. + * + * @{ + */ +static int echo_site_init(const struct lu_env *env, struct echo_device *ed) +{ + struct cl_site *site = &ed->ed_site_myself; + int rc; + + /* initialize site */ + rc = cl_site_init(site, &ed->ed_cl); + if (rc) { + CERROR("Cannot initialize site for echo client(%d)\n", rc); + return rc; + } + + rc = lu_site_init_finish(&site->cs_lu); + if (rc) + return rc; + + ed->ed_site = site; + return 0; +} + +static void echo_site_fini(const struct lu_env *env, struct echo_device *ed) +{ + if (ed->ed_site) { + cl_site_fini(ed->ed_site); + ed->ed_site = NULL; + } +} + +static void *echo_thread_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void echo_thread_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_thread_info *info = data; + + OBD_SLAB_FREE_PTR(info, echo_thread_kmem); +} + +static void echo_thread_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ +} + +static struct lu_context_key echo_thread_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = echo_thread_key_init, + .lct_fini = echo_thread_key_fini, + .lct_exit = echo_thread_key_exit +}; + +static void *echo_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_session_info *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, GFP_NOFS); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void echo_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_session_info *session = data; + + OBD_SLAB_FREE_PTR(session, echo_session_kmem); +} + +static void echo_session_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ +} + +static struct lu_context_key echo_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = echo_session_key_init, + .lct_fini = echo_session_key_fini, + .lct_exit = echo_session_key_exit +}; + +LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key); + +static struct lu_device *echo_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *next; + struct echo_device *ed; + struct cl_device *cd; + struct obd_device *obd = NULL; /* to keep compiler happy */ + struct obd_device *tgt; + const char *tgt_type_name; + int rc; + int cleanup = 0; + + OBD_ALLOC_PTR(ed); + if (ed == NULL) { + rc = -ENOMEM; + goto out; + } + + cleanup = 1; + cd = &ed->ed_cl; + rc = cl_device_init(cd, t); + if (rc) + goto out; + + cd->cd_lu_dev.ld_ops = &echo_device_lu_ops; + cd->cd_ops = &echo_device_cl_ops; + + cleanup = 2; + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + LASSERT(env != NULL); + + tgt = class_name2obd(lustre_cfg_string(cfg, 1)); + if (tgt == NULL) { + CERROR("Can not find tgt device %s\n", + lustre_cfg_string(cfg, 1)); + rc = -ENODEV; + goto out; + } + + next = tgt->obd_lu_dev; + if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) { + CERROR("echo MDT client must be run on server\n"); + rc = -EOPNOTSUPP; + goto out; + } + + rc = echo_site_init(env, ed); + if (rc) + goto out; + + cleanup = 3; + + rc = echo_client_setup(env, obd, cfg); + if (rc) + goto out; + + ed->ed_ec = &obd->u.echo_client; + cleanup = 4; + + /* if echo client is to be stacked upon ost device, the next is + * NULL since ost is not a clio device so far */ + if (next != NULL && !lu_device_is_cl(next)) + next = NULL; + + tgt_type_name = tgt->obd_type->typ_name; + if (next != NULL) { + LASSERT(next != NULL); + if (next->ld_site != NULL) { + rc = -EBUSY; + goto out; + } + + next->ld_site = &ed->ed_site->cs_lu; + rc = next->ld_type->ldt_ops->ldto_device_init(env, next, + next->ld_type->ldt_name, + NULL); + if (rc) + goto out; + + /* Tricky case, I have to determine the obd type since + * CLIO uses the different parameters to initialize + * objects for lov & osc. */ + if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0) + ed->ed_next_islov = 1; + else + LASSERT(strcmp(tgt_type_name, + LUSTRE_OSC_NAME) == 0); + } else { + LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0); + } + + ed->ed_next = next; + return &cd->cd_lu_dev; +out: + switch (cleanup) { + case 4: { + int rc2; + + rc2 = echo_client_cleanup(obd); + if (rc2) + CERROR("Cleanup obd device %s error(%d)\n", + obd->obd_name, rc2); + } + + case 3: + echo_site_fini(env, ed); + case 2: + cl_device_fini(&ed->ed_cl); + case 1: + OBD_FREE_PTR(ed); + case 0: + default: + break; + } + return ERR_PTR(rc); +} + +static int echo_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + LBUG(); + return 0; +} + +static struct lu_device *echo_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct lu_device *next = ed->ed_next; + + while (next) + next = next->ld_type->ldt_ops->ldto_device_fini(env, next); + return NULL; +} + +static void echo_lock_release(const struct lu_env *env, + struct echo_lock *ecl, + int still_used) +{ + struct cl_lock *clk = echo_lock2cl(ecl); + + cl_lock_get(clk); + cl_unuse(env, clk); + cl_lock_release(env, clk, "ec enqueue", ecl->el_object); + if (!still_used) { + cl_lock_mutex_get(env, clk); + cl_lock_cancel(env, clk); + cl_lock_delete(env, clk); + cl_lock_mutex_put(env, clk); + } + cl_lock_put(env, clk); +} + +static struct lu_device *echo_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco; + struct lu_device *next = ed->ed_next; + + CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n", + ed, next); + + lu_site_purge(env, &ed->ed_site->cs_lu, -1); + + /* check if there are objects still alive. + * It shouldn't have any object because lu_site_purge would cleanup + * all of cached objects. Anyway, probably the echo device is being + * parallelly accessed. + */ + spin_lock(&ec->ec_lock); + list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain) + eco->eo_deleted = 1; + spin_unlock(&ec->ec_lock); + + /* purge again */ + lu_site_purge(env, &ed->ed_site->cs_lu, -1); + + CDEBUG(D_INFO, + "Waiting for the reference of echo object to be dropped\n"); + + /* Wait for the last reference to be dropped. */ + spin_lock(&ec->ec_lock); + while (!list_empty(&ec->ec_objects)) { + spin_unlock(&ec->ec_lock); + CERROR("echo_client still has objects at cleanup time, wait for 1 second\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + lu_site_purge(env, &ed->ed_site->cs_lu, -1); + spin_lock(&ec->ec_lock); + } + spin_unlock(&ec->ec_lock); + + LASSERT(list_empty(&ec->ec_locks)); + + CDEBUG(D_INFO, "No object exists, exiting...\n"); + + echo_client_cleanup(d->ld_obd); + + while (next) + next = next->ld_type->ldt_ops->ldto_device_free(env, next); + + LASSERT(ed->ed_site == lu2cl_site(d->ld_site)); + echo_site_fini(env, ed); + cl_device_fini(&ed->ed_cl); + OBD_FREE_PTR(ed); + + return NULL; +} + +static const struct lu_device_type_operations echo_device_type_ops = { + .ldto_init = echo_type_init, + .ldto_fini = echo_type_fini, + + .ldto_start = echo_type_start, + .ldto_stop = echo_type_stop, + + .ldto_device_alloc = echo_device_alloc, + .ldto_device_free = echo_device_free, + .ldto_device_init = echo_device_init, + .ldto_device_fini = echo_device_fini +}; + +static struct lu_device_type echo_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_ECHO_CLIENT_NAME, + .ldt_ops = &echo_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD, +}; +/** @} echo_init */ + +/** \defgroup echo_exports Exported operations + * + * exporting functions to echo client + * + * @{ + */ + +/* Interfaces to echo client obd device */ +static struct echo_object *cl_echo_object_find(struct echo_device *d, + struct lov_stripe_md **lsmp) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct echo_object_conf *conf; + struct lov_stripe_md *lsm; + struct echo_object *eco; + struct cl_object *obj; + struct lu_fid *fid; + int refcheck; + int rc; + + LASSERT(lsmp); + lsm = *lsmp; + LASSERT(lsm); + LASSERTF(ostid_id(&lsm->lsm_oi) != 0, DOSTID"\n", POSTID(&lsm->lsm_oi)); + LASSERTF(ostid_seq(&lsm->lsm_oi) == FID_SEQ_ECHO, DOSTID"\n", + POSTID(&lsm->lsm_oi)); + + /* Never return an object if the obd is to be freed. */ + if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping) + return ERR_PTR(-ENODEV); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return (void *)env; + + info = echo_env_info(env); + conf = &info->eti_conf; + if (d->ed_next) { + if (!d->ed_next_islov) { + struct lov_oinfo *oinfo = lsm->lsm_oinfo[0]; + + LASSERT(oinfo != NULL); + oinfo->loi_oi = lsm->lsm_oi; + conf->eoc_cl.u.coc_oinfo = oinfo; + } else { + struct lustre_md *md; + + md = &info->eti_md; + memset(md, 0, sizeof(*md)); + md->lsm = lsm; + conf->eoc_cl.u.coc_md = md; + } + } + conf->eoc_md = lsmp; + + fid = &info->eti_fid; + rc = ostid_to_fid(fid, &lsm->lsm_oi, 0); + if (rc != 0) { + eco = ERR_PTR(rc); + goto out; + } + + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl); + if (IS_ERR(obj)) { + eco = (void *)obj; + goto out; + } + + eco = cl2echo_obj(obj); + if (eco->eo_deleted) { + cl_object_put(env, obj); + eco = ERR_PTR(-EAGAIN); + } + +out: + cl_env_put(env, &refcheck); + return eco; +} + +static int cl_echo_object_put(struct echo_object *eco) +{ + struct lu_env *env; + struct cl_object *obj = echo_obj2cl(eco); + int refcheck; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + /* an external function to kill an object? */ + if (eco->eo_deleted) { + struct lu_object_header *loh = obj->co_lu.lo_header; + + LASSERT(&eco->eo_hdr == luh2coh(loh)); + set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags); + } + + cl_object_put(env, obj); + cl_env_put(env, &refcheck); + return 0; +} + +static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco, + u64 start, u64 end, int mode, + __u64 *cookie, __u32 enqflags) +{ + struct cl_io *io; + struct cl_lock *lck; + struct cl_object *obj; + struct cl_lock_descr *descr; + struct echo_thread_info *info; + int rc = -ENOMEM; + + info = echo_env_info(env); + io = &info->eti_io; + descr = &info->eti_descr; + obj = echo_obj2cl(eco); + + descr->cld_obj = obj; + descr->cld_start = cl_index(obj, start); + descr->cld_end = cl_index(obj, end); + descr->cld_mode = mode == LCK_PW ? CLM_WRITE : CLM_READ; + descr->cld_enq_flags = enqflags; + io->ci_obj = obj; + + lck = cl_lock_request(env, io, descr, "ec enqueue", eco); + if (lck) { + struct echo_client_obd *ec = eco->eo_dev->ed_ec; + struct echo_lock *el; + + rc = cl_wait(env, lck); + if (rc == 0) { + el = cl2echo_lock(cl_lock_at(lck, &echo_device_type)); + spin_lock(&ec->ec_lock); + if (list_empty(&el->el_chain)) { + list_add(&el->el_chain, &ec->ec_locks); + el->el_cookie = ++ec->ec_unique; + } + atomic_inc(&el->el_refcount); + *cookie = el->el_cookie; + spin_unlock(&ec->ec_lock); + } else { + cl_lock_release(env, lck, "ec enqueue", current); + } + } + return rc; +} + +static int cl_echo_enqueue(struct echo_object *eco, u64 start, u64 end, + int mode, __u64 *cookie) +{ + struct echo_thread_info *info; + struct lu_env *env; + struct cl_io *io; + int refcheck; + int result; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + info = echo_env_info(env); + io = &info->eti_io; + + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, echo_obj2cl(eco)); + if (result < 0) + goto out; + LASSERT(result == 0); + + result = cl_echo_enqueue0(env, eco, start, end, mode, cookie, 0); + cl_io_fini(env, io); + +out: + cl_env_put(env, &refcheck); + return result; +} + +static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed, + __u64 cookie) +{ + struct echo_client_obd *ec = ed->ed_ec; + struct echo_lock *ecl = NULL; + struct list_head *el; + int found = 0, still_used = 0; + + LASSERT(ec != NULL); + spin_lock(&ec->ec_lock); + list_for_each(el, &ec->ec_locks) { + ecl = list_entry(el, struct echo_lock, el_chain); + CDEBUG(D_INFO, "ecl: %p, cookie: %#llx\n", ecl, ecl->el_cookie); + found = (ecl->el_cookie == cookie); + if (found) { + if (atomic_dec_and_test(&ecl->el_refcount)) + list_del_init(&ecl->el_chain); + else + still_used = 1; + break; + } + } + spin_unlock(&ec->ec_lock); + + if (!found) + return -ENOENT; + + echo_lock_release(env, ecl, still_used); + return 0; +} + +static int cl_echo_cancel(struct echo_device *ed, __u64 cookie) +{ + struct lu_env *env; + int refcheck; + int rc; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + rc = cl_echo_cancel0(env, ed, cookie); + + cl_env_put(env, &refcheck); + return rc; +} + +static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io, + enum cl_req_type unused, struct cl_2queue *queue) +{ + struct cl_page *clp; + struct cl_page *temp; + int result = 0; + + cl_page_list_for_each_safe(clp, temp, &queue->c2_qin) { + int rc; + + rc = cl_page_cache_add(env, io, clp, CRT_WRITE); + if (rc == 0) + continue; + result = result ?: rc; + } + return result; +} + +static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, + struct page **pages, int npages, int async) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct cl_object *obj = echo_obj2cl(eco); + struct echo_device *ed = eco->eo_dev; + struct cl_2queue *queue; + struct cl_io *io; + struct cl_page *clp; + struct lustre_handle lh = { 0 }; + int page_size = cl_page_size(obj); + int refcheck; + int rc; + int i; + + LASSERT((offset & ~CFS_PAGE_MASK) == 0); + LASSERT(ed->ed_next != NULL); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + info = echo_env_info(env); + io = &info->eti_io; + queue = &info->eti_queue; + + cl_2queue_init(queue); + + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, obj); + if (rc < 0) + goto out; + LASSERT(rc == 0); + + + rc = cl_echo_enqueue0(env, eco, offset, + offset + npages * PAGE_CACHE_SIZE - 1, + rw == READ ? LCK_PR : LCK_PW, &lh.cookie, + CEF_NEVER); + if (rc < 0) + goto error_lock; + + for (i = 0; i < npages; i++) { + LASSERT(pages[i]); + clp = cl_page_find(env, obj, cl_index(obj, offset), + pages[i], CPT_TRANSIENT); + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + LASSERT(clp->cp_type == CPT_TRANSIENT); + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + cl_2queue_add(queue, clp); + + /* drop the reference count for cl_page_find, so that the page + * will be freed in cl_2queue_fini. */ + cl_page_put(env, clp); + cl_page_clip(env, clp, 0, page_size); + + offset += page_size; + } + + if (rc == 0) { + enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE; + + async = async && (typ == CRT_WRITE); + if (async) + rc = cl_echo_async_brw(env, io, typ, queue); + else + rc = cl_io_submit_sync(env, io, typ, queue, 0); + CDEBUG(D_INFO, "echo_client %s write returns %d\n", + async ? "async" : "sync", rc); + } + + cl_echo_cancel0(env, ed, lh.cookie); +error_lock: + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + cl_io_fini(env, io); +out: + cl_env_put(env, &refcheck); + return rc; +} +/** @} echo_exports */ + + +static u64 last_object_id; + +static int +echo_copyout_lsm(struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob) +{ + struct lov_stripe_md *ulsm = _ulsm; + int nob, i; + + nob = offsetof(struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]); + if (nob > ulsm_nob) + return -EINVAL; + + if (copy_to_user(ulsm, lsm, sizeof(*ulsm))) + return -EFAULT; + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (copy_to_user(ulsm->lsm_oinfo[i], lsm->lsm_oinfo[i], + sizeof(lsm->lsm_oinfo[0]))) + return -EFAULT; + } + return 0; +} + +static int +echo_copyin_lsm(struct echo_device *ed, struct lov_stripe_md *lsm, + void *ulsm, int ulsm_nob) +{ + struct echo_client_obd *ec = ed->ed_ec; + int i; + + if (ulsm_nob < sizeof(*lsm)) + return -EINVAL; + + if (copy_from_user(lsm, ulsm, sizeof(*lsm))) + return -EFAULT; + + if (lsm->lsm_stripe_count > ec->ec_nstripes || + lsm->lsm_magic != LOV_MAGIC || + (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 || + ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL)) + return -EINVAL; + + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (copy_from_user(lsm->lsm_oinfo[i], + ((struct lov_stripe_md *)ulsm)-> \ + lsm_oinfo[i], + sizeof(lsm->lsm_oinfo[0]))) + return -EFAULT; + } + return 0; +} + +static int echo_create_object(const struct lu_env *env, struct echo_device *ed, + int on_target, struct obdo *oa, void *ulsm, + int ulsm_nob, struct obd_trans_info *oti) +{ + struct echo_object *eco; + struct echo_client_obd *ec = ed->ed_ec; + struct lov_stripe_md *lsm = NULL; + int rc; + int created = 0; + + if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */ + (on_target || /* set_stripe */ + ec->ec_nstripes != 0)) { /* LOV */ + CERROR("No valid oid\n"); + return -EINVAL; + } + + rc = echo_alloc_memmd(ed, &lsm); + if (rc < 0) { + CERROR("Cannot allocate md: rc = %d\n", rc); + goto failed; + } + + if (ulsm != NULL) { + int i, idx; + + rc = echo_copyin_lsm(ed, lsm, ulsm, ulsm_nob); + if (rc != 0) + goto failed; + + if (lsm->lsm_stripe_count == 0) + lsm->lsm_stripe_count = ec->ec_nstripes; + + if (lsm->lsm_stripe_size == 0) + lsm->lsm_stripe_size = PAGE_CACHE_SIZE; + + idx = cfs_rand(); + + /* setup stripes: indices + default ids if required */ + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) == 0) + lsm->lsm_oinfo[i]->loi_oi = lsm->lsm_oi; + + lsm->lsm_oinfo[i]->loi_ost_idx = + (idx + i) % ec->ec_nstripes; + } + } + + /* setup object ID here for !on_target and LOV hint */ + if (oa->o_valid & OBD_MD_FLID) { + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + lsm->lsm_oi = oa->o_oi; + } + + if (ostid_id(&lsm->lsm_oi) == 0) + ostid_set_id(&lsm->lsm_oi, ++last_object_id); + + rc = 0; + if (on_target) { + /* Only echo objects are allowed to be created */ + LASSERT((oa->o_valid & OBD_MD_FLGROUP) && + (ostid_seq(&oa->o_oi) == FID_SEQ_ECHO)); + rc = obd_create(env, ec->ec_exp, oa, &lsm, oti); + if (rc != 0) { + CERROR("Cannot create objects: rc = %d\n", rc); + goto failed; + } + created = 1; + } + + /* See what object ID we were given */ + oa->o_oi = lsm->lsm_oi; + oa->o_valid |= OBD_MD_FLID; + + eco = cl_echo_object_find(ed, &lsm); + if (IS_ERR(eco)) { + rc = PTR_ERR(eco); + goto failed; + } + cl_echo_object_put(eco); + + CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi)); + + failed: + if (created && rc) + obd_destroy(env, ec->ec_exp, oa, lsm, oti, NULL, NULL); + if (lsm) + echo_free_memmd(ed, &lsm); + if (rc) + CERROR("create object failed with: rc = %d\n", rc); + return rc; +} + +static int echo_get_object(struct echo_object **ecop, struct echo_device *ed, + struct obdo *oa) +{ + struct lov_stripe_md *lsm = NULL; + struct echo_object *eco; + int rc; + + if ((oa->o_valid & OBD_MD_FLID) == 0 || ostid_id(&oa->o_oi) == 0) { + /* disallow use of object id 0 */ + CERROR("No valid oid\n"); + return -EINVAL; + } + + rc = echo_alloc_memmd(ed, &lsm); + if (rc < 0) + return rc; + + lsm->lsm_oi = oa->o_oi; + if (!(oa->o_valid & OBD_MD_FLGROUP)) + ostid_set_seq_echo(&lsm->lsm_oi); + + rc = 0; + eco = cl_echo_object_find(ed, &lsm); + if (!IS_ERR(eco)) + *ecop = eco; + else + rc = PTR_ERR(eco); + if (lsm) + echo_free_memmd(ed, &lsm); + return rc; +} + +static void echo_put_object(struct echo_object *eco) +{ + if (cl_echo_object_put(eco)) + CERROR("echo client: drop an object failed"); +} + +static void +echo_get_stripe_off_id(struct lov_stripe_md *lsm, u64 *offp, u64 *idp) +{ + unsigned long stripe_count; + unsigned long stripe_size; + unsigned long width; + unsigned long woffset; + int stripe_index; + u64 offset; + + if (lsm->lsm_stripe_count <= 1) + return; + + offset = *offp; + stripe_size = lsm->lsm_stripe_size; + stripe_count = lsm->lsm_stripe_count; + + /* width = # bytes in all stripes */ + width = stripe_size * stripe_count; + + /* woffset = offset within a width; offset = whole number of widths */ + woffset = do_div(offset, width); + + stripe_index = woffset / stripe_size; + + *idp = ostid_id(&lsm->lsm_oinfo[stripe_index]->loi_oi); + *offp = offset * stripe_size + woffset % stripe_size; +} + +static void +echo_client_page_debug_setup(struct lov_stripe_md *lsm, + struct page *page, int rw, u64 id, + u64 offset, u64 count) +{ + char *addr; + u64 stripe_off; + u64 stripe_id; + int delta; + + /* no partial pages on the client */ + LASSERT(count == PAGE_CACHE_SIZE); + + addr = kmap(page); + + for (delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { + if (rw == OBD_BRW_WRITE) { + stripe_off = offset + delta; + stripe_id = id; + echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id); + } else { + stripe_off = 0xdeadbeef00c0ffeeULL; + stripe_id = 0xdeadbeef00c0ffeeULL; + } + block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE, + stripe_off, stripe_id); + } + + kunmap(page); +} + +static int echo_client_page_debug_check(struct lov_stripe_md *lsm, + struct page *page, u64 id, + u64 offset, u64 count) +{ + u64 stripe_off; + u64 stripe_id; + char *addr; + int delta; + int rc; + int rc2; + + /* no partial pages on the client */ + LASSERT(count == PAGE_CACHE_SIZE); + + addr = kmap(page); + + for (rc = delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { + stripe_off = offset + delta; + stripe_id = id; + echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id); + + rc2 = block_debug_check("test_brw", + addr + delta, OBD_ECHO_BLOCK_SIZE, + stripe_off, stripe_id); + if (rc2 != 0) { + CERROR("Error in echo object %#llx\n", id); + rc = rc2; + } + } + + kunmap(page); + return rc; +} + +static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa, + struct echo_object *eco, u64 offset, + u64 count, int async, + struct obd_trans_info *oti) +{ + struct lov_stripe_md *lsm = eco->eo_lsm; + u32 npages; + struct brw_page *pga; + struct brw_page *pgp; + struct page **pages; + u64 off; + int i; + int rc; + int verify; + gfp_t gfp_mask; + int brw_flags = 0; + + verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + + gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER; + + LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); + LASSERT(lsm != NULL); + LASSERT(ostid_id(&lsm->lsm_oi) == ostid_id(&oa->o_oi)); + + if (count <= 0 || + (count & (~CFS_PAGE_MASK)) != 0) + return -EINVAL; + + /* XXX think again with misaligned I/O */ + npages = count >> PAGE_CACHE_SHIFT; + + if (rw == OBD_BRW_WRITE) + brw_flags = OBD_BRW_ASYNC; + + OBD_ALLOC(pga, npages * sizeof(*pga)); + if (pga == NULL) + return -ENOMEM; + + OBD_ALLOC(pages, npages * sizeof(*pages)); + if (pages == NULL) { + OBD_FREE(pga, npages * sizeof(*pga)); + return -ENOMEM; + } + + for (i = 0, pgp = pga, off = offset; + i < npages; + i++, pgp++, off += PAGE_CACHE_SIZE) { + + LASSERT(pgp->pg == NULL); /* for cleanup */ + + rc = -ENOMEM; + OBD_PAGE_ALLOC(pgp->pg, gfp_mask); + if (pgp->pg == NULL) + goto out; + + pages[i] = pgp->pg; + pgp->count = PAGE_CACHE_SIZE; + pgp->off = off; + pgp->flag = brw_flags; + + if (verify) + echo_client_page_debug_setup(lsm, pgp->pg, rw, + ostid_id(&oa->o_oi), off, + pgp->count); + } + + /* brw mode can only be used at client */ + LASSERT(ed->ed_next != NULL); + rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async); + + out: + if (rc != 0 || rw != OBD_BRW_READ) + verify = 0; + + for (i = 0, pgp = pga; i < npages; i++, pgp++) { + if (pgp->pg == NULL) + continue; + + if (verify) { + int vrc; + + vrc = echo_client_page_debug_check(lsm, pgp->pg, + ostid_id(&oa->o_oi), + pgp->off, pgp->count); + if (vrc != 0 && rc == 0) + rc = vrc; + } + OBD_PAGE_FREE(pgp->pg); + } + OBD_FREE(pga, npages * sizeof(*pga)); + OBD_FREE(pages, npages * sizeof(*pages)); + return rc; +} + +static int echo_client_prep_commit(const struct lu_env *env, + struct obd_export *exp, int rw, + struct obdo *oa, struct echo_object *eco, + u64 offset, u64 count, + u64 batch, struct obd_trans_info *oti, + int async) +{ + struct lov_stripe_md *lsm = eco->eo_lsm; + struct obd_ioobj ioo; + struct niobuf_local *lnb; + struct niobuf_remote *rnb; + u64 off; + u64 npages, tot_pages; + int i, ret = 0, brw_flags = 0; + + if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0 || + (lsm != NULL && ostid_id(&lsm->lsm_oi) != ostid_id(&oa->o_oi))) + return -EINVAL; + + npages = batch >> PAGE_CACHE_SHIFT; + tot_pages = count >> PAGE_CACHE_SHIFT; + + OBD_ALLOC(lnb, npages * sizeof(struct niobuf_local)); + OBD_ALLOC(rnb, npages * sizeof(struct niobuf_remote)); + + if (lnb == NULL || rnb == NULL) { + ret = -ENOMEM; + goto out; + } + + if (rw == OBD_BRW_WRITE && async) + brw_flags |= OBD_BRW_ASYNC; + + obdo_to_ioobj(oa, &ioo); + + off = offset; + + for (; tot_pages; tot_pages -= npages) { + int lpages; + + if (tot_pages < npages) + npages = tot_pages; + + for (i = 0; i < npages; i++, off += PAGE_CACHE_SIZE) { + rnb[i].offset = off; + rnb[i].len = PAGE_CACHE_SIZE; + rnb[i].flags = brw_flags; + } + + ioo.ioo_bufcnt = npages; + oti->oti_transno = 0; + + lpages = npages; + ret = obd_preprw(env, rw, exp, oa, 1, &ioo, rnb, &lpages, + lnb, oti, NULL); + if (ret != 0) + goto out; + LASSERT(lpages == npages); + + for (i = 0; i < lpages; i++) { + struct page *page = lnb[i].page; + + /* read past eof? */ + if (page == NULL && lnb[i].rc == 0) + continue; + + if (async) + lnb[i].flags |= OBD_BRW_ASYNC; + + if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID || + (oa->o_valid & OBD_MD_FLFLAGS) == 0 || + (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0) + continue; + + if (rw == OBD_BRW_WRITE) + echo_client_page_debug_setup(lsm, page, rw, + ostid_id(&oa->o_oi), + rnb[i].offset, + rnb[i].len); + else + echo_client_page_debug_check(lsm, page, + ostid_id(&oa->o_oi), + rnb[i].offset, + rnb[i].len); + } + + ret = obd_commitrw(env, rw, exp, oa, 1, &ioo, + rnb, npages, lnb, oti, ret); + if (ret != 0) + goto out; + + /* Reset oti otherwise it would confuse ldiskfs. */ + memset(oti, 0, sizeof(*oti)); + + /* Reuse env context. */ + lu_context_exit((struct lu_context *)&env->le_ctx); + lu_context_enter((struct lu_context *)&env->le_ctx); + } + +out: + if (lnb) + OBD_FREE(lnb, npages * sizeof(struct niobuf_local)); + if (rnb) + OBD_FREE(rnb, npages * sizeof(struct niobuf_remote)); + return ret; +} + +static int echo_client_brw_ioctl(const struct lu_env *env, int rw, + struct obd_export *exp, + struct obd_ioctl_data *data, + struct obd_trans_info *dummy_oti) +{ + struct obd_device *obd = class_exp2obd(exp); + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct obdo *oa = &data->ioc_obdo1; + struct echo_object *eco; + int rc; + int async = 1; + long test_mode; + + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + rc = echo_get_object(&eco, ed, oa); + if (rc) + return rc; + + oa->o_valid &= ~OBD_MD_FLHANDLE; + + /* OFD/obdfilter works only via prep/commit */ + test_mode = (long)data->ioc_pbuf1; + if (test_mode == 1) + async = 0; + + if (ed->ed_next == NULL && test_mode != 3) { + test_mode = 3; + data->ioc_plen1 = data->ioc_count; + } + + /* Truncate batch size to maximum */ + if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE) + data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE; + + switch (test_mode) { + case 1: + /* fall through */ + case 2: + rc = echo_client_kbrw(ed, rw, oa, + eco, data->ioc_offset, + data->ioc_count, async, dummy_oti); + break; + case 3: + rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa, + eco, data->ioc_offset, + data->ioc_count, data->ioc_plen1, + dummy_oti, async); + break; + default: + rc = -EINVAL; + } + echo_put_object(eco); + return rc; +} + +static int +echo_client_enqueue(struct obd_export *exp, struct obdo *oa, + int mode, u64 offset, u64 nob) +{ + struct echo_device *ed = obd2echo_dev(exp->exp_obd); + struct lustre_handle *ulh = &oa->o_handle; + struct echo_object *eco; + u64 end; + int rc; + + if (ed->ed_next == NULL) + return -EOPNOTSUPP; + + if (!(mode == LCK_PR || mode == LCK_PW)) + return -EINVAL; + + if ((offset & (~CFS_PAGE_MASK)) != 0 || + (nob & (~CFS_PAGE_MASK)) != 0) + return -EINVAL; + + rc = echo_get_object(&eco, ed, oa); + if (rc != 0) + return rc; + + end = (nob == 0) ? ((u64) -1) : (offset + nob - 1); + rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie); + if (rc == 0) { + oa->o_valid |= OBD_MD_FLHANDLE; + CDEBUG(D_INFO, "Cookie is %#llx\n", ulh->cookie); + } + echo_put_object(eco); + return rc; +} + +static int +echo_client_cancel(struct obd_export *exp, struct obdo *oa) +{ + struct echo_device *ed = obd2echo_dev(exp->exp_obd); + __u64 cookie = oa->o_handle.cookie; + + if ((oa->o_valid & OBD_MD_FLHANDLE) == 0) + return -EINVAL; + + CDEBUG(D_INFO, "Cookie is %#llx\n", cookie); + return cl_echo_cancel(ed, cookie); +} + +static int +echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco; + struct obd_ioctl_data *data = karg; + struct obd_trans_info dummy_oti; + struct lu_env *env; + struct oti_req_ack_lock *ack_lock; + struct obdo *oa; + struct lu_fid fid; + int rw = OBD_BRW_READ; + int rc = 0; + int i; + + memset(&dummy_oti, 0, sizeof(dummy_oti)); + + oa = &data->ioc_obdo1; + if (!(oa->o_valid & OBD_MD_FLGROUP)) { + oa->o_valid |= OBD_MD_FLGROUP; + ostid_set_seq_echo(&oa->o_oi); + } + + /* This FID is unpacked just for validation at this point */ + rc = ostid_to_fid(&fid, &oa->o_oi, 0); + if (rc < 0) + return rc; + + OBD_ALLOC_PTR(env); + if (env == NULL) + return -ENOMEM; + + rc = lu_env_init(env, LCT_DT_THREAD); + if (rc) { + rc = -ENOMEM; + goto out; + } + + switch (cmd) { + case OBD_IOC_CREATE: /* may create echo object */ + if (!capable(CFS_CAP_SYS_ADMIN)) { + rc = -EPERM; + goto out; + } + + rc = echo_create_object(env, ed, 1, oa, data->ioc_pbuf1, + data->ioc_plen1, &dummy_oti); + goto out; + + case OBD_IOC_DESTROY: + if (!capable(CFS_CAP_SYS_ADMIN)) { + rc = -EPERM; + goto out; + } + + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = obd_destroy(env, ec->ec_exp, oa, eco->eo_lsm, + &dummy_oti, NULL, NULL); + if (rc == 0) + eco->eo_deleted = 1; + echo_put_object(eco); + } + goto out; + + case OBD_IOC_GETATTR: + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + struct obd_info oinfo = { { { 0 } } }; + + oinfo.oi_md = eco->eo_lsm; + oinfo.oi_oa = oa; + rc = obd_getattr(env, ec->ec_exp, &oinfo); + echo_put_object(eco); + } + goto out; + + case OBD_IOC_SETATTR: + if (!capable(CFS_CAP_SYS_ADMIN)) { + rc = -EPERM; + goto out; + } + + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + struct obd_info oinfo = { { { 0 } } }; + + oinfo.oi_oa = oa; + oinfo.oi_md = eco->eo_lsm; + + rc = obd_setattr(env, ec->ec_exp, &oinfo, NULL); + echo_put_object(eco); + } + goto out; + + case OBD_IOC_BRW_WRITE: + if (!capable(CFS_CAP_SYS_ADMIN)) { + rc = -EPERM; + goto out; + } + + rw = OBD_BRW_WRITE; + /* fall through */ + case OBD_IOC_BRW_READ: + rc = echo_client_brw_ioctl(env, rw, exp, data, &dummy_oti); + goto out; + + case ECHO_IOC_GET_STRIPE: + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1, + data->ioc_plen1); + echo_put_object(eco); + } + goto out; + + case ECHO_IOC_SET_STRIPE: + if (!capable(CFS_CAP_SYS_ADMIN)) { + rc = -EPERM; + goto out; + } + + if (data->ioc_pbuf1 == NULL) { /* unset */ + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + eco->eo_deleted = 1; + echo_put_object(eco); + } + } else { + rc = echo_create_object(env, ed, 0, oa, + data->ioc_pbuf1, + data->ioc_plen1, &dummy_oti); + } + goto out; + + case ECHO_IOC_ENQUEUE: + if (!capable(CFS_CAP_SYS_ADMIN)) { + rc = -EPERM; + goto out; + } + + rc = echo_client_enqueue(exp, oa, + data->ioc_conn1, /* lock mode */ + data->ioc_offset, + data->ioc_count);/*extent*/ + goto out; + + case ECHO_IOC_CANCEL: + rc = echo_client_cancel(exp, oa); + goto out; + + default: + CERROR("echo_ioctl(): unrecognised ioctl %#x\n", cmd); + rc = -ENOTTY; + goto out; + } + +out: + lu_env_fini(env); + OBD_FREE_PTR(env); + + /* XXX this should be in a helper also called by target_send_reply */ + for (ack_lock = dummy_oti.oti_ack_locks, i = 0; i < 4; + i++, ack_lock++) { + if (!ack_lock->mode) + break; + ldlm_lock_decref(&ack_lock->lock, ack_lock->mode); + } + + return rc; +} + +static int echo_client_setup(const struct lu_env *env, + struct obd_device *obddev, struct lustre_cfg *lcfg) +{ + struct echo_client_obd *ec = &obddev->u.echo_client; + struct obd_device *tgt; + struct obd_uuid echo_uuid = { "ECHO_UUID" }; + struct obd_connect_data *ocd = NULL; + int rc; + + if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("requires a TARGET OBD name\n"); + return -EINVAL; + } + + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); + if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { + CERROR("device not attached or not set up (%s)\n", + lustre_cfg_string(lcfg, 1)); + return -EINVAL; + } + + spin_lock_init(&ec->ec_lock); + INIT_LIST_HEAD(&ec->ec_objects); + INIT_LIST_HEAD(&ec->ec_locks); + ec->ec_unique = 0; + ec->ec_nstripes = 0; + + OBD_ALLOC(ocd, sizeof(*ocd)); + if (ocd == NULL) { + CERROR("Can't alloc ocd connecting to %s\n", + lustre_cfg_string(lcfg, 1)); + return -ENOMEM; + } + + ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL | + OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 | + OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_FID; + ocd->ocd_brw_size = DT_MAX_BRW_SIZE; + ocd->ocd_version = LUSTRE_VERSION_CODE; + ocd->ocd_group = FID_SEQ_ECHO; + + rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL); + if (rc == 0) { + /* Turn off pinger because it connects to tgt obd directly. */ + spin_lock(&tgt->obd_dev_lock); + list_del_init(&ec->ec_exp->exp_obd_chain_timed); + spin_unlock(&tgt->obd_dev_lock); + } + + OBD_FREE(ocd, sizeof(*ocd)); + + if (rc != 0) { + CERROR("fail to connect to device %s\n", + lustre_cfg_string(lcfg, 1)); + return rc; + } + + return rc; +} + +static int echo_client_cleanup(struct obd_device *obddev) +{ + struct echo_client_obd *ec = &obddev->u.echo_client; + int rc; + + if (!list_empty(&obddev->obd_exports)) { + CERROR("still has clients!\n"); + return -EBUSY; + } + + LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0); + rc = obd_disconnect(ec->ec_exp); + if (rc != 0) + CERROR("fail to disconnect device: %d\n", rc); + + return rc; +} + +static int echo_client_connect(const struct lu_env *env, + struct obd_export **exp, + struct obd_device *src, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + int rc; + struct lustre_handle conn = { 0 }; + + rc = class_connect(&conn, src, cluuid); + if (rc == 0) { + *exp = class_conn2export(&conn); + } + + return rc; +} + +static int echo_client_disconnect(struct obd_export *exp) +{ + int rc; + + if (exp == NULL) { + rc = -EINVAL; + goto out; + } + + rc = class_disconnect(exp); + goto out; + out: + return rc; +} + +static struct obd_ops echo_client_obd_ops = { + .o_owner = THIS_MODULE, + .o_iocontrol = echo_client_iocontrol, + .o_connect = echo_client_connect, + .o_disconnect = echo_client_disconnect +}; + +int echo_client_init(void) +{ + struct lprocfs_static_vars lvars = { NULL }; + int rc; + + lprocfs_echo_init_vars(&lvars); + + rc = lu_kmem_init(echo_caches); + if (rc == 0) { + rc = class_register_type(&echo_client_obd_ops, NULL, + lvars.module_vars, + LUSTRE_ECHO_CLIENT_NAME, + &echo_device_type); + if (rc) + lu_kmem_fini(echo_caches); + } + return rc; +} + +void echo_client_exit(void) +{ + class_unregister_type(LUSTRE_ECHO_CLIENT_NAME); + lu_kmem_fini(echo_caches); +} + +static int __init obdecho_init(void) +{ + struct lprocfs_static_vars lvars; + + LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n"); + + LASSERT(PAGE_CACHE_SIZE % OBD_ECHO_BLOCK_SIZE == 0); + + lprocfs_echo_init_vars(&lvars); + + + return echo_client_init(); +} + +static void /*__exit*/ obdecho_exit(void) +{ + echo_client_exit(); + +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Testing Echo OBD driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(LUSTRE_VERSION_STRING); + +module_init(obdecho_init); +module_exit(obdecho_exit); + +/** @} echo_client */ diff --git a/kernel/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/kernel/drivers/staging/lustre/lustre/obdecho/echo_internal.h new file mode 100644 index 000000000..8e9dbc235 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdecho/echo_internal.h @@ -0,0 +1,47 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Whamcloud, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdecho/echo_internal.h + */ + +#ifndef _ECHO_INTERNAL_H +#define _ECHO_INTERNAL_H + +/* The persistent object (i.e. actually stores stuff!) */ +#define ECHO_PERSISTENT_OBJID 1ULL +#define ECHO_PERSISTENT_SIZE ((__u64)(1<<20)) + +/* block size to use for data verification */ +#define OBD_ECHO_BLOCK_SIZE (4<<10) + + +#endif diff --git a/kernel/drivers/staging/lustre/lustre/obdecho/lproc_echo.c b/kernel/drivers/staging/lustre/lustre/obdecho/lproc_echo.c new file mode 100644 index 000000000..0beb97db7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/obdecho/lproc_echo.c @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_ECHO + +#include "../include/lprocfs_status.h" +#include "../include/obd_class.h" + +#if defined(CONFIG_PROC_FS) +LPROC_SEQ_FOPS_RO_TYPE(echo, uuid); +static struct lprocfs_vars lprocfs_echo_obd_vars[] = { + { "uuid", &echo_uuid_fops, NULL, 0 }, + { NULL } +}; + +LPROC_SEQ_FOPS_RO_TYPE(echo, numrefs); +static struct lprocfs_vars lprocfs_echo_module_vars[] = { + { "num_refs", &echo_numrefs_fops, NULL, 0 }, + { NULL } +}; + +void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_echo_module_vars; + lvars->obd_vars = lprocfs_echo_obd_vars; +} +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/drivers/staging/lustre/lustre/osc/Makefile b/kernel/drivers/staging/lustre/lustre/osc/Makefile new file mode 100644 index 000000000..54927fba4 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_LUSTRE_FS) += osc.o +osc-y := osc_request.o osc_dev.o osc_object.o \ + osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o +osc-$(CONFIG_PROC_FS) += lproc_osc.o diff --git a/kernel/drivers/staging/lustre/lustre/osc/lproc_osc.c b/kernel/drivers/staging/lustre/lustre/osc/lproc_osc.c new file mode 100644 index 000000000..15a662098 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/lproc_osc.c @@ -0,0 +1,751 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include "../include/obd_cksum.h" +#include "../include/obd_class.h" +#include "../include/lprocfs_status.h" +#include +#include "osc_internal.h" + +static int osc_active_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + + LPROCFS_CLIMP_CHECK(dev); + seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive); + LPROCFS_CLIMP_EXIT(dev); + + return 0; +} + +static ssize_t osc_active_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + if (val < 0 || val > 1) + return -ERANGE; + + /* opposite senses */ + if (dev->u.cli.cl_import->imp_deactive == val) + rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val); + else + CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n", val); + + return count; +} +LPROC_SEQ_FOPS(osc_active); + +static int osc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + + client_obd_list_lock(&cli->cl_loi_list_lock); + seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return 0; +} + +static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 1 || val > OSC_MAX_RIF_MAX) + return -ERANGE; + + LPROCFS_CLIMP_CHECK(dev); + if (pool && val > cli->cl_max_rpcs_in_flight) + pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight); + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_max_rpcs_in_flight = val; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + LPROCFS_CLIMP_EXIT(dev); + return count; +} +LPROC_SEQ_FOPS(osc_max_rpcs_in_flight); + +static int osc_max_dirty_mb_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + long val; + int mult; + + client_obd_list_lock(&cli->cl_loi_list_lock); + val = cli->cl_dirty_max; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + mult = 1 << 20; + return lprocfs_seq_read_frac_helper(m, val, mult); +} + +static ssize_t osc_max_dirty_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &dev->u.cli; + int pages_number, mult, rc; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number <= 0 || + pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_CACHE_SHIFT) || + pages_number > totalram_pages / 4) /* 1/4 of RAM */ + return -ERANGE; + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_dirty_max = (u32)(pages_number << PAGE_CACHE_SHIFT); + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return count; +} +LPROC_SEQ_FOPS(osc_max_dirty_mb); + +static int osc_cached_mb_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + int shift = 20 - PAGE_CACHE_SHIFT; + + seq_printf(m, + "used_mb: %d\n" + "busy_cnt: %d\n", + (atomic_read(&cli->cl_lru_in_list) + + atomic_read(&cli->cl_lru_busy)) >> shift, + atomic_read(&cli->cl_lru_busy)); + + return 0; +} + +/* shrink the number of caching pages to a specific number */ +static ssize_t osc_cached_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &dev->u.cli; + int pages_number, mult, rc; + char kernbuf[128]; + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) - + kernbuf; + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number < 0) + return -ERANGE; + + rc = atomic_read(&cli->cl_lru_in_list) - pages_number; + if (rc > 0) + (void)osc_lru_shrink(cli, rc); + + return count; +} +LPROC_SEQ_FOPS(osc_cached_mb); + +static int osc_cur_dirty_bytes_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + + client_obd_list_lock(&cli->cl_loi_list_lock); + seq_printf(m, "%lu\n", cli->cl_dirty); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return 0; +} +LPROC_SEQ_FOPS_RO(osc_cur_dirty_bytes); + +static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + + client_obd_list_lock(&cli->cl_loi_list_lock); + seq_printf(m, "%lu\n", cli->cl_avail_grant); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return 0; +} + +static ssize_t osc_cur_grant_bytes_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &obd->u.cli; + int rc; + __u64 val; + + if (obd == NULL) + return 0; + + rc = lprocfs_write_u64_helper(buffer, count, &val); + if (rc) + return rc; + + /* this is only for shrinking grant */ + client_obd_list_lock(&cli->cl_loi_list_lock); + if (val >= cli->cl_avail_grant) { + client_obd_list_unlock(&cli->cl_loi_list_lock); + return 0; + } + client_obd_list_unlock(&cli->cl_loi_list_lock); + + LPROCFS_CLIMP_CHECK(obd); + if (cli->cl_import->imp_state == LUSTRE_IMP_FULL) + rc = osc_shrink_grant_to_target(cli, val); + LPROCFS_CLIMP_EXIT(obd); + if (rc) + return rc; + return count; +} +LPROC_SEQ_FOPS(osc_cur_grant_bytes); + +static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + + client_obd_list_lock(&cli->cl_loi_list_lock); + seq_printf(m, "%lu\n", cli->cl_lost_grant); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return 0; +} +LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes); + +static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + + if (obd == NULL) + return 0; + seq_printf(m, "%d\n", obd->u.cli.cl_grant_shrink_interval); + return 0; +} + +static ssize_t osc_grant_shrink_interval_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + int val, rc; + + if (obd == NULL) + return 0; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val <= 0) + return -ERANGE; + + obd->u.cli.cl_grant_shrink_interval = val; + + return count; +} +LPROC_SEQ_FOPS(osc_grant_shrink_interval); + +static int osc_checksum_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + + if (obd == NULL) + return 0; + + seq_printf(m, "%d\n", obd->u.cli.cl_checksum ? 1 : 0); + return 0; +} + +static ssize_t osc_checksum_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + int val, rc; + + if (obd == NULL) + return 0; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + obd->u.cli.cl_checksum = (val ? 1 : 0); + + return count; +} +LPROC_SEQ_FOPS(osc_checksum); + +static int osc_checksum_type_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + int i; + DECLARE_CKSUM_NAME; + + if (obd == NULL) + return 0; + + for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { + if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) + continue; + if (obd->u.cli.cl_cksum_type == (1 << i)) + seq_printf(m, "[%s] ", cksum_name[i]); + else + seq_printf(m, "%s ", cksum_name[i]); + } + seq_putc(m, '\n'); + return 0; +} + +static ssize_t osc_checksum_type_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + int i; + DECLARE_CKSUM_NAME; + char kernbuf[10]; + + if (obd == NULL) + return 0; + + if (count > sizeof(kernbuf) - 1) + return -EINVAL; + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + if (count > 0 && kernbuf[count - 1] == '\n') + kernbuf[count - 1] = '\0'; + else + kernbuf[count] = '\0'; + + for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { + if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) + continue; + if (!strcmp(kernbuf, cksum_name[i])) { + obd->u.cli.cl_cksum_type = 1 << i; + return count; + } + } + return -EINVAL; +} +LPROC_SEQ_FOPS(osc_checksum_type); + +static int osc_resend_count_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + + seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_resends)); + return 0; +} + +static ssize_t osc_resend_count_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 0) + return -EINVAL; + + atomic_set(&obd->u.cli.cl_resends, val); + + return count; +} +LPROC_SEQ_FOPS(osc_resend_count); + +static int osc_contention_seconds_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + struct osc_device *od = obd2osc_dev(obd); + + seq_printf(m, "%u\n", od->od_contention_time); + return 0; +} + +static ssize_t osc_contention_seconds_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + struct osc_device *od = obd2osc_dev(obd); + + return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?: + count; +} +LPROC_SEQ_FOPS(osc_contention_seconds); + +static int osc_lockless_truncate_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + struct osc_device *od = obd2osc_dev(obd); + + seq_printf(m, "%u\n", od->od_lockless_truncate); + return 0; +} + +static ssize_t osc_lockless_truncate_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + struct osc_device *od = obd2osc_dev(obd); + + return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?: + count; +} +LPROC_SEQ_FOPS(osc_lockless_truncate); + +static int osc_destroys_in_flight_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + + seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_destroy_in_flight)); + return 0; +} +LPROC_SEQ_FOPS_RO(osc_destroys_in_flight); + +static int osc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v) +{ + return lprocfs_obd_rd_max_pages_per_rpc(m, m->private); +} + +static ssize_t osc_obd_max_pages_per_rpc_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &dev->u.cli; + struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data; + int chunk_mask, rc; + __u64 val; + + rc = lprocfs_write_u64_helper(buffer, count, &val); + if (rc) + return rc; + + /* if the max_pages is specified in bytes, convert to pages */ + if (val >= ONE_MB_BRW_SIZE) + val >>= PAGE_CACHE_SHIFT; + + LPROCFS_CLIMP_CHECK(dev); + + chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_CACHE_SHIFT)) - 1); + /* max_pages_per_rpc must be chunk aligned */ + val = (val + ~chunk_mask) & chunk_mask; + if (val == 0 || val > ocd->ocd_brw_size >> PAGE_CACHE_SHIFT) { + LPROCFS_CLIMP_EXIT(dev); + return -ERANGE; + } + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_max_pages_per_rpc = val; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + LPROCFS_CLIMP_EXIT(dev); + return count; +} +LPROC_SEQ_FOPS(osc_obd_max_pages_per_rpc); + +LPROC_SEQ_FOPS_RO_TYPE(osc, uuid); +LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags); +LPROC_SEQ_FOPS_RO_TYPE(osc, blksize); +LPROC_SEQ_FOPS_RO_TYPE(osc, kbytestotal); +LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesfree); +LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesavail); +LPROC_SEQ_FOPS_RO_TYPE(osc, filestotal); +LPROC_SEQ_FOPS_RO_TYPE(osc, filesfree); +LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid); +LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid); +LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts); +LPROC_SEQ_FOPS_RO_TYPE(osc, state); + +LPROC_SEQ_FOPS_WR_ONLY(osc, ping); + +LPROC_SEQ_FOPS_RW_TYPE(osc, import); +LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov); + +static struct lprocfs_vars lprocfs_osc_obd_vars[] = { + { "uuid", &osc_uuid_fops, NULL, 0 }, + { "ping", &osc_ping_fops, NULL, 0222 }, + { "connect_flags", &osc_connect_flags_fops, NULL, 0 }, + { "blocksize", &osc_blksize_fops, NULL, 0 }, + { "kbytestotal", &osc_kbytestotal_fops, NULL, 0 }, + { "kbytesfree", &osc_kbytesfree_fops, NULL, 0 }, + { "kbytesavail", &osc_kbytesavail_fops, NULL, 0 }, + { "filestotal", &osc_filestotal_fops, NULL, 0 }, + { "filesfree", &osc_filesfree_fops, NULL, 0 }, + /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/ + { "ost_server_uuid", &osc_server_uuid_fops, NULL, 0 }, + { "ost_conn_uuid", &osc_conn_uuid_fops, NULL, 0 }, + { "active", &osc_active_fops, NULL }, + { "max_pages_per_rpc", &osc_obd_max_pages_per_rpc_fops, NULL }, + { "max_rpcs_in_flight", &osc_max_rpcs_in_flight_fops, NULL }, + { "destroys_in_flight", &osc_destroys_in_flight_fops, NULL, 0 }, + { "max_dirty_mb", &osc_max_dirty_mb_fops, NULL }, + { "osc_cached_mb", &osc_cached_mb_fops, NULL }, + { "cur_dirty_bytes", &osc_cur_dirty_bytes_fops, NULL, 0 }, + { "cur_grant_bytes", &osc_cur_grant_bytes_fops, NULL }, + { "cur_lost_grant_bytes", &osc_cur_lost_grant_bytes_fops, NULL, 0}, + { "grant_shrink_interval", &osc_grant_shrink_interval_fops, NULL }, + { "checksums", &osc_checksum_fops, NULL }, + { "checksum_type", &osc_checksum_type_fops, NULL }, + { "resend_count", &osc_resend_count_fops, NULL}, + { "timeouts", &osc_timeouts_fops, NULL, 0 }, + { "contention_seconds", &osc_contention_seconds_fops, NULL }, + { "lockless_truncate", &osc_lockless_truncate_fops, NULL }, + { "import", &osc_import_fops, NULL }, + { "state", &osc_state_fops, NULL, 0 }, + { "pinger_recov", &osc_pinger_recov_fops, NULL }, + { NULL } +}; + +LPROC_SEQ_FOPS_RO_TYPE(osc, numrefs); +static struct lprocfs_vars lprocfs_osc_module_vars[] = { + { "num_refs", &osc_numrefs_fops, NULL, 0 }, + { NULL } +}; + +#define pct(a, b) (b ? a * 100 / b : 0) + +static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + int i; + + do_gettimeofday(&now); + + client_obd_list_lock(&cli->cl_loi_list_lock); + + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, (unsigned long)now.tv_usec); + seq_printf(seq, "read RPCs in flight: %d\n", + cli->cl_r_in_flight); + seq_printf(seq, "write RPCs in flight: %d\n", + cli->cl_w_in_flight); + seq_printf(seq, "pending write pages: %d\n", + atomic_read(&cli->cl_pending_w_pages)); + seq_printf(seq, "pending read pages: %d\n", + atomic_read(&cli->cl_pending_r_pages)); + + seq_puts(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_puts(seq, "pages per rpc rpcs % cum % |"); + seq_puts(seq, " rpcs % cum %\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_page_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_page_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", + 1 << i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_puts(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_puts(seq, "rpcs in flight rpcs % cum % |"); + seq_puts(seq, " rpcs % cum %\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", + i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_puts(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_puts(seq, "offset rpcs % cum % |"); + seq_puts(seq, " rpcs % cum %\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_offset_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_offset_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", + (i == 0) ? 0 : 1 << (i - 1), + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return 0; +} +#undef pct + +static ssize_t osc_rpc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + + lprocfs_oh_clear(&cli->cl_read_rpc_hist); + lprocfs_oh_clear(&cli->cl_write_rpc_hist); + lprocfs_oh_clear(&cli->cl_read_page_hist); + lprocfs_oh_clear(&cli->cl_write_page_hist); + lprocfs_oh_clear(&cli->cl_read_offset_hist); + lprocfs_oh_clear(&cli->cl_write_offset_hist); + + return len; +} + +LPROC_SEQ_FOPS(osc_rpc_stats); + +static int osc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + do_gettimeofday(&now); + + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, (unsigned long)now.tv_usec); + seq_printf(seq, "lockless_write_bytes\t\t%llu\n", + stats->os_lockless_writes); + seq_printf(seq, "lockless_read_bytes\t\t%llu\n", + stats->os_lockless_reads); + seq_printf(seq, "lockless_truncate\t\t%llu\n", + stats->os_lockless_truncates); + return 0; +} + +static ssize_t osc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + memset(stats, 0, sizeof(*stats)); + return len; +} + +LPROC_SEQ_FOPS(osc_stats); + +int lproc_osc_attach_seqstat(struct obd_device *dev) +{ + int rc; + + rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644, + &osc_stats_fops, dev); + if (rc == 0) + rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644, + &osc_rpc_stats_fops, dev); + + return rc; +} + +void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_osc_module_vars; + lvars->obd_vars = lprocfs_osc_obd_vars; +} diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_cache.c b/kernel/drivers/staging/lustre/lustre/osc/osc_cache.c new file mode 100644 index 000000000..d44b3d4ff --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_cache.c @@ -0,0 +1,2944 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + * + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * osc cache management. + * + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" +#include "osc_internal.h" + +static int extent_debug; /* set it to be true for more debug */ + +static void osc_update_pending(struct osc_object *obj, int cmd, int delta); +static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, + int state); +static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int sent, int rc); +static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, + int cmd); +static int osc_refresh_count(const struct lu_env *env, + struct osc_async_page *oap, int cmd); +static int osc_io_unplug_async(const struct lu_env *env, + struct client_obd *cli, struct osc_object *osc); +static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, + unsigned int lost_grant); + +static void osc_extent_tree_dump0(int level, struct osc_object *obj, + const char *func, int line); +#define osc_extent_tree_dump(lvl, obj) \ + osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) + +/** \addtogroup osc + * @{ + */ + +/* ------------------ osc extent ------------------ */ +static inline char *ext_flags(struct osc_extent *ext, char *flags) +{ + char *buf = flags; + *buf++ = ext->oe_rw ? 'r' : 'w'; + if (ext->oe_intree) + *buf++ = 'i'; + if (ext->oe_srvlock) + *buf++ = 's'; + if (ext->oe_hp) + *buf++ = 'h'; + if (ext->oe_urgent) + *buf++ = 'u'; + if (ext->oe_memalloc) + *buf++ = 'm'; + if (ext->oe_trunc_pending) + *buf++ = 't'; + if (ext->oe_fsync_wait) + *buf++ = 'Y'; + *buf = 0; + return flags; +} + +static inline char list_empty_marker(struct list_head *list) +{ + return list_empty(list) ? '-' : '+'; +} + +#define EXTSTR "[%lu -> %lu/%lu]" +#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end +static const char *oes_strings[] = { + "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL }; + +#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ + struct osc_extent *__ext = (extent); \ + char __buf[16]; \ + \ + CDEBUG(lvl, \ + "extent %p@{" EXTSTR ", " \ + "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ + /* ----- extent part 0 ----- */ \ + __ext, EXTPARA(__ext), \ + /* ----- part 1 ----- */ \ + atomic_read(&__ext->oe_refc), \ + atomic_read(&__ext->oe_users), \ + list_empty_marker(&__ext->oe_link), \ + oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ + __ext->oe_obj, \ + /* ----- part 2 ----- */ \ + __ext->oe_grants, __ext->oe_nr_pages, \ + list_empty_marker(&__ext->oe_pages), \ + waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ + __ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner, \ + /* ----- part 4 ----- */ \ + ## __VA_ARGS__); \ +} while (0) + +#undef EASSERTF +#define EASSERTF(expr, ext, fmt, args...) do { \ + if (!(expr)) { \ + OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \ + osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \ + LASSERT(expr); \ + } \ +} while (0) + +#undef EASSERT +#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n") + +static inline struct osc_extent *rb_extent(struct rb_node *n) +{ + if (n == NULL) + return NULL; + + return container_of(n, struct osc_extent, oe_node); +} + +static inline struct osc_extent *next_extent(struct osc_extent *ext) +{ + if (ext == NULL) + return NULL; + + LASSERT(ext->oe_intree); + return rb_extent(rb_next(&ext->oe_node)); +} + +static inline struct osc_extent *prev_extent(struct osc_extent *ext) +{ + if (ext == NULL) + return NULL; + + LASSERT(ext->oe_intree); + return rb_extent(rb_prev(&ext->oe_node)); +} + +static inline struct osc_extent *first_extent(struct osc_object *obj) +{ + return rb_extent(rb_first(&obj->oo_root)); +} + +/* object must be locked by caller. */ +static int osc_extent_sanity_check0(struct osc_extent *ext, + const char *func, const int line) +{ + struct osc_object *obj = ext->oe_obj; + struct osc_async_page *oap; + int page_count; + int rc = 0; + + if (!osc_object_is_locked(obj)) { + rc = 9; + goto out; + } + + if (ext->oe_state >= OES_STATE_MAX) { + rc = 10; + goto out; + } + + if (atomic_read(&ext->oe_refc) <= 0) { + rc = 20; + goto out; + } + + if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) { + rc = 30; + goto out; + } + + switch (ext->oe_state) { + case OES_INV: + if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages)) + rc = 35; + else + rc = 0; + goto out; + case OES_ACTIVE: + if (atomic_read(&ext->oe_users) == 0) { + rc = 40; + goto out; + } + if (ext->oe_hp) { + rc = 50; + goto out; + } + if (ext->oe_fsync_wait && !ext->oe_urgent) { + rc = 55; + goto out; + } + break; + case OES_CACHE: + if (ext->oe_grants == 0) { + rc = 60; + goto out; + } + if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) { + rc = 65; + goto out; + } + default: + if (atomic_read(&ext->oe_users) > 0) { + rc = 70; + goto out; + } + } + + if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) { + rc = 80; + goto out; + } + + if (ext->oe_osclock == NULL && ext->oe_grants > 0) { + rc = 90; + goto out; + } + + if (ext->oe_osclock) { + struct cl_lock_descr *descr; + descr = &ext->oe_osclock->cll_descr; + if (!(descr->cld_start <= ext->oe_start && + descr->cld_end >= ext->oe_max_end)) { + rc = 100; + goto out; + } + } + + if (ext->oe_nr_pages > ext->oe_mppr) { + rc = 105; + goto out; + } + + /* Do not verify page list if extent is in RPC. This is because an + * in-RPC extent is supposed to be exclusively accessible w/o lock. */ + if (ext->oe_state > OES_CACHE) { + rc = 0; + goto out; + } + + if (!extent_debug) { + rc = 0; + goto out; + } + + page_count = 0; + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + pgoff_t index = oap2cl_page(oap)->cp_index; + ++page_count; + if (index > ext->oe_end || index < ext->oe_start) { + rc = 110; + goto out; + } + } + if (page_count != ext->oe_nr_pages) { + rc = 120; + goto out; + } + +out: + if (rc != 0) + OSC_EXTENT_DUMP(D_ERROR, ext, + "%s:%d sanity check %p failed with rc = %d\n", + func, line, ext, rc); + return rc; +} + +#define sanity_check_nolock(ext) \ + osc_extent_sanity_check0(ext, __func__, __LINE__) + +#define sanity_check(ext) ({ \ + int __res; \ + osc_object_lock((ext)->oe_obj); \ + __res = sanity_check_nolock(ext); \ + osc_object_unlock((ext)->oe_obj); \ + __res; \ +}) + + +/** + * sanity check - to make sure there is no overlapped extent in the tree. + */ +static int osc_extent_is_overlapped(struct osc_object *obj, + struct osc_extent *ext) +{ + struct osc_extent *tmp; + + LASSERT(osc_object_is_locked(obj)); + + if (!extent_debug) + return 0; + + for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) { + if (tmp == ext) + continue; + if (tmp->oe_end >= ext->oe_start && + tmp->oe_start <= ext->oe_end) + return 1; + } + return 0; +} + +static void osc_extent_state_set(struct osc_extent *ext, int state) +{ + LASSERT(osc_object_is_locked(ext->oe_obj)); + LASSERT(state >= OES_INV && state < OES_STATE_MAX); + + /* Never try to sanity check a state changing extent :-) */ + /* LASSERT(sanity_check_nolock(ext) == 0); */ + + /* TODO: validate the state machine */ + ext->oe_state = state; + wake_up_all(&ext->oe_waitq); +} + +static struct osc_extent *osc_extent_alloc(struct osc_object *obj) +{ + struct osc_extent *ext; + + OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS); + if (ext == NULL) + return NULL; + + RB_CLEAR_NODE(&ext->oe_node); + ext->oe_obj = obj; + atomic_set(&ext->oe_refc, 1); + atomic_set(&ext->oe_users, 0); + INIT_LIST_HEAD(&ext->oe_link); + ext->oe_state = OES_INV; + INIT_LIST_HEAD(&ext->oe_pages); + init_waitqueue_head(&ext->oe_waitq); + ext->oe_osclock = NULL; + + return ext; +} + +static void osc_extent_free(struct osc_extent *ext) +{ + OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); +} + +static struct osc_extent *osc_extent_get(struct osc_extent *ext) +{ + LASSERT(atomic_read(&ext->oe_refc) >= 0); + atomic_inc(&ext->oe_refc); + return ext; +} + +static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) +{ + LASSERT(atomic_read(&ext->oe_refc) > 0); + if (atomic_dec_and_test(&ext->oe_refc)) { + LASSERT(list_empty(&ext->oe_link)); + LASSERT(atomic_read(&ext->oe_users) == 0); + LASSERT(ext->oe_state == OES_INV); + LASSERT(!ext->oe_intree); + + if (ext->oe_osclock) { + cl_lock_put(env, ext->oe_osclock); + ext->oe_osclock = NULL; + } + osc_extent_free(ext); + } +} + +/** + * osc_extent_put_trust() is a special version of osc_extent_put() when + * it's known that the caller is not the last user. This is to address the + * problem of lacking of lu_env ;-). + */ +static void osc_extent_put_trust(struct osc_extent *ext) +{ + LASSERT(atomic_read(&ext->oe_refc) > 1); + LASSERT(osc_object_is_locked(ext->oe_obj)); + atomic_dec(&ext->oe_refc); +} + +/** + * Return the extent which includes pgoff @index, or return the greatest + * previous extent in the tree. + */ +static struct osc_extent *osc_extent_search(struct osc_object *obj, + pgoff_t index) +{ + struct rb_node *n = obj->oo_root.rb_node; + struct osc_extent *tmp, *p = NULL; + + LASSERT(osc_object_is_locked(obj)); + while (n != NULL) { + tmp = rb_extent(n); + if (index < tmp->oe_start) { + n = n->rb_left; + } else if (index > tmp->oe_end) { + p = rb_extent(n); + n = n->rb_right; + } else { + return tmp; + } + } + return p; +} + +/* + * Return the extent covering @index, otherwise return NULL. + * caller must have held object lock. + */ +static struct osc_extent *osc_extent_lookup(struct osc_object *obj, + pgoff_t index) +{ + struct osc_extent *ext; + + ext = osc_extent_search(obj, index); + if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end) + return osc_extent_get(ext); + return NULL; +} + +/* caller must have held object lock. */ +static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) +{ + struct rb_node **n = &obj->oo_root.rb_node; + struct rb_node *parent = NULL; + struct osc_extent *tmp; + + LASSERT(ext->oe_intree == 0); + LASSERT(ext->oe_obj == obj); + LASSERT(osc_object_is_locked(obj)); + while (*n != NULL) { + tmp = rb_extent(*n); + parent = *n; + + if (ext->oe_end < tmp->oe_start) + n = &(*n)->rb_left; + else if (ext->oe_start > tmp->oe_end) + n = &(*n)->rb_right; + else + EASSERTF(0, tmp, EXTSTR, EXTPARA(ext)); + } + rb_link_node(&ext->oe_node, parent, n); + rb_insert_color(&ext->oe_node, &obj->oo_root); + osc_extent_get(ext); + ext->oe_intree = 1; +} + +/* caller must have held object lock. */ +static void osc_extent_erase(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + LASSERT(osc_object_is_locked(obj)); + if (ext->oe_intree) { + rb_erase(&ext->oe_node, &obj->oo_root); + ext->oe_intree = 0; + /* rbtree held a refcount */ + osc_extent_put_trust(ext); + } +} + +static struct osc_extent *osc_extent_hold(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + + LASSERT(osc_object_is_locked(obj)); + LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); + if (ext->oe_state == OES_CACHE) { + osc_extent_state_set(ext, OES_ACTIVE); + osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages); + } + atomic_inc(&ext->oe_users); + list_del_init(&ext->oe_link); + return osc_extent_get(ext); +} + +static void __osc_extent_remove(struct osc_extent *ext) +{ + LASSERT(osc_object_is_locked(ext->oe_obj)); + LASSERT(list_empty(&ext->oe_pages)); + osc_extent_erase(ext); + list_del_init(&ext->oe_link); + osc_extent_state_set(ext, OES_INV); + OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n"); +} + +static void osc_extent_remove(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + + osc_object_lock(obj); + __osc_extent_remove(ext); + osc_object_unlock(obj); +} + +/** + * This function is used to merge extents to get better performance. It checks + * if @cur and @victim are contiguous at chunk level. + */ +static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, + struct osc_extent *victim) +{ + struct osc_object *obj = cur->oe_obj; + pgoff_t chunk_start; + pgoff_t chunk_end; + int ppc_bits; + + LASSERT(cur->oe_state == OES_CACHE); + LASSERT(osc_object_is_locked(obj)); + if (victim == NULL) + return -EINVAL; + + if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait) + return -EBUSY; + + if (cur->oe_max_end != victim->oe_max_end) + return -ERANGE; + + LASSERT(cur->oe_osclock == victim->oe_osclock); + ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT; + chunk_start = cur->oe_start >> ppc_bits; + chunk_end = cur->oe_end >> ppc_bits; + if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && + chunk_end + 1 != victim->oe_start >> ppc_bits) + return -ERANGE; + + OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); + + cur->oe_start = min(cur->oe_start, victim->oe_start); + cur->oe_end = max(cur->oe_end, victim->oe_end); + cur->oe_grants += victim->oe_grants; + cur->oe_nr_pages += victim->oe_nr_pages; + /* only the following bits are needed to merge */ + cur->oe_urgent |= victim->oe_urgent; + cur->oe_memalloc |= victim->oe_memalloc; + list_splice_init(&victim->oe_pages, &cur->oe_pages); + list_del_init(&victim->oe_link); + victim->oe_nr_pages = 0; + + osc_extent_get(victim); + __osc_extent_remove(victim); + osc_extent_put(env, victim); + + OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim); + return 0; +} + +/** + * Drop user count of osc_extent, and unplug IO asynchronously. + */ +void osc_extent_release(const struct lu_env *env, struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + + LASSERT(atomic_read(&ext->oe_users) > 0); + LASSERT(sanity_check(ext) == 0); + LASSERT(ext->oe_grants > 0); + + if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { + LASSERT(ext->oe_state == OES_ACTIVE); + if (ext->oe_trunc_pending) { + /* a truncate process is waiting for this extent. + * This may happen due to a race, check + * osc_cache_truncate_start(). */ + osc_extent_state_set(ext, OES_TRUNC); + ext->oe_trunc_pending = 0; + } else { + osc_extent_state_set(ext, OES_CACHE); + osc_update_pending(obj, OBD_BRW_WRITE, + ext->oe_nr_pages); + + /* try to merge the previous and next extent. */ + osc_extent_merge(env, ext, prev_extent(ext)); + osc_extent_merge(env, ext, next_extent(ext)); + + if (ext->oe_urgent) + list_move_tail(&ext->oe_link, + &obj->oo_urgent_exts); + } + osc_object_unlock(obj); + + osc_io_unplug_async(env, osc_cli(obj), obj); + } + osc_extent_put(env, ext); +} + +static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) +{ + return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); +} + +/** + * Find or create an extent which includes @index, core function to manage + * extent tree. + */ +struct osc_extent *osc_extent_find(const struct lu_env *env, + struct osc_object *obj, pgoff_t index, + int *grants) + +{ + struct client_obd *cli = osc_cli(obj); + struct cl_lock *lock; + struct osc_extent *cur; + struct osc_extent *ext; + struct osc_extent *conflict = NULL; + struct osc_extent *found = NULL; + pgoff_t chunk; + pgoff_t max_end; + int max_pages; /* max_pages_per_rpc */ + int chunksize; + int ppc_bits; /* pages per chunk bits */ + int chunk_mask; + int rc; + + cur = osc_extent_alloc(obj); + if (cur == NULL) + return ERR_PTR(-ENOMEM); + + lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0); + LASSERT(lock != NULL); + LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE); + + LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT); + ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + chunk_mask = ~((1 << ppc_bits) - 1); + chunksize = 1 << cli->cl_chunkbits; + chunk = index >> ppc_bits; + + /* align end to rpc edge, rpc size may not be a power 2 integer. */ + max_pages = cli->cl_max_pages_per_rpc; + LASSERT((max_pages & ~chunk_mask) == 0); + max_end = index - (index % max_pages) + max_pages - 1; + max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end); + + /* initialize new extent by parameters so far */ + cur->oe_max_end = max_end; + cur->oe_start = index & chunk_mask; + cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; + if (cur->oe_start < lock->cll_descr.cld_start) + cur->oe_start = lock->cll_descr.cld_start; + if (cur->oe_end > max_end) + cur->oe_end = max_end; + cur->oe_osclock = lock; + cur->oe_grants = 0; + cur->oe_mppr = max_pages; + + /* grants has been allocated by caller */ + LASSERTF(*grants >= chunksize + cli->cl_extent_tax, + "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax); + LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur)); + +restart: + osc_object_lock(obj); + ext = osc_extent_search(obj, cur->oe_start); + if (ext == NULL) + ext = first_extent(obj); + while (ext != NULL) { + loff_t ext_chk_start = ext->oe_start >> ppc_bits; + loff_t ext_chk_end = ext->oe_end >> ppc_bits; + + LASSERT(sanity_check_nolock(ext) == 0); + if (chunk > ext_chk_end + 1) + break; + + /* if covering by different locks, no chance to match */ + if (lock != ext->oe_osclock) { + EASSERTF(!overlapped(ext, cur), ext, + EXTSTR, EXTPARA(cur)); + + ext = next_extent(ext); + continue; + } + + /* discontiguous chunks? */ + if (chunk + 1 < ext_chk_start) { + ext = next_extent(ext); + continue; + } + + /* ok, from now on, ext and cur have these attrs: + * 1. covered by the same lock + * 2. contiguous at chunk level or overlapping. */ + + if (overlapped(ext, cur)) { + /* cur is the minimum unit, so overlapping means + * full contain. */ + EASSERTF((ext->oe_start <= cur->oe_start && + ext->oe_end >= cur->oe_end), + ext, EXTSTR, EXTPARA(cur)); + + if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) { + /* for simplicity, we wait for this extent to + * finish before going forward. */ + conflict = osc_extent_get(ext); + break; + } + + found = osc_extent_hold(ext); + break; + } + + /* non-overlapped extent */ + if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) { + /* we can't do anything for a non OES_CACHE extent, or + * if there is someone waiting for this extent to be + * flushed, try next one. */ + ext = next_extent(ext); + continue; + } + + /* check if they belong to the same rpc slot before trying to + * merge. the extents are not overlapped and contiguous at + * chunk level to get here. */ + if (ext->oe_max_end != max_end) { + /* if they don't belong to the same RPC slot or + * max_pages_per_rpc has ever changed, do not merge. */ + ext = next_extent(ext); + continue; + } + + /* it's required that an extent must be contiguous at chunk + * level so that we know the whole extent is covered by grant + * (the pages in the extent are NOT required to be contiguous). + * Otherwise, it will be too much difficult to know which + * chunks have grants allocated. */ + + /* try to do front merge - extend ext's start */ + if (chunk + 1 == ext_chk_start) { + /* ext must be chunk size aligned */ + EASSERT((ext->oe_start & ~chunk_mask) == 0, ext); + + /* pull ext's start back to cover cur */ + ext->oe_start = cur->oe_start; + ext->oe_grants += chunksize; + *grants -= chunksize; + + found = osc_extent_hold(ext); + } else if (chunk == ext_chk_end + 1) { + /* rear merge */ + ext->oe_end = cur->oe_end; + ext->oe_grants += chunksize; + *grants -= chunksize; + + /* try to merge with the next one because we just fill + * in a gap */ + if (osc_extent_merge(env, ext, next_extent(ext)) == 0) + /* we can save extent tax from next extent */ + *grants += cli->cl_extent_tax; + + found = osc_extent_hold(ext); + } + if (found != NULL) + break; + + ext = next_extent(ext); + } + + osc_extent_tree_dump(D_CACHE, obj); + if (found != NULL) { + LASSERT(conflict == NULL); + if (!IS_ERR(found)) { + LASSERT(found->oe_osclock == cur->oe_osclock); + OSC_EXTENT_DUMP(D_CACHE, found, + "found caching ext for %lu.\n", index); + } + } else if (conflict == NULL) { + /* create a new extent */ + EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); + cur->oe_grants = chunksize + cli->cl_extent_tax; + *grants -= cur->oe_grants; + LASSERT(*grants >= 0); + + cur->oe_state = OES_CACHE; + found = osc_extent_hold(cur); + osc_extent_insert(obj, cur); + OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", + index, lock->cll_descr.cld_end); + } + osc_object_unlock(obj); + + if (conflict != NULL) { + LASSERT(found == NULL); + + /* waiting for IO to finish. Please notice that it's impossible + * to be an OES_TRUNC extent. */ + rc = osc_extent_wait(env, conflict, OES_INV); + osc_extent_put(env, conflict); + conflict = NULL; + if (rc < 0) { + found = ERR_PTR(rc); + goto out; + } + + goto restart; + } + +out: + osc_extent_put(env, cur); + LASSERT(*grants >= 0); + return found; +} + +/** + * Called when IO is finished to an extent. + */ +int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, + int sent, int rc) +{ + struct client_obd *cli = osc_cli(ext->oe_obj); + struct osc_async_page *oap; + struct osc_async_page *tmp; + int nr_pages = ext->oe_nr_pages; + int lost_grant = 0; + int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; + __u64 last_off = 0; + int last_count = -1; + + OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n"); + + ext->oe_rc = rc ?: ext->oe_nr_pages; + EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); + list_for_each_entry_safe(oap, tmp, &ext->oe_pages, + oap_pending_item) { + list_del_init(&oap->oap_rpc_item); + list_del_init(&oap->oap_pending_item); + if (last_off <= oap->oap_obj_off) { + last_off = oap->oap_obj_off; + last_count = oap->oap_count; + } + + --ext->oe_nr_pages; + osc_ap_completion(env, cli, oap, sent, rc); + } + EASSERT(ext->oe_nr_pages == 0, ext); + + if (!sent) { + lost_grant = ext->oe_grants; + } else if (blocksize < PAGE_CACHE_SIZE && + last_count != PAGE_CACHE_SIZE) { + /* For short writes we shouldn't count parts of pages that + * span a whole chunk on the OST side, or our accounting goes + * wrong. Should match the code in filter_grant_check. */ + int offset = oap->oap_page_off & ~CFS_PAGE_MASK; + int count = oap->oap_count + (offset & (blocksize - 1)); + int end = (offset + oap->oap_count) & (blocksize - 1); + if (end) + count += blocksize - end; + + lost_grant = PAGE_CACHE_SIZE - count; + } + if (ext->oe_grants > 0) + osc_free_grant(cli, nr_pages, lost_grant); + + osc_extent_remove(ext); + /* put the refcount for RPC */ + osc_extent_put(env, ext); + return 0; +} + +static int extent_wait_cb(struct osc_extent *ext, int state) +{ + int ret; + + osc_object_lock(ext->oe_obj); + ret = ext->oe_state == state; + osc_object_unlock(ext->oe_obj); + + return ret; +} + +/** + * Wait for the extent's state to become @state. + */ +static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, + int state) +{ + struct osc_object *obj = ext->oe_obj; + struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, + LWI_ON_SIGNAL_NOOP, NULL); + int rc = 0; + + osc_object_lock(obj); + LASSERT(sanity_check_nolock(ext) == 0); + /* `Kick' this extent only if the caller is waiting for it to be + * written out. */ + if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp && + !ext->oe_trunc_pending) { + if (ext->oe_state == OES_ACTIVE) { + ext->oe_urgent = 1; + } else if (ext->oe_state == OES_CACHE) { + ext->oe_urgent = 1; + osc_extent_hold(ext); + rc = 1; + } + } + osc_object_unlock(obj); + if (rc == 1) + osc_extent_release(env, ext); + + /* wait for the extent until its state becomes @state */ + rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); + if (rc == -ETIMEDOUT) { + OSC_EXTENT_DUMP(D_ERROR, ext, + "%s: wait ext to %d timedout, recovery in progress?\n", + osc_export(obj)->exp_obd->obd_name, state); + + lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), + &lwi); + } + if (rc == 0 && ext->oe_rc < 0) + rc = ext->oe_rc; + return rc; +} + +/** + * Discard pages with index greater than @size. If @ext is overlapped with + * @size, then partial truncate happens. + */ +static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, + bool partial) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct cl_io *io; + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_async_page *oap; + struct osc_async_page *tmp; + int pages_in_chunk = 0; + int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + __u64 trunc_chunk = trunc_index >> ppc_bits; + int grants = 0; + int nr_pages = 0; + int rc = 0; + + LASSERT(sanity_check(ext) == 0); + EASSERT(ext->oe_state == OES_TRUNC, ext); + EASSERT(!ext->oe_urgent, ext); + + /* Request new lu_env. + * We can't use that env from osc_cache_truncate_start() because + * it's from lov_io_sub and not fully initialized. */ + env = cl_env_nested_get(&nest); + io = &osc_env_info(env)->oti_io; + io->ci_obj = cl_object_top(osc2cl(obj)); + rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (rc < 0) + goto out; + + /* discard all pages with index greater then trunc_index */ + list_for_each_entry_safe(oap, tmp, &ext->oe_pages, + oap_pending_item) { + struct cl_page *sub = oap2cl_page(oap); + struct cl_page *page = cl_page_top(sub); + + LASSERT(list_empty(&oap->oap_rpc_item)); + + /* only discard the pages with their index greater than + * trunc_index, and ... */ + if (sub->cp_index < trunc_index || + (sub->cp_index == trunc_index && partial)) { + /* accounting how many pages remaining in the chunk + * so that we can calculate grants correctly. */ + if (sub->cp_index >> ppc_bits == trunc_chunk) + ++pages_in_chunk; + continue; + } + + list_del_init(&oap->oap_pending_item); + + cl_page_get(page); + lu_ref_add(&page->cp_reference, "truncate", current); + + if (cl_page_own(env, io, page) == 0) { + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + LASSERT(0); + } + + lu_ref_del(&page->cp_reference, "truncate", current); + cl_page_put(env, page); + + --ext->oe_nr_pages; + ++nr_pages; + } + EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, + ext->oe_nr_pages == 0), + ext, "trunc_index %lu, partial %d\n", trunc_index, partial); + + osc_object_lock(obj); + if (ext->oe_nr_pages == 0) { + LASSERT(pages_in_chunk == 0); + grants = ext->oe_grants; + ext->oe_grants = 0; + } else { /* calculate how many grants we can free */ + int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk; + pgoff_t last_index; + + + /* if there is no pages in this chunk, we can also free grants + * for the last chunk */ + if (pages_in_chunk == 0) { + /* if this is the 1st chunk and no pages in this chunk, + * ext->oe_nr_pages must be zero, so we should be in + * the other if-clause. */ + LASSERT(trunc_chunk > 0); + --trunc_chunk; + ++chunks; + } + + /* this is what we can free from this extent */ + grants = chunks << cli->cl_chunkbits; + ext->oe_grants -= grants; + last_index = ((trunc_chunk + 1) << ppc_bits) - 1; + ext->oe_end = min(last_index, ext->oe_max_end); + LASSERT(ext->oe_end >= ext->oe_start); + LASSERT(ext->oe_grants > 0); + } + osc_object_unlock(obj); + + if (grants > 0 || nr_pages > 0) + osc_free_grant(cli, nr_pages, grants); + +out: + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + return rc; +} + +/** + * This function is used to make the extent prepared for transfer. + * A race with flushing page - ll_writepage() has to be handled cautiously. + */ +static int osc_extent_make_ready(const struct lu_env *env, + struct osc_extent *ext) +{ + struct osc_async_page *oap; + struct osc_async_page *last = NULL; + struct osc_object *obj = ext->oe_obj; + int page_count = 0; + int rc; + + /* we're going to grab page lock, so object lock must not be taken. */ + LASSERT(sanity_check(ext) == 0); + /* in locking state, any process should not touch this extent. */ + EASSERT(ext->oe_state == OES_LOCKING, ext); + EASSERT(ext->oe_owner != NULL, ext); + + OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n"); + + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + ++page_count; + if (last == NULL || last->oap_obj_off < oap->oap_obj_off) + last = oap; + + /* checking ASYNC_READY is race safe */ + if ((oap->oap_async_flags & ASYNC_READY) != 0) + continue; + + rc = osc_make_ready(env, oap, OBD_BRW_WRITE); + switch (rc) { + case 0: + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_READY; + spin_unlock(&oap->oap_lock); + break; + case -EALREADY: + LASSERT((oap->oap_async_flags & ASYNC_READY) != 0); + break; + default: + LASSERTF(0, "unknown return code: %d\n", rc); + } + } + + LASSERT(page_count == ext->oe_nr_pages); + LASSERT(last != NULL); + /* the last page is the only one we need to refresh its count by + * the size of file. */ + if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { + last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); + LASSERT(last->oap_count > 0); + LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE); + last->oap_async_flags |= ASYNC_COUNT_STABLE; + } + + /* for the rest of pages, we don't need to call osf_refresh_count() + * because it's known they are not the last page */ + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { + oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off; + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + } + } + + osc_object_lock(obj); + osc_extent_state_set(ext, OES_RPC); + osc_object_unlock(obj); + /* get a refcount for RPC. */ + osc_extent_get(ext); + + return 0; +} + +/** + * Quick and simple version of osc_extent_find(). This function is frequently + * called to expand the extent for the same IO. To expand the extent, the + * page index must be in the same or next chunk of ext->oe_end. + */ +static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants) +{ + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_extent *next; + int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + pgoff_t chunk = index >> ppc_bits; + pgoff_t end_chunk; + pgoff_t end_index; + int chunksize = 1 << cli->cl_chunkbits; + int rc = 0; + + LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); + osc_object_lock(obj); + LASSERT(sanity_check_nolock(ext) == 0); + end_chunk = ext->oe_end >> ppc_bits; + if (chunk > end_chunk + 1) { + rc = -ERANGE; + goto out; + } + + if (end_chunk >= chunk) { + rc = 0; + goto out; + } + + LASSERT(end_chunk + 1 == chunk); + /* try to expand this extent to cover @index */ + end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); + + next = next_extent(ext); + if (next != NULL && next->oe_start <= end_index) { + /* complex mode - overlapped with the next extent, + * this case will be handled by osc_extent_find() */ + rc = -EAGAIN; + goto out; + } + + ext->oe_end = end_index; + ext->oe_grants += chunksize; + *grants -= chunksize; + LASSERT(*grants >= 0); + EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext, + "overlapped after expanding for %lu.\n", index); + +out: + osc_object_unlock(obj); + return rc; +} + +static void osc_extent_tree_dump0(int level, struct osc_object *obj, + const char *func, int line) +{ + struct osc_extent *ext; + int cnt; + + CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", + obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); + + /* osc_object_lock(obj); */ + cnt = 1; + for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext)) + OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) + OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) + OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) + OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); + /* osc_object_unlock(obj); */ +} + +/* ------------------ osc extent end ------------------ */ + +static inline int osc_is_ready(struct osc_object *osc) +{ + return !list_empty(&osc->oo_ready_item) || + !list_empty(&osc->oo_hp_ready_item); +} + +#define OSC_IO_DEBUG(OSC, STR, args...) \ + CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \ + (OSC), osc_is_ready(OSC), \ + list_empty_marker(&(OSC)->oo_hp_ready_item), \ + list_empty_marker(&(OSC)->oo_ready_item), \ + atomic_read(&(OSC)->oo_nr_writes), \ + list_empty_marker(&(OSC)->oo_hp_exts), \ + list_empty_marker(&(OSC)->oo_urgent_exts), \ + atomic_read(&(OSC)->oo_nr_reads), \ + list_empty_marker(&(OSC)->oo_reading_exts), \ + ##args) + +static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, + int cmd) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = cl_page_top(oap2cl_page(oap)); + int result; + + LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ + + result = cl_page_make_ready(env, page, CRT_WRITE); + if (result == 0) + opg->ops_submit_time = cfs_time_current(); + return result; +} + +static int osc_refresh_count(const struct lu_env *env, + struct osc_async_page *oap, int cmd) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = oap2cl_page(oap); + struct cl_object *obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + + int result; + loff_t kms; + + /* readpage queues with _COUNT_STABLE, shouldn't get here. */ + LASSERT(!(cmd & OBD_BRW_READ)); + LASSERT(opg != NULL); + obj = opg->ops_cl.cpl_obj; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result < 0) + return result; + kms = attr->cat_kms; + if (cl_offset(obj, page->cp_index) >= kms) + /* catch race with truncate */ + return 0; + else if (cl_offset(obj, page->cp_index + 1) > kms) + /* catch sub-page write at end of file */ + return kms % PAGE_CACHE_SIZE; + else + return PAGE_CACHE_SIZE; +} + +static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, + int cmd, int rc) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = cl_page_top(oap2cl_page(oap)); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + enum cl_req_type crt; + int srvlock; + + cmd &= ~OBD_BRW_NOQUOTA; + LASSERT(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ)); + LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE)); + LASSERT(opg->ops_transfer_pinned); + + /* + * page->cp_req can be NULL if io submission failed before + * cl_req was allocated. + */ + if (page->cp_req != NULL) + cl_req_page_done(env, page); + LASSERT(page->cp_req == NULL); + + crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; + /* Clear opg->ops_transfer_pinned before VM lock is released. */ + opg->ops_transfer_pinned = 0; + + spin_lock(&obj->oo_seatbelt); + LASSERT(opg->ops_submitter != NULL); + LASSERT(!list_empty(&opg->ops_inflight)); + list_del_init(&opg->ops_inflight); + opg->ops_submitter = NULL; + spin_unlock(&obj->oo_seatbelt); + + opg->ops_submit_time = 0; + srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; + + /* statistic */ + if (rc == 0 && srvlock) { + struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; + struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; + int bytes = oap->oap_count; + + if (crt == CRT_READ) + stats->os_lockless_reads += bytes; + else + stats->os_lockless_writes += bytes; + } + + /* + * This has to be the last operation with the page, as locks are + * released in cl_page_completion() and nothing except for the + * reference counter protects page from concurrent reclaim. + */ + lu_ref_del(&page->cp_reference, "transfer", page); + + cl_page_completion(env, page, crt, rc); + + return 0; +} + +#define OSC_DUMP_GRANT(cli, fmt, args...) do { \ + struct client_obd *__tmp = (cli); \ + CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d " \ + "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt, \ + __tmp->cl_import->imp_obd->obd_name, \ + __tmp->cl_dirty, __tmp->cl_dirty_max, \ + atomic_read(&obd_dirty_pages), obd_max_dirty_pages, \ + __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ + __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args); \ +} while (0) + +/* caller must hold loi_list_lock */ +static void osc_consume_write_grant(struct client_obd *cli, + struct brw_page *pga) +{ + assert_spin_locked(&cli->cl_loi_list_lock.lock); + LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); + atomic_inc(&obd_dirty_pages); + cli->cl_dirty += PAGE_CACHE_SIZE; + pga->flag |= OBD_BRW_FROM_GRANT; + CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", + PAGE_CACHE_SIZE, pga, pga->pg); + osc_update_next_shrink(cli); +} + +/* the companion to osc_consume_write_grant, called when a brw has completed. + * must be called with the loi lock held. */ +static void osc_release_write_grant(struct client_obd *cli, + struct brw_page *pga) +{ + assert_spin_locked(&cli->cl_loi_list_lock.lock); + if (!(pga->flag & OBD_BRW_FROM_GRANT)) { + return; + } + + pga->flag &= ~OBD_BRW_FROM_GRANT; + atomic_dec(&obd_dirty_pages); + cli->cl_dirty -= PAGE_CACHE_SIZE; + if (pga->flag & OBD_BRW_NOCACHE) { + pga->flag &= ~OBD_BRW_NOCACHE; + atomic_dec(&obd_dirty_transit_pages); + cli->cl_dirty_transit -= PAGE_CACHE_SIZE; + } +} + +/** + * To avoid sleeping with object lock held, it's good for us allocate enough + * grants before entering into critical section. + * + * client_obd_list_lock held by caller + */ +static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) +{ + int rc = -EDQUOT; + + if (cli->cl_avail_grant >= bytes) { + cli->cl_avail_grant -= bytes; + cli->cl_reserved_grant += bytes; + rc = 0; + } + return rc; +} + +static void __osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + /* it's quite normal for us to get more grant than reserved. + * Thinking about a case that two extents merged by adding a new + * chunk, we can save one extent tax. If extent tax is greater than + * one chunk, we can save more grant by adding a new chunk */ + cli->cl_reserved_grant -= reserved; + if (unused > reserved) { + cli->cl_avail_grant += reserved; + cli->cl_lost_grant += unused - reserved; + } else { + cli->cl_avail_grant += unused; + } +} + +void osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + client_obd_list_lock(&cli->cl_loi_list_lock); + __osc_unreserve_grant(cli, reserved, unused); + if (unused > 0) + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); +} + +/** + * Free grant after IO is finished or canceled. + * + * @lost_grant is used to remember how many grants we have allocated but not + * used, we should return these grants to OST. There're two cases where grants + * can be lost: + * 1. truncate; + * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was + * written. In this case OST may use less chunks to serve this partial + * write. OSTs don't actually know the page size on the client side. so + * clients have to calculate lost grant by the blocksize on the OST. + * See filter_grant_check() for details. + */ +static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, + unsigned int lost_grant) +{ + int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + + client_obd_list_lock(&cli->cl_loi_list_lock); + atomic_sub(nr_pages, &obd_dirty_pages); + cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT; + cli->cl_lost_grant += lost_grant; + if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { + /* borrow some grant from truncate to avoid the case that + * truncate uses up all avail grant */ + cli->cl_lost_grant -= grant; + cli->cl_avail_grant += grant; + } + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); + CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", + lost_grant, cli->cl_lost_grant, + cli->cl_avail_grant, cli->cl_dirty); +} + +/** + * The companion to osc_enter_cache(), called when @oap is no longer part of + * the dirty accounting due to error. + */ +static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) +{ + client_obd_list_lock(&cli->cl_loi_list_lock); + osc_release_write_grant(cli, &oap->oap_brw_page); + client_obd_list_unlock(&cli->cl_loi_list_lock); +} + +/** + * Non-blocking version of osc_enter_cache() that consumes grant only when it + * is available. + */ +static int osc_enter_cache_try(struct client_obd *cli, + struct osc_async_page *oap, + int bytes, int transient) +{ + int rc; + + OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); + + rc = osc_reserve_grant(cli, bytes); + if (rc < 0) + return 0; + + if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max && + atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) { + osc_consume_write_grant(cli, &oap->oap_brw_page); + if (transient) { + cli->cl_dirty_transit += PAGE_CACHE_SIZE; + atomic_inc(&obd_dirty_transit_pages); + oap->oap_brw_flags |= OBD_BRW_NOCACHE; + } + rc = 1; + } else { + __osc_unreserve_grant(cli, bytes, bytes); + rc = 0; + } + return rc; +} + +static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) +{ + int rc; + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = list_empty(&ocw->ocw_entry); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} + +/** + * The main entry to reserve dirty page accounting. Usually the grant reserved + * in this function will be freed in bulk in osc_free_grant() unless it fails + * to add osc cache, in that case, it will be freed in osc_exit_cache(). + * + * The process will be put into sleep if it's already run out of grant. + */ +static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int bytes) +{ + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + struct osc_cache_waiter ocw; + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + int rc = -EDQUOT; + + OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); + + client_obd_list_lock(&cli->cl_loi_list_lock); + + /* force the caller to try sync io. this can jump the list + * of queued writes and create a discontiguous rpc stream */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || + cli->cl_dirty_max < PAGE_CACHE_SIZE || + cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) { + rc = -EDQUOT; + goto out; + } + + /* Hopefully normal case - cache space and write credits available */ + if (osc_enter_cache_try(cli, oap, bytes, 0)) { + rc = 0; + goto out; + } + + /* We can get here for two reasons: too many dirty pages in cache, or + * run out of grants. In both cases we should write dirty pages out. + * Adding a cache waiter will trigger urgent write-out no matter what + * RPC size will be. + * The exiting condition is no avail grants and no dirty pages caching, + * that really means there is no space on the OST. */ + init_waitqueue_head(&ocw.ocw_waitq); + ocw.ocw_oap = oap; + ocw.ocw_grant = bytes; + while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) { + list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); + ocw.ocw_rc = 0; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + osc_io_unplug_async(env, cli, NULL); + + CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", + cli->cl_import->imp_obd->obd_name, &ocw, oap); + + rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi); + + client_obd_list_lock(&cli->cl_loi_list_lock); + + /* l_wait_event is interrupted by signal */ + if (rc < 0) { + list_del_init(&ocw.ocw_entry); + goto out; + } + + LASSERT(list_empty(&ocw.ocw_entry)); + rc = ocw.ocw_rc; + + if (rc != -EDQUOT) + goto out; + if (osc_enter_cache_try(cli, oap, bytes, 0)) { + rc = 0; + goto out; + } + } +out: + client_obd_list_unlock(&cli->cl_loi_list_lock); + OSC_DUMP_GRANT(cli, "returned %d.\n", rc); + return rc; +} + +/* caller must hold loi_list_lock */ +void osc_wake_cache_waiters(struct client_obd *cli) +{ + struct list_head *l, *tmp; + struct osc_cache_waiter *ocw; + + list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { + ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); + list_del_init(&ocw->ocw_entry); + + ocw->ocw_rc = -EDQUOT; + /* we can't dirty more */ + if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) || + (atomic_read(&obd_dirty_pages) + 1 > + obd_max_dirty_pages)) { + CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %d\n", + cli->cl_dirty, + cli->cl_dirty_max, obd_max_dirty_pages); + goto wakeup; + } + + ocw->ocw_rc = 0; + if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) + ocw->ocw_rc = -EDQUOT; + +wakeup: + CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", + ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); + + wake_up(&ocw->ocw_waitq); + } +} + +static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) +{ + int hprpc = !!list_empty(&osc->oo_hp_exts); + return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; +} + +/* This maintains the lists of pending pages to read/write for a given object + * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint() + * to quickly find objects that are ready to send an RPC. */ +static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, + int cmd) +{ + int invalid_import = 0; + + /* if we have an invalid import we want to drain the queued pages + * by forcing them through rpcs that immediately fail and complete + * the pages. recovery relies on this to empty the queued pages + * before canceling the locks and evicting down the llite pages */ + if ((cli->cl_import == NULL || cli->cl_import->imp_invalid)) + invalid_import = 1; + + if (cmd & OBD_BRW_WRITE) { + if (atomic_read(&osc->oo_nr_writes) == 0) + return 0; + if (invalid_import) { + CDEBUG(D_CACHE, "invalid import forcing RPC\n"); + return 1; + } + if (!list_empty(&osc->oo_hp_exts)) { + CDEBUG(D_CACHE, "high prio request forcing RPC\n"); + return 1; + } + if (!list_empty(&osc->oo_urgent_exts)) { + CDEBUG(D_CACHE, "urgent request forcing RPC\n"); + return 1; + } + /* trigger a write rpc stream as long as there are dirtiers + * waiting for space. as they're waiting, they're not going to + * create more pages to coalesce with what's waiting.. */ + if (!list_empty(&cli->cl_cache_waiters)) { + CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); + return 1; + } + if (atomic_read(&osc->oo_nr_writes) >= + cli->cl_max_pages_per_rpc) + return 1; + } else { + if (atomic_read(&osc->oo_nr_reads) == 0) + return 0; + if (invalid_import) { + CDEBUG(D_CACHE, "invalid import forcing RPC\n"); + return 1; + } + /* all read are urgent. */ + if (!list_empty(&osc->oo_reading_exts)) + return 1; + } + + return 0; +} + +static void osc_update_pending(struct osc_object *obj, int cmd, int delta) +{ + struct client_obd *cli = osc_cli(obj); + if (cmd & OBD_BRW_WRITE) { + atomic_add(delta, &obj->oo_nr_writes); + atomic_add(delta, &cli->cl_pending_w_pages); + LASSERT(atomic_read(&obj->oo_nr_writes) >= 0); + } else { + atomic_add(delta, &obj->oo_nr_reads); + atomic_add(delta, &cli->cl_pending_r_pages); + LASSERT(atomic_read(&obj->oo_nr_reads) >= 0); + } + OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta); +} + +static int osc_makes_hprpc(struct osc_object *obj) +{ + return !list_empty(&obj->oo_hp_exts); +} + +static void on_list(struct list_head *item, struct list_head *list, int should_be_on) +{ + if (list_empty(item) && should_be_on) + list_add_tail(item, list); + else if (!list_empty(item) && !should_be_on) + list_del_init(item); +} + +/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc + * can find pages to build into rpcs quickly */ +static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc) +{ + if (osc_makes_hprpc(osc)) { + /* HP rpc */ + on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0); + on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); + } else { + on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); + on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, + osc_makes_rpc(cli, osc, OBD_BRW_WRITE) || + osc_makes_rpc(cli, osc, OBD_BRW_READ)); + } + + on_list(&osc->oo_write_item, &cli->cl_loi_write_list, + atomic_read(&osc->oo_nr_writes) > 0); + + on_list(&osc->oo_read_item, &cli->cl_loi_read_list, + atomic_read(&osc->oo_nr_reads) > 0); + + return osc_is_ready(osc); +} + +static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) +{ + int is_ready; + + client_obd_list_lock(&cli->cl_loi_list_lock); + is_ready = __osc_list_maint(cli, osc); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return is_ready; +} + +/* this is trying to propagate async writeback errors back up to the + * application. As an async write fails we record the error code for later if + * the app does an fsync. As long as errors persist we force future rpcs to be + * sync so that the app can get a sync error and break the cycle of queueing + * pages for which writeback will fail. */ +static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, + int rc) +{ + if (rc) { + if (!ar->ar_rc) + ar->ar_rc = rc; + + ar->ar_force_sync = 1; + ar->ar_min_xid = ptlrpc_sample_next_xid(); + return; + + } + + if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) + ar->ar_force_sync = 0; +} + + +/* this must be called holding the loi list lock to give coverage to exit_cache, + * async_flag maintenance, and oap_request */ +static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int sent, int rc) +{ + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + __u64 xid = 0; + + if (oap->oap_request != NULL) { + xid = ptlrpc_req_xid(oap->oap_request); + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = NULL; + } + + /* As the transfer for this page is being done, clear the flags */ + spin_lock(&oap->oap_lock); + oap->oap_async_flags = 0; + spin_unlock(&oap->oap_lock); + oap->oap_interrupted = 0; + + if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { + client_obd_list_lock(&cli->cl_loi_list_lock); + osc_process_ar(&cli->cl_ar, xid, rc); + osc_process_ar(&loi->loi_ar, xid, rc); + client_obd_list_unlock(&cli->cl_loi_list_lock); + } + + rc = osc_completion(env, oap, oap->oap_cmd, rc); + if (rc) + CERROR("completion on oap %p obj %p returns %d.\n", + oap, osc, rc); +} + +/** + * Try to add extent to one RPC. We need to think about the following things: + * - # of pages must not be over max_pages_per_rpc + * - extent must be compatible with previous ones + */ +static int try_to_add_extent_for_io(struct client_obd *cli, + struct osc_extent *ext, struct list_head *rpclist, + int *pc, unsigned int *max_pages) +{ + struct osc_extent *tmp; + struct osc_async_page *oap = list_first_entry(&ext->oe_pages, + struct osc_async_page, + oap_pending_item); + + EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), + ext); + + *max_pages = max(ext->oe_mppr, *max_pages); + if (*pc + ext->oe_nr_pages > *max_pages) + return 0; + + list_for_each_entry(tmp, rpclist, oe_link) { + struct osc_async_page *oap2; + + oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page, + oap_pending_item); + EASSERT(tmp->oe_owner == current, tmp); +#if 0 + if (overlapped(tmp, ext)) { + OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext); + EASSERT(0, ext); + } +#endif + if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) { + CDEBUG(D_CACHE, "Do not permit different type of IO" + " for a same RPC\n"); + return 0; + } + + if (tmp->oe_srvlock != ext->oe_srvlock || + !tmp->oe_grants != !ext->oe_grants) + return 0; + + /* remove break for strict check */ + break; + } + + *pc += ext->oe_nr_pages; + list_move_tail(&ext->oe_link, rpclist); + ext->oe_owner = current; + return 1; +} + +/** + * In order to prevent multiple ptlrpcd from breaking contiguous extents, + * get_write_extent() takes all appropriate extents in atomic. + * + * The following policy is used to collect extents for IO: + * 1. Add as many HP extents as possible; + * 2. Add the first urgent extent in urgent extent list and take it out of + * urgent list; + * 3. Add subsequent extents of this urgent extent; + * 4. If urgent list is not empty, goto 2; + * 5. Traverse the extent tree from the 1st extent; + * 6. Above steps exit if there is no space in this RPC. + */ +static int get_write_extents(struct osc_object *obj, struct list_head *rpclist) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + int page_count = 0; + unsigned int max_pages = cli->cl_max_pages_per_rpc; + + LASSERT(osc_object_is_locked(obj)); + while (!list_empty(&obj->oo_hp_exts)) { + ext = list_entry(obj->oo_hp_exts.next, struct osc_extent, + oe_link); + LASSERT(ext->oe_state == OES_CACHE); + if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, + &max_pages)) + return page_count; + EASSERT(ext->oe_nr_pages <= max_pages, ext); + } + if (page_count == max_pages) + return page_count; + + while (!list_empty(&obj->oo_urgent_exts)) { + ext = list_entry(obj->oo_urgent_exts.next, + struct osc_extent, oe_link); + if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, + &max_pages)) + return page_count; + + if (!ext->oe_intree) + continue; + + while ((ext = next_extent(ext)) != NULL) { + if ((ext->oe_state != OES_CACHE) || + (!list_empty(&ext->oe_link) && + ext->oe_owner != NULL)) + continue; + + if (!try_to_add_extent_for_io(cli, ext, rpclist, + &page_count, &max_pages)) + return page_count; + } + } + if (page_count == max_pages) + return page_count; + + ext = first_extent(obj); + while (ext != NULL) { + if ((ext->oe_state != OES_CACHE) || + /* this extent may be already in current rpclist */ + (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) { + ext = next_extent(ext); + continue; + } + + if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, + &max_pages)) + return page_count; + + ext = next_extent(ext); + } + return page_count; +} + +static int +osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol) +{ + LIST_HEAD(rpclist); + struct osc_extent *ext; + struct osc_extent *tmp; + struct osc_extent *first = NULL; + u32 page_count = 0; + int srvlock = 0; + int rc = 0; + + LASSERT(osc_object_is_locked(osc)); + + page_count = get_write_extents(osc, &rpclist); + LASSERT(equi(page_count == 0, list_empty(&rpclist))); + + if (list_empty(&rpclist)) + return 0; + + osc_update_pending(osc, OBD_BRW_WRITE, -page_count); + + list_for_each_entry(ext, &rpclist, oe_link) { + LASSERT(ext->oe_state == OES_CACHE || + ext->oe_state == OES_LOCK_DONE); + if (ext->oe_state == OES_CACHE) + osc_extent_state_set(ext, OES_LOCKING); + else + osc_extent_state_set(ext, OES_RPC); + } + + /* we're going to grab page lock, so release object lock because + * lock order is page lock -> object lock. */ + osc_object_unlock(osc); + + list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) { + if (ext->oe_state == OES_LOCKING) { + rc = osc_extent_make_ready(env, ext); + if (unlikely(rc < 0)) { + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + continue; + } + } + if (first == NULL) { + first = ext; + srvlock = ext->oe_srvlock; + } else { + LASSERT(srvlock == ext->oe_srvlock); + } + } + + if (!list_empty(&rpclist)) { + LASSERT(page_count > 0); + rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol); + LASSERT(list_empty(&rpclist)); + } + + osc_object_lock(osc); + return rc; +} + +/** + * prepare pages for ASYNC io and put pages in send queue. + * + * \param cmd OBD_BRW_* macroses + * \param lop pending pages + * + * \return zero if no page added to send queue. + * \return 1 if pages successfully added to send queue. + * \return negative on errors. + */ +static int +osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol) +{ + struct osc_extent *ext; + struct osc_extent *next; + LIST_HEAD(rpclist); + int page_count = 0; + unsigned int max_pages = cli->cl_max_pages_per_rpc; + int rc = 0; + + LASSERT(osc_object_is_locked(osc)); + list_for_each_entry_safe(ext, next, + &osc->oo_reading_exts, oe_link) { + EASSERT(ext->oe_state == OES_LOCK_DONE, ext); + if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count, + &max_pages)) + break; + osc_extent_state_set(ext, OES_RPC); + EASSERT(ext->oe_nr_pages <= max_pages, ext); + } + LASSERT(page_count <= max_pages); + + osc_update_pending(osc, OBD_BRW_READ, -page_count); + + if (!list_empty(&rpclist)) { + osc_object_unlock(osc); + + LASSERT(page_count > 0); + rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol); + LASSERT(list_empty(&rpclist)); + + osc_object_lock(osc); + } + return rc; +} + +#define list_to_obj(list, item) ({ \ + struct list_head *__tmp = (list)->next; \ + list_del_init(__tmp); \ + list_entry(__tmp, struct osc_object, oo_##item); \ +}) + +/* This is called by osc_check_rpcs() to find which objects have pages that + * we could be sending. These lists are maintained by osc_makes_rpc(). */ +static struct osc_object *osc_next_obj(struct client_obd *cli) +{ + /* First return objects that have blocked locks so that they + * will be flushed quickly and other clients can get the lock, + * then objects which have pages ready to be stuffed into RPCs */ + if (!list_empty(&cli->cl_loi_hp_ready_list)) + return list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item); + if (!list_empty(&cli->cl_loi_ready_list)) + return list_to_obj(&cli->cl_loi_ready_list, ready_item); + + /* then if we have cache waiters, return all objects with queued + * writes. This is especially important when many small files + * have filled up the cache and not been fired into rpcs because + * they don't pass the nr_pending/object threshold */ + if (!list_empty(&cli->cl_cache_waiters) && + !list_empty(&cli->cl_loi_write_list)) + return list_to_obj(&cli->cl_loi_write_list, write_item); + + /* then return all queued objects when we have an invalid import + * so that they get flushed */ + if (cli->cl_import == NULL || cli->cl_import->imp_invalid) { + if (!list_empty(&cli->cl_loi_write_list)) + return list_to_obj(&cli->cl_loi_write_list, write_item); + if (!list_empty(&cli->cl_loi_read_list)) + return list_to_obj(&cli->cl_loi_read_list, read_item); + } + return NULL; +} + +/* called with the loi list lock held */ +static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli, + pdl_policy_t pol) +{ + struct osc_object *osc; + int rc = 0; + + while ((osc = osc_next_obj(cli)) != NULL) { + struct cl_object *obj = osc2cl(osc); + struct lu_ref_link link; + + OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); + + if (osc_max_rpc_in_flight(cli, osc)) { + __osc_list_maint(cli, osc); + break; + } + + cl_object_get(obj); + client_obd_list_unlock(&cli->cl_loi_list_lock); + lu_object_ref_add_at(&obj->co_lu, &link, "check", + current); + + /* attempt some read/write balancing by alternating between + * reads and writes in an object. The makes_rpc checks here + * would be redundant if we were getting read/write work items + * instead of objects. we don't want send_oap_rpc to drain a + * partial read pending queue when we're given this object to + * do io on writes while there are cache waiters */ + osc_object_lock(osc); + if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) { + rc = osc_send_write_rpc(env, cli, osc, pol); + if (rc < 0) { + CERROR("Write request failed with %d\n", rc); + + /* osc_send_write_rpc failed, mostly because of + * memory pressure. + * + * It can't break here, because if: + * - a page was submitted by osc_io_submit, so + * page locked; + * - no request in flight + * - no subsequent request + * The system will be in live-lock state, + * because there is no chance to call + * osc_io_unplug() and osc_check_rpcs() any + * more. pdflush can't help in this case, + * because it might be blocked at grabbing + * the page lock as we mentioned. + * + * Anyway, continue to drain pages. */ + /* break; */ + } + } + if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) { + rc = osc_send_read_rpc(env, cli, osc, pol); + if (rc < 0) + CERROR("Read request failed with %d\n", rc); + } + osc_object_unlock(osc); + + osc_list_maint(cli, osc); + lu_object_ref_del_at(&obj->co_lu, &link, "check", + current); + cl_object_put(env, obj); + + client_obd_list_lock(&cli->cl_loi_list_lock); + } +} + +static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol, int async) +{ + int rc = 0; + + if (osc != NULL && osc_list_maint(cli, osc) == 0) + return 0; + + if (!async) { + /* disable osc_lru_shrink() temporarily to avoid + * potential stack overrun problem. LU-2859 */ + atomic_inc(&cli->cl_lru_shrinkers); + client_obd_list_lock(&cli->cl_loi_list_lock); + osc_check_rpcs(env, cli, pol); + client_obd_list_unlock(&cli->cl_loi_list_lock); + atomic_dec(&cli->cl_lru_shrinkers); + } else { + CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli); + LASSERT(cli->cl_writeback_work != NULL); + rc = ptlrpcd_queue_work(cli->cl_writeback_work); + } + return rc; +} + +static int osc_io_unplug_async(const struct lu_env *env, + struct client_obd *cli, struct osc_object *osc) +{ + /* XXX: policy is no use actually. */ + return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1); +} + +void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol) +{ + (void)osc_io_unplug0(env, cli, osc, pol, 0); +} + +int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, + struct page *page, loff_t offset) +{ + struct obd_export *exp = osc_export(osc); + struct osc_async_page *oap = &ops->ops_oap; + + if (!page) + return cfs_size_round(sizeof(*oap)); + + oap->oap_magic = OAP_MAGIC; + oap->oap_cli = &exp->exp_obd->u.cli; + oap->oap_obj = osc; + + oap->oap_page = page; + oap->oap_obj_off = offset; + LASSERT(!(offset & ~CFS_PAGE_MASK)); + + if (!client_is_remote(exp) && capable(CFS_CAP_SYS_RESOURCE)) + oap->oap_brw_flags = OBD_BRW_NOQUOTA; + + INIT_LIST_HEAD(&oap->oap_pending_item); + INIT_LIST_HEAD(&oap->oap_rpc_item); + + spin_lock_init(&oap->oap_lock); + CDEBUG(D_INFO, "oap %p page %p obj off %llu\n", + oap, page, oap->oap_obj_off); + return 0; +} + +int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_extent *ext = NULL; + struct osc_async_page *oap = &ops->ops_oap; + struct client_obd *cli = oap->oap_cli; + struct osc_object *osc = oap->oap_obj; + pgoff_t index; + int grants = 0; + int brw_flags = OBD_BRW_ASYNC; + int cmd = OBD_BRW_WRITE; + int need_release = 0; + int rc = 0; + + if (oap->oap_magic != OAP_MAGIC) + return -EINVAL; + + if (cli->cl_import == NULL || cli->cl_import->imp_invalid) + return -EIO; + + if (!list_empty(&oap->oap_pending_item) || + !list_empty(&oap->oap_rpc_item)) + return -EBUSY; + + /* Set the OBD_BRW_SRVLOCK before the page is queued. */ + brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; + if (!client_is_remote(osc_export(osc)) && + capable(CFS_CAP_SYS_RESOURCE)) { + brw_flags |= OBD_BRW_NOQUOTA; + cmd |= OBD_BRW_NOQUOTA; + } + + /* check if the file's owner/group is over quota */ + if (!(cmd & OBD_BRW_NOQUOTA)) { + struct cl_object *obj; + struct cl_attr *attr; + unsigned int qid[MAXQUOTAS]; + + obj = cl_object_top(&osc->oo_cl); + attr = &osc_env_info(env)->oti_attr; + + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + + qid[USRQUOTA] = attr->cat_uid; + qid[GRPQUOTA] = attr->cat_gid; + if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA) + rc = -EDQUOT; + if (rc) + return rc; + } + + oap->oap_cmd = cmd; + oap->oap_page_off = ops->ops_from; + oap->oap_count = ops->ops_to - ops->ops_from; + oap->oap_async_flags = 0; + oap->oap_brw_flags = brw_flags; + + OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", + oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); + + index = oap2cl_page(oap)->cp_index; + + /* Add this page into extent by the following steps: + * 1. if there exists an active extent for this IO, mostly this page + * can be added to the active extent and sometimes we need to + * expand extent to accommodate this page; + * 2. otherwise, a new extent will be allocated. */ + + ext = oio->oi_active; + if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) { + /* one chunk plus extent overhead must be enough to write this + * page */ + grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + if (ext->oe_end >= index) + grants = 0; + + /* it doesn't need any grant to dirty this page */ + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = osc_enter_cache_try(cli, oap, grants, 0); + client_obd_list_unlock(&cli->cl_loi_list_lock); + if (rc == 0) { /* try failed */ + grants = 0; + need_release = 1; + } else if (ext->oe_end < index) { + int tmp = grants; + /* try to expand this extent */ + rc = osc_extent_expand(ext, index, &tmp); + if (rc < 0) { + need_release = 1; + /* don't free reserved grant */ + } else { + OSC_EXTENT_DUMP(D_CACHE, ext, + "expanded for %lu.\n", index); + osc_unreserve_grant(cli, grants, tmp); + grants = 0; + } + } + rc = 0; + } else if (ext != NULL) { + /* index is located outside of active extent */ + need_release = 1; + } + if (need_release) { + osc_extent_release(env, ext); + oio->oi_active = NULL; + ext = NULL; + } + + if (ext == NULL) { + int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + + /* try to find new extent to cover this page */ + LASSERT(oio->oi_active == NULL); + /* we may have allocated grant for this page if we failed + * to expand the previous active extent. */ + LASSERT(ergo(grants > 0, grants >= tmp)); + + rc = 0; + if (grants == 0) { + /* we haven't allocated grant for this page. */ + rc = osc_enter_cache(env, cli, oap, tmp); + if (rc == 0) + grants = tmp; + } + + tmp = grants; + if (rc == 0) { + ext = osc_extent_find(env, osc, index, &tmp); + if (IS_ERR(ext)) { + LASSERT(tmp == grants); + osc_exit_cache(cli, oap); + rc = PTR_ERR(ext); + ext = NULL; + } else { + oio->oi_active = ext; + } + } + if (grants > 0) + osc_unreserve_grant(cli, grants, tmp); + } + + LASSERT(ergo(rc == 0, ext != NULL)); + if (ext != NULL) { + EASSERTF(ext->oe_end >= index && ext->oe_start <= index, + ext, "index = %lu.\n", index); + LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); + + osc_object_lock(osc); + if (ext->oe_nr_pages == 0) + ext->oe_srvlock = ops->ops_srvlock; + else + LASSERT(ext->oe_srvlock == ops->ops_srvlock); + ++ext->oe_nr_pages; + list_add_tail(&oap->oap_pending_item, &ext->oe_pages); + osc_object_unlock(osc); + } + return rc; +} + +int osc_teardown_async_page(const struct lu_env *env, + struct osc_object *obj, struct osc_page *ops) +{ + struct osc_async_page *oap = &ops->ops_oap; + struct osc_extent *ext = NULL; + int rc = 0; + + LASSERT(oap->oap_magic == OAP_MAGIC); + + CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", + oap, ops, oap2cl_page(oap)->cp_index); + + osc_object_lock(obj); + if (!list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); + rc = -EBUSY; + } else if (!list_empty(&oap->oap_pending_item)) { + ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index); + /* only truncated pages are allowed to be taken out. + * See osc_extent_truncate() and osc_cache_truncate_start() + * for details. */ + if (ext != NULL && ext->oe_state != OES_TRUNC) { + OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", + oap2cl_page(oap)->cp_index); + rc = -EBUSY; + } + } + osc_object_unlock(obj); + if (ext != NULL) + osc_extent_put(env, ext); + return rc; +} + +/** + * This is called when a page is picked up by kernel to write out. + * + * We should find out the corresponding extent and add the whole extent + * into urgent list. The extent may be being truncated or used, handle it + * carefully. + */ +int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops) +{ + struct osc_extent *ext = NULL; + struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); + struct cl_page *cp = ops->ops_cl.cpl_page; + pgoff_t index = cp->cp_index; + struct osc_async_page *oap = &ops->ops_oap; + bool unplug = false; + int rc = 0; + + osc_object_lock(obj); + ext = osc_extent_lookup(obj, index); + if (ext == NULL) { + osc_extent_tree_dump(D_ERROR, obj); + LASSERTF(0, "page index %lu is NOT covered.\n", index); + } + + switch (ext->oe_state) { + case OES_RPC: + case OES_LOCK_DONE: + CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp), + "flush an in-rpc page?\n"); + LASSERT(0); + break; + case OES_LOCKING: + /* If we know this extent is being written out, we should abort + * so that the writer can make this page ready. Otherwise, there + * exists a deadlock problem because other process can wait for + * page writeback bit holding page lock; and meanwhile in + * vvp_page_make_ready(), we need to grab page lock before + * really sending the RPC. */ + case OES_TRUNC: + /* race with truncate, page will be redirtied */ + case OES_ACTIVE: + /* The extent is active so we need to abort and let the caller + * re-dirty the page. If we continued on here, and we were the + * one making the extent active, we could deadlock waiting for + * the page writeback to clear but it won't because the extent + * is active and won't be written out. */ + rc = -EAGAIN; + goto out; + default: + break; + } + + rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE); + if (rc) + goto out; + + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; + spin_unlock(&oap->oap_lock); + + if (memory_pressure_get()) + ext->oe_memalloc = 1; + + ext->oe_urgent = 1; + if (ext->oe_state == OES_CACHE) { + OSC_EXTENT_DUMP(D_CACHE, ext, + "flush page %p make it urgent.\n", oap); + if (list_empty(&ext->oe_link)) + list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); + unplug = true; + } + rc = 0; + +out: + osc_object_unlock(obj); + osc_extent_put(env, ext); + if (unplug) + osc_io_unplug_async(env, osc_cli(obj), obj); + return rc; +} + +/** + * this is called when a sync waiter receives an interruption. Its job is to + * get the caller woken as soon as possible. If its page hasn't been put in an + * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as + * desiring interruption which will forcefully complete the rpc once the rpc + * has timed out. + */ +int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) +{ + struct osc_async_page *oap = &ops->ops_oap; + struct osc_object *obj = oap->oap_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_extent *found = NULL; + struct list_head *plist; + pgoff_t index = oap2cl_page(oap)->cp_index; + int rc = -EBUSY; + int cmd; + + LASSERT(!oap->oap_interrupted); + oap->oap_interrupted = 1; + + /* Find out the caching extent */ + osc_object_lock(obj); + if (oap->oap_cmd & OBD_BRW_WRITE) { + plist = &obj->oo_urgent_exts; + cmd = OBD_BRW_WRITE; + } else { + plist = &obj->oo_reading_exts; + cmd = OBD_BRW_READ; + } + list_for_each_entry(ext, plist, oe_link) { + if (ext->oe_start <= index && ext->oe_end >= index) { + LASSERT(ext->oe_state == OES_LOCK_DONE); + /* For OES_LOCK_DONE state extent, it has already held + * a refcount for RPC. */ + found = osc_extent_get(ext); + break; + } + } + if (found != NULL) { + list_del_init(&found->oe_link); + osc_update_pending(obj, cmd, -found->oe_nr_pages); + osc_object_unlock(obj); + + osc_extent_finish(env, found, 0, -EINTR); + osc_extent_put(env, found); + rc = 0; + } else { + osc_object_unlock(obj); + /* ok, it's been put in an rpc. only one oap gets a request + * reference */ + if (oap->oap_request != NULL) { + ptlrpc_mark_interrupted(oap->oap_request); + ptlrpcd_wake(oap->oap_request); + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = NULL; + } + } + + osc_list_maint(cli, obj); + return rc; +} + +int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, + struct list_head *list, int cmd, int brw_flags) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_async_page *oap, *tmp; + int page_count = 0; + int mppr = cli->cl_max_pages_per_rpc; + pgoff_t start = CL_PAGE_EOF; + pgoff_t end = 0; + + list_for_each_entry(oap, list, oap_pending_item) { + struct cl_page *cp = oap2cl_page(oap); + if (cp->cp_index > end) + end = cp->cp_index; + if (cp->cp_index < start) + start = cp->cp_index; + ++page_count; + mppr <<= (page_count > mppr); + } + + ext = osc_extent_alloc(obj); + if (ext == NULL) { + list_for_each_entry_safe(oap, tmp, list, oap_pending_item) { + list_del_init(&oap->oap_pending_item); + osc_ap_completion(env, cli, oap, 0, -ENOMEM); + } + return -ENOMEM; + } + + ext->oe_rw = !!(cmd & OBD_BRW_READ); + ext->oe_urgent = 1; + ext->oe_start = start; + ext->oe_end = ext->oe_max_end = end; + ext->oe_obj = obj; + ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); + ext->oe_nr_pages = page_count; + ext->oe_mppr = mppr; + list_splice_init(list, &ext->oe_pages); + + osc_object_lock(obj); + /* Reuse the initial refcount for RPC, don't drop it */ + osc_extent_state_set(ext, OES_LOCK_DONE); + if (cmd & OBD_BRW_WRITE) { + list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); + osc_update_pending(obj, OBD_BRW_WRITE, page_count); + } else { + list_add_tail(&ext->oe_link, &obj->oo_reading_exts); + osc_update_pending(obj, OBD_BRW_READ, page_count); + } + osc_object_unlock(obj); + + osc_io_unplug_async(env, cli, obj); + return 0; +} + +/** + * Called by osc_io_setattr_start() to freeze and destroy covering extents. + */ +int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio, + struct osc_object *obj, __u64 size) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_extent *waiting = NULL; + pgoff_t index; + LIST_HEAD(list); + int result = 0; + bool partial; + + /* pages with index greater or equal to index will be truncated. */ + index = cl_index(osc2cl(obj), size); + partial = size > cl_offset(osc2cl(obj), index); + +again: + osc_object_lock(obj); + ext = osc_extent_search(obj, index); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < index) + ext = next_extent(ext); + while (ext != NULL) { + EASSERT(ext->oe_state != OES_TRUNC, ext); + + if (ext->oe_state > OES_CACHE || ext->oe_urgent) { + /* if ext is in urgent state, it means there must exist + * a page already having been flushed by write_page(). + * We have to wait for this extent because we can't + * truncate that page. */ + LASSERT(!ext->oe_hp); + OSC_EXTENT_DUMP(D_CACHE, ext, + "waiting for busy extent\n"); + waiting = osc_extent_get(ext); + break; + } + + OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size); + + osc_extent_get(ext); + if (ext->oe_state == OES_ACTIVE) { + /* though we grab inode mutex for write path, but we + * release it before releasing extent(in osc_io_end()), + * so there is a race window that an extent is still + * in OES_ACTIVE when truncate starts. */ + LASSERT(!ext->oe_trunc_pending); + ext->oe_trunc_pending = 1; + } else { + EASSERT(ext->oe_state == OES_CACHE, ext); + osc_extent_state_set(ext, OES_TRUNC); + osc_update_pending(obj, OBD_BRW_WRITE, + -ext->oe_nr_pages); + } + EASSERT(list_empty(&ext->oe_link), ext); + list_add_tail(&ext->oe_link, &list); + + ext = next_extent(ext); + } + osc_object_unlock(obj); + + osc_list_maint(cli, obj); + + while (!list_empty(&list)) { + int rc; + + ext = list_entry(list.next, struct osc_extent, oe_link); + list_del_init(&ext->oe_link); + + /* extent may be in OES_ACTIVE state because inode mutex + * is released before osc_io_end() in file write case */ + if (ext->oe_state != OES_TRUNC) + osc_extent_wait(env, ext, OES_TRUNC); + + rc = osc_extent_truncate(ext, index, partial); + if (rc < 0) { + if (result == 0) + result = rc; + + OSC_EXTENT_DUMP(D_ERROR, ext, + "truncate error %d\n", rc); + } else if (ext->oe_nr_pages == 0) { + osc_extent_remove(ext); + } else { + /* this must be an overlapped extent which means only + * part of pages in this extent have been truncated. + */ + EASSERTF(ext->oe_start <= index, ext, + "trunc index = %lu/%d.\n", index, partial); + /* fix index to skip this partially truncated extent */ + index = ext->oe_end + 1; + partial = false; + + /* we need to hold this extent in OES_TRUNC state so + * that no writeback will happen. This is to avoid + * BUG 17397. */ + LASSERT(oio->oi_trunc == NULL); + oio->oi_trunc = osc_extent_get(ext); + OSC_EXTENT_DUMP(D_CACHE, ext, + "trunc at %llu\n", size); + } + osc_extent_put(env, ext); + } + if (waiting != NULL) { + int rc; + + /* ignore the result of osc_extent_wait the write initiator + * should take care of it. */ + rc = osc_extent_wait(env, waiting, OES_INV); + if (rc < 0) + OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc); + + osc_extent_put(env, waiting); + waiting = NULL; + goto again; + } + return result; +} + +/** + * Called after osc_io_setattr_end to add oio->oi_trunc back to cache. + */ +void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio, + struct osc_object *obj) +{ + struct osc_extent *ext = oio->oi_trunc; + + oio->oi_trunc = NULL; + if (ext != NULL) { + bool unplug = false; + + EASSERT(ext->oe_nr_pages > 0, ext); + EASSERT(ext->oe_state == OES_TRUNC, ext); + EASSERT(!ext->oe_urgent, ext); + + OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n"); + osc_object_lock(obj); + osc_extent_state_set(ext, OES_CACHE); + if (ext->oe_fsync_wait && !ext->oe_urgent) { + ext->oe_urgent = 1; + list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); + unplug = true; + } + osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); + osc_object_unlock(obj); + osc_extent_put(env, ext); + + if (unplug) + osc_io_unplug_async(env, osc_cli(obj), obj); + } +} + +/** + * Wait for extents in a specific range to be written out. + * The caller must have called osc_cache_writeback_range() to issue IO + * otherwise it will take a long time for this function to finish. + * + * Caller must hold inode_mutex , or cancel exclusive dlm lock so that + * nobody else can dirty this range of file while we're waiting for + * extents to be written. + */ +int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end) +{ + struct osc_extent *ext; + pgoff_t index = start; + int result = 0; + +again: + osc_object_lock(obj); + ext = osc_extent_search(obj, index); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < index) + ext = next_extent(ext); + while (ext != NULL) { + int rc; + + if (ext->oe_start > end) + break; + + if (!ext->oe_fsync_wait) { + ext = next_extent(ext); + continue; + } + + EASSERT(ergo(ext->oe_state == OES_CACHE, + ext->oe_hp || ext->oe_urgent), ext); + EASSERT(ergo(ext->oe_state == OES_ACTIVE, + !ext->oe_hp && ext->oe_urgent), ext); + + index = ext->oe_end + 1; + osc_extent_get(ext); + osc_object_unlock(obj); + + rc = osc_extent_wait(env, ext, OES_INV); + if (result == 0) + result = rc; + osc_extent_put(env, ext); + goto again; + } + osc_object_unlock(obj); + + OSC_IO_DEBUG(obj, "sync file range.\n"); + return result; +} + +/** + * Called to write out a range of osc object. + * + * @hp : should be set this is caused by lock cancel; + * @discard: is set if dirty pages should be dropped - file will be deleted or + * truncated, this implies there is no partially discarding extents. + * + * Return how many pages will be issued, or error code if error occurred. + */ +int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, int hp, int discard) +{ + struct osc_extent *ext; + LIST_HEAD(discard_list); + bool unplug = false; + int result = 0; + + osc_object_lock(obj); + ext = osc_extent_search(obj, start); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < start) + ext = next_extent(ext); + while (ext != NULL) { + if (ext->oe_start > end) + break; + + ext->oe_fsync_wait = 1; + switch (ext->oe_state) { + case OES_CACHE: + result += ext->oe_nr_pages; + if (!discard) { + struct list_head *list = NULL; + if (hp) { + EASSERT(!ext->oe_hp, ext); + ext->oe_hp = 1; + list = &obj->oo_hp_exts; + } else if (!ext->oe_urgent) { + ext->oe_urgent = 1; + list = &obj->oo_urgent_exts; + } + if (list != NULL) + list_move_tail(&ext->oe_link, list); + unplug = true; + } else { + /* the only discarder is lock cancelling, so + * [start, end] must contain this extent */ + EASSERT(ext->oe_start >= start && + ext->oe_max_end <= end, ext); + osc_extent_state_set(ext, OES_LOCKING); + ext->oe_owner = current; + list_move_tail(&ext->oe_link, + &discard_list); + osc_update_pending(obj, OBD_BRW_WRITE, + -ext->oe_nr_pages); + } + break; + case OES_ACTIVE: + /* It's pretty bad to wait for ACTIVE extents, because + * we don't know how long we will wait for it to be + * flushed since it may be blocked at awaiting more + * grants. We do this for the correctness of fsync. */ + LASSERT(hp == 0 && discard == 0); + ext->oe_urgent = 1; + break; + case OES_TRUNC: + /* this extent is being truncated, can't do anything + * for it now. it will be set to urgent after truncate + * is finished in osc_cache_truncate_end(). */ + default: + break; + } + ext = next_extent(ext); + } + osc_object_unlock(obj); + + LASSERT(ergo(!discard, list_empty(&discard_list))); + if (!list_empty(&discard_list)) { + struct osc_extent *tmp; + int rc; + + osc_list_maint(osc_cli(obj), obj); + list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) { + list_del_init(&ext->oe_link); + EASSERT(ext->oe_state == OES_LOCKING, ext); + + /* Discard caching pages. We don't actually write this + * extent out but we complete it as if we did. */ + rc = osc_extent_make_ready(env, ext); + if (unlikely(rc < 0)) { + OSC_EXTENT_DUMP(D_ERROR, ext, + "make_ready returned %d\n", rc); + if (result >= 0) + result = rc; + } + + /* finish the extent as if the pages were sent */ + osc_extent_finish(env, ext, 0, 0); + } + } + + if (unplug) + osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND); + + if (hp || discard) { + int rc; + rc = osc_cache_wait_range(env, obj, start, end); + if (result >= 0 && rc < 0) + result = rc; + } + + OSC_IO_DEBUG(obj, "cache page out.\n"); + return result; +} + +/** @} osc */ diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/kernel/drivers/staging/lustre/lustre/osc/osc_cl_internal.h new file mode 100644 index 000000000..365b2787b --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_cl_internal.h @@ -0,0 +1,685 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal interfaces of OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#ifndef OSC_CL_INTERNAL_H +#define OSC_CL_INTERNAL_H + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd.h" +/* osc_build_res_name() */ +#include "../include/cl_object.h" +#include "../include/lclient.h" +#include "osc_internal.h" + +/** \defgroup osc osc + * @{ + */ + +struct osc_extent; + +/** + * State maintained by osc layer for each IO context. + */ +struct osc_io { + /** super class */ + struct cl_io_slice oi_cl; + /** true if this io is lockless. */ + int oi_lockless; + /** active extents, we know how many bytes is going to be written, + * so having an active extent will prevent it from being fragmented */ + struct osc_extent *oi_active; + /** partially truncated extent, we need to hold this extent to prevent + * page writeback from happening. */ + struct osc_extent *oi_trunc; + + struct obd_info oi_info; + struct obdo oi_oa; + struct osc_async_cbargs { + bool opc_rpc_sent; + int opc_rc; + struct completion opc_sync; + } oi_cbarg; +}; + +/** + * State of transfer for osc. + */ +struct osc_req { + struct cl_req_slice or_cl; +}; + +/** + * State maintained by osc layer for the duration of a system call. + */ +struct osc_session { + struct osc_io os_io; +}; + +#define OTI_PVEC_SIZE 64 +struct osc_thread_info { + struct ldlm_res_id oti_resname; + ldlm_policy_data_t oti_policy; + struct cl_lock_descr oti_descr; + struct cl_attr oti_attr; + struct lustre_handle oti_handle; + struct cl_page_list oti_plist; + struct cl_io oti_io; + struct cl_page *oti_pvec[OTI_PVEC_SIZE]; +}; + +struct osc_object { + struct cl_object oo_cl; + struct lov_oinfo *oo_oinfo; + /** + * True if locking against this stripe got -EUSERS. + */ + int oo_contended; + unsigned long oo_contention_time; + /** + * List of pages in transfer. + */ + struct list_head oo_inflight[CRT_NR]; + /** + * Lock, protecting ccc_object::cob_inflight, because a seat-belt is + * locked during take-off and landing. + */ + spinlock_t oo_seatbelt; + + /** + * used by the osc to keep track of what objects to build into rpcs. + * Protected by client_obd->cli_loi_list_lock. + */ + struct list_head oo_ready_item; + struct list_head oo_hp_ready_item; + struct list_head oo_write_item; + struct list_head oo_read_item; + + /** + * extent is a red black tree to manage (async) dirty pages. + */ + struct rb_root oo_root; + /** + * Manage write(dirty) extents. + */ + struct list_head oo_hp_exts; /* list of hp extents */ + struct list_head oo_urgent_exts; /* list of writeback extents */ + struct list_head oo_rpc_exts; + + struct list_head oo_reading_exts; + + atomic_t oo_nr_reads; + atomic_t oo_nr_writes; + + /** Protect extent tree. Will be used to protect + * oo_{read|write}_pages soon. */ + spinlock_t oo_lock; +}; + +static inline void osc_object_lock(struct osc_object *obj) +{ + spin_lock(&obj->oo_lock); +} + +static inline int osc_object_trylock(struct osc_object *obj) +{ + return spin_trylock(&obj->oo_lock); +} + +static inline void osc_object_unlock(struct osc_object *obj) +{ + spin_unlock(&obj->oo_lock); +} + +static inline int osc_object_is_locked(struct osc_object *obj) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + return spin_is_locked(&obj->oo_lock); +#else + /* + * It is not perfect to return true all the time. + * But since this function is only used for assertion + * and checking, it seems OK. + */ + return 1; +#endif +} + +/* + * Lock "micro-states" for osc layer. + */ +enum osc_lock_state { + OLS_NEW, + OLS_ENQUEUED, + OLS_UPCALL_RECEIVED, + OLS_GRANTED, + OLS_RELEASED, + OLS_BLOCKED, + OLS_CANCELLED +}; + +/** + * osc-private state of cl_lock. + * + * Interaction with DLM. + * + * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode). + * + * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in + * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock. + * + * This pointer is protected through a reference, acquired by + * osc_lock_upcall0(). Also, an additional reference is acquired by + * ldlm_lock_addref() call protecting the lock from cancellation, until + * osc_lock_unuse() releases it. + * + * Below is a description of how lock references are acquired and released + * inside of DLM. + * + * - When new lock is created and enqueued to the server (ldlm_cli_enqueue()) + * - ldlm_lock_create() + * - ldlm_lock_new(): initializes a lock with 2 references. One for + * the caller (released when reply from the server is received, or on + * error), and another for the hash table. + * - ldlm_lock_addref_internal(): protects the lock from cancellation. + * + * - When reply is received from the server (osc_enqueue_interpret()) + * - ldlm_cli_enqueue_fini() + * - LDLM_LOCK_PUT(): releases caller reference acquired by + * ldlm_lock_new(). + * - if (rc != 0) + * ldlm_lock_decref(): error case: matches ldlm_cli_enqueue(). + * - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue(). + * + * - When lock is being cancelled (ldlm_lock_cancel()) + * - ldlm_lock_destroy() + * - LDLM_LOCK_PUT(): releases hash-table reference acquired by + * ldlm_lock_new(). + * + * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called + * either when lock is cancelled (osc_lock_blocking()), or when locks is + * deleted without cancellation (e.g., from cl_locks_prune()). In the latter + * case ldlm lock remains in memory, and can be re-attached to osc_lock in the + * future. + */ +struct osc_lock { + struct cl_lock_slice ols_cl; + /** underlying DLM lock */ + struct ldlm_lock *ols_lock; + /** lock value block */ + struct ost_lvb ols_lvb; + /** DLM flags with which osc_lock::ols_lock was enqueued */ + __u64 ols_flags; + /** osc_lock::ols_lock handle */ + struct lustre_handle ols_handle; + struct ldlm_enqueue_info ols_einfo; + enum osc_lock_state ols_state; + + /** + * How many pages are using this lock for io, currently only used by + * read-ahead. If non-zero, the underlying dlm lock won't be cancelled + * during recovery to avoid deadlock. see bz16774. + * + * \see osc_page::ops_lock + * \see osc_page_addref_lock(), osc_page_putref_lock() + */ + atomic_t ols_pageref; + + /** + * true, if ldlm_lock_addref() was called against + * osc_lock::ols_lock. This is used for sanity checking. + * + * \see osc_lock::ols_has_ref + */ + unsigned ols_hold :1, + /** + * this is much like osc_lock::ols_hold, except that this bit is + * cleared _after_ reference in released in osc_lock_unuse(). This + * fine distinction is needed because: + * + * - if ldlm lock still has a reference, osc_ast_data_get() needs + * to return associated cl_lock (so that a flag is needed that is + * cleared after ldlm_lock_decref() returned), and + * + * - ldlm_lock_decref() can invoke blocking ast (for a + * LDLM_FL_CBPENDING lock), and osc_lock functions like + * osc_lock_cancel() called from there need to know whether to + * release lock reference (so that a flag is needed that is + * cleared before ldlm_lock_decref() is called). + */ + ols_has_ref:1, + /** + * inherit the lockless attribute from top level cl_io. + * If true, osc_lock_enqueue is able to tolerate the -EUSERS error. + */ + ols_locklessable:1, + /** + * set by osc_lock_use() to wait until blocking AST enters into + * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for + * further synchronization. + */ + ols_ast_wait:1, + /** + * If the data of this lock has been flushed to server side. + */ + ols_flush:1, + /** + * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat + * the EVAVAIL error as tolerable, this will make upper logic happy + * to wait all glimpse locks to each OSTs to be completed. + * Glimpse lock converts to normal lock if the server lock is + * granted. + * Glimpse lock should be destroyed immediately after use. + */ + ols_glimpse:1, + /** + * For async glimpse lock. + */ + ols_agl:1; + /** + * IO that owns this lock. This field is used for a dead-lock + * avoidance by osc_lock_enqueue_wait(). + * + * XXX: unfortunately, the owner of a osc_lock is not unique, + * the lock may have multiple users, if the lock is granted and + * then matched. + */ + struct osc_io *ols_owner; +}; + + +/** + * Page state private for osc layer. + */ +struct osc_page { + struct cl_page_slice ops_cl; + /** + * Page queues used by osc to detect when RPC can be formed. + */ + struct osc_async_page ops_oap; + /** + * An offset within page from which next transfer starts. This is used + * by cl_page_clip() to submit partial page transfers. + */ + int ops_from; + /** + * An offset within page at which next transfer ends. + * + * \see osc_page::ops_from. + */ + int ops_to; + /** + * Boolean, true iff page is under transfer. Used for sanity checking. + */ + unsigned ops_transfer_pinned:1, + /** + * True for a `temporary page' created by read-ahead code, probably + * outside of any DLM lock. + */ + ops_temp:1, + /** + * in LRU? + */ + ops_in_lru:1, + /** + * Set if the page must be transferred with OBD_BRW_SRVLOCK. + */ + ops_srvlock:1; + union { + /** + * lru page list. ops_inflight and ops_lru are exclusive so + * that they can share the same data. + */ + struct list_head ops_lru; + /** + * Linkage into a per-osc_object list of pages in flight. For + * debugging. + */ + struct list_head ops_inflight; + }; + /** + * Thread that submitted this page for transfer. For debugging. + */ + struct task_struct *ops_submitter; + /** + * Submit time - the time when the page is starting RPC. For debugging. + */ + unsigned long ops_submit_time; + + /** + * A lock of which we hold a reference covers this page. Only used by + * read-ahead: for a readahead page, we hold it's covering lock to + * prevent it from being canceled during recovery. + * + * \see osc_lock::ols_pageref + * \see osc_page_addref_lock(), osc_page_putref_lock(). + */ + struct cl_lock *ops_lock; +}; + +extern struct kmem_cache *osc_lock_kmem; +extern struct kmem_cache *osc_object_kmem; +extern struct kmem_cache *osc_thread_kmem; +extern struct kmem_cache *osc_session_kmem; +extern struct kmem_cache *osc_req_kmem; +extern struct kmem_cache *osc_extent_kmem; + +extern struct lu_device_type osc_device_type; +extern struct lu_context_key osc_key; +extern struct lu_context_key osc_session_key; + +#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY) + +int osc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); +int osc_io_init (const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); +int osc_req_init (const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); +int osc_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage); + +void osc_index2policy (ldlm_policy_data_t *policy, const struct cl_object *obj, + pgoff_t start, pgoff_t end); +int osc_lvb_print (const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb); + +void osc_page_submit(const struct lu_env *env, struct osc_page *opg, + enum cl_req_type crt, int brw_flags); +int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops); +int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg, + u32 async_flags); +int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, + struct page *page, loff_t offset); +int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops); +int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj, + struct osc_page *ops); +int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops); +int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, + struct list_head *list, int cmd, int brw_flags); +int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio, + struct osc_object *obj, __u64 size); +void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio, + struct osc_object *obj); +int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, int hp, int discard); +int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end); +void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol); + +void osc_object_set_contended (struct osc_object *obj); +void osc_object_clear_contended(struct osc_object *obj); +int osc_object_is_contended (struct osc_object *obj); + +int osc_lock_is_lockless (const struct osc_lock *olck); + +/***************************************************************************** + * + * Accessors. + * + */ + +static inline struct osc_thread_info *osc_env_info(const struct lu_env *env) +{ + struct osc_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &osc_key); + LASSERT(info != NULL); + return info; +} + +static inline struct osc_session *osc_env_session(const struct lu_env *env) +{ + struct osc_session *ses; + + ses = lu_context_key_get(env->le_ses, &osc_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct osc_io *osc_env_io(const struct lu_env *env) +{ + return &osc_env_session(env)->os_io; +} + +static inline int osc_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &osc_device_type; +} + +static inline struct osc_device *lu2osc_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &osc_device_type); + return container_of0(d, struct osc_device, od_cl.cd_lu_dev); +} + +static inline struct obd_export *osc_export(const struct osc_object *obj) +{ + return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp; +} + +static inline struct client_obd *osc_cli(const struct osc_object *obj) +{ + return &osc_export(obj)->exp_obd->u.cli; +} + +static inline struct osc_object *cl2osc(const struct cl_object *obj) +{ + LINVRNT(osc_is_object(&obj->co_lu)); + return container_of0(obj, struct osc_object, oo_cl); +} + +static inline struct cl_object *osc2cl(const struct osc_object *obj) +{ + return (struct cl_object *)&obj->oo_cl; +} + +static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode) +{ + LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP); + if (mode == CLM_READ) + return LCK_PR; + else if (mode == CLM_WRITE) + return LCK_PW; + else + return LCK_GROUP; +} + +static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode) +{ + LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP); + if (mode == LCK_PR) + return CLM_READ; + else if (mode == LCK_PW) + return CLM_WRITE; + else + return CLM_GROUP; +} + +static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice) +{ + LINVRNT(osc_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct osc_page, ops_cl); +} + +static inline struct osc_page *oap2osc(struct osc_async_page *oap) +{ + return container_of0(oap, struct osc_page, ops_oap); +} + +static inline struct cl_page *oap2cl_page(struct osc_async_page *oap) +{ + return oap2osc(oap)->ops_cl.cpl_page; +} + +static inline struct osc_page *oap2osc_page(struct osc_async_page *oap) +{ + return (struct osc_page *)container_of(oap, struct osc_page, ops_oap); +} + +static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(osc_is_object(&slice->cls_obj->co_lu)); + return container_of0(slice, struct osc_lock, ols_cl); +} + +static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock) +{ + return cl2osc_lock(cl_lock_at(lock, &osc_device_type)); +} + +static inline int osc_io_srvlock(struct osc_io *oio) +{ + return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock); +} + +enum osc_extent_state { + OES_INV = 0, /** extent is just initialized or destroyed */ + OES_ACTIVE = 1, /** process is using this extent */ + OES_CACHE = 2, /** extent is ready for IO */ + OES_LOCKING = 3, /** locking page to prepare IO */ + OES_LOCK_DONE = 4, /** locking finished, ready to send */ + OES_RPC = 5, /** in RPC */ + OES_TRUNC = 6, /** being truncated */ + OES_STATE_MAX +}; + +/** + * osc_extent data to manage dirty pages. + * osc_extent has the following attributes: + * 1. all pages in the same must be in one RPC in write back; + * 2. # of pages must be less than max_pages_per_rpc - implied by 1; + * 3. must be covered by only 1 osc_lock; + * 4. exclusive. It's impossible to have overlapped osc_extent. + * + * The lifetime of an extent is from when the 1st page is dirtied to when + * all pages inside it are written out. + * + * LOCKING ORDER + * ============= + * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock) + */ +struct osc_extent { + /** red-black tree node */ + struct rb_node oe_node; + /** osc_object of this extent */ + struct osc_object *oe_obj; + /** refcount, removed from red-black tree if reaches zero. */ + atomic_t oe_refc; + /** busy if non-zero */ + atomic_t oe_users; + /** link list of osc_object's oo_{hp|urgent|locking}_exts. */ + struct list_head oe_link; + /** state of this extent */ + unsigned int oe_state; + /** flags for this extent. */ + unsigned int oe_intree:1, + /** 0 is write, 1 is read */ + oe_rw:1, + oe_srvlock:1, + oe_memalloc:1, + /** an ACTIVE extent is going to be truncated, so when this extent + * is released, it will turn into TRUNC state instead of CACHE. */ + oe_trunc_pending:1, + /** this extent should be written asap and someone may wait for the + * write to finish. This bit is usually set along with urgent if + * the extent was CACHE state. + * fsync_wait extent can't be merged because new extent region may + * exceed fsync range. */ + oe_fsync_wait:1, + /** covering lock is being canceled */ + oe_hp:1, + /** this extent should be written back asap. set if one of pages is + * called by page WB daemon, or sync write or reading requests. */ + oe_urgent:1; + /** how many grants allocated for this extent. + * Grant allocated for this extent. There is no grant allocated + * for reading extents and sync write extents. */ + unsigned int oe_grants; + /** # of dirty pages in this extent */ + unsigned int oe_nr_pages; + /** list of pending oap pages. Pages in this list are NOT sorted. */ + struct list_head oe_pages; + /** Since an extent has to be written out in atomic, this is used to + * remember the next page need to be locked to write this extent out. + * Not used right now. + */ + struct osc_page *oe_next_page; + /** start and end index of this extent, include start and end + * themselves. Page offset here is the page index of osc_pages. + * oe_start is used as keyword for red-black tree. */ + pgoff_t oe_start; + pgoff_t oe_end; + /** maximum ending index of this extent, this is limited by + * max_pages_per_rpc, lock extent and chunk size. */ + pgoff_t oe_max_end; + /** waitqueue - for those who want to be notified if this extent's + * state has changed. */ + wait_queue_head_t oe_waitq; + /** lock covering this extent */ + struct cl_lock *oe_osclock; + /** terminator of this extent. Must be true if this extent is in IO. */ + struct task_struct *oe_owner; + /** return value of writeback. If somebody is waiting for this extent, + * this value can be known by outside world. */ + int oe_rc; + /** max pages per rpc when this extent was created */ + unsigned int oe_mppr; +}; + +int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, + int sent, int rc); +void osc_extent_release(const struct lu_env *env, struct osc_extent *ext); + +/** @} osc */ + +#endif /* OSC_CL_INTERNAL_H */ diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_dev.c b/kernel/drivers/staging/lustre/lustre/osc/osc_dev.c new file mode 100644 index 000000000..4935fc7c0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_dev.c @@ -0,0 +1,262 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device, cl_req for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +/* class_name2obd() */ +#include "../include/obd_class.h" + +#include "osc_cl_internal.h" + +/** \addtogroup osc + * @{ + */ + +struct kmem_cache *osc_lock_kmem; +struct kmem_cache *osc_object_kmem; +struct kmem_cache *osc_thread_kmem; +struct kmem_cache *osc_session_kmem; +struct kmem_cache *osc_req_kmem; +struct kmem_cache *osc_extent_kmem; +struct kmem_cache *osc_quota_kmem; + +struct lu_kmem_descr osc_caches[] = { + { + .ckd_cache = &osc_lock_kmem, + .ckd_name = "osc_lock_kmem", + .ckd_size = sizeof(struct osc_lock) + }, + { + .ckd_cache = &osc_object_kmem, + .ckd_name = "osc_object_kmem", + .ckd_size = sizeof(struct osc_object) + }, + { + .ckd_cache = &osc_thread_kmem, + .ckd_name = "osc_thread_kmem", + .ckd_size = sizeof(struct osc_thread_info) + }, + { + .ckd_cache = &osc_session_kmem, + .ckd_name = "osc_session_kmem", + .ckd_size = sizeof(struct osc_session) + }, + { + .ckd_cache = &osc_req_kmem, + .ckd_name = "osc_req_kmem", + .ckd_size = sizeof(struct osc_req) + }, + { + .ckd_cache = &osc_extent_kmem, + .ckd_name = "osc_extent_kmem", + .ckd_size = sizeof(struct osc_extent) + }, + { + .ckd_cache = &osc_quota_kmem, + .ckd_name = "osc_quota_kmem", + .ckd_size = sizeof(struct osc_quota_info) + }, + { + .ckd_cache = NULL + } +}; + +struct lock_class_key osc_ast_guard_class; + +/***************************************************************************** + * + * Type conversions. + * + */ + +static struct lu_device *osc2lu_dev(struct osc_device *osc) +{ + return &osc->od_cl.cd_lu_dev; +} + +/***************************************************************************** + * + * Osc device and device type functions. + * + */ + +static void *osc_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_thread_info *info = data; + + OBD_SLAB_FREE_PTR(info, osc_thread_kmem); +} + +struct lu_context_key osc_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = osc_key_init, + .lct_fini = osc_key_fini +}; + +static void *osc_session_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_session *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_session_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_session *info = data; + + OBD_SLAB_FREE_PTR(info, osc_session_kmem); +} + +struct lu_context_key osc_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = osc_session_init, + .lct_fini = osc_session_fini +}; + +/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key); + +static int osc_cl_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + return osc_process_config_base(d->ld_obd, cfg); +} + +static const struct lu_device_operations osc_lu_ops = { + .ldo_object_alloc = osc_object_alloc, + .ldo_process_config = osc_cl_process_config, + .ldo_recovery_complete = NULL +}; + +static const struct cl_device_operations osc_cl_ops = { + .cdo_req_init = osc_req_init +}; + +static int osc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + return 0; +} + +static struct lu_device *osc_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return NULL; +} + +static struct lu_device *osc_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct osc_device *od = lu2osc_dev(d); + + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(od); + return NULL; +} + +static struct lu_device *osc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct osc_device *od; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(od); + if (od == NULL) + return ERR_PTR(-ENOMEM); + + cl_device_init(&od->od_cl, t); + d = osc2lu_dev(od); + d->ld_ops = &osc_lu_ops; + od->od_cl.cd_ops = &osc_cl_ops; + + /* Setup OSC OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = osc_setup(obd, cfg); + if (rc) { + osc_device_free(env, d); + return ERR_PTR(rc); + } + od->od_exp = obd->obd_self_export; + return d; +} + +static const struct lu_device_type_operations osc_device_type_ops = { + .ldto_init = osc_type_init, + .ldto_fini = osc_type_fini, + + .ldto_start = osc_type_start, + .ldto_stop = osc_type_stop, + + .ldto_device_alloc = osc_device_alloc, + .ldto_device_free = osc_device_free, + + .ldto_device_init = osc_device_init, + .ldto_device_fini = osc_device_fini +}; + +struct lu_device_type osc_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_OSC_NAME, + .ldt_ops = &osc_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** @} osc */ diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_internal.h b/kernel/drivers/staging/lustre/lustre/osc/osc_internal.h new file mode 100644 index 000000000..af96c7bc7 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_internal.h @@ -0,0 +1,203 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef OSC_INTERNAL_H +#define OSC_INTERNAL_H + +#define OAP_MAGIC 8675309 + +struct lu_env; + +enum async_flags { + ASYNC_READY = 0x1, /* ap_make_ready will not be called before this + page is added to an rpc */ + ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ + ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called + to give the caller a chance to update + or cancel the size of the io */ + ASYNC_HP = 0x10, +}; + +struct osc_async_page { + int oap_magic; + unsigned short oap_cmd; + unsigned short oap_interrupted:1; + + struct list_head oap_pending_item; + struct list_head oap_rpc_item; + + u64 oap_obj_off; + unsigned oap_page_off; + enum async_flags oap_async_flags; + + struct brw_page oap_brw_page; + + struct ptlrpc_request *oap_request; + struct client_obd *oap_cli; + struct osc_object *oap_obj; + + struct ldlm_lock *oap_ldlm_lock; + spinlock_t oap_lock; +}; + +#define oap_page oap_brw_page.pg +#define oap_count oap_brw_page.count +#define oap_brw_flags oap_brw_page.flag + +struct osc_cache_waiter { + struct list_head ocw_entry; + wait_queue_head_t ocw_waitq; + struct osc_async_page *ocw_oap; + int ocw_grant; + int ocw_rc; +}; + +int osc_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti); +int osc_real_create(struct obd_export *exp, struct obdo *oa, + struct lov_stripe_md **ea, struct obd_trans_info *oti); +void osc_wake_cache_waiters(struct client_obd *cli); +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes); +void osc_update_next_shrink(struct client_obd *cli); + +/* + * cl integration. + */ +#include "../include/cl_object.h" + +extern struct ptlrpc_request_set *PTLRPCD_SET; + +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u64 *flags, ldlm_policy_data_t *policy, + struct ost_lvb *lvb, int kms_valid, + obd_enqueue_update_f upcall, + void *cookie, struct ldlm_enqueue_info *einfo, + struct lustre_handle *lockh, + struct ptlrpc_request_set *rqset, int async, int agl); +int osc_cancel_base(struct lustre_handle *lockh, __u32 mode); + +int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + __u64 *flags, void *data, struct lustre_handle *lockh, + int unref); + +int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); + +int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg); +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + struct list_head *ext_list, int cmd, pdl_policy_t p); +int osc_lru_shrink(struct client_obd *cli, int target); + +extern spinlock_t osc_ast_guard; + +int osc_cleanup(struct obd_device *obd); +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg); + +#if defined (CONFIG_PROC_FS) +int lproc_osc_attach_seqstat(struct obd_device *dev); +void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;} +static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + +extern struct lu_device_type osc_device_type; + +static inline int osc_recoverable_error(int rc) +{ + return (rc == -EIO || rc == -EROFS || rc == -ENOMEM || + rc == -EAGAIN || rc == -EINPROGRESS); +} + +static inline unsigned long rpcs_in_flight(struct client_obd *cli) +{ + return cli->cl_r_in_flight + cli->cl_w_in_flight; +} + +struct osc_device { + struct cl_device od_cl; + struct obd_export *od_exp; + + /* Write stats is actually protected by client_obd's lock. */ + struct osc_stats { + uint64_t os_lockless_writes; /* by bytes */ + uint64_t os_lockless_reads; /* by bytes */ + uint64_t os_lockless_truncates; /* by times */ + } od_stats; + + /* configuration item(s) */ + int od_contention_time; + int od_lockless_truncate; +}; + +static inline struct osc_device *obd2osc_dev(const struct obd_device *d) +{ + return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev); +} + +int osc_dlm_lock_pageref(struct ldlm_lock *dlm); + +extern struct kmem_cache *osc_quota_kmem; +struct osc_quota_info { + /** linkage for quota hash table */ + struct hlist_node oqi_hash; + u32 oqi_id; +}; +int osc_quota_setup(struct obd_device *obd); +int osc_quota_cleanup(struct obd_device *obd); +int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[], + u32 valid, u32 flags); +int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]); +int osc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int osc_quotacheck(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk); + +#endif /* OSC_INTERNAL_H */ diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_io.c b/kernel/drivers/staging/lustre/lustre/osc/osc_io.c new file mode 100644 index 000000000..3c7300b06 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_io.c @@ -0,0 +1,819 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" + +/** \addtogroup osc + * @{ + */ + +/***************************************************************************** + * + * Type conversions. + * + */ + +static struct osc_req *cl2osc_req(const struct cl_req_slice *slice) +{ + LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type); + return container_of0(slice, struct osc_req, or_cl); +} + +static struct osc_io *cl2osc_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl); + + LINVRNT(oio == osc_env_io(env)); + return oio; +} + +static struct osc_page *osc_cl_page_osc(struct cl_page *page) +{ + const struct cl_page_slice *slice; + + slice = cl_page_at(page, &osc_device_type); + LASSERT(slice != NULL); + + return cl2osc_page(slice); +} + + +/***************************************************************************** + * + * io operations. + * + */ + +static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io) +{ +} + +/** + * An implementation of cl_io_operations::cio_io_submit() method for osc + * layer. Iterates over pages in the in-queue, prepares each for io by calling + * cl_page_prep() and then either submits them through osc_io_submit_page() + * or, if page is already submitted, changes osc flags through + * osc_set_async_flags(). + */ +static int osc_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct cl_page *page; + struct cl_page *tmp; + struct client_obd *cli = NULL; + struct osc_object *osc = NULL; /* to keep gcc happy */ + struct osc_page *opg; + struct cl_io *io; + LIST_HEAD(list); + + struct cl_page_list *qin = &queue->c2_qin; + struct cl_page_list *qout = &queue->c2_qout; + int queued = 0; + int result = 0; + int cmd; + int brw_flags; + int max_pages; + + LASSERT(qin->pl_nr > 0); + + CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt); + + osc = cl2osc(ios->cis_obj); + cli = osc_cli(osc); + max_pages = cli->cl_max_pages_per_rpc; + + cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; + brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0; + + /* + * NOTE: here @page is a top-level page. This is done to avoid + * creation of sub-page-list. + */ + cl_page_list_for_each_safe(page, tmp, qin) { + struct osc_async_page *oap; + + /* Top level IO. */ + io = page->cp_owner; + LASSERT(io != NULL); + + opg = osc_cl_page_osc(page); + oap = &opg->ops_oap; + LASSERT(osc == oap->oap_obj); + + if (!list_empty(&oap->oap_pending_item) || + !list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", + oap, opg); + result = -EBUSY; + break; + } + + result = cl_page_prep(env, io, page, crt); + if (result != 0) { + LASSERT(result < 0); + if (result != -EALREADY) + break; + /* + * Handle -EALREADY error: for read case, the page is + * already in UPTODATE state; for write, the page + * is not dirty. + */ + result = 0; + continue; + } + + cl_page_list_move(qout, qin, page); + oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY; + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + + osc_page_submit(env, opg, crt, brw_flags); + list_add_tail(&oap->oap_pending_item, &list); + if (++queued == max_pages) { + queued = 0; + result = osc_queue_sync_pages(env, osc, &list, cmd, + brw_flags); + if (result < 0) + break; + } + } + + if (queued > 0) + result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags); + + CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result); + return qout->pl_nr > 0 ? 0 : result; +} + +static void osc_page_touch_at(const struct lu_env *env, + struct cl_object *obj, pgoff_t idx, unsigned to) +{ + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int valid; + __u64 kms; + + /* offset within stripe */ + kms = cl_offset(obj, idx) + to; + + cl_object_attr_lock(obj); + /* + * XXX old code used + * + * ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm); + * + * here + */ + CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n", + kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms, + loi->loi_lvb.lvb_size); + + valid = 0; + if (kms > loi->loi_kms) { + attr->cat_kms = kms; + valid |= CAT_KMS; + } + if (kms > loi->loi_lvb.lvb_size) { + attr->cat_size = kms; + valid |= CAT_SIZE; + } + cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); +} + +/** + * This is called when a page is accessed within file in a way that creates + * new page, if one were missing (i.e., if there were a hole at that place in + * the file, or accessed page is beyond the current file size). Examples: + * ->commit_write() and ->nopage() methods. + * + * Expand stripe KMS if necessary. + */ +static void osc_page_touch(const struct lu_env *env, + struct osc_page *opage, unsigned to) +{ + struct cl_page *page = opage->ops_cl.cpl_page; + struct cl_object *obj = opage->ops_cl.cpl_obj; + + osc_page_touch_at(env, obj, page->cp_index, to); +} + +/** + * Implements cl_io_operations::cio_prepare_write() method for osc layer. + * + * \retval -EIO transfer initiated against this osc will most likely fail + * \retval 0 transfer initiated against this osc will most likely succeed. + * + * The reason for this check is to immediately return an error to the caller + * in the case of a deactivated import. Note, that import can be deactivated + * later, while pages, dirtied by this IO, are still in the cache, but this is + * irrelevant, because that would still return an error to the application (if + * it does fsync), but many applications don't do fsync because of performance + * issues, and we wanted to return an -EIO at write time to notify the + * application. + */ +static int osc_io_prepare_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev); + struct obd_import *imp = class_exp2cliimp(dev->od_exp); + struct osc_io *oio = cl2osc_io(env, ios); + int result = 0; + + /* + * This implements OBD_BRW_CHECK logic from old client. + */ + + if (imp == NULL || imp->imp_invalid) + result = -EIO; + if (result == 0 && oio->oi_lockless) + /* this page contains `invalid' data, but who cares? + * nobody can access the invalid data. + * in osc_io_commit_write(), we're going to write exact + * [from, to) bytes of this page to OST. -jay */ + cl_page_export(env, slice->cpl_page, 1); + + return result; +} + +static int osc_io_commit_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct osc_io *oio = cl2osc_io(env, ios); + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + struct osc_async_page *oap = &opg->ops_oap; + + LASSERT(to > 0); + /* + * XXX instead of calling osc_page_touch() here and in + * osc_io_fault_start() it might be more logical to introduce + * cl_page_touch() method, that generic cl_io_commit_write() and page + * fault code calls. + */ + osc_page_touch(env, cl2osc_page(slice), to); + if (!client_is_remote(osc_export(obj)) && + capable(CFS_CAP_SYS_RESOURCE)) + oap->oap_brw_flags |= OBD_BRW_NOQUOTA; + + if (oio->oi_lockless) + /* see osc_io_prepare_write() for lockless io handling. */ + cl_page_clip(env, slice->cpl_page, from, to); + + return 0; +} + +static int osc_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io; + struct cl_fault_io *fio; + + io = ios->cis_io; + fio = &io->u.ci_fault; + CDEBUG(D_INFO, "%lu %d %d\n", + fio->ft_index, fio->ft_writable, fio->ft_nob); + /* + * If mapping is writeable, adjust kms to cover this page, + * but do not extend kms beyond actual file size. + * See bug 10919. + */ + if (fio->ft_writable) + osc_page_touch_at(env, ios->cis_obj, + fio->ft_index, fio->ft_nob); + return 0; +} + +static int osc_async_upcall(void *a, int rc) +{ + struct osc_async_cbargs *args = a; + + args->opc_rc = rc; + complete(&args->opc_sync); + return 0; +} + +/** + * Checks that there are no pages being written in the extent being truncated. + */ +static int trunc_check_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + const struct cl_page_slice *slice; + struct osc_page *ops; + struct osc_async_page *oap; + __u64 start = *(__u64 *)cbdata; + + slice = cl_page_at(page, &osc_device_type); + LASSERT(slice != NULL); + ops = cl2osc_page(slice); + oap = &ops->ops_oap; + + if (oap->oap_cmd & OBD_BRW_WRITE && + !list_empty(&oap->oap_pending_item)) + CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n", + start, current->comm); + + { + struct page *vmpage = cl_page_vmpage(env, page); + + if (PageLocked(vmpage)) + CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n", + ops, page->cp_index, + (oap->oap_cmd & OBD_BRW_RWMASK)); + } + + return CLP_GANG_OKAY; +} + +static void osc_trunc_check(const struct lu_env *env, struct cl_io *io, + struct osc_io *oio, __u64 size) +{ + struct cl_object *clob; + int partial; + pgoff_t start; + + clob = oio->oi_cl.cis_obj; + start = cl_index(clob, size); + partial = cl_offset(clob, start) < size; + + /* + * Complain if there are pages in the truncated region. + */ + cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF, + trunc_check_cb, (void *)&size); +} + +static int osc_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + unsigned int ia_valid = io->u.ci_setattr.sa_valid; + int result = 0; + struct obd_info oinfo = { { { 0 } } }; + + /* truncate cache dirty pages first */ + if (cl_io_is_trunc(io)) + result = osc_cache_truncate_start(env, oio, cl2osc(obj), size); + + if (result == 0 && oio->oi_lockless == 0) { + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr; + unsigned int cl_valid = 0; + + if (ia_valid & ATTR_SIZE) { + attr->cat_size = attr->cat_kms = size; + cl_valid = CAT_SIZE | CAT_KMS; + } + if (ia_valid & ATTR_MTIME_SET) { + attr->cat_mtime = lvb->lvb_mtime; + cl_valid |= CAT_MTIME; + } + if (ia_valid & ATTR_ATIME_SET) { + attr->cat_atime = lvb->lvb_atime; + cl_valid |= CAT_ATIME; + } + if (ia_valid & ATTR_CTIME_SET) { + attr->cat_ctime = lvb->lvb_ctime; + cl_valid |= CAT_CTIME; + } + result = cl_object_attr_set(env, obj, attr, cl_valid); + } + cl_object_attr_unlock(obj); + } + memset(oa, 0, sizeof(*oa)); + if (result == 0) { + oa->o_oi = loi->loi_oi; + oa->o_mtime = attr->cat_mtime; + oa->o_atime = attr->cat_atime; + oa->o_ctime = attr->cat_ctime; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME | + OBD_MD_FLCTIME | OBD_MD_FLMTIME; + if (ia_valid & ATTR_SIZE) { + oa->o_size = size; + oa->o_blocks = OBD_OBJECT_EOF; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + + if (oio->oi_lockless) { + oa->o_flags = OBD_FL_SRVLOCK; + oa->o_valid |= OBD_MD_FLFLAGS; + } + } else { + LASSERT(oio->oi_lockless == 0); + } + + oinfo.oi_oa = oa; + oinfo.oi_capa = io->u.ci_setattr.sa_capa; + init_completion(&cbargs->opc_sync); + + if (ia_valid & ATTR_SIZE) + result = osc_punch_base(osc_export(cl2osc(obj)), + &oinfo, osc_async_upcall, + cbargs, PTLRPCD_SET); + else + result = osc_setattr_async_base(osc_export(cl2osc(obj)), + &oinfo, NULL, + osc_async_upcall, + cbargs, PTLRPCD_SET); + cbargs->opc_rpc_sent = result == 0; + } + return result; +} + +static void osc_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + int result = 0; + + if (cbargs->opc_rpc_sent) { + wait_for_completion(&cbargs->opc_sync); + result = io->ci_result = cbargs->opc_rc; + } + if (result == 0) { + if (oio->oi_lockless) { + /* lockless truncate */ + struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); + + LASSERT(cl_io_is_trunc(io)); + /* XXX: Need a lock. */ + osd->od_stats.os_lockless_truncates++; + } + } + + if (cl_io_is_trunc(io)) { + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + + osc_trunc_check(env, io, oio, size); + if (oio->oi_trunc != NULL) { + osc_cache_truncate_end(env, oio, cl2osc(obj)); + oio->oi_trunc = NULL; + } + } +} + +static int osc_io_read_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_object *obj = slice->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int rc = 0; + + if (!slice->cis_io->ci_noatime) { + cl_object_attr_lock(obj); + attr->cat_atime = LTIME_S(CURRENT_TIME); + rc = cl_object_attr_set(env, obj, attr, CAT_ATIME); + cl_object_attr_unlock(obj); + } + return rc; +} + +static int osc_io_write_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_object *obj = slice->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int rc = 0; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1); + cl_object_attr_lock(obj); + attr->cat_mtime = attr->cat_ctime = LTIME_S(CURRENT_TIME); + rc = cl_object_attr_set(env, obj, attr, CAT_MTIME | CAT_CTIME); + cl_object_attr_unlock(obj); + + return rc; +} + +static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj, + struct cl_fsync_io *fio) +{ + struct osc_io *oio = osc_env_io(env); + struct obdo *oa = &oio->oi_oa; + struct obd_info *oinfo = &oio->oi_info; + struct lov_oinfo *loi = obj->oo_oinfo; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + int rc = 0; + + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + /* reload size abd blocks for start and end of sync range */ + oa->o_size = fio->fi_start; + oa->o_blocks = fio->fi_end; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + + obdo_set_parent_fid(oa, fio->fi_fid); + + memset(oinfo, 0, sizeof(*oinfo)); + oinfo->oi_oa = oa; + oinfo->oi_capa = fio->fi_capa; + init_completion(&cbargs->opc_sync); + + rc = osc_sync_base(osc_export(obj), oinfo, osc_async_upcall, cbargs, + PTLRPCD_SET); + return rc; +} + +static int osc_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct cl_fsync_io *fio = &io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + struct osc_object *osc = cl2osc(obj); + pgoff_t start = cl_index(obj, fio->fi_start); + pgoff_t end = cl_index(obj, fio->fi_end); + int result = 0; + + if (fio->fi_end == OBD_OBJECT_EOF) + end = CL_PAGE_EOF; + + result = osc_cache_writeback_range(env, osc, start, end, 0, + fio->fi_mode == CL_FSYNC_DISCARD); + if (result > 0) { + fio->fi_nr_written += result; + result = 0; + } + if (fio->fi_mode == CL_FSYNC_ALL) { + int rc; + + /* we have to wait for writeback to finish before we can + * send OST_SYNC RPC. This is bad because it causes extents + * to be written osc by osc. However, we usually start + * writeback before CL_FSYNC_ALL so this won't have any real + * problem. */ + rc = osc_cache_wait_range(env, osc, start, end); + if (result == 0) + result = rc; + rc = osc_fsync_ost(env, osc, fio); + if (result == 0) + result = rc; + } + + return result; +} + +static void osc_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + pgoff_t start = cl_index(obj, fio->fi_start); + pgoff_t end = cl_index(obj, fio->fi_end); + int result = 0; + + if (fio->fi_mode == CL_FSYNC_LOCAL) { + result = osc_cache_wait_range(env, cl2osc(obj), start, end); + } else if (fio->fi_mode == CL_FSYNC_ALL) { + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + + wait_for_completion(&cbargs->opc_sync); + if (result == 0) + result = cbargs->opc_rc; + } + slice->cis_io->ci_result = result; +} + +static void osc_io_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = cl2osc_io(env, slice); + + if (oio->oi_active) { + osc_extent_release(env, oio->oi_active); + oio->oi_active = NULL; + } +} + +static const struct cl_io_operations osc_io_ops = { + .op = { + [CIT_READ] = { + .cio_start = osc_io_read_start, + .cio_fini = osc_io_fini + }, + [CIT_WRITE] = { + .cio_start = osc_io_write_start, + .cio_end = osc_io_end, + .cio_fini = osc_io_fini + }, + [CIT_SETATTR] = { + .cio_start = osc_io_setattr_start, + .cio_end = osc_io_setattr_end + }, + [CIT_FAULT] = { + .cio_start = osc_io_fault_start, + .cio_end = osc_io_end, + .cio_fini = osc_io_fini + }, + [CIT_FSYNC] = { + .cio_start = osc_io_fsync_start, + .cio_end = osc_io_fsync_end, + .cio_fini = osc_io_fini + }, + [CIT_MISC] = { + .cio_fini = osc_io_fini + } + }, + .req_op = { + [CRT_READ] = { + .cio_submit = osc_io_submit + }, + [CRT_WRITE] = { + .cio_submit = osc_io_submit + } + }, + .cio_prepare_write = osc_io_prepare_write, + .cio_commit_write = osc_io_commit_write +}; + +/***************************************************************************** + * + * Transfer operations. + * + */ + +static int osc_req_prep(const struct lu_env *env, + const struct cl_req_slice *slice) +{ + return 0; +} + +static void osc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct osc_req *or; + + or = cl2osc_req(slice); + OBD_SLAB_FREE_PTR(or, osc_req_kmem); +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for osc + * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq + * fields. + */ +static void osc_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, u64 flags) +{ + struct lov_oinfo *oinfo; + struct cl_req *clerq; + struct cl_page *apage; /* _some_ page in @clerq */ + struct cl_lock *lock; /* _some_ lock protecting @apage */ + struct osc_lock *olck; + struct osc_page *opg; + struct obdo *oa; + struct ost_lvb *lvb; + + oinfo = cl2osc(obj)->oo_oinfo; + lvb = &oinfo->loi_lvb; + oa = attr->cra_oa; + + if ((flags & OBD_MD_FLMTIME) != 0) { + oa->o_mtime = lvb->lvb_mtime; + oa->o_valid |= OBD_MD_FLMTIME; + } + if ((flags & OBD_MD_FLATIME) != 0) { + oa->o_atime = lvb->lvb_atime; + oa->o_valid |= OBD_MD_FLATIME; + } + if ((flags & OBD_MD_FLCTIME) != 0) { + oa->o_ctime = lvb->lvb_ctime; + oa->o_valid |= OBD_MD_FLCTIME; + } + if (flags & OBD_MD_FLGROUP) { + ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi)); + oa->o_valid |= OBD_MD_FLGROUP; + } + if (flags & OBD_MD_FLID) { + ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi)); + oa->o_valid |= OBD_MD_FLID; + } + if (flags & OBD_MD_FLHANDLE) { + clerq = slice->crs_req; + LASSERT(!list_empty(&clerq->crq_pages)); + apage = container_of(clerq->crq_pages.next, + struct cl_page, cp_flight); + opg = osc_cl_page_osc(apage); + apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */ + lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1); + if (lock == NULL) { + struct cl_object_header *head; + struct cl_lock *scan; + + head = cl_object_header(apage->cp_obj); + list_for_each_entry(scan, &head->coh_locks, + cll_linkage) + CL_LOCK_DEBUG(D_ERROR, env, scan, + "no cover page!\n"); + CL_PAGE_DEBUG(D_ERROR, env, apage, + "dump uncover page!\n"); + dump_stack(); + LBUG(); + } + + olck = osc_lock_at(lock); + LASSERT(olck != NULL); + LASSERT(ergo(opg->ops_srvlock, olck->ols_lock == NULL)); + /* check for lockless io. */ + if (olck->ols_lock != NULL) { + oa->o_handle = olck->ols_lock->l_remote_handle; + oa->o_valid |= OBD_MD_FLHANDLE; + } + cl_lock_put(env, lock); + } +} + +static const struct cl_req_operations osc_req_ops = { + .cro_prep = osc_req_prep, + .cro_attr_set = osc_req_attr_set, + .cro_completion = osc_req_completion +}; + + +int osc_io_init(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io) +{ + struct osc_io *oio = osc_env_io(env); + + CL_IO_SLICE_CLEAN(oio, oi_cl); + cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops); + return 0; +} + +int osc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct osc_req *or; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(or, osc_req_kmem, GFP_NOFS); + if (or != NULL) { + cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +/** @} osc */ diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_lock.c b/kernel/drivers/staging/lustre/lustre/osc/osc_lock.c new file mode 100644 index 000000000..350ad4955 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_lock.c @@ -0,0 +1,1613 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "../../include/linux/libcfs/libcfs.h" +/* fid_build_reg_res_name() */ +#include "../include/lustre_fid.h" + +#include "osc_cl_internal.h" + +/** \addtogroup osc + * @{ + */ + +#define _PAGEREF_MAGIC (-10000000) + +/***************************************************************************** + * + * Type conversions. + * + */ + +static const struct cl_lock_operations osc_lock_ops; +static const struct cl_lock_operations osc_lock_lockless_ops; +static void osc_lock_to_lockless(const struct lu_env *env, + struct osc_lock *ols, int force); +static int osc_lock_has_pages(struct osc_lock *olck); + +int osc_lock_is_lockless(const struct osc_lock *olck) +{ + return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops); +} + +/** + * Returns a weak pointer to the ldlm lock identified by a handle. Returned + * pointer cannot be dereferenced, as lock is not protected from concurrent + * reclaim. This function is a helper for osc_lock_invariant(). + */ +static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle) +{ + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(handle); + if (lock != NULL) + LDLM_LOCK_PUT(lock); + return lock; +} + +/** + * Invariant that has to be true all of the time. + */ +static int osc_lock_invariant(struct osc_lock *ols) +{ + struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle); + struct ldlm_lock *olock = ols->ols_lock; + int handle_used = lustre_handle_is_used(&ols->ols_handle); + + if (ergo(osc_lock_is_lockless(ols), + ols->ols_locklessable && ols->ols_lock == NULL)) + return 1; + + /* + * If all the following "ergo"s are true, return 1, otherwise 0 + */ + if (!ergo(olock != NULL, handle_used)) + return 0; + + if (!ergo(olock != NULL, + olock->l_handle.h_cookie == ols->ols_handle.cookie)) + return 0; + + if (!ergo(handle_used, + ergo(lock != NULL && olock != NULL, lock == olock) && + ergo(lock == NULL, olock == NULL))) + return 0; + /* + * Check that ->ols_handle and ->ols_lock are consistent, but + * take into account that they are set at the different time. + */ + if (!ergo(ols->ols_state == OLS_CANCELLED, + olock == NULL && !handle_used)) + return 0; + /* + * DLM lock is destroyed only after we have seen cancellation + * ast. + */ + if (!ergo(olock != NULL && ols->ols_state < OLS_CANCELLED, + ((olock->l_flags & LDLM_FL_DESTROYED) == 0))) + return 0; + + if (!ergo(ols->ols_state == OLS_GRANTED, + olock != NULL && + olock->l_req_mode == olock->l_granted_mode && + ols->ols_hold)) + return 0; + return 1; +} + +/***************************************************************************** + * + * Lock operations. + * + */ + +/** + * Breaks a link between osc_lock and dlm_lock. + */ +static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck) +{ + struct ldlm_lock *dlmlock; + + spin_lock(&osc_ast_guard); + dlmlock = olck->ols_lock; + if (dlmlock == NULL) { + spin_unlock(&osc_ast_guard); + return; + } + + olck->ols_lock = NULL; + /* wb(); --- for all who checks (ols->ols_lock != NULL) before + * call to osc_lock_detach() */ + dlmlock->l_ast_data = NULL; + olck->ols_handle.cookie = 0ULL; + spin_unlock(&osc_ast_guard); + + lock_res_and_lock(dlmlock); + if (dlmlock->l_granted_mode == dlmlock->l_req_mode) { + struct cl_object *obj = olck->ols_cl.cls_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + __u64 old_kms; + + cl_object_attr_lock(obj); + /* Must get the value under the lock to avoid possible races. */ + old_kms = cl2osc(obj)->oo_oinfo->loi_kms; + /* Update the kms. Need to loop all granted locks. + * Not a problem for the client */ + attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms); + + cl_object_attr_set(env, obj, attr, CAT_KMS); + cl_object_attr_unlock(obj); + } + unlock_res_and_lock(dlmlock); + + /* release a reference taken in osc_lock_upcall0(). */ + LASSERT(olck->ols_has_ref); + lu_ref_del(&dlmlock->l_reference, "osc_lock", olck); + LDLM_LOCK_RELEASE(dlmlock); + olck->ols_has_ref = 0; +} + +static int osc_lock_unhold(struct osc_lock *ols) +{ + int result = 0; + + if (ols->ols_hold) { + ols->ols_hold = 0; + result = osc_cancel_base(&ols->ols_handle, + ols->ols_einfo.ei_mode); + } + return result; +} + +static int osc_lock_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + + LINVRNT(osc_lock_invariant(ols)); + + switch (ols->ols_state) { + case OLS_NEW: + LASSERT(!ols->ols_hold); + LASSERT(ols->ols_agl); + return 0; + case OLS_UPCALL_RECEIVED: + osc_lock_unhold(ols); + case OLS_ENQUEUED: + LASSERT(!ols->ols_hold); + osc_lock_detach(env, ols); + ols->ols_state = OLS_NEW; + return 0; + case OLS_GRANTED: + LASSERT(!ols->ols_glimpse); + LASSERT(ols->ols_hold); + /* + * Move lock into OLS_RELEASED state before calling + * osc_cancel_base() so that possible synchronous cancellation + * (that always happens e.g., for liblustre) sees that lock is + * released. + */ + ols->ols_state = OLS_RELEASED; + return osc_lock_unhold(ols); + default: + CERROR("Impossible state: %d\n", ols->ols_state); + LBUG(); + } +} + +static void osc_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + + LINVRNT(osc_lock_invariant(ols)); + /* + * ->ols_hold can still be true at this point if, for example, a + * thread that requested a lock was killed (and released a reference + * to the lock), before reply from a server was received. In this case + * lock is destroyed immediately after upcall. + */ + osc_lock_unhold(ols); + LASSERT(ols->ols_lock == NULL); + LASSERT(atomic_read(&ols->ols_pageref) == 0 || + atomic_read(&ols->ols_pageref) == _PAGEREF_MAGIC); + + OBD_SLAB_FREE_PTR(ols, osc_lock_kmem); +} + +static void osc_lock_build_policy(const struct lu_env *env, + const struct cl_lock *lock, + ldlm_policy_data_t *policy) +{ + const struct cl_lock_descr *d = &lock->cll_descr; + + osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end); + policy->l_extent.gid = d->cld_gid; +} + +static __u64 osc_enq2ldlm_flags(__u32 enqflags) +{ + __u64 result = 0; + + LASSERT((enqflags & ~CEF_MASK) == 0); + + if (enqflags & CEF_NONBLOCK) + result |= LDLM_FL_BLOCK_NOWAIT; + if (enqflags & CEF_ASYNC) + result |= LDLM_FL_HAS_INTENT; + if (enqflags & CEF_DISCARD_DATA) + result |= LDLM_FL_AST_DISCARD_DATA; + return result; +} + +/** + * Global spin-lock protecting consistency of ldlm_lock::l_ast_data + * pointers. Initialized in osc_init(). + */ +spinlock_t osc_ast_guard; + +static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock) +{ + struct osc_lock *olck; + + lock_res_and_lock(dlm_lock); + spin_lock(&osc_ast_guard); + olck = dlm_lock->l_ast_data; + if (olck != NULL) { + struct cl_lock *lock = olck->ols_cl.cls_lock; + /* + * If osc_lock holds a reference on ldlm lock, return it even + * when cl_lock is in CLS_FREEING state. This way + * + * osc_ast_data_get(dlmlock) == NULL + * + * guarantees that all osc references on dlmlock were + * released. osc_dlm_blocking_ast0() relies on that. + */ + if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) { + cl_lock_get_trust(lock); + lu_ref_add_atomic(&lock->cll_reference, + "ast", current); + } else + olck = NULL; + } + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(dlm_lock); + return olck; +} + +static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck) +{ + struct cl_lock *lock; + + lock = olck->ols_cl.cls_lock; + lu_ref_del(&lock->cll_reference, "ast", current); + cl_lock_put(env, lock); +} + +/** + * Updates object attributes from a lock value block (lvb) received together + * with the DLM lock reply from the server. Copy of osc_update_enqueue() + * logic. + * + * This can be optimized to not update attributes when lock is a result of a + * local match. + * + * Called under lock and resource spin-locks. + */ +static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck, + int rc) +{ + struct ost_lvb *lvb; + struct cl_object *obj; + struct lov_oinfo *oinfo; + struct cl_attr *attr; + unsigned valid; + + if (!(olck->ols_flags & LDLM_FL_LVB_READY)) + return; + + lvb = &olck->ols_lvb; + obj = olck->ols_cl.cls_obj; + oinfo = cl2osc(obj)->oo_oinfo; + attr = &osc_env_info(env)->oti_attr; + valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE; + cl_lvb2attr(attr, lvb); + + cl_object_attr_lock(obj); + if (rc == 0) { + struct ldlm_lock *dlmlock; + __u64 size; + + dlmlock = olck->ols_lock; + LASSERT(dlmlock != NULL); + + /* re-grab LVB from a dlm lock under DLM spin-locks. */ + *lvb = *(struct ost_lvb *)dlmlock->l_lvb_data; + size = lvb->lvb_size; + /* Extend KMS up to the end of this lock and no further + * A lock on [x,y] means a KMS of up to y + 1 bytes! */ + if (size > dlmlock->l_policy_data.l_extent.end) + size = dlmlock->l_policy_data.l_extent.end + 1; + if (size >= oinfo->loi_kms) { + LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu, kms=%llu", + lvb->lvb_size, size); + valid |= CAT_KMS; + attr->cat_kms = size; + } else { + LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu; leaving kms=%llu, end=%llu", + lvb->lvb_size, oinfo->loi_kms, + dlmlock->l_policy_data.l_extent.end); + } + ldlm_lock_allow_match_locked(dlmlock); + } else if (rc == -ENAVAIL && olck->ols_glimpse) { + CDEBUG(D_INODE, "glimpsed, setting rss=%llu; leaving kms=%llu\n", + lvb->lvb_size, oinfo->loi_kms); + } else + valid = 0; + + if (valid != 0) + cl_object_attr_set(env, obj, attr, valid); + + cl_object_attr_unlock(obj); +} + +/** + * Called when a lock is granted, from an upcall (when server returned a + * granted lock), or from completion AST, when server returned a blocked lock. + * + * Called under lock and resource spin-locks, that are released temporarily + * here. + */ +static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck, + struct ldlm_lock *dlmlock, int rc) +{ + struct ldlm_extent *ext; + struct cl_lock *lock; + struct cl_lock_descr *descr; + + LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode); + + if (olck->ols_state < OLS_GRANTED) { + lock = olck->ols_cl.cls_lock; + ext = &dlmlock->l_policy_data.l_extent; + descr = &osc_env_info(env)->oti_descr; + descr->cld_obj = lock->cll_descr.cld_obj; + + /* XXX check that ->l_granted_mode is valid. */ + descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode); + descr->cld_start = cl_index(descr->cld_obj, ext->start); + descr->cld_end = cl_index(descr->cld_obj, ext->end); + descr->cld_gid = ext->gid; + /* + * tell upper layers the extent of the lock that was actually + * granted + */ + olck->ols_state = OLS_GRANTED; + osc_lock_lvb_update(env, olck, rc); + + /* release DLM spin-locks to allow cl_lock_{modify,signal}() + * to take a semaphore on a parent lock. This is safe, because + * spin-locks are needed to protect consistency of + * dlmlock->l_*_mode and LVB, and we have finished processing + * them. */ + unlock_res_and_lock(dlmlock); + cl_lock_modify(env, lock, descr); + cl_lock_signal(env, lock); + LINVRNT(osc_lock_invariant(olck)); + lock_res_and_lock(dlmlock); + } +} + +static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck) + +{ + struct ldlm_lock *dlmlock; + + dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0); + LASSERT(dlmlock != NULL); + + lock_res_and_lock(dlmlock); + spin_lock(&osc_ast_guard); + LASSERT(dlmlock->l_ast_data == olck); + LASSERT(olck->ols_lock == NULL); + olck->ols_lock = dlmlock; + spin_unlock(&osc_ast_guard); + + /* + * Lock might be not yet granted. In this case, completion ast + * (osc_ldlm_completion_ast()) comes later and finishes lock + * granting. + */ + if (dlmlock->l_granted_mode == dlmlock->l_req_mode) + osc_lock_granted(env, olck, dlmlock, 0); + unlock_res_and_lock(dlmlock); + + /* + * osc_enqueue_interpret() decrefs asynchronous locks, counter + * this. + */ + ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode); + olck->ols_hold = 1; + + /* lock reference taken by ldlm_handle2lock_long() is owned by + * osc_lock and released in osc_lock_detach() */ + lu_ref_add(&dlmlock->l_reference, "osc_lock", olck); + olck->ols_has_ref = 1; +} + +/** + * Lock upcall function that is executed either when a reply to ENQUEUE rpc is + * received from a server, or after osc_enqueue_base() matched a local DLM + * lock. + */ +static int osc_lock_upcall(void *cookie, int errcode) +{ + struct osc_lock *olck = cookie; + struct cl_lock_slice *slice = &olck->ols_cl; + struct cl_lock *lock = slice->cls_lock; + struct lu_env *env; + struct cl_env_nest nest; + + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + int rc; + + cl_lock_mutex_get(env, lock); + + LASSERT(lock->cll_state >= CLS_QUEUING); + if (olck->ols_state == OLS_ENQUEUED) { + olck->ols_state = OLS_UPCALL_RECEIVED; + rc = ldlm_error2errno(errcode); + } else if (olck->ols_state == OLS_CANCELLED) { + rc = -EIO; + } else { + CERROR("Impossible state: %d\n", olck->ols_state); + LBUG(); + } + if (rc) { + struct ldlm_lock *dlmlock; + + dlmlock = ldlm_handle2lock(&olck->ols_handle); + if (dlmlock != NULL) { + lock_res_and_lock(dlmlock); + spin_lock(&osc_ast_guard); + LASSERT(olck->ols_lock == NULL); + dlmlock->l_ast_data = NULL; + olck->ols_handle.cookie = 0ULL; + spin_unlock(&osc_ast_guard); + ldlm_lock_fail_match_locked(dlmlock); + unlock_res_and_lock(dlmlock); + LDLM_LOCK_PUT(dlmlock); + } + } else { + if (olck->ols_glimpse) + olck->ols_glimpse = 0; + osc_lock_upcall0(env, olck); + } + + /* Error handling, some errors are tolerable. */ + if (olck->ols_locklessable && rc == -EUSERS) { + /* This is a tolerable error, turn this lock into + * lockless lock. + */ + osc_object_set_contended(cl2osc(slice->cls_obj)); + LASSERT(slice->cls_ops == &osc_lock_ops); + + /* Change this lock to ldlmlock-less lock. */ + osc_lock_to_lockless(env, olck, 1); + olck->ols_state = OLS_GRANTED; + rc = 0; + } else if (olck->ols_glimpse && rc == -ENAVAIL) { + osc_lock_lvb_update(env, olck, rc); + cl_lock_delete(env, lock); + /* Hide the error. */ + rc = 0; + } + + if (rc == 0) { + /* For AGL case, the RPC sponsor may exits the cl_lock + * processing without wait() called before related OSC + * lock upcall(). So update the lock status according + * to the enqueue result inside AGL upcall(). */ + if (olck->ols_agl) { + lock->cll_flags |= CLF_FROM_UPCALL; + cl_wait_try(env, lock); + lock->cll_flags &= ~CLF_FROM_UPCALL; + if (!olck->ols_glimpse) + olck->ols_agl = 0; + } + cl_lock_signal(env, lock); + /* del user for lock upcall cookie */ + cl_unuse_try(env, lock); + } else { + /* del user for lock upcall cookie */ + cl_lock_user_del(env, lock); + cl_lock_error(env, lock, rc); + } + + /* release cookie reference, acquired by osc_lock_enqueue() */ + cl_lock_hold_release(env, lock, "upcall", lock); + cl_lock_mutex_put(env, lock); + + lu_ref_del(&lock->cll_reference, "upcall", lock); + /* This maybe the last reference, so must be called after + * cl_lock_mutex_put(). */ + cl_lock_put(env, lock); + + cl_env_nested_put(&nest, env); + } else { + /* should never happen, similar to osc_ldlm_blocking_ast(). */ + LBUG(); + } + return errcode; +} + +/** + * Core of osc_dlm_blocking_ast() logic. + */ +static void osc_lock_blocking(const struct lu_env *env, + struct ldlm_lock *dlmlock, + struct osc_lock *olck, int blocking) +{ + struct cl_lock *lock = olck->ols_cl.cls_lock; + + LASSERT(olck->ols_lock == dlmlock); + CLASSERT(OLS_BLOCKED < OLS_CANCELLED); + LASSERT(!osc_lock_is_lockless(olck)); + + /* + * Lock might be still addref-ed here, if e.g., blocking ast + * is sent for a failed lock. + */ + osc_lock_unhold(olck); + + if (blocking && olck->ols_state < OLS_BLOCKED) + /* + * Move osc_lock into OLS_BLOCKED before canceling the lock, + * because it recursively re-enters osc_lock_blocking(), with + * the state set to OLS_CANCELLED. + */ + olck->ols_state = OLS_BLOCKED; + /* + * cancel and destroy lock at least once no matter how blocking ast is + * entered (see comment above osc_ldlm_blocking_ast() for use + * cases). cl_lock_cancel() and cl_lock_delete() are idempotent. + */ + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); +} + +/** + * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock + * and ldlm_lock caches. + */ +static int osc_dlm_blocking_ast0(const struct lu_env *env, + struct ldlm_lock *dlmlock, + void *data, int flag) +{ + struct osc_lock *olck; + struct cl_lock *lock; + int result; + int cancel; + + LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING); + + cancel = 0; + olck = osc_ast_data_get(dlmlock); + if (olck != NULL) { + lock = olck->ols_cl.cls_lock; + cl_lock_mutex_get(env, lock); + LINVRNT(osc_lock_invariant(olck)); + if (olck->ols_ast_wait) { + /* wake up osc_lock_use() */ + cl_lock_signal(env, lock); + olck->ols_ast_wait = 0; + } + /* + * Lock might have been canceled while this thread was + * sleeping for lock mutex, but olck is pinned in memory. + */ + if (olck == dlmlock->l_ast_data) { + /* + * NOTE: DLM sends blocking AST's for failed locks + * (that are still in pre-OLS_GRANTED state) + * too, and they have to be canceled otherwise + * DLM lock is never destroyed and stuck in + * the memory. + * + * Alternatively, ldlm_cli_cancel() can be + * called here directly for osc_locks with + * ols_state < OLS_GRANTED to maintain an + * invariant that ->clo_cancel() is only called + * for locks that were granted. + */ + LASSERT(data == olck); + osc_lock_blocking(env, dlmlock, + olck, flag == LDLM_CB_BLOCKING); + } else + cancel = 1; + cl_lock_mutex_put(env, lock); + osc_ast_data_put(env, olck); + } else + /* + * DLM lock exists, but there is no cl_lock attached to it. + * This is a `normal' race. cl_object and its cl_lock's can be + * removed by memory pressure, together with all pages. + */ + cancel = (flag == LDLM_CB_BLOCKING); + + if (cancel) { + struct lustre_handle *lockh; + + lockh = &osc_env_info(env)->oti_handle; + ldlm_lock2handle(dlmlock, lockh); + result = ldlm_cli_cancel(lockh, LCF_ASYNC); + } else + result = 0; + return result; +} + +/** + * Blocking ast invoked by ldlm when dlm lock is either blocking progress of + * some other lock, or is canceled. This function is installed as a + * ldlm_lock::l_blocking_ast() for client extent locks. + * + * Control flow is tricky, because ldlm uses the same call-back + * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's. + * + * \param dlmlock lock for which ast occurred. + * + * \param new description of a conflicting lock in case of blocking ast. + * + * \param data value of dlmlock->l_ast_data + * + * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish + * cancellation and blocking ast's. + * + * Possible use cases: + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel + * lock due to lock lru pressure, or explicit user request to purge + * locks. + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify + * us that dlmlock conflicts with another lock that some client is + * enqueing. Lock is canceled. + * + * - cl_lock_cancel() is called. osc_lock_cancel() calls + * ldlm_cli_cancel() that calls + * + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + * recursively entering osc_ldlm_blocking_ast(). + * + * - client cancels lock voluntary (e.g., as a part of early cancellation): + * + * cl_lock_cancel()-> + * osc_lock_cancel()-> + * ldlm_cli_cancel()-> + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + */ +static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, + struct ldlm_lock_desc *new, void *data, + int flag) +{ + struct lu_env *env; + struct cl_env_nest nest; + int result; + + /* + * This can be called in the context of outer IO, e.g., + * + * cl_enqueue()->... + * ->osc_enqueue_base()->... + * ->ldlm_prep_elc_req()->... + * ->ldlm_cancel_callback()->... + * ->osc_ldlm_blocking_ast() + * + * new environment has to be created to not corrupt outer context. + */ + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + result = osc_dlm_blocking_ast0(env, dlmlock, data, flag); + cl_env_nested_put(&nest, env); + } else { + result = PTR_ERR(env); + /* + * XXX This should never happen, as cl_lock is + * stuck. Pre-allocated environment a la vvp_inode_fini_env + * should be used. + */ + LBUG(); + } + if (result != 0) { + if (result == -ENODATA) + result = 0; + else + CERROR("BAST failed: %d\n", result); + } + return result; +} + +static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock, + __u64 flags, void *data) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct osc_lock *olck; + struct cl_lock *lock; + int result; + int dlmrc; + + /* first, do dlm part of the work */ + dlmrc = ldlm_completion_ast_async(dlmlock, flags, data); + /* then, notify cl_lock */ + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + olck = osc_ast_data_get(dlmlock); + if (olck != NULL) { + lock = olck->ols_cl.cls_lock; + cl_lock_mutex_get(env, lock); + /* + * ldlm_handle_cp_callback() copied LVB from request + * to lock->l_lvb_data, store it in osc_lock. + */ + LASSERT(dlmlock->l_lvb_data != NULL); + lock_res_and_lock(dlmlock); + olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data; + if (olck->ols_lock == NULL) { + /* + * upcall (osc_lock_upcall()) hasn't yet been + * called. Do nothing now, upcall will bind + * olck to dlmlock and signal the waiters. + * + * This maintains an invariant that osc_lock + * and ldlm_lock are always bound when + * osc_lock is in OLS_GRANTED state. + */ + } else if (dlmlock->l_granted_mode == + dlmlock->l_req_mode) { + osc_lock_granted(env, olck, dlmlock, dlmrc); + } + unlock_res_and_lock(dlmlock); + + if (dlmrc != 0) { + CL_LOCK_DEBUG(D_ERROR, env, lock, + "dlmlock returned %d\n", dlmrc); + cl_lock_error(env, lock, dlmrc); + } + cl_lock_mutex_put(env, lock); + osc_ast_data_put(env, olck); + result = 0; + } else + result = -ELDLM_NO_LOCK_DATA; + cl_env_nested_put(&nest, env); + } else + result = PTR_ERR(env); + return dlmrc ?: result; +} + +static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) +{ + struct ptlrpc_request *req = data; + struct osc_lock *olck; + struct cl_lock *lock; + struct cl_object *obj; + struct cl_env_nest nest; + struct lu_env *env; + struct ost_lvb *lvb; + struct req_capsule *cap; + int result; + + LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK); + + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + /* osc_ast_data_get() has to go after environment is + * allocated, because osc_ast_data() acquires a + * reference to a lock, and it can only be released in + * environment. + */ + olck = osc_ast_data_get(dlmlock); + if (olck != NULL) { + lock = olck->ols_cl.cls_lock; + /* Do not grab the mutex of cl_lock for glimpse. + * See LU-1274 for details. + * BTW, it's okay for cl_lock to be cancelled during + * this period because server can handle this race. + * See ldlm_server_glimpse_ast() for details. + * cl_lock_mutex_get(env, lock); */ + cap = &req->rq_pill; + req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK); + req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER, + sizeof(*lvb)); + result = req_capsule_server_pack(cap); + if (result == 0) { + lvb = req_capsule_server_get(cap, &RMF_DLM_LVB); + obj = lock->cll_descr.cld_obj; + result = cl_object_glimpse(env, obj, lvb); + } + if (!exp_connect_lvb_type(req->rq_export)) + req_capsule_shrink(&req->rq_pill, + &RMF_DLM_LVB, + sizeof(struct ost_lvb_v1), + RCL_SERVER); + osc_ast_data_put(env, olck); + } else { + /* + * These errors are normal races, so we don't want to + * fill the console with messages by calling + * ptlrpc_error() + */ + lustre_pack_reply(req, 1, NULL, NULL); + result = -ELDLM_NO_LOCK_DATA; + } + cl_env_nested_put(&nest, env); + } else + result = PTR_ERR(env); + req->rq_status = result; + return result; +} + +static unsigned long osc_lock_weigh(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + /* + * don't need to grab coh_page_guard since we don't care the exact # + * of pages.. + */ + return cl_object_header(slice->cls_obj)->coh_pages; +} + +static void osc_lock_build_einfo(const struct lu_env *env, + const struct cl_lock *clock, + struct osc_lock *lock, + struct ldlm_enqueue_info *einfo) +{ + enum cl_lock_mode mode; + + mode = clock->cll_descr.cld_mode; + if (mode == CLM_PHANTOM) + /* + * For now, enqueue all glimpse locks in read mode. In the + * future, client might choose to enqueue LCK_PW lock for + * glimpse on a file opened for write. + */ + mode = CLM_READ; + + einfo->ei_type = LDLM_EXTENT; + einfo->ei_mode = osc_cl_lock2ldlm(mode); + einfo->ei_cb_bl = osc_ldlm_blocking_ast; + einfo->ei_cb_cp = osc_ldlm_completion_ast; + einfo->ei_cb_gl = osc_ldlm_glimpse_ast; + einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */ +} + +/** + * Determine if the lock should be converted into a lockless lock. + * + * Steps to check: + * - if the lock has an explicit requirement for a non-lockless lock; + * - if the io lock request type ci_lockreq; + * - send the enqueue rpc to ost to make the further decision; + * - special treat to truncate lockless lock + * + * Additional policy can be implemented here, e.g., never do lockless-io + * for large extents. + */ +static void osc_lock_to_lockless(const struct lu_env *env, + struct osc_lock *ols, int force) +{ + struct cl_lock_slice *slice = &ols->ols_cl; + + LASSERT(ols->ols_state == OLS_NEW || + ols->ols_state == OLS_UPCALL_RECEIVED); + + if (force) { + ols->ols_locklessable = 1; + slice->cls_ops = &osc_lock_lockless_ops; + } else { + struct osc_io *oio = osc_env_io(env); + struct cl_io *io = oio->oi_cl.cis_io; + struct cl_object *obj = slice->cls_obj; + struct osc_object *oob = cl2osc(obj); + const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); + struct obd_connect_data *ocd; + + LASSERT(io->ci_lockreq == CILR_MANDATORY || + io->ci_lockreq == CILR_MAYBE || + io->ci_lockreq == CILR_NEVER); + + ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data; + ols->ols_locklessable = (io->ci_type != CIT_SETATTR) && + (io->ci_lockreq == CILR_MAYBE) && + (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK); + if (io->ci_lockreq == CILR_NEVER || + /* lockless IO */ + (ols->ols_locklessable && osc_object_is_contended(oob)) || + /* lockless truncate */ + (cl_io_is_trunc(io) && + (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) && + osd->od_lockless_truncate)) { + ols->ols_locklessable = 1; + slice->cls_ops = &osc_lock_lockless_ops; + } + } + LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); +} + +static int osc_lock_compatible(const struct osc_lock *qing, + const struct osc_lock *qed) +{ + enum cl_lock_mode qing_mode; + enum cl_lock_mode qed_mode; + + qing_mode = qing->ols_cl.cls_lock->cll_descr.cld_mode; + if (qed->ols_glimpse && + (qed->ols_state >= OLS_UPCALL_RECEIVED || qing_mode == CLM_READ)) + return 1; + + qed_mode = qed->ols_cl.cls_lock->cll_descr.cld_mode; + return ((qing_mode == CLM_READ) && (qed_mode == CLM_READ)); +} + +/** + * Cancel all conflicting locks and wait for them to be destroyed. + * + * This function is used for two purposes: + * + * - early cancel all conflicting locks before starting IO, and + * + * - guarantee that pages added to the page cache by lockless IO are never + * covered by locks other than lockless IO lock, and, hence, are not + * visible to other threads. + */ +static int osc_lock_enqueue_wait(const struct lu_env *env, + const struct osc_lock *olck) +{ + struct cl_lock *lock = olck->ols_cl.cls_lock; + struct cl_lock_descr *descr = &lock->cll_descr; + struct cl_object_header *hdr = cl_object_header(descr->cld_obj); + struct cl_lock *scan; + struct cl_lock *conflict = NULL; + int lockless = osc_lock_is_lockless(olck); + int rc = 0; + + LASSERT(cl_lock_is_mutexed(lock)); + + /* make it enqueue anyway for glimpse lock, because we actually + * don't need to cancel any conflicting locks. */ + if (olck->ols_glimpse) + return 0; + + spin_lock(&hdr->coh_lock_guard); + list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) { + struct cl_lock_descr *cld = &scan->cll_descr; + const struct osc_lock *scan_ols; + + if (scan == lock) + break; + + if (scan->cll_state < CLS_QUEUING || + scan->cll_state == CLS_FREEING || + cld->cld_start > descr->cld_end || + cld->cld_end < descr->cld_start) + continue; + + /* overlapped and living locks. */ + + /* We're not supposed to give up group lock. */ + if (scan->cll_descr.cld_mode == CLM_GROUP) { + LASSERT(descr->cld_mode != CLM_GROUP || + descr->cld_gid != scan->cll_descr.cld_gid); + continue; + } + + scan_ols = osc_lock_at(scan); + + /* We need to cancel the compatible locks if we're enqueuing + * a lockless lock, for example: + * imagine that client has PR lock on [0, 1000], and thread T0 + * is doing lockless IO in [500, 1500] region. Concurrent + * thread T1 can see lockless data in [500, 1000], which is + * wrong, because these data are possibly stale. */ + if (!lockless && osc_lock_compatible(olck, scan_ols)) + continue; + + cl_lock_get_trust(scan); + conflict = scan; + break; + } + spin_unlock(&hdr->coh_lock_guard); + + if (conflict) { + if (lock->cll_descr.cld_mode == CLM_GROUP) { + /* we want a group lock but a previous lock request + * conflicts, we do not wait but return 0 so the + * request is send to the server + */ + CDEBUG(D_DLMTRACE, "group lock %p is conflicted with %p, no wait, send to server\n", + lock, conflict); + cl_lock_put(env, conflict); + rc = 0; + } else { + CDEBUG(D_DLMTRACE, "lock %p is conflicted with %p, will wait\n", + lock, conflict); + LASSERT(lock->cll_conflict == NULL); + lu_ref_add(&conflict->cll_reference, "cancel-wait", + lock); + lock->cll_conflict = conflict; + rc = CLO_WAIT; + } + } + return rc; +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() method for osc + * layer. This initiates ldlm enqueue: + * + * - cancels conflicting locks early (osc_lock_enqueue_wait()); + * + * - calls osc_enqueue_base() to do actual enqueue. + * + * osc_enqueue_base() is supplied with an upcall function that is executed + * when lock is received either after a local cached ldlm lock is matched, or + * when a reply from the server is received. + * + * This function does not wait for the network communication to complete. + */ +static int osc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *unused, __u32 enqflags) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct cl_lock *lock = ols->ols_cl.cls_lock; + int result; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERTF(ols->ols_state == OLS_NEW, + "Impossible state: %d\n", ols->ols_state); + + LASSERTF(ergo(ols->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ), + "lock = %p, ols = %p\n", lock, ols); + + result = osc_lock_enqueue_wait(env, ols); + if (result == 0) { + if (!osc_lock_is_lockless(ols)) { + struct osc_object *obj = cl2osc(slice->cls_obj); + struct osc_thread_info *info = osc_env_info(env); + struct ldlm_res_id *resname = &info->oti_resname; + ldlm_policy_data_t *policy = &info->oti_policy; + struct ldlm_enqueue_info *einfo = &ols->ols_einfo; + + /* lock will be passed as upcall cookie, + * hold ref to prevent to be released. */ + cl_lock_hold_add(env, lock, "upcall", lock); + /* a user for lock also */ + cl_lock_user_add(env, lock); + ols->ols_state = OLS_ENQUEUED; + + /* + * XXX: this is possible blocking point as + * ldlm_lock_match(LDLM_FL_LVB_READY) waits for + * LDLM_CP_CALLBACK. + */ + ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname); + osc_lock_build_policy(env, lock, policy); + result = osc_enqueue_base(osc_export(obj), resname, + &ols->ols_flags, policy, + &ols->ols_lvb, + obj->oo_oinfo->loi_kms_valid, + osc_lock_upcall, + ols, einfo, &ols->ols_handle, + PTLRPCD_SET, 1, ols->ols_agl); + if (result != 0) { + cl_lock_user_del(env, lock); + cl_lock_unhold(env, lock, "upcall", lock); + if (unlikely(result == -ECANCELED)) { + ols->ols_state = OLS_NEW; + result = 0; + } + } + } else { + ols->ols_state = OLS_GRANTED; + ols->ols_owner = osc_env_io(env); + } + } + LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); + return result; +} + +static int osc_lock_wait(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck = cl2osc_lock(slice); + struct cl_lock *lock = olck->ols_cl.cls_lock; + + LINVRNT(osc_lock_invariant(olck)); + + if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED) { + if (olck->ols_flags & LDLM_FL_LVB_READY) { + return 0; + } else if (olck->ols_agl) { + if (lock->cll_flags & CLF_FROM_UPCALL) + /* It is from enqueue RPC reply upcall for + * updating state. Do not re-enqueue. */ + return -ENAVAIL; + else + olck->ols_state = OLS_NEW; + } else { + LASSERT(lock->cll_error); + return lock->cll_error; + } + } + + if (olck->ols_state == OLS_NEW) { + int rc; + + LASSERT(olck->ols_agl); + olck->ols_agl = 0; + olck->ols_flags &= ~LDLM_FL_BLOCK_NOWAIT; + rc = osc_lock_enqueue(env, slice, NULL, CEF_ASYNC | CEF_MUST); + if (rc != 0) + return rc; + else + return CLO_REENQUEUED; + } + + LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED && + lock->cll_error == 0, olck->ols_lock != NULL)); + + return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT; +} + +/** + * An implementation of cl_lock_operations::clo_use() method that pins cached + * lock. + */ +static int osc_lock_use(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck = cl2osc_lock(slice); + int rc; + + LASSERT(!olck->ols_hold); + + /* + * Atomically check for LDLM_FL_CBPENDING and addref a lock if this + * flag is not set. This protects us from a concurrent blocking ast. + */ + rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode); + if (rc == 0) { + olck->ols_hold = 1; + olck->ols_state = OLS_GRANTED; + } else { + struct cl_lock *lock; + + /* + * Lock is being cancelled somewhere within + * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already + * set, but osc_ldlm_blocking_ast() hasn't yet acquired + * cl_lock mutex. + */ + lock = slice->cls_lock; + LASSERT(lock->cll_state == CLS_INTRANSIT); + LASSERT(lock->cll_users > 0); + /* set a flag for osc_dlm_blocking_ast0() to signal the + * lock.*/ + olck->ols_ast_wait = 1; + rc = CLO_WAIT; + } + return rc; +} + +static int osc_lock_flush(struct osc_lock *ols, int discard) +{ + struct cl_lock *lock = ols->ols_cl.cls_lock; + struct cl_env_nest nest; + struct lu_env *env; + int result = 0; + + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + struct osc_object *obj = cl2osc(ols->ols_cl.cls_obj); + struct cl_lock_descr *descr = &lock->cll_descr; + int rc = 0; + + if (descr->cld_mode >= CLM_WRITE) { + result = osc_cache_writeback_range(env, obj, + descr->cld_start, descr->cld_end, + 1, discard); + LDLM_DEBUG(ols->ols_lock, + "lock %p: %d pages were %s.\n", lock, result, + discard ? "discarded" : "written"); + if (result > 0) + result = 0; + } + + rc = cl_lock_discard_pages(env, lock); + if (result == 0 && rc < 0) + result = rc; + + cl_env_nested_put(&nest, env); + } else + result = PTR_ERR(env); + if (result == 0) { + ols->ols_flush = 1; + LINVRNT(!osc_lock_has_pages(ols)); + } + return result; +} + +/** + * Implements cl_lock_operations::clo_cancel() method for osc layer. This is + * called (as part of cl_lock_cancel()) when lock is canceled either voluntary + * (LRU pressure, early cancellation, umount, etc.) or due to the conflict + * with some other lock some where in the cluster. This function does the + * following: + * + * - invalidates all pages protected by this lock (after sending dirty + * ones to the server, as necessary); + * + * - decref's underlying ldlm lock; + * + * - cancels ldlm lock (ldlm_cli_cancel()). + */ +static void osc_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct cl_lock *lock = slice->cls_lock; + struct osc_lock *olck = cl2osc_lock(slice); + struct ldlm_lock *dlmlock = olck->ols_lock; + int result = 0; + int discard; + + LASSERT(cl_lock_is_mutexed(lock)); + LINVRNT(osc_lock_invariant(olck)); + + if (dlmlock != NULL) { + int do_cancel; + + discard = !!(dlmlock->l_flags & LDLM_FL_DISCARD_DATA); + if (olck->ols_state >= OLS_GRANTED) + result = osc_lock_flush(olck, discard); + osc_lock_unhold(olck); + + lock_res_and_lock(dlmlock); + /* Now that we're the only user of dlm read/write reference, + * mostly the ->l_readers + ->l_writers should be zero. + * However, there is a corner case. + * See bug 18829 for details.*/ + do_cancel = (dlmlock->l_readers == 0 && + dlmlock->l_writers == 0); + dlmlock->l_flags |= LDLM_FL_CBPENDING; + unlock_res_and_lock(dlmlock); + if (do_cancel) + result = ldlm_cli_cancel(&olck->ols_handle, LCF_ASYNC); + if (result < 0) + CL_LOCK_DEBUG(D_ERROR, env, lock, + "lock %p cancel failure with error(%d)\n", + lock, result); + } + olck->ols_state = OLS_CANCELLED; + olck->ols_flags &= ~LDLM_FL_LVB_READY; + osc_lock_detach(env, olck); +} + +static int osc_lock_has_pages(struct osc_lock *olck) +{ + return 0; +} + +static void osc_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck; + + olck = cl2osc_lock(slice); + if (olck->ols_glimpse) { + LASSERT(!olck->ols_hold); + LASSERT(!olck->ols_lock); + return; + } + + LINVRNT(osc_lock_invariant(olck)); + LINVRNT(!osc_lock_has_pages(olck)); + + osc_lock_unhold(olck); + osc_lock_detach(env, olck); +} + +/** + * Implements cl_lock_operations::clo_state() method for osc layer. + * + * Maintains osc_lock::ols_owner field. + * + * This assumes that lock always enters CLS_HELD (from some other state) in + * the same IO context as one that requested the lock. This should not be a + * problem, because context is by definition shared by all activity pertaining + * to the same high-level IO. + */ +static void osc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + /* + * XXX multiple io contexts can use the lock at the same time. + */ + LINVRNT(osc_lock_invariant(lock)); + if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) { + struct osc_io *oio = osc_env_io(env); + + LASSERT(lock->ols_owner == NULL); + lock->ols_owner = oio; + } else if (state != CLS_HELD) + lock->ols_owner = NULL; +} + +static int osc_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + /* + * XXX print ldlm lock and einfo properly. + */ + (*p)(env, cookie, "%p %#16llx %#llx %d %p ", + lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie, + lock->ols_state, lock->ols_owner); + osc_lvb_print(env, cookie, p, &lock->ols_lvb); + return 0; +} + +static int osc_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + struct osc_lock *ols = cl2osc_lock(slice); + + if (need->cld_enq_flags & CEF_NEVER) + return 0; + + if (ols->ols_state >= OLS_CANCELLED) + return 0; + + if (need->cld_mode == CLM_PHANTOM) { + if (ols->ols_agl) + return !(ols->ols_state > OLS_RELEASED); + + /* + * Note: the QUEUED lock can't be matched here, otherwise + * it might cause the deadlocks. + * In read_process, + * P1: enqueued read lock, create sublock1 + * P2: enqueued write lock, create sublock2(conflicted + * with sublock1). + * P1: Grant read lock. + * P1: enqueued glimpse lock(with holding sublock1_read), + * matched with sublock2, waiting sublock2 to be granted. + * But sublock2 can not be granted, because P1 + * will not release sublock1. Bang! + */ + if (ols->ols_state < OLS_GRANTED || + ols->ols_state > OLS_RELEASED) + return 0; + } else if (need->cld_enq_flags & CEF_MUST) { + /* + * If the lock hasn't ever enqueued, it can't be matched + * because enqueue process brings in many information + * which can be used to determine things such as lockless, + * CEF_MUST, etc. + */ + if (ols->ols_state < OLS_UPCALL_RECEIVED && + ols->ols_locklessable) + return 0; + } + return 1; +} + +static const struct cl_lock_operations osc_lock_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_enqueue, + .clo_wait = osc_lock_wait, + .clo_unuse = osc_lock_unuse, + .clo_use = osc_lock_use, + .clo_delete = osc_lock_delete, + .clo_state = osc_lock_state, + .clo_cancel = osc_lock_cancel, + .clo_weigh = osc_lock_weigh, + .clo_print = osc_lock_print, + .clo_fits_into = osc_lock_fits_into, +}; + +static int osc_lock_lockless_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct cl_lock *lock = slice->cls_lock; + + LASSERT(ols->ols_state == OLS_GRANTED); + LINVRNT(osc_lock_invariant(ols)); + + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + return 0; +} + +static void osc_lock_lockless_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + int result; + + result = osc_lock_flush(ols, 0); + if (result) + CERROR("Pages for lockless lock %p were not purged(%d)\n", + ols, result); + ols->ols_state = OLS_CANCELLED; +} + +static int osc_lock_lockless_wait(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck = cl2osc_lock(slice); + struct cl_lock *lock = olck->ols_cl.cls_lock; + + LINVRNT(osc_lock_invariant(olck)); + LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED); + + return lock->cll_error; +} + +static void osc_lock_lockless_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + LINVRNT(osc_lock_invariant(lock)); + if (state == CLS_HELD) { + struct osc_io *oio = osc_env_io(env); + + LASSERT(ergo(lock->ols_owner, lock->ols_owner == oio)); + lock->ols_owner = oio; + + /* set the io to be lockless if this lock is for io's + * host object */ + if (cl_object_same(oio->oi_cl.cis_obj, slice->cls_obj)) + oio->oi_lockless = 1; + } +} + +static int osc_lock_lockless_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + if (!(need->cld_enq_flags & CEF_NEVER)) + return 0; + + /* lockless lock should only be used by its owning io. b22147 */ + return (lock->ols_owner == osc_env_io(env)); +} + +static const struct cl_lock_operations osc_lock_lockless_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_enqueue, + .clo_wait = osc_lock_lockless_wait, + .clo_unuse = osc_lock_lockless_unuse, + .clo_state = osc_lock_lockless_state, + .clo_fits_into = osc_lock_lockless_fits_into, + .clo_cancel = osc_lock_lockless_cancel, + .clo_print = osc_lock_print +}; + +int osc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *unused) +{ + struct osc_lock *clk; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(clk, osc_lock_kmem, GFP_NOFS); + if (clk != NULL) { + __u32 enqflags = lock->cll_descr.cld_enq_flags; + + osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo); + atomic_set(&clk->ols_pageref, 0); + clk->ols_state = OLS_NEW; + + clk->ols_flags = osc_enq2ldlm_flags(enqflags); + clk->ols_agl = !!(enqflags & CEF_AGL); + if (clk->ols_agl) + clk->ols_flags |= LDLM_FL_BLOCK_NOWAIT; + if (clk->ols_flags & LDLM_FL_HAS_INTENT) + clk->ols_glimpse = 1; + + cl_lock_slice_add(lock, &clk->ols_cl, obj, &osc_lock_ops); + + if (!(enqflags & CEF_MUST)) + /* try to convert this lock to a lockless lock */ + osc_lock_to_lockless(env, clk, (enqflags & CEF_NEVER)); + if (clk->ols_locklessable && !(enqflags & CEF_DISCARD_DATA)) + clk->ols_flags |= LDLM_FL_DENY_ON_CONTENTION; + + LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx\n", + lock, clk, clk->ols_flags); + + result = 0; + } else + result = -ENOMEM; + return result; +} + +int osc_dlm_lock_pageref(struct ldlm_lock *dlm) +{ + struct osc_lock *olock; + int rc = 0; + + spin_lock(&osc_ast_guard); + olock = dlm->l_ast_data; + /* + * there's a very rare race with osc_page_addref_lock(), but that + * doesn't matter because in the worst case we don't cancel a lock + * which we actually can, that's no harm. + */ + if (olock != NULL && + atomic_add_return(_PAGEREF_MAGIC, + &olock->ols_pageref) != _PAGEREF_MAGIC) { + atomic_sub(_PAGEREF_MAGIC, &olock->ols_pageref); + rc = 1; + } + spin_unlock(&osc_ast_guard); + return rc; +} + +/** @} osc */ diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_object.c b/kernel/drivers/staging/lustre/lustre/osc/osc_object.c new file mode 100644 index 000000000..92c202f70 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_object.c @@ -0,0 +1,271 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" + +/** \addtogroup osc + * @{ + */ + +/***************************************************************************** + * + * Type conversions. + * + */ + +static struct lu_object *osc2lu(struct osc_object *osc) +{ + return &osc->oo_cl.co_lu; +} + +static struct osc_object *lu2osc(const struct lu_object *obj) +{ + LINVRNT(osc_is_object(obj)); + return container_of0(obj, struct osc_object, oo_cl.co_lu); +} + +/***************************************************************************** + * + * Object operations. + * + */ + +static int osc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct osc_object *osc = lu2osc(obj); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + int i; + + osc->oo_oinfo = cconf->u.coc_oinfo; + spin_lock_init(&osc->oo_seatbelt); + for (i = 0; i < CRT_NR; ++i) + INIT_LIST_HEAD(&osc->oo_inflight[i]); + + INIT_LIST_HEAD(&osc->oo_ready_item); + INIT_LIST_HEAD(&osc->oo_hp_ready_item); + INIT_LIST_HEAD(&osc->oo_write_item); + INIT_LIST_HEAD(&osc->oo_read_item); + + osc->oo_root.rb_node = NULL; + INIT_LIST_HEAD(&osc->oo_hp_exts); + INIT_LIST_HEAD(&osc->oo_urgent_exts); + INIT_LIST_HEAD(&osc->oo_rpc_exts); + INIT_LIST_HEAD(&osc->oo_reading_exts); + atomic_set(&osc->oo_nr_reads, 0); + atomic_set(&osc->oo_nr_writes, 0); + spin_lock_init(&osc->oo_lock); + + cl_object_page_init(lu2cl(obj), sizeof(struct osc_page)); + + return 0; +} + +static void osc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + int i; + + for (i = 0; i < CRT_NR; ++i) + LASSERT(list_empty(&osc->oo_inflight[i])); + + LASSERT(list_empty(&osc->oo_ready_item)); + LASSERT(list_empty(&osc->oo_hp_ready_item)); + LASSERT(list_empty(&osc->oo_write_item)); + LASSERT(list_empty(&osc->oo_read_item)); + + LASSERT(osc->oo_root.rb_node == NULL); + LASSERT(list_empty(&osc->oo_hp_exts)); + LASSERT(list_empty(&osc->oo_urgent_exts)); + LASSERT(list_empty(&osc->oo_rpc_exts)); + LASSERT(list_empty(&osc->oo_reading_exts)); + LASSERT(atomic_read(&osc->oo_nr_reads) == 0); + LASSERT(atomic_read(&osc->oo_nr_writes) == 0); + + lu_object_fini(obj); + OBD_SLAB_FREE_PTR(osc, osc_object_kmem); +} + +int osc_lvb_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb) +{ + return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); +} + +static int osc_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + struct lov_oinfo *oinfo = osc->oo_oinfo; + struct osc_async_rc *ar = &oinfo->loi_ar; + + (*p)(env, cookie, "id: " DOSTID " idx: %d gen: %d kms_valid: %u kms %llu rc: %d force_sync: %d min_xid: %llu ", + POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx, + oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms, + ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid); + osc_lvb_print(env, cookie, p, &oinfo->loi_lvb); + return 0; +} + + +static int osc_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + cl_lvb2attr(attr, &oinfo->loi_lvb); + attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0; + return 0; +} + +int osc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + struct ost_lvb *lvb = &oinfo->loi_lvb; + + if (valid & CAT_SIZE) + lvb->lvb_size = attr->cat_size; + if (valid & CAT_MTIME) + lvb->lvb_mtime = attr->cat_mtime; + if (valid & CAT_ATIME) + lvb->lvb_atime = attr->cat_atime; + if (valid & CAT_CTIME) + lvb->lvb_ctime = attr->cat_ctime; + if (valid & CAT_BLOCKS) + lvb->lvb_blocks = attr->cat_blocks; + if (valid & CAT_KMS) { + CDEBUG(D_CACHE, "set kms from %llu to %llu\n", + oinfo->loi_kms, (__u64)attr->cat_kms); + loi_kms_set(oinfo, attr->cat_kms); + } + return 0; +} + +static int osc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + lvb->lvb_size = oinfo->loi_kms; + lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks; + return 0; +} + + +void osc_object_set_contended(struct osc_object *obj) +{ + obj->oo_contention_time = cfs_time_current(); + /* mb(); */ + obj->oo_contended = 1; +} + +void osc_object_clear_contended(struct osc_object *obj) +{ + obj->oo_contended = 0; +} + +int osc_object_is_contended(struct osc_object *obj) +{ + struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev); + int osc_contention_time = dev->od_contention_time; + unsigned long cur_time = cfs_time_current(); + unsigned long retry_time; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION)) + return 1; + + if (!obj->oo_contended) + return 0; + + /* + * I like copy-paste. the code is copied from + * ll_file_is_contended. + */ + retry_time = cfs_time_add(obj->oo_contention_time, + cfs_time_seconds(osc_contention_time)); + if (cfs_time_after(cur_time, retry_time)) { + osc_object_clear_contended(obj); + return 0; + } + return 1; +} + +static const struct cl_object_operations osc_ops = { + .coo_page_init = osc_page_init, + .coo_lock_init = osc_lock_init, + .coo_io_init = osc_io_init, + .coo_attr_get = osc_attr_get, + .coo_attr_set = osc_attr_set, + .coo_glimpse = osc_object_glimpse +}; + +static const struct lu_object_operations osc_lu_obj_ops = { + .loo_object_init = osc_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = osc_object_free, + .loo_object_print = osc_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct osc_object *osc; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS); + if (osc != NULL) { + obj = osc2lu(osc); + lu_object_init(obj, NULL, dev); + osc->oo_cl.co_ops = &osc_ops; + obj->lo_ops = &osc_lu_obj_ops; + } else + obj = NULL; + return obj; +} + +/** @} osc */ diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_page.c b/kernel/drivers/staging/lustre/lustre/osc/osc_page.c new file mode 100644 index 000000000..76ba58b09 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_page.c @@ -0,0 +1,916 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" + +static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del); +static void osc_lru_add(struct client_obd *cli, struct osc_page *opg); +static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, + struct osc_page *opg); + +/** \addtogroup osc + * @{ + */ + +/* + * Comment out osc_page_protected because it may sleep inside the + * the client_obd_list_lock. + * client_obd_list_lock -> osc_ap_completion -> osc_completion -> + * -> osc_page_protected -> osc_page_is_dlocked -> osc_match_base + * -> ldlm_lock_match -> sptlrpc_import_check_ctx -> sleep. + */ +#if 0 +static int osc_page_is_dlocked(const struct lu_env *env, + const struct osc_page *opg, + enum cl_lock_mode mode, int pending, int unref) +{ + struct cl_page *page; + struct osc_object *obj; + struct osc_thread_info *info; + struct ldlm_res_id *resname; + struct lustre_handle *lockh; + ldlm_policy_data_t *policy; + ldlm_mode_t dlmmode; + __u64 flags; + + might_sleep(); + + info = osc_env_info(env); + resname = &info->oti_resname; + policy = &info->oti_policy; + lockh = &info->oti_handle; + page = opg->ops_cl.cpl_page; + obj = cl2osc(opg->ops_cl.cpl_obj); + + flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED; + if (pending) + flags |= LDLM_FL_CBPENDING; + + dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW; + osc_lock_build_res(env, obj, resname); + osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index); + return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy, + dlmmode, &flags, NULL, lockh, unref); +} + +/** + * Checks an invariant that a page in the cache is covered by a lock, as + * needed. + */ +static int osc_page_protected(const struct lu_env *env, + const struct osc_page *opg, + enum cl_lock_mode mode, int unref) +{ + struct cl_object_header *hdr; + struct cl_lock *scan; + struct cl_page *page; + struct cl_lock_descr *descr; + int result; + + LINVRNT(!opg->ops_temp); + + page = opg->ops_cl.cpl_page; + if (page->cp_owner != NULL && + cl_io_top(page->cp_owner)->ci_lockreq == CILR_NEVER) + /* + * If IO is done without locks (liblustre, or lloop), lock is + * not required. + */ + result = 1; + else + /* otherwise check for a DLM lock */ + result = osc_page_is_dlocked(env, opg, mode, 1, unref); + if (result == 0) { + /* maybe this page is a part of a lockless io? */ + hdr = cl_object_header(opg->ops_cl.cpl_obj); + descr = &osc_env_info(env)->oti_descr; + descr->cld_mode = mode; + descr->cld_start = page->cp_index; + descr->cld_end = page->cp_index; + spin_lock(&hdr->coh_lock_guard); + list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) { + /* + * Lock-less sub-lock has to be either in HELD state + * (when io is actively going on), or in CACHED state, + * when top-lock is being unlocked: + * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse(). + */ + if ((scan->cll_state == CLS_HELD || + scan->cll_state == CLS_CACHED) && + cl_lock_ext_match(&scan->cll_descr, descr)) { + struct osc_lock *olck; + + olck = osc_lock_at(scan); + result = osc_lock_is_lockless(olck); + break; + } + } + spin_unlock(&hdr->coh_lock_guard); + } + return result; +} +#else +static int osc_page_protected(const struct lu_env *env, + const struct osc_page *opg, + enum cl_lock_mode mode, int unref) +{ + return 1; +} +#endif + +/***************************************************************************** + * + * Page operations. + * + */ +static void osc_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + CDEBUG(D_TRACE, "%p\n", opg); + LASSERT(opg->ops_lock == NULL); +} + +static void osc_page_transfer_get(struct osc_page *opg, const char *label) +{ + struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); + + LASSERT(!opg->ops_transfer_pinned); + cl_page_get(page); + lu_ref_add_atomic(&page->cp_reference, label, page); + opg->ops_transfer_pinned = 1; +} + +static void osc_page_transfer_put(const struct lu_env *env, + struct osc_page *opg) +{ + struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); + + if (opg->ops_transfer_pinned) { + lu_ref_del(&page->cp_reference, "transfer", page); + opg->ops_transfer_pinned = 0; + cl_page_put(env, page); + } +} + +/** + * This is called once for every page when it is submitted for a transfer + * either opportunistic (osc_page_cache_add()), or immediate + * (osc_page_submit()). + */ +static void osc_page_transfer_add(const struct lu_env *env, + struct osc_page *opg, enum cl_req_type crt) +{ + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + + /* ops_lru and ops_inflight share the same field, so take it from LRU + * first and then use it as inflight. */ + osc_lru_del(osc_cli(obj), opg, false); + + spin_lock(&obj->oo_seatbelt); + list_add(&opg->ops_inflight, &obj->oo_inflight[crt]); + opg->ops_submitter = current; + spin_unlock(&obj->oo_seatbelt); +} + +static int osc_page_cache_add(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_page *opg = cl2osc_page(slice); + int result; + + LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0)); + + osc_page_transfer_get(opg, "transfer\0cache"); + result = osc_queue_async_io(env, io, opg); + if (result != 0) + osc_page_transfer_put(env, opg); + else + osc_page_transfer_add(env, opg, CRT_WRITE); + + /* for sync write, kernel will wait for this page to be flushed before + * osc_io_end() is called, so release it earlier. + * for mkwrite(), it's known there is no further pages. */ + if (cl_io_is_sync_write(io) || cl_io_is_mkwrite(io)) { + if (oio->oi_active != NULL) { + osc_extent_release(env, oio->oi_active); + oio->oi_active = NULL; + } + } + + return result; +} + +void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj, + pgoff_t start, pgoff_t end) +{ + memset(policy, 0, sizeof(*policy)); + policy->l_extent.start = cl_offset(obj, start); + policy->l_extent.end = cl_offset(obj, end + 1) - 1; +} + +static int osc_page_addref_lock(const struct lu_env *env, + struct osc_page *opg, + struct cl_lock *lock) +{ + struct osc_lock *olock; + int rc; + + LASSERT(opg->ops_lock == NULL); + + olock = osc_lock_at(lock); + if (atomic_inc_return(&olock->ols_pageref) <= 0) { + atomic_dec(&olock->ols_pageref); + rc = -ENODATA; + } else { + cl_lock_get(lock); + opg->ops_lock = lock; + rc = 0; + } + return rc; +} + +static void osc_page_putref_lock(const struct lu_env *env, + struct osc_page *opg) +{ + struct cl_lock *lock = opg->ops_lock; + struct osc_lock *olock; + + LASSERT(lock != NULL); + olock = osc_lock_at(lock); + + atomic_dec(&olock->ols_pageref); + opg->ops_lock = NULL; + + cl_lock_put(env, lock); +} + +static int osc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct cl_lock *lock; + int result = -ENODATA; + + lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page, + NULL, 1, 0); + if (lock != NULL) { + if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0) + result = -EBUSY; + cl_lock_put(env, lock); + } + return result; +} + +static void osc_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct osc_page *opg = cl2osc_page(slice); + + if (unlikely(opg->ops_lock)) + osc_page_putref_lock(env, opg); +} + +static void osc_page_completion_read(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + + if (likely(opg->ops_lock)) + osc_page_putref_lock(env, opg); + osc_lru_add(osc_cli(obj), opg); +} + +static void osc_page_completion_write(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(slice->cpl_obj); + + osc_lru_add(osc_cli(obj), opg); +} + +static int osc_page_fail(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + /* + * Cached read? + */ + LBUG(); + return 0; +} + + +static const char *osc_list(struct list_head *head) +{ + return list_empty(head) ? "-" : "+"; +} + +static inline unsigned long osc_submit_duration(struct osc_page *opg) +{ + if (opg->ops_submit_time == 0) + return 0; + + return (cfs_time_current() - opg->ops_submit_time); +} + +static int osc_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + struct osc_object *obj = cl2osc(slice->cpl_obj); + struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli; + + return (*printer)(env, cookie, LUSTRE_OSC_NAME "-page@%p: 1< %#x %d %u %s %s > 2< %llu %u %u %#x %#x | %p %p %p > 3< %s %p %d %lu %d > 4< %d %d %d %lu %s | %s %s %s %s > 5< %s %s %s %s | %d %s | %d %s %s>\n", + opg, + /* 1 */ + oap->oap_magic, oap->oap_cmd, + oap->oap_interrupted, + osc_list(&oap->oap_pending_item), + osc_list(&oap->oap_rpc_item), + /* 2 */ + oap->oap_obj_off, oap->oap_page_off, oap->oap_count, + oap->oap_async_flags, oap->oap_brw_flags, + oap->oap_request, oap->oap_cli, obj, + /* 3 */ + osc_list(&opg->ops_inflight), + opg->ops_submitter, opg->ops_transfer_pinned, + osc_submit_duration(opg), opg->ops_srvlock, + /* 4 */ + cli->cl_r_in_flight, cli->cl_w_in_flight, + cli->cl_max_rpcs_in_flight, + cli->cl_avail_grant, + osc_list(&cli->cl_cache_waiters), + osc_list(&cli->cl_loi_ready_list), + osc_list(&cli->cl_loi_hp_ready_list), + osc_list(&cli->cl_loi_write_list), + osc_list(&cli->cl_loi_read_list), + /* 5 */ + osc_list(&obj->oo_ready_item), + osc_list(&obj->oo_hp_ready_item), + osc_list(&obj->oo_write_item), + osc_list(&obj->oo_read_item), + atomic_read(&obj->oo_nr_reads), + osc_list(&obj->oo_reading_exts), + atomic_read(&obj->oo_nr_writes), + osc_list(&obj->oo_hp_exts), + osc_list(&obj->oo_urgent_exts)); +} + +static void osc_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + int rc; + + LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1)); + + CDEBUG(D_TRACE, "%p\n", opg); + osc_page_transfer_put(env, opg); + rc = osc_teardown_async_page(env, obj, opg); + if (rc) { + CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page), + "Trying to teardown failed: %d\n", rc); + LASSERT(0); + } + + spin_lock(&obj->oo_seatbelt); + if (opg->ops_submitter != NULL) { + LASSERT(!list_empty(&opg->ops_inflight)); + list_del_init(&opg->ops_inflight); + opg->ops_submitter = NULL; + } + spin_unlock(&obj->oo_seatbelt); + + osc_lru_del(osc_cli(obj), opg, true); +} + +void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice, + int from, int to) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + + LINVRNT(osc_page_protected(env, opg, CLM_READ, 0)); + + opg->ops_from = from; + opg->ops_to = to; + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&oap->oap_lock); +} + +static int osc_page_cancel(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + int rc = 0; + + LINVRNT(osc_page_protected(env, opg, CLM_READ, 0)); + + /* Check if the transferring against this page + * is completed, or not even queued. */ + if (opg->ops_transfer_pinned) + /* FIXME: may not be interrupted.. */ + rc = osc_cancel_async_page(env, opg); + LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0)); + return rc; +} + +static int osc_page_flush(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct osc_page *opg = cl2osc_page(slice); + int rc = 0; + + rc = osc_flush_async_page(env, io, opg); + return rc; +} + +static const struct cl_page_operations osc_page_ops = { + .cpo_fini = osc_page_fini, + .cpo_print = osc_page_print, + .cpo_delete = osc_page_delete, + .cpo_is_under_lock = osc_page_is_under_lock, + .cpo_disown = osc_page_disown, + .io = { + [CRT_READ] = { + .cpo_cache_add = osc_page_fail, + .cpo_completion = osc_page_completion_read + }, + [CRT_WRITE] = { + .cpo_cache_add = osc_page_cache_add, + .cpo_completion = osc_page_completion_write + } + }, + .cpo_clip = osc_page_clip, + .cpo_cancel = osc_page_cancel, + .cpo_flush = osc_page_flush +}; + +int osc_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct osc_object *osc = cl2osc(obj); + struct osc_page *opg = cl_object_page_slice(obj, page); + int result; + + opg->ops_from = 0; + opg->ops_to = PAGE_CACHE_SIZE; + + result = osc_prep_async_page(osc, opg, vmpage, + cl_offset(obj, page->cp_index)); + if (result == 0) { + struct osc_io *oio = osc_env_io(env); + opg->ops_srvlock = osc_io_srvlock(oio); + cl_page_slice_add(page, &opg->ops_cl, obj, + &osc_page_ops); + } + /* + * Cannot assert osc_page_protected() here as read-ahead + * creates temporary pages outside of a lock. + */ + /* ops_inflight and ops_lru are the same field, but it doesn't + * hurt to initialize it twice :-) */ + INIT_LIST_HEAD(&opg->ops_inflight); + INIT_LIST_HEAD(&opg->ops_lru); + + /* reserve an LRU space for this page */ + if (page->cp_type == CPT_CACHEABLE && result == 0) + result = osc_lru_reserve(env, osc, opg); + + return result; +} + +/** + * Helper function called by osc_io_submit() for every page in an immediate + * transfer (i.e., transferred synchronously). + */ +void osc_page_submit(const struct lu_env *env, struct osc_page *opg, + enum cl_req_type crt, int brw_flags) +{ + struct osc_async_page *oap = &opg->ops_oap; + struct osc_object *obj = oap->oap_obj; + + LINVRNT(osc_page_protected(env, opg, + crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1)); + + LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, magic 0x%x\n", + oap, oap->oap_magic); + LASSERT(oap->oap_async_flags & ASYNC_READY); + LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE); + + oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; + oap->oap_page_off = opg->ops_from; + oap->oap_count = opg->ops_to - opg->ops_from; + oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags; + + if (!client_is_remote(osc_export(obj)) && + capable(CFS_CAP_SYS_RESOURCE)) { + oap->oap_brw_flags |= OBD_BRW_NOQUOTA; + oap->oap_cmd |= OBD_BRW_NOQUOTA; + } + + opg->ops_submit_time = cfs_time_current(); + osc_page_transfer_get(opg, "transfer\0imm"); + osc_page_transfer_add(env, opg, crt); +} + +/* --------------- LRU page management ------------------ */ + +/* OSC is a natural place to manage LRU pages as applications are specialized + * to write OSC by OSC. Ideally, if one OSC is used more frequently it should + * occupy more LRU slots. On the other hand, we should avoid using up all LRU + * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep + * for free LRU slots - this will be very bad so the algorithm requires each + * OSC to free slots voluntarily to maintain a reasonable number of free slots + * at any time. + */ + +static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq); +static atomic_t osc_lru_waiters = ATOMIC_INIT(0); +/* LRU pages are freed in batch mode. OSC should at least free this + * number of pages to avoid running out of LRU budget, and.. */ +static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT); /* 2M */ +/* free this number at most otherwise it will take too long time to finish. */ +static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */ + +/* Check if we can free LRU slots from this OSC. If there exists LRU waiters, + * we should free slots aggressively. In this way, slots are freed in a steady + * step to maintain fairness among OSCs. + * + * Return how many LRU pages should be freed. */ +static int osc_cache_too_much(struct client_obd *cli) +{ + struct cl_client_cache *cache = cli->cl_cache; + int pages = atomic_read(&cli->cl_lru_in_list) >> 1; + + if (atomic_read(&osc_lru_waiters) > 0 && + atomic_read(cli->cl_lru_left) < lru_shrink_max) + /* drop lru pages aggressively */ + return min(pages, lru_shrink_max); + + /* if it's going to run out LRU slots, we should free some, but not + * too much to maintain fairness among OSCs. */ + if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) { + unsigned long tmp; + + tmp = cache->ccc_lru_max / atomic_read(&cache->ccc_users); + if (pages > tmp) + return min(pages, lru_shrink_max); + + return pages > lru_shrink_min ? lru_shrink_min : 0; + } + + return 0; +} + +/* Return how many pages are not discarded in @pvec. */ +static int discard_pagevec(const struct lu_env *env, struct cl_io *io, + struct cl_page **pvec, int max_index) +{ + int count; + int i; + + for (count = 0, i = 0; i < max_index; i++) { + struct cl_page *page = pvec[i]; + if (cl_page_own_try(env, io, page) == 0) { + /* free LRU page only if nobody is using it. + * This check is necessary to avoid freeing the pages + * having already been removed from LRU and pinned + * for IO. */ + if (!cl_page_in_use(page)) { + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + ++count; + } + cl_page_disown(env, io, page); + } + cl_page_put(env, page); + pvec[i] = NULL; + } + return max_index - count; +} + +/** + * Drop @target of pages from LRU at most. + */ +int osc_lru_shrink(struct client_obd *cli, int target) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct cl_io *io; + struct cl_object *clobj = NULL; + struct cl_page **pvec; + struct osc_page *opg; + int maxscan = 0; + int count = 0; + int index = 0; + int rc = 0; + + LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0); + if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0) + return 0; + + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + return PTR_ERR(env); + + pvec = osc_env_info(env)->oti_pvec; + io = &osc_env_info(env)->oti_io; + + client_obd_list_lock(&cli->cl_lru_list_lock); + atomic_inc(&cli->cl_lru_shrinkers); + maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list)); + while (!list_empty(&cli->cl_lru_list)) { + struct cl_page *page; + + if (--maxscan < 0) + break; + + opg = list_entry(cli->cl_lru_list.next, struct osc_page, + ops_lru); + page = cl_page_top(opg->ops_cl.cpl_page); + if (cl_page_in_use_noref(page)) { + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + continue; + } + + LASSERT(page->cp_obj != NULL); + if (clobj != page->cp_obj) { + struct cl_object *tmp = page->cp_obj; + + cl_object_get(tmp); + client_obd_list_unlock(&cli->cl_lru_list_lock); + + if (clobj != NULL) { + count -= discard_pagevec(env, io, pvec, index); + index = 0; + + cl_io_fini(env, io); + cl_object_put(env, clobj); + clobj = NULL; + } + + clobj = tmp; + io->ci_obj = clobj; + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, clobj); + + client_obd_list_lock(&cli->cl_lru_list_lock); + + if (rc != 0) + break; + + ++maxscan; + continue; + } + + /* move this page to the end of list as it will be discarded + * soon. The page will be finally removed from LRU list in + * osc_page_delete(). */ + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + + /* it's okay to grab a refcount here w/o holding lock because + * it has to grab cl_lru_list_lock to delete the page. */ + cl_page_get(page); + pvec[index++] = page; + if (++count >= target) + break; + + if (unlikely(index == OTI_PVEC_SIZE)) { + client_obd_list_unlock(&cli->cl_lru_list_lock); + count -= discard_pagevec(env, io, pvec, index); + index = 0; + + client_obd_list_lock(&cli->cl_lru_list_lock); + } + } + client_obd_list_unlock(&cli->cl_lru_list_lock); + + if (clobj != NULL) { + count -= discard_pagevec(env, io, pvec, index); + + cl_io_fini(env, io); + cl_object_put(env, clobj); + } + cl_env_nested_put(&nest, env); + + atomic_dec(&cli->cl_lru_shrinkers); + return count > 0 ? count : rc; +} + +static void osc_lru_add(struct client_obd *cli, struct osc_page *opg) +{ + bool wakeup = false; + + if (!opg->ops_in_lru) + return; + + atomic_dec(&cli->cl_lru_busy); + client_obd_list_lock(&cli->cl_lru_list_lock); + if (list_empty(&opg->ops_lru)) { + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + atomic_inc_return(&cli->cl_lru_in_list); + wakeup = atomic_read(&osc_lru_waiters) > 0; + } + client_obd_list_unlock(&cli->cl_lru_list_lock); + + if (wakeup) { + osc_lru_shrink(cli, osc_cache_too_much(cli)); + wake_up_all(&osc_lru_waitq); + } +} + +/* delete page from LRUlist. The page can be deleted from LRUlist for two + * reasons: redirtied or deleted from page cache. */ +static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del) +{ + if (opg->ops_in_lru) { + client_obd_list_lock(&cli->cl_lru_list_lock); + if (!list_empty(&opg->ops_lru)) { + LASSERT(atomic_read(&cli->cl_lru_in_list) > 0); + list_del_init(&opg->ops_lru); + atomic_dec(&cli->cl_lru_in_list); + if (!del) + atomic_inc(&cli->cl_lru_busy); + } else if (del) { + LASSERT(atomic_read(&cli->cl_lru_busy) > 0); + atomic_dec(&cli->cl_lru_busy); + } + client_obd_list_unlock(&cli->cl_lru_list_lock); + if (del) { + atomic_inc(cli->cl_lru_left); + /* this is a great place to release more LRU pages if + * this osc occupies too many LRU pages and kernel is + * stealing one of them. + * cl_lru_shrinkers is to avoid recursive call in case + * we're already in the context of osc_lru_shrink(). */ + if (atomic_read(&cli->cl_lru_shrinkers) == 0 && + !memory_pressure_get()) + osc_lru_shrink(cli, osc_cache_too_much(cli)); + wake_up(&osc_lru_waitq); + } + } else { + LASSERT(list_empty(&opg->ops_lru)); + } +} + +static inline int max_to_shrink(struct client_obd *cli) +{ + return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max); +} + +static int osc_lru_reclaim(struct client_obd *cli) +{ + struct cl_client_cache *cache = cli->cl_cache; + int max_scans; + int rc; + + LASSERT(cache != NULL); + LASSERT(!list_empty(&cache->ccc_lru)); + + rc = osc_lru_shrink(cli, lru_shrink_min); + if (rc != 0) { + CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n", + cli->cl_import->imp_obd->obd_name, rc, cli); + return rc; + } + + CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n", + cli->cl_import->imp_obd->obd_name, cli, + atomic_read(&cli->cl_lru_in_list), + atomic_read(&cli->cl_lru_busy)); + + /* Reclaim LRU slots from other client_obd as it can't free enough + * from its own. This should rarely happen. */ + spin_lock(&cache->ccc_lru_lock); + cache->ccc_lru_shrinkers++; + list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); + + max_scans = atomic_read(&cache->ccc_users); + while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) { + cli = list_entry(cache->ccc_lru.next, struct client_obd, + cl_lru_osc); + + CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n", + cli->cl_import->imp_obd->obd_name, cli, + atomic_read(&cli->cl_lru_in_list), + atomic_read(&cli->cl_lru_busy)); + + list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); + if (atomic_read(&cli->cl_lru_in_list) > 0) { + spin_unlock(&cache->ccc_lru_lock); + + rc = osc_lru_shrink(cli, max_to_shrink(cli)); + spin_lock(&cache->ccc_lru_lock); + if (rc != 0) + break; + } + } + spin_unlock(&cache->ccc_lru_lock); + + CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n", + cli->cl_import->imp_obd->obd_name, cli, rc); + return rc; +} + +static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, + struct osc_page *opg) +{ + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + struct client_obd *cli = osc_cli(obj); + int rc = 0; + + if (cli->cl_cache == NULL) /* shall not be in LRU */ + return 0; + + LASSERT(atomic_read(cli->cl_lru_left) >= 0); + while (!atomic_add_unless(cli->cl_lru_left, -1, 0)) { + int gen; + + /* run out of LRU spaces, try to drop some by itself */ + rc = osc_lru_reclaim(cli); + if (rc < 0) + break; + if (rc > 0) + continue; + + cond_resched(); + + /* slowest case, all of caching pages are busy, notifying + * other OSCs that we're lack of LRU slots. */ + atomic_inc(&osc_lru_waiters); + + gen = atomic_read(&cli->cl_lru_in_list); + rc = l_wait_event(osc_lru_waitq, + atomic_read(cli->cl_lru_left) > 0 || + (atomic_read(&cli->cl_lru_in_list) > 0 && + gen != atomic_read(&cli->cl_lru_in_list)), + &lwi); + + atomic_dec(&osc_lru_waiters); + if (rc < 0) + break; + } + + if (rc >= 0) { + atomic_inc(&cli->cl_lru_busy); + opg->ops_in_lru = 1; + rc = 0; + } + + return rc; +} + +/** @} osc */ diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_quota.c b/kernel/drivers/staging/lustre/lustre/osc/osc_quota.c new file mode 100644 index 000000000..6690f149a --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_quota.c @@ -0,0 +1,327 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Code originally extracted from quota directory + */ + +#include "../include/obd_class.h" +#include "osc_internal.h" + +static inline struct osc_quota_info *osc_oqi_alloc(u32 id) +{ + struct osc_quota_info *oqi; + + OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem); + if (oqi != NULL) + oqi->oqi_id = id; + + return oqi; +} + +int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) { + struct osc_quota_info *oqi; + + oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); + if (oqi) { + /* do not try to access oqi here, it could have been + * freed by osc_quota_setdq() */ + + /* the slot is busy, the user is about to run out of + * quota space on this OST */ + CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n", + type == USRQUOTA ? "user" : "grout", qid[type]); + return NO_QUOTA; + } + } + + return QUOTA_OK; +} + +#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \ + : OBD_MD_FLGRPQUOTA) +#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \ + : OBD_FL_NO_GRPQUOTA) + +int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[], + u32 valid, u32 flags) +{ + int type; + int rc = 0; + + if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0) + return 0; + + for (type = 0; type < MAXQUOTAS; type++) { + struct osc_quota_info *oqi; + + if ((valid & MD_QUOTA_FLAG(type)) == 0) + continue; + + /* lookup the ID in the per-type hash table */ + oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); + if ((flags & FL_QUOTA_FLAG(type)) != 0) { + /* This ID is getting close to its quota limit, let's + * switch to sync I/O */ + if (oqi != NULL) + continue; + + oqi = osc_oqi_alloc(qid[type]); + if (oqi == NULL) { + rc = -ENOMEM; + break; + } + + rc = cfs_hash_add_unique(cli->cl_quota_hash[type], + &qid[type], &oqi->oqi_hash); + /* race with others? */ + if (rc == -EALREADY) { + rc = 0; + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); + } + + CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n", + cli->cl_import->imp_obd->obd_name, + type == USRQUOTA ? "user" : "group", + qid[type], rc); + } else { + /* This ID is now off the hook, let's remove it from + * the hash table */ + if (oqi == NULL) + continue; + + oqi = cfs_hash_del_key(cli->cl_quota_hash[type], + &qid[type]); + if (oqi) + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); + + CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n", + cli->cl_import->imp_obd->obd_name, + type == USRQUOTA ? "user" : "group", + qid[type], oqi); + } + } + + return rc; +} + +/* + * Hash operations for uid/gid <-> osc_quota_info + */ +static unsigned +oqi_hashfn(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_u32_hash(*((__u32 *)key), mask); +} + +static int +oqi_keycmp(const void *key, struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + u32 uid; + + LASSERT(key != NULL); + uid = *((u32 *)key); + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + + return uid == oqi->oqi_id; +} + +static void * +oqi_key(struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + return &oqi->oqi_id; +} + +static void * +oqi_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct osc_quota_info, oqi_hash); +} + +static void +oqi_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ +} + +static void +oqi_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ +} + +static void +oqi_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); +} + +#define HASH_QUOTA_BKT_BITS 5 +#define HASH_QUOTA_CUR_BITS 5 +#define HASH_QUOTA_MAX_BITS 15 + +static cfs_hash_ops_t quota_hash_ops = { + .hs_hash = oqi_hashfn, + .hs_keycmp = oqi_keycmp, + .hs_key = oqi_key, + .hs_object = oqi_object, + .hs_get = oqi_get, + .hs_put_locked = oqi_put_locked, + .hs_exit = oqi_exit, +}; + +int osc_quota_setup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int i, type; + + for (type = 0; type < MAXQUOTAS; type++) { + cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH", + HASH_QUOTA_CUR_BITS, + HASH_QUOTA_MAX_BITS, + HASH_QUOTA_BKT_BITS, + 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + "a_hash_ops, + CFS_HASH_DEFAULT); + if (cli->cl_quota_hash[type] == NULL) + break; + } + + if (type == MAXQUOTAS) + return 0; + + for (i = 0; i < type; i++) + cfs_hash_putref(cli->cl_quota_hash[i]); + + return -ENOMEM; +} + +int osc_quota_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int type; + + for (type = 0; type < MAXQUOTAS; type++) + cfs_hash_putref(cli->cl_quota_hash[type]); + + return 0; +} + +int osc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct ptlrpc_request *req; + struct obd_quotactl *oqc; + int rc; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION, + OST_QUOTACTL); + if (req == NULL) + return -ENOMEM; + + oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *oqc = *oqctl; + + ptlrpc_request_set_replen(req); + ptlrpc_at_set_req_timeout(req); + req->rq_no_resend = 1; + + rc = ptlrpc_queue_wait(req); + if (rc) + CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); + + if (req->rq_repmsg) { + oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + if (oqc) { + *oqctl = *oqc; + } else if (!rc) { + CERROR("Can't unpack obd_quotactl\n"); + rc = -EPROTO; + } + } else if (!rc) { + CERROR("Can't unpack obd_quotactl\n"); + rc = -EPROTO; + } + ptlrpc_req_finished(req); + + return rc; +} + +int osc_quotacheck(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct ptlrpc_request *req; + struct obd_quotactl *body; + int rc; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_OST_QUOTACHECK, LUSTRE_OST_VERSION, + OST_QUOTACHECK); + if (req == NULL) + return -ENOMEM; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *body = *oqctl; + + ptlrpc_request_set_replen(req); + + /* the next poll will find -ENODATA, that means quotacheck is + * going on */ + cli->cl_qchk_stat = -ENODATA; + rc = ptlrpc_queue_wait(req); + if (rc) + cli->cl_qchk_stat = rc; + ptlrpc_req_finished(req); + return rc; +} + +int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + int rc; + + qchk->obd_uuid = cli->cl_target_uuid; + memcpy(qchk->obd_type, LUSTRE_OST_NAME, strlen(LUSTRE_OST_NAME)); + + rc = cli->cl_qchk_stat; + /* the client is not the previous one */ + if (rc == CL_NOT_QUOTACHECKED) + rc = -EINTR; + return rc; +} diff --git a/kernel/drivers/staging/lustre/lustre/osc/osc_request.c b/kernel/drivers/staging/lustre/lustre/osc/osc_request.c new file mode 100644 index 000000000..d7a9b650d --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/osc/osc_request.c @@ -0,0 +1,3379 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "../../include/linux/libcfs/libcfs.h" + + +#include "../include/lustre_dlm.h" +#include "../include/lustre_net.h" +#include "../include/lustre/lustre_user.h" +#include "../include/obd_cksum.h" + +#include "../include/lustre_ha.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre_debug.h" +#include "../include/lustre_param.h" +#include "../include/lustre_fid.h" +#include "../include/obd_class.h" +#include "osc_internal.h" +#include "osc_cl_internal.h" + +struct osc_brw_async_args { + struct obdo *aa_oa; + int aa_requested_nob; + int aa_nio_count; + u32 aa_page_count; + int aa_resends; + struct brw_page **aa_ppga; + struct client_obd *aa_cli; + struct list_head aa_oaps; + struct list_head aa_exts; + struct obd_capa *aa_ocapa; + struct cl_req *aa_clerq; +}; + +struct osc_async_args { + struct obd_info *aa_oi; +}; + +struct osc_setattr_args { + struct obdo *sa_oa; + obd_enqueue_update_f sa_upcall; + void *sa_cookie; +}; + +struct osc_fsync_args { + struct obd_info *fa_oi; + obd_enqueue_update_f fa_upcall; + void *fa_cookie; +}; + +struct osc_enqueue_args { + struct obd_export *oa_exp; + __u64 *oa_flags; + obd_enqueue_update_f oa_upcall; + void *oa_cookie; + struct ost_lvb *oa_lvb; + struct lustre_handle *oa_lockh; + struct ldlm_enqueue_info *oa_ei; + unsigned int oa_agl:1; +}; + +static void osc_release_ppga(struct brw_page **ppga, u32 count); +static int brw_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc); +int osc_cleanup(struct obd_device *obd); + +/* Pack OSC object metadata for disk storage (LE byte order). */ +static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, + struct lov_stripe_md *lsm) +{ + int lmm_size; + + lmm_size = sizeof(**lmmp); + if (lmmp == NULL) + return lmm_size; + + if (*lmmp != NULL && lsm == NULL) { + OBD_FREE(*lmmp, lmm_size); + *lmmp = NULL; + return 0; + } else if (unlikely(lsm != NULL && ostid_id(&lsm->lsm_oi) == 0)) { + return -EBADF; + } + + if (*lmmp == NULL) { + OBD_ALLOC(*lmmp, lmm_size); + if (*lmmp == NULL) + return -ENOMEM; + } + + if (lsm) + ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi); + + return lmm_size; +} + +/* Unpack OSC object metadata from disk storage (LE byte order). */ +static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmm, int lmm_bytes) +{ + int lsm_size; + struct obd_import *imp = class_exp2cliimp(exp); + + if (lmm != NULL) { + if (lmm_bytes < sizeof(*lmm)) { + CERROR("%s: lov_mds_md too small: %d, need %d\n", + exp->exp_obd->obd_name, lmm_bytes, + (int)sizeof(*lmm)); + return -EINVAL; + } + /* XXX LOV_MAGIC etc check? */ + + if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) { + CERROR("%s: zero lmm_object_id: rc = %d\n", + exp->exp_obd->obd_name, -EINVAL); + return -EINVAL; + } + } + + lsm_size = lov_stripe_md_size(1); + if (lsmp == NULL) + return lsm_size; + + if (*lsmp != NULL && lmm == NULL) { + OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + OBD_FREE(*lsmp, lsm_size); + *lsmp = NULL; + return 0; + } + + if (*lsmp == NULL) { + OBD_ALLOC(*lsmp, lsm_size); + if (unlikely(*lsmp == NULL)) + return -ENOMEM; + OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) { + OBD_FREE(*lsmp, lsm_size); + return -ENOMEM; + } + loi_init((*lsmp)->lsm_oinfo[0]); + } else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) { + return -EBADF; + } + + if (lmm != NULL) + /* XXX zero *lsmp? */ + ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi); + + if (imp != NULL && + (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES)) + (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes; + else + (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; + + return lsm_size; +} + +static inline void osc_pack_capa(struct ptlrpc_request *req, + struct ost_body *body, void *capa) +{ + struct obd_capa *oc = (struct obd_capa *)capa; + struct lustre_capa *c; + + if (!capa) + return; + + c = req_capsule_client_get(&req->rq_pill, &RMF_CAPA1); + LASSERT(c); + capa_cpy(c, oc); + body->oa.o_valid |= OBD_MD_FLOSSCAPA; + DEBUG_CAPA(D_SEC, c, "pack"); +} + +static inline void osc_pack_req_body(struct ptlrpc_request *req, + struct obd_info *oinfo) +{ + struct ost_body *body; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); +} + +static inline void osc_set_capa_size(struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa *oc) +{ + if (oc == NULL) + req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0); + else + /* it is already calculated as sizeof struct obd_capa */ + ; +} + +static int osc_getattr_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_async_args *aa, int rc) +{ + struct ost_body *body; + + if (rc != 0) + goto out; + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body) { + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, + aa->aa_oi->oi_oa, &body->oa); + + /* This should really be sent by the OST */ + aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE; + aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ; + } else { + CDEBUG(D_INFO, "can't unpack ost_body\n"); + rc = -EPROTO; + aa->aa_oi->oi_oa->o_valid = 0; + } +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + return rc; +} + +static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + struct osc_async_args *aa; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); + if (req == NULL) + return -ENOMEM; + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + osc_pack_req_body(req, oinfo); + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_getattr_interpret; + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->aa_oi = oinfo; + + ptlrpc_set_add_req(set, req); + return 0; +} + +static int osc_getattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); + if (req == NULL) + return -ENOMEM; + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + osc_pack_req_body(req, oinfo); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + rc = -EPROTO; + goto out; + } + + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa, + &body->oa); + + oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd); + oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ; + + out: + ptlrpc_req_finished(req); + return rc; +} + +static int osc_setattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + + LASSERT(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); + if (req == NULL) + return -ENOMEM; + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + osc_pack_req_body(req, oinfo); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + rc = -EPROTO; + goto out; + } + + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa, + &body->oa); + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int osc_setattr_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_setattr_args *sa, int rc) +{ + struct ost_body *body; + + if (rc != 0) + goto out; + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + rc = -EPROTO; + goto out; + } + + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa, + &body->oa); +out: + rc = sa->sa_upcall(sa->sa_cookie, rc); + return rc; +} + +int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct osc_setattr_args *sa; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); + if (req == NULL) + return -ENOMEM; + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + if (oti && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) + oinfo->oi_oa->o_lcookie = *oti->oti_logcookies; + + osc_pack_req_body(req, oinfo); + + ptlrpc_request_set_replen(req); + + /* do mds to ost setattr asynchronously */ + if (!rqset) { + /* Do not wait for response. */ + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + } else { + req->rq_interpret_reply = + (ptlrpc_interpterer_t)osc_setattr_interpret; + + CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args)); + sa = ptlrpc_req_async_args(req); + sa->sa_oa = oinfo->oi_oa; + sa->sa_upcall = upcall; + sa->sa_cookie = cookie; + + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + } + + return 0; +} + +static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + return osc_setattr_async_base(exp, oinfo, oti, + oinfo->oi_cb_up, oinfo, rqset); +} + +int osc_real_create(struct obd_export *exp, struct obdo *oa, + struct lov_stripe_md **ea, struct obd_trans_info *oti) +{ + struct ptlrpc_request *req; + struct ost_body *body; + struct lov_stripe_md *lsm; + int rc; + + LASSERT(oa); + LASSERT(ea); + + lsm = *ea; + if (!lsm) { + rc = obd_alloc_memmd(exp, &lsm); + if (rc < 0) + return rc; + } + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE); + if (req == NULL) { + rc = -ENOMEM; + goto out; + } + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE); + if (rc) { + ptlrpc_request_free(req); + goto out; + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + + if ((oa->o_valid & OBD_MD_FLFLAGS) && + oa->o_flags == OBD_FL_DELORPHAN) { + DEBUG_REQ(D_HA, req, + "delorphan from OST integration"); + /* Don't resend the delorphan req */ + req->rq_no_resend = req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) + goto out_req; + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + rc = -EPROTO; + goto out_req; + } + + CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); + + oa->o_blksize = cli_brw_size(exp->exp_obd); + oa->o_valid |= OBD_MD_FLBLKSZ; + + /* XXX LOV STACKING: the lsm that is passed to us from LOV does not + * have valid lsm_oinfo data structs, so don't go touching that. + * This needs to be fixed in a big way. + */ + lsm->lsm_oi = oa->o_oi; + *ea = lsm; + + if (oti != NULL) { + oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); + + if (oa->o_valid & OBD_MD_FLCOOKIE) { + if (!oti->oti_logcookies) + oti_alloc_cookies(oti, 1); + *oti->oti_logcookies = oa->o_lcookie; + } + } + + CDEBUG(D_HA, "transno: %lld\n", + lustre_msg_get_transno(req->rq_repmsg)); +out_req: + ptlrpc_req_finished(req); +out: + if (rc && !*ea) + obd_free_memmd(exp, &lsm); + return rc; +} + +int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct osc_setattr_args *sa; + struct ost_body *body; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH); + if (req == NULL) + return -ENOMEM; + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); + + ptlrpc_request_set_replen(req); + + req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret; + CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args)); + sa = ptlrpc_req_async_args(req); + sa->sa_oa = oinfo->oi_oa; + sa->sa_upcall = upcall; + sa->sa_cookie = cookie; + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + + return 0; +} + +static int osc_sync_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc) +{ + struct osc_fsync_args *fa = arg; + struct ost_body *body; + + if (rc) + goto out; + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + CERROR ("can't unpack ost_body\n"); + rc = -EPROTO; + goto out; + } + + *fa->fa_oi->oi_oa = body->oa; +out: + rc = fa->fa_upcall(fa->fa_cookie, rc); + return rc; +} + +int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_fsync_args *fa; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC); + if (req == NULL) + return -ENOMEM; + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + /* overload the size and blocks fields in the oa with start/end */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = osc_sync_interpret; + + CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args)); + fa = ptlrpc_req_async_args(req); + fa->fa_oi = oinfo; + fa->fa_upcall = upcall; + fa->fa_cookie = cookie; + + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + + return 0; +} + +/* Find and cancel locally locks matched by @mode in the resource found by + * @objid. Found locks are added into @cancel list. Returns the amount of + * locks added to @cancels list. */ +static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, + struct list_head *cancels, + ldlm_mode_t mode, __u64 lock_flags) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + struct ldlm_res_id res_id; + struct ldlm_resource *res; + int count; + + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + return 0; + + ostid_build_res_name(&oa->o_oi, &res_id); + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if (res == NULL) + return 0; + + LDLM_RESOURCE_ADDREF(res); + count = ldlm_cancel_resource_local(res, cancels, NULL, mode, + lock_flags, 0, NULL); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + return count; +} + +static int osc_destroy_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, + int rc) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + + atomic_dec(&cli->cl_destroy_in_flight); + wake_up(&cli->cl_destroy_waitq); + return 0; +} + +static int osc_can_send_destroy(struct client_obd *cli) +{ + if (atomic_inc_return(&cli->cl_destroy_in_flight) <= + cli->cl_max_rpcs_in_flight) { + /* The destroy request can be sent */ + return 1; + } + if (atomic_dec_return(&cli->cl_destroy_in_flight) < + cli->cl_max_rpcs_in_flight) { + /* + * The counter has been modified between the two atomic + * operations. + */ + wake_up(&cli->cl_destroy_waitq); + } + return 0; +} + +int osc_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + int rc = 0; + + LASSERT(oa); + LASSERT(ea); + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + if ((oa->o_valid & OBD_MD_FLFLAGS) && + oa->o_flags == OBD_FL_RECREATE_OBJS) { + return osc_real_create(exp, oa, ea, oti); + } + + if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi))) + return osc_real_create(exp, oa, ea, oti); + + /* we should not get here anymore */ + LBUG(); + + return rc; +} + +/* Destroy requests can be async always on the client, and we don't even really + * care about the return code since the client cannot do anything at all about + * a destroy failure. + * When the MDS is unlinking a filename, it saves the file objects into a + * recovery llog, and these object records are cancelled when the OST reports + * they were destroyed and sync'd to disk (i.e. transaction committed). + * If the client dies, or the OST is down when the object should be destroyed, + * the records are not cancelled, and when the OST reconnects to the MDS next, + * it will retrieve the llog unlink logs and then sends the log cancellation + * cookies to the MDS after committing destroy transactions. */ +static int osc_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *ea, + struct obd_trans_info *oti, struct obd_export *md_export, + void *capa) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct ptlrpc_request *req; + struct ost_body *body; + LIST_HEAD(cancels); + int rc, count; + + if (!oa) { + CDEBUG(D_INFO, "oa NULL\n"); + return -EINVAL; + } + + count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW, + LDLM_FL_DISCARD_DATA); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + return -ENOMEM; + } + + osc_set_capa_size(req, &RMF_CAPA1, (struct obd_capa *)capa); + rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY, + 0, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); + + if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) + oa->o_lcookie = *oti->oti_logcookies; + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + osc_pack_capa(req, body, (struct obd_capa *)capa); + ptlrpc_request_set_replen(req); + + /* If osc_destroy is for destroying the unlink orphan, + * sent from MDT to OST, which should not be blocked here, + * because the process might be triggered by ptlrpcd, and + * it is not good to block ptlrpcd thread (b=16006)*/ + if (!(oa->o_flags & OBD_FL_DELORPHAN)) { + req->rq_interpret_reply = osc_destroy_interpret; + if (!osc_can_send_destroy(cli)) { + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, + NULL); + + /* + * Wait until the number of on-going destroy RPCs drops + * under max_rpc_in_flight + */ + l_wait_event_exclusive(cli->cl_destroy_waitq, + osc_can_send_destroy(cli), &lwi); + } + } + + /* Do not wait for response */ + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + return 0; +} + +static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, + long writing_bytes) +{ + u32 bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT; + + LASSERT(!(oa->o_valid & bits)); + + oa->o_valid |= bits; + client_obd_list_lock(&cli->cl_loi_list_lock); + oa->o_dirty = cli->cl_dirty; + if (unlikely(cli->cl_dirty - cli->cl_dirty_transit > + cli->cl_dirty_max)) { + CERROR("dirty %lu - %lu > dirty_max %lu\n", + cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max); + oa->o_undirty = 0; + } else if (unlikely(atomic_read(&obd_dirty_pages) - + atomic_read(&obd_dirty_transit_pages) > + (long)(obd_max_dirty_pages + 1))) { + /* The atomic_read() allowing the atomic_inc() are + * not covered by a lock thus they may safely race and trip + * this CERROR() unless we add in a small fudge factor (+1). */ + CERROR("dirty %d - %d > system dirty_max %d\n", + atomic_read(&obd_dirty_pages), + atomic_read(&obd_dirty_transit_pages), + obd_max_dirty_pages); + oa->o_undirty = 0; + } else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) { + CERROR("dirty %lu - dirty_max %lu too big???\n", + cli->cl_dirty, cli->cl_dirty_max); + oa->o_undirty = 0; + } else { + long max_in_flight = (cli->cl_max_pages_per_rpc << + PAGE_CACHE_SHIFT)* + (cli->cl_max_rpcs_in_flight + 1); + oa->o_undirty = max(cli->cl_dirty_max, max_in_flight); + } + oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; + oa->o_dropped = cli->cl_lost_grant; + cli->cl_lost_grant = 0; + client_obd_list_unlock(&cli->cl_loi_list_lock); + CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n", + oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant); + +} + +void osc_update_next_shrink(struct client_obd *cli) +{ + cli->cl_next_shrink_grant = + cfs_time_shift(cli->cl_grant_shrink_interval); + CDEBUG(D_CACHE, "next time %ld to shrink grant \n", + cli->cl_next_shrink_grant); +} + +static void __osc_update_grant(struct client_obd *cli, u64 grant) +{ + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant += grant; + client_obd_list_unlock(&cli->cl_loi_list_lock); +} + +static void osc_update_grant(struct client_obd *cli, struct ost_body *body) +{ + if (body->oa.o_valid & OBD_MD_FLGRANT) { + CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant); + __osc_update_grant(cli, body->oa.o_grant); + } +} + +static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, u32 vallen, + void *val, struct ptlrpc_request_set *set); + +static int osc_shrink_grant_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *aa, int rc) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct obdo *oa = ((struct osc_brw_async_args *)aa)->aa_oa; + struct ost_body *body; + + if (rc != 0) { + __osc_update_grant(cli, oa->o_grant); + goto out; + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + osc_update_grant(cli, body); +out: + OBDO_FREE(oa); + return rc; +} + +static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa) +{ + client_obd_list_lock(&cli->cl_loi_list_lock); + oa->o_grant = cli->cl_avail_grant / 4; + cli->cl_avail_grant -= oa->o_grant; + client_obd_list_unlock(&cli->cl_loi_list_lock); + if (!(oa->o_valid & OBD_MD_FLFLAGS)) { + oa->o_valid |= OBD_MD_FLFLAGS; + oa->o_flags = 0; + } + oa->o_flags |= OBD_FL_SHRINK_GRANT; + osc_update_next_shrink(cli); +} + +/* Shrink the current grant, either from some large amount to enough for a + * full set of in-flight RPCs, or if we have already shrunk to that limit + * then to enough for a single RPC. This avoids keeping more grant than + * needed, and avoids shrinking the grant piecemeal. */ +static int osc_shrink_grant(struct client_obd *cli) +{ + __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) * + (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT); + + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_avail_grant <= target_bytes) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return osc_shrink_grant_to_target(cli, target_bytes); +} + +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes) +{ + int rc = 0; + struct ost_body *body; + + client_obd_list_lock(&cli->cl_loi_list_lock); + /* Don't shrink if we are already above or below the desired limit + * We don't want to shrink below a single RPC, as that will negatively + * impact block allocation and long-term performance. */ + if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + + if (target_bytes >= cli->cl_avail_grant) { + client_obd_list_unlock(&cli->cl_loi_list_lock); + return 0; + } + client_obd_list_unlock(&cli->cl_loi_list_lock); + + OBD_ALLOC_PTR(body); + if (!body) + return -ENOMEM; + + osc_announce_cached(cli, &body->oa, 0); + + client_obd_list_lock(&cli->cl_loi_list_lock); + body->oa.o_grant = cli->cl_avail_grant - target_bytes; + cli->cl_avail_grant = target_bytes; + client_obd_list_unlock(&cli->cl_loi_list_lock); + if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_SHRINK_GRANT; + osc_update_next_shrink(cli); + + rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export, + sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK, + sizeof(*body), body, NULL); + if (rc != 0) + __osc_update_grant(cli, body->oa.o_grant); + OBD_FREE_PTR(body); + return rc; +} + +static int osc_should_shrink_grant(struct client_obd *client) +{ + unsigned long time = cfs_time_current(); + unsigned long next_shrink = client->cl_next_shrink_grant; + + if ((client->cl_import->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_GRANT_SHRINK) == 0) + return 0; + + if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) { + /* Get the current RPC size directly, instead of going via: + * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export) + * Keep comment here so that it can be found by searching. */ + int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + + if (client->cl_import->imp_state == LUSTRE_IMP_FULL && + client->cl_avail_grant > brw_size) + return 1; + else + osc_update_next_shrink(client); + } + return 0; +} + +static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data) +{ + struct client_obd *client; + + list_for_each_entry(client, &item->ti_obd_list, + cl_grant_shrink_list) { + if (osc_should_shrink_grant(client)) + osc_shrink_grant(client); + } + return 0; +} + +static int osc_add_shrink_grant(struct client_obd *client) +{ + int rc; + + rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval, + TIMEOUT_GRANT, + osc_grant_shrink_grant_cb, NULL, + &client->cl_grant_shrink_list); + if (rc) { + CERROR("add grant client %s error %d\n", + client->cl_import->imp_obd->obd_name, rc); + return rc; + } + CDEBUG(D_CACHE, "add grant client %s \n", + client->cl_import->imp_obd->obd_name); + osc_update_next_shrink(client); + return 0; +} + +static int osc_del_shrink_grant(struct client_obd *client) +{ + return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list, + TIMEOUT_GRANT); +} + +static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) +{ + /* + * ocd_grant is the total grant amount we're expect to hold: if we've + * been evicted, it's the new avail_grant amount, cl_dirty will drop + * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty. + * + * race is tolerable here: if we're evicted, but imp_state already + * left EVICTED state, then cl_dirty must be 0 already. + */ + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED) + cli->cl_avail_grant = ocd->ocd_grant; + else + cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty; + + if (cli->cl_avail_grant < 0) { + CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n", + cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant, + ocd->ocd_grant, cli->cl_dirty); + /* workaround for servers which do not have the patch from + * LU-2679 */ + cli->cl_avail_grant = ocd->ocd_grant; + } + + /* determine the appropriate chunk size used by osc_extent. */ + cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld chunk bits: %d\n", + cli->cl_import->imp_obd->obd_name, + cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits); + + if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK && + list_empty(&cli->cl_grant_shrink_list)) + osc_add_shrink_grant(cli); +} + +/* We assume that the reason this OSC got a short read is because it read + * beyond the end of a stripe file; i.e. lustre is reading a sparse file + * via the LOV, and it _knows_ it's reading inside the file, it's just that + * this stripe never got written at or beyond this stripe offset yet. */ +static void handle_short_read(int nob_read, u32 page_count, + struct brw_page **pga) +{ + char *ptr; + int i = 0; + + /* skip bytes read OK */ + while (nob_read > 0) { + LASSERT (page_count > 0); + + if (pga[i]->count > nob_read) { + /* EOF inside this page */ + ptr = kmap(pga[i]->pg) + + (pga[i]->off & ~CFS_PAGE_MASK); + memset(ptr + nob_read, 0, pga[i]->count - nob_read); + kunmap(pga[i]->pg); + page_count--; + i++; + break; + } + + nob_read -= pga[i]->count; + page_count--; + i++; + } + + /* zero remaining pages */ + while (page_count-- > 0) { + ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK); + memset(ptr, 0, pga[i]->count); + kunmap(pga[i]->pg); + i++; + } +} + +static int check_write_rcs(struct ptlrpc_request *req, + int requested_nob, int niocount, + u32 page_count, struct brw_page **pga) +{ + int i; + __u32 *remote_rcs; + + remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS, + sizeof(*remote_rcs) * + niocount); + if (remote_rcs == NULL) { + CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n"); + return -EPROTO; + } + + /* return error if any niobuf was in error */ + for (i = 0; i < niocount; i++) { + if ((int)remote_rcs[i] < 0) + return remote_rcs[i]; + + if (remote_rcs[i] != 0) { + CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n", + i, remote_rcs[i], req); + return -EPROTO; + } + } + + if (req->rq_bulk->bd_nob_transferred != requested_nob) { + CERROR("Unexpected # bytes transferred: %d (requested %d)\n", + req->rq_bulk->bd_nob_transferred, requested_nob); + return -EPROTO; + } + + return 0; +} + +static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) +{ + if (p1->flag != p2->flag) { + unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE | + OBD_BRW_SYNC | OBD_BRW_ASYNC|OBD_BRW_NOQUOTA); + + /* warn if we try to combine flags that we don't know to be + * safe to combine */ + if (unlikely((p1->flag & mask) != (p2->flag & mask))) { + CWARN("Saw flags 0x%x and 0x%x in the same brw, please report this at http://bugs.whamcloud.com/\n", + p1->flag, p2->flag); + } + return 0; + } + + return (p1->off + p1->count == p2->off); +} + +static u32 osc_checksum_bulk(int nob, u32 pg_count, + struct brw_page **pga, int opc, + cksum_type_t cksum_type) +{ + __u32 cksum; + int i = 0; + struct cfs_crypto_hash_desc *hdesc; + unsigned int bufsize; + int err; + unsigned char cfs_alg = cksum_obd2cfs(cksum_type); + + LASSERT(pg_count > 0); + + hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(hdesc)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(hdesc); + } + + while (nob > 0 && pg_count > 0) { + int count = pga[i]->count > nob ? nob : pga[i]->count; + + /* corrupt the data before we compute the checksum, to + * simulate an OST->client data error */ + if (i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) { + unsigned char *ptr = kmap(pga[i]->pg); + int off = pga[i]->off & ~CFS_PAGE_MASK; + memcpy(ptr + off, "bad1", min(4, nob)); + kunmap(pga[i]->pg); + } + cfs_crypto_hash_update_page(hdesc, pga[i]->pg, + pga[i]->off & ~CFS_PAGE_MASK, + count); + CDEBUG(D_PAGE, + "page %p map %p index %lu flags %lx count %u priv %0lx: off %d\n", + pga[i]->pg, pga[i]->pg->mapping, pga[i]->pg->index, + (long)pga[i]->pg->flags, page_count(pga[i]->pg), + page_private(pga[i]->pg), + (int)(pga[i]->off & ~CFS_PAGE_MASK)); + + nob -= pga[i]->count; + pg_count--; + i++; + } + + bufsize = 4; + err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize); + + if (err) + cfs_crypto_hash_final(hdesc, NULL, NULL); + + /* For sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo */ + if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) + cksum++; + + return cksum; +} + +static int osc_brw_prep_request(int cmd, struct client_obd *cli, + struct obdo *oa, + struct lov_stripe_md *lsm, u32 page_count, + struct brw_page **pga, + struct ptlrpc_request **reqp, + struct obd_capa *ocapa, int reserve, + int resend) +{ + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + struct ost_body *body; + struct obd_ioobj *ioobj; + struct niobuf_remote *niobuf; + int niocount, i, requested_nob, opc, rc; + struct osc_brw_async_args *aa; + struct req_capsule *pill; + struct brw_page *pg_prev; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ)) + return -ENOMEM; /* Recoverable */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2)) + return -EINVAL; /* Fatal */ + + if ((cmd & OBD_BRW_WRITE) != 0) { + opc = OST_WRITE; + req = ptlrpc_request_alloc_pool(cli->cl_import, + cli->cl_import->imp_rq_pool, + &RQF_OST_BRW_WRITE); + } else { + opc = OST_READ; + req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ); + } + if (req == NULL) + return -ENOMEM; + + for (niocount = i = 1; i < page_count; i++) { + if (!can_merge_pages(pga[i - 1], pga[i])) + niocount++; + } + + pill = &req->rq_pill; + req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT, + sizeof(*ioobj)); + req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT, + niocount * sizeof(*niobuf)); + osc_set_capa_size(req, &RMF_CAPA1, ocapa); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); + /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own + * retry logic */ + req->rq_no_retry_einprogress = 1; + + desc = ptlrpc_prep_bulk_imp(req, page_count, + cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS, + opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK, + OST_BULK_PORTAL); + + if (desc == NULL) { + rc = -ENOMEM; + goto out; + } + /* NB request now owns desc and will free it when it gets freed */ + + body = req_capsule_client_get(pill, &RMF_OST_BODY); + ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ); + niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); + LASSERT(body != NULL && ioobj != NULL && niobuf != NULL); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + obdo_to_ioobj(oa, ioobj); + ioobj->ioo_bufcnt = niocount; + /* The high bits of ioo_max_brw tells server _maximum_ number of bulks + * that might be send for this request. The actual number is decided + * when the RPC is finally sent in ptlrpc_register_bulk(). It sends + * "max - 1" for old client compatibility sending "0", and also so the + * the actual maximum is a power-of-two number, not one less. LU-1431 */ + ioobj_max_brw_set(ioobj, desc->bd_md_max_brw); + osc_pack_capa(req, body, ocapa); + LASSERT(page_count > 0); + pg_prev = pga[0]; + for (requested_nob = i = 0; i < page_count; i++, niobuf++) { + struct brw_page *pg = pga[i]; + int poff = pg->off & ~CFS_PAGE_MASK; + + LASSERT(pg->count > 0); + /* make sure there is no gap in the middle of page array */ + LASSERTF(page_count == 1 || + (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) && + ergo(i > 0 && i < page_count - 1, + poff == 0 && pg->count == PAGE_CACHE_SIZE) && + ergo(i == page_count - 1, poff == 0)), + "i: %d/%d pg: %p off: %llu, count: %u\n", + i, page_count, pg, pg->off, pg->count); + LASSERTF(i == 0 || pg->off > pg_prev->off, + "i %d p_c %u pg %p [pri %lu ind %lu] off %llu prev_pg %p [pri %lu ind %lu] off %llu\n", + i, page_count, + pg->pg, page_private(pg->pg), pg->pg->index, pg->off, + pg_prev->pg, page_private(pg_prev->pg), + pg_prev->pg->index, pg_prev->off); + LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) == + (pg->flag & OBD_BRW_SRVLOCK)); + + ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count); + requested_nob += pg->count; + + if (i > 0 && can_merge_pages(pg_prev, pg)) { + niobuf--; + niobuf->len += pg->count; + } else { + niobuf->offset = pg->off; + niobuf->len = pg->count; + niobuf->flags = pg->flag; + } + pg_prev = pg; + } + + LASSERTF((void *)(niobuf - niocount) == + req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE), + "want %p - real %p\n", req_capsule_client_get(&req->rq_pill, + &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount)); + + osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0); + if (resend) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_RECOV_RESEND; + } + + if (osc_should_shrink_grant(cli)) + osc_shrink_grant_local(cli, &body->oa); + + /* size[REQ_REC_OFF] still sizeof (*body) */ + if (opc == OST_WRITE) { + if (cli->cl_checksum && + !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { + /* store cl_cksum_type in a local variable since + * it can be changed via lprocfs */ + cksum_type_t cksum_type = cli->cl_cksum_type; + + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { + oa->o_flags &= OBD_FL_LOCAL_MASK; + body->oa.o_flags = 0; + } + body->oa.o_flags |= cksum_type_pack(cksum_type); + body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + body->oa.o_cksum = osc_checksum_bulk(requested_nob, + page_count, pga, + OST_WRITE, + cksum_type); + CDEBUG(D_PAGE, "checksum at write origin: %x\n", + body->oa.o_cksum); + /* save this in 'oa', too, for later checking */ + oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + oa->o_flags |= cksum_type_pack(cksum_type); + } else { + /* clear out the checksum flag, in case this is a + * resend but cl_checksum is no longer set. b=11238 */ + oa->o_valid &= ~OBD_MD_FLCKSUM; + } + oa->o_cksum = body->oa.o_cksum; + /* 1 RC per niobuf */ + req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER, + sizeof(__u32) * niocount); + } else { + if (cli->cl_checksum && + !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) + body->oa.o_flags = 0; + body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type); + body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + } + } + ptlrpc_request_set_replen(req); + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->aa_oa = oa; + aa->aa_requested_nob = requested_nob; + aa->aa_nio_count = niocount; + aa->aa_page_count = page_count; + aa->aa_resends = 0; + aa->aa_ppga = pga; + aa->aa_cli = cli; + INIT_LIST_HEAD(&aa->aa_oaps); + if (ocapa && reserve) + aa->aa_ocapa = capa_get(ocapa); + + *reqp = req; + return 0; + + out: + ptlrpc_req_finished(req); + return rc; +} + +static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer, + __u32 client_cksum, __u32 server_cksum, int nob, + u32 page_count, struct brw_page **pga, + cksum_type_t client_cksum_type) +{ + __u32 new_cksum; + char *msg; + cksum_type_t cksum_type; + + if (server_cksum == client_cksum) { + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + return 0; + } + + cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ? + oa->o_flags : 0); + new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE, + cksum_type); + + if (cksum_type != client_cksum_type) + msg = "the server did not use the checksum type specified in the original request - likely a protocol problem" + ; + else if (new_cksum == server_cksum) + msg = "changed on the client after we checksummed it - likely false positive due to mmap IO (bug 11742)" + ; + else if (new_cksum == client_cksum) + msg = "changed in transit before arrival at OST"; + else + msg = "changed in transit AND doesn't match the original - likely false positive due to mmap IO (bug 11742)" + ; + + LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID + " object "DOSTID" extent [%llu-%llu]\n", + msg, libcfs_nid2str(peer->nid), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + POSTID(&oa->o_oi), pga[0]->off, + pga[page_count-1]->off + pga[page_count-1]->count - 1); + CERROR("original client csum %x (type %x), server csum %x (type %x), client csum now %x\n", + client_cksum, client_cksum_type, + server_cksum, cksum_type, new_cksum); + return 1; +} + +/* Note rc enters this function as number of bytes transferred */ +static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) +{ + struct osc_brw_async_args *aa = (void *)&req->rq_async_args; + const lnet_process_id_t *peer = + &req->rq_import->imp_connection->c_peer; + struct client_obd *cli = aa->aa_cli; + struct ost_body *body; + __u32 client_cksum = 0; + + if (rc < 0 && rc != -EDQUOT) { + DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc); + return rc; + } + + LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc); + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + DEBUG_REQ(D_INFO, req, "Can't unpack body\n"); + return -EPROTO; + } + + /* set/clear over quota flag for a uid/gid */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && + body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) { + unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid }; + + CDEBUG(D_QUOTA, "setdq for [%u %u] with valid %#llx, flags %x\n", + body->oa.o_uid, body->oa.o_gid, body->oa.o_valid, + body->oa.o_flags); + osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags); + } + + osc_update_grant(cli, body); + + if (rc < 0) + return rc; + + if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM) + client_cksum = aa->aa_oa->o_cksum; /* save for later */ + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { + if (rc > 0) { + CERROR("Unexpected +ve rc %d\n", rc); + return -EPROTO; + } + LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob); + + if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk)) + return -EAGAIN; + + if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum && + check_write_checksum(&body->oa, peer, client_cksum, + body->oa.o_cksum, aa->aa_requested_nob, + aa->aa_page_count, aa->aa_ppga, + cksum_type_unpack(aa->aa_oa->o_flags))) + return -EAGAIN; + + rc = check_write_rcs(req, aa->aa_requested_nob, + aa->aa_nio_count, + aa->aa_page_count, aa->aa_ppga); + goto out; + } + + /* The rest of this function executes only for OST_READs */ + + /* if unwrap_bulk failed, return -EAGAIN to retry */ + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc); + if (rc < 0) { + rc = -EAGAIN; + goto out; + } + + if (rc > aa->aa_requested_nob) { + CERROR("Unexpected rc %d (%d requested)\n", rc, + aa->aa_requested_nob); + return -EPROTO; + } + + if (rc != req->rq_bulk->bd_nob_transferred) { + CERROR ("Unexpected rc %d (%d transferred)\n", + rc, req->rq_bulk->bd_nob_transferred); + return -EPROTO; + } + + if (rc < aa->aa_requested_nob) + handle_short_read(rc, aa->aa_page_count, aa->aa_ppga); + + if (body->oa.o_valid & OBD_MD_FLCKSUM) { + static int cksum_counter; + __u32 server_cksum = body->oa.o_cksum; + char *via; + char *router; + cksum_type_t cksum_type; + + cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS? + body->oa.o_flags : 0); + client_cksum = osc_checksum_bulk(rc, aa->aa_page_count, + aa->aa_ppga, OST_READ, + cksum_type); + + if (peer->nid == req->rq_bulk->bd_sender) { + via = router = ""; + } else { + via = " via "; + router = libcfs_nid2str(req->rq_bulk->bd_sender); + } + + if (server_cksum != client_cksum) { + LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from %s%s%s inode " DFID " object " DOSTID " extent [%llu-%llu]\n", + req->rq_import->imp_obd->obd_name, + libcfs_nid2str(peer->nid), + via, router, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_seq : (__u64)0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_oid : 0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_ver : 0, + POSTID(&body->oa.o_oi), + aa->aa_ppga[0]->off, + aa->aa_ppga[aa->aa_page_count-1]->off + + aa->aa_ppga[aa->aa_page_count-1]->count - + 1); + CERROR("client %x, server %x, cksum_type %x\n", + client_cksum, server_cksum, cksum_type); + cksum_counter = 0; + aa->aa_oa->o_cksum = client_cksum; + rc = -EAGAIN; + } else { + cksum_counter++; + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + rc = 0; + } + } else if (unlikely(client_cksum)) { + static int cksum_missed; + + cksum_missed++; + if ((cksum_missed & (-cksum_missed)) == cksum_missed) + CERROR("Checksum %u requested from %s but not sent\n", + cksum_missed, libcfs_nid2str(peer->nid)); + } else { + rc = 0; + } +out: + if (rc >= 0) + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, + aa->aa_oa, &body->oa); + + return rc; +} + +static int osc_brw_redo_request(struct ptlrpc_request *request, + struct osc_brw_async_args *aa, int rc) +{ + struct ptlrpc_request *new_req; + struct osc_brw_async_args *new_aa; + struct osc_async_page *oap; + + DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request, + "redo for recoverable error %d", rc); + + rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == + OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ, + aa->aa_cli, aa->aa_oa, + NULL /* lsm unused by osc currently */, + aa->aa_page_count, aa->aa_ppga, + &new_req, aa->aa_ocapa, 0, 1); + if (rc) + return rc; + + list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { + if (oap->oap_request != NULL) { + LASSERTF(request == oap->oap_request, + "request %p != oap_request %p\n", + request, oap->oap_request); + if (oap->oap_interrupted) { + ptlrpc_req_finished(new_req); + return -EINTR; + } + } + } + /* New request takes over pga and oaps from old request. + * Note that copying a list_head doesn't work, need to move it... */ + aa->aa_resends++; + new_req->rq_interpret_reply = request->rq_interpret_reply; + new_req->rq_async_args = request->rq_async_args; + /* cap resend delay to the current request timeout, this is similar to + * what ptlrpc does (see after_reply()) */ + if (aa->aa_resends > new_req->rq_timeout) + new_req->rq_sent = get_seconds() + new_req->rq_timeout; + else + new_req->rq_sent = get_seconds() + aa->aa_resends; + new_req->rq_generation_set = 1; + new_req->rq_import_generation = request->rq_import_generation; + + new_aa = ptlrpc_req_async_args(new_req); + + INIT_LIST_HEAD(&new_aa->aa_oaps); + list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps); + INIT_LIST_HEAD(&new_aa->aa_exts); + list_splice_init(&aa->aa_exts, &new_aa->aa_exts); + new_aa->aa_resends = aa->aa_resends; + + list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) { + if (oap->oap_request) { + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = ptlrpc_request_addref(new_req); + } + } + + new_aa->aa_ocapa = aa->aa_ocapa; + aa->aa_ocapa = NULL; + + /* XXX: This code will run into problem if we're going to support + * to add a series of BRW RPCs into a self-defined ptlrpc_request_set + * and wait for all of them to be finished. We should inherit request + * set from old request. */ + ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1); + + DEBUG_REQ(D_INFO, new_req, "new request"); + return 0; +} + +/* + * ugh, we want disk allocation on the target to happen in offset order. we'll + * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do + * fine for our small page arrays and doesn't require allocation. its an + * insertion sort that swaps elements that are strides apart, shrinking the + * stride down until its '1' and the array is sorted. + */ +static void sort_brw_pages(struct brw_page **array, int num) +{ + int stride, i, j; + struct brw_page *tmp; + + if (num == 1) + return; + for (stride = 1; stride < num ; stride = (stride * 3) + 1) + ; + + do { + stride /= 3; + for (i = stride ; i < num ; i++) { + tmp = array[i]; + j = i; + while (j >= stride && array[j - stride]->off > tmp->off) { + array[j] = array[j - stride]; + j -= stride; + } + array[j] = tmp; + } + } while (stride > 1); +} + +static void osc_release_ppga(struct brw_page **ppga, u32 count) +{ + LASSERT(ppga != NULL); + OBD_FREE(ppga, sizeof(*ppga) * count); +} + +static int brw_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc) +{ + struct osc_brw_async_args *aa = data; + struct osc_extent *ext; + struct osc_extent *tmp; + struct cl_object *obj = NULL; + struct client_obd *cli = aa->aa_cli; + + rc = osc_brw_fini_request(req, rc); + CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); + /* When server return -EINPROGRESS, client should always retry + * regardless of the number of times the bulk was resent already. */ + if (osc_recoverable_error(rc)) { + if (req->rq_import_generation != + req->rq_import->imp_generation) { + CDEBUG(D_HA, "%s: resend cross eviction for object: " DOSTID ", rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } else if (rc == -EINPROGRESS || + client_should_resend(aa->aa_resends, aa->aa_cli)) { + rc = osc_brw_redo_request(req, aa, rc); + } else { + CERROR("%s: too many resent retries for object: %llu:%llu, rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } + + if (rc == 0) + return 0; + else if (rc == -EAGAIN || rc == -EINPROGRESS) + rc = -EIO; + } + + if (aa->aa_ocapa) { + capa_put(aa->aa_ocapa); + aa->aa_ocapa = NULL; + } + + list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) { + if (obj == NULL && rc == 0) { + obj = osc2cl(ext->oe_obj); + cl_object_get(obj); + } + + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 1, rc); + } + LASSERT(list_empty(&aa->aa_exts)); + LASSERT(list_empty(&aa->aa_oaps)); + + if (obj != NULL) { + struct obdo *oa = aa->aa_oa; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned long valid = 0; + + LASSERT(rc == 0); + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + valid |= CAT_BLOCKS; + } + if (oa->o_valid & OBD_MD_FLMTIME) { + attr->cat_mtime = oa->o_mtime; + valid |= CAT_MTIME; + } + if (oa->o_valid & OBD_MD_FLATIME) { + attr->cat_atime = oa->o_atime; + valid |= CAT_ATIME; + } + if (oa->o_valid & OBD_MD_FLCTIME) { + attr->cat_ctime = oa->o_ctime; + valid |= CAT_CTIME; + } + if (valid != 0) { + cl_object_attr_lock(obj); + cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); + } + cl_object_put(env, obj); + } + OBDO_FREE(aa->aa_oa); + + cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc : + req->rq_bulk->bd_nob_transferred); + osc_release_ppga(aa->aa_ppga, aa->aa_page_count); + ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred); + + client_obd_list_lock(&cli->cl_loi_list_lock); + /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters + * is called so we know whether to go to sync BRWs or wait for more + * RPCs to complete */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) + cli->cl_w_in_flight--; + else + cli->cl_r_in_flight--; + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME); + return rc; +} + +/** + * Build an RPC by the list of extent @ext_list. The caller must ensure + * that the total pages in this list are NOT over max pages per RPC. + * Extents in the list must be in OES_RPC state. + */ +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + struct list_head *ext_list, int cmd, pdl_policy_t pol) +{ + struct ptlrpc_request *req = NULL; + struct osc_extent *ext; + struct brw_page **pga = NULL; + struct osc_brw_async_args *aa = NULL; + struct obdo *oa = NULL; + struct osc_async_page *oap; + struct osc_async_page *tmp; + struct cl_req *clerq = NULL; + enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : + CRT_READ; + struct ldlm_lock *lock = NULL; + struct cl_req_attr *crattr = NULL; + u64 starting_offset = OBD_OBJECT_EOF; + u64 ending_offset = 0; + int mpflag = 0; + int mem_tight = 0; + int page_count = 0; + int i; + int rc; + struct ost_body *body; + LIST_HEAD(rpc_list); + + LASSERT(!list_empty(ext_list)); + + /* add pages into rpc_list to build BRW rpc */ + list_for_each_entry(ext, ext_list, oe_link) { + LASSERT(ext->oe_state == OES_RPC); + mem_tight |= ext->oe_memalloc; + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + ++page_count; + list_add_tail(&oap->oap_rpc_item, &rpc_list); + if (starting_offset > oap->oap_obj_off) + starting_offset = oap->oap_obj_off; + else + LASSERT(oap->oap_page_off == 0); + if (ending_offset < oap->oap_obj_off + oap->oap_count) + ending_offset = oap->oap_obj_off + + oap->oap_count; + else + LASSERT(oap->oap_page_off + oap->oap_count == + PAGE_CACHE_SIZE); + } + } + + if (mem_tight) + mpflag = cfs_memory_pressure_get_and_set(); + + OBD_ALLOC(crattr, sizeof(*crattr)); + if (crattr == NULL) { + rc = -ENOMEM; + goto out; + } + + OBD_ALLOC(pga, sizeof(*pga) * page_count); + if (pga == NULL) { + rc = -ENOMEM; + goto out; + } + + OBDO_ALLOC(oa); + if (oa == NULL) { + rc = -ENOMEM; + goto out; + } + + i = 0; + list_for_each_entry(oap, &rpc_list, oap_rpc_item) { + struct cl_page *page = oap2cl_page(oap); + if (clerq == NULL) { + clerq = cl_req_alloc(env, page, crt, + 1 /* only 1-object rpcs for now */); + if (IS_ERR(clerq)) { + rc = PTR_ERR(clerq); + goto out; + } + lock = oap->oap_ldlm_lock; + } + if (mem_tight) + oap->oap_brw_flags |= OBD_BRW_MEMALLOC; + pga[i] = &oap->oap_brw_page; + pga[i]->off = oap->oap_obj_off + oap->oap_page_off; + CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n", + pga[i]->pg, page_index(oap->oap_page), oap, + pga[i]->flag); + i++; + cl_req_page_add(env, clerq, page); + } + + /* always get the data for the obdo for the rpc */ + LASSERT(clerq != NULL); + crattr->cra_oa = oa; + cl_req_attr_set(env, clerq, crattr, ~0ULL); + if (lock) { + oa->o_handle = lock->l_remote_handle; + oa->o_valid |= OBD_MD_FLHANDLE; + } + + rc = cl_req_prep(env, clerq); + if (rc != 0) { + CERROR("cl_req_prep failed: %d\n", rc); + goto out; + } + + sort_brw_pages(pga, page_count); + rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count, + pga, &req, crattr->cra_capa, 1, 0); + if (rc != 0) { + CERROR("prep_req failed: %d\n", rc); + goto out; + } + + req->rq_interpret_reply = brw_interpret; + + if (mem_tight != 0) + req->rq_memalloc = 1; + + /* Need to update the timestamps after the request is built in case + * we race with setattr (locally or in queue at OST). If OST gets + * later setattr before earlier BRW (as determined by the request xid), + * the OST will not use BRW timestamps. Sadly, there is no obvious + * way to do this in a single call. bug 10150 */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + crattr->cra_oa = &body->oa; + cl_req_attr_set(env, clerq, crattr, + OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME); + + lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid); + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + INIT_LIST_HEAD(&aa->aa_oaps); + list_splice_init(&rpc_list, &aa->aa_oaps); + INIT_LIST_HEAD(&aa->aa_exts); + list_splice_init(ext_list, &aa->aa_exts); + aa->aa_clerq = clerq; + + /* queued sync pages can be torn down while the pages + * were between the pending list and the rpc */ + tmp = NULL; + list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { + /* only one oap gets a request reference */ + if (tmp == NULL) + tmp = oap; + if (oap->oap_interrupted && !req->rq_intr) { + CDEBUG(D_INODE, "oap %p in req %p interrupted\n", + oap, req); + ptlrpc_mark_interrupted(req); + } + } + if (tmp != NULL) + tmp->oap_request = ptlrpc_request_addref(req); + + client_obd_list_lock(&cli->cl_loi_list_lock); + starting_offset >>= PAGE_CACHE_SHIFT; + if (cmd == OBD_BRW_READ) { + cli->cl_r_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); + lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, + starting_offset + 1); + } else { + cli->cl_w_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight); + lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, + starting_offset + 1); + } + client_obd_list_unlock(&cli->cl_loi_list_lock); + + DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight", + page_count, aa, cli->cl_r_in_flight, + cli->cl_w_in_flight); + + /* XXX: Maybe the caller can check the RPC bulk descriptor to + * see which CPU/NUMA node the majority of pages were allocated + * on, and try to assign the async RPC to the CPU core + * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic. + * + * But on the other hand, we expect that multiple ptlrpcd + * threads and the initial write sponsor can run in parallel, + * especially when data checksum is enabled, which is CPU-bound + * operation and single ptlrpcd thread cannot process in time. + * So more ptlrpcd threads sharing BRW load + * (with PDL_POLICY_ROUND) seems better. + */ + ptlrpcd_add_req(req, pol, -1); + rc = 0; + +out: + if (mem_tight != 0) + cfs_memory_pressure_restore(mpflag); + + if (crattr != NULL) { + capa_put(crattr->cra_capa); + OBD_FREE(crattr, sizeof(*crattr)); + } + + if (rc != 0) { + LASSERT(req == NULL); + + if (oa) + OBDO_FREE(oa); + if (pga) + OBD_FREE(pga, sizeof(*pga) * page_count); + /* this should happen rarely and is pretty bad, it makes the + * pending list not follow the dirty order */ + while (!list_empty(ext_list)) { + ext = list_entry(ext_list->next, struct osc_extent, + oe_link); + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + } + if (clerq && !IS_ERR(clerq)) + cl_req_completion(env, clerq, rc); + } + return rc; +} + +static int osc_set_lock_data_with_check(struct ldlm_lock *lock, + struct ldlm_enqueue_info *einfo) +{ + void *data = einfo->ei_cbdata; + int set = 0; + + LASSERT(lock != NULL); + LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl); + LASSERT(lock->l_resource->lr_type == einfo->ei_type); + LASSERT(lock->l_completion_ast == einfo->ei_cb_cp); + LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl); + + lock_res_and_lock(lock); + spin_lock(&osc_ast_guard); + + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + set = 1; + + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(lock); + + return set; +} + +static int osc_set_data_with_check(struct lustre_handle *lockh, + struct ldlm_enqueue_info *einfo) +{ + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + int set = 0; + + if (lock != NULL) { + set = osc_set_lock_data_with_check(lock, einfo); + LDLM_LOCK_PUT(lock); + } else + CERROR("lockh %p, data %p - client evicted?\n", + lockh, einfo->ei_cbdata); + return set; +} + +/* find any ldlm lock of the inode in osc + * return 0 not find + * 1 find one + * < 0 error */ +static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, + ldlm_iterator_t replace, void *data) +{ + struct ldlm_res_id res_id; + struct obd_device *obd = class_exp2obd(exp); + int rc = 0; + + ostid_build_res_name(&lsm->lsm_oi, &res_id); + rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); + if (rc == LDLM_ITER_STOP) + return 1; + if (rc == LDLM_ITER_CONTINUE) + return 0; + return rc; +} + +static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb, + obd_enqueue_update_f upcall, void *cookie, + __u64 *flags, int agl, int rc) +{ + int intent = *flags & LDLM_FL_HAS_INTENT; + + if (intent) { + /* The request was created before ldlm_cli_enqueue call. */ + if (rc == ELDLM_LOCK_ABORTED) { + struct ldlm_reply *rep; + rep = req_capsule_server_get(&req->rq_pill, + &RMF_DLM_REP); + + LASSERT(rep != NULL); + rep->lock_policy_res1 = + ptlrpc_status_ntoh(rep->lock_policy_res1); + if (rep->lock_policy_res1) + rc = rep->lock_policy_res1; + } + } + + if ((intent != 0 && rc == ELDLM_LOCK_ABORTED && agl == 0) || + (rc == 0)) { + *flags |= LDLM_FL_LVB_READY; + CDEBUG(D_INODE, "got kms %llu blocks %llu mtime %llu\n", + lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime); + } + + /* Call the update callback. */ + rc = (*upcall)(cookie, rc); + return rc; +} + +static int osc_enqueue_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_enqueue_args *aa, int rc) +{ + struct ldlm_lock *lock; + struct lustre_handle handle; + __u32 mode; + struct ost_lvb *lvb; + __u32 lvb_len; + __u64 *flags = aa->oa_flags; + + /* Make a local copy of a lock handle and a mode, because aa->oa_* + * might be freed anytime after lock upcall has been called. */ + lustre_handle_copy(&handle, aa->oa_lockh); + mode = aa->oa_ei->ei_mode; + + /* ldlm_cli_enqueue is holding a reference on the lock, so it must + * be valid. */ + lock = ldlm_handle2lock(&handle); + + /* Take an additional reference so that a blocking AST that + * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed + * to arrive after an upcall has been executed by + * osc_enqueue_fini(). */ + ldlm_lock_addref(&handle, mode); + + /* Let CP AST to grant the lock first. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1); + + if (aa->oa_agl && rc == ELDLM_LOCK_ABORTED) { + lvb = NULL; + lvb_len = 0; + } else { + lvb = aa->oa_lvb; + lvb_len = sizeof(*aa->oa_lvb); + } + + /* Complete obtaining the lock procedure. */ + rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1, + mode, flags, lvb, lvb_len, &handle, rc); + /* Complete osc stuff. */ + rc = osc_enqueue_fini(req, aa->oa_lvb, aa->oa_upcall, aa->oa_cookie, + flags, aa->oa_agl, rc); + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10); + + /* Release the lock for async request. */ + if (lustre_handle_is_used(&handle) && rc == ELDLM_OK) + /* + * Releases a reference taken by ldlm_cli_enqueue(), if it is + * not already released by + * ldlm_cli_enqueue_fini()->failed_lock_cleanup() + */ + ldlm_lock_decref(&handle, mode); + + LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n", + aa->oa_lockh, req, aa); + ldlm_lock_decref(&handle, mode); + LDLM_LOCK_PUT(lock); + return rc; +} + +struct ptlrpc_request_set *PTLRPCD_SET = (void *)1; + +/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock + * from the 2nd OSC before a lock from the 1st one. This does not deadlock with + * other synchronous requests, however keeping some locks and trying to obtain + * others may take a considerable amount of time in a case of ost failure; and + * when other sync requests do not get released lock from a client, the client + * is excluded from the cluster -- such scenarious make the life difficult, so + * release locks just after they are obtained. */ +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u64 *flags, ldlm_policy_data_t *policy, + struct ost_lvb *lvb, int kms_valid, + obd_enqueue_update_f upcall, void *cookie, + struct ldlm_enqueue_info *einfo, + struct lustre_handle *lockh, + struct ptlrpc_request_set *rqset, int async, int agl) +{ + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req = NULL; + int intent = *flags & LDLM_FL_HAS_INTENT; + __u64 match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY); + ldlm_mode_t mode; + int rc; + + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother. */ + policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK; + policy->l_extent.end |= ~CFS_PAGE_MASK; + + /* + * kms is not valid when either object is completely fresh (so that no + * locks are cached), or object was evicted. In the latter case cached + * lock cannot be used, because it would prime inode state with + * potentially stale LVB. + */ + if (!kms_valid) + goto no_match; + + /* Next, search for already existing extent locks that will cover us */ + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. + * + * There are problems with conversion deadlocks, so instead of + * converting a read lock to a write lock, we'll just enqueue a new + * one. + * + * At some point we should cancel the read lock instead of making them + * send us a blocking callback, but there are problems with canceling + * locks out from other users right now, too. */ + mode = einfo->ei_mode; + if (einfo->ei_mode == LCK_PR) + mode |= LCK_PW; + mode = ldlm_lock_match(obd->obd_namespace, *flags | match_lvb, res_id, + einfo->ei_type, policy, mode, lockh, 0); + if (mode) { + struct ldlm_lock *matched = ldlm_handle2lock(lockh); + + if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) { + /* For AGL, if enqueue RPC is sent but the lock is not + * granted, then skip to process this strpe. + * Return -ECANCELED to tell the caller. */ + ldlm_lock_decref(lockh, mode); + LDLM_LOCK_PUT(matched); + return -ECANCELED; + } else if (osc_set_lock_data_with_check(matched, einfo)) { + *flags |= LDLM_FL_LVB_READY; + /* addref the lock only if not async requests and PW + * lock is matched whereas we asked for PR. */ + if (!rqset && einfo->ei_mode != mode) + ldlm_lock_addref(lockh, LCK_PR); + if (intent) { + /* I would like to be able to ASSERT here that + * rss <= kms, but I can't, for reasons which + * are explained in lov_enqueue() */ + } + + /* We already have a lock, and it's referenced. + * + * At this point, the cl_lock::cll_state is CLS_QUEUING, + * AGL upcall may change it to CLS_HELD directly. */ + (*upcall)(cookie, ELDLM_OK); + + if (einfo->ei_mode != mode) + ldlm_lock_decref(lockh, LCK_PW); + else if (rqset) + /* For async requests, decref the lock. */ + ldlm_lock_decref(lockh, einfo->ei_mode); + LDLM_LOCK_PUT(matched); + return ELDLM_OK; + } else { + ldlm_lock_decref(lockh, mode); + LDLM_LOCK_PUT(matched); + } + } + + no_match: + if (intent) { + LIST_HEAD(cancels); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE_LVB); + if (req == NULL) + return -ENOMEM; + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, 0); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + sizeof(*lvb)); + ptlrpc_request_set_replen(req); + } + + /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */ + *flags &= ~LDLM_FL_BLOCK_GRANTED; + + rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, + sizeof(*lvb), LVB_T_OST, lockh, async); + if (rqset) { + if (!rc) { + struct osc_enqueue_args *aa; + CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->oa_ei = einfo; + aa->oa_exp = exp; + aa->oa_flags = flags; + aa->oa_upcall = upcall; + aa->oa_cookie = cookie; + aa->oa_lvb = lvb; + aa->oa_lockh = lockh; + aa->oa_agl = !!agl; + + req->rq_interpret_reply = + (ptlrpc_interpterer_t)osc_enqueue_interpret; + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + } else if (intent) { + ptlrpc_req_finished(req); + } + return rc; + } + + rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, agl, rc); + if (intent) + ptlrpc_req_finished(req); + + return rc; +} + +int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + __u64 *flags, void *data, struct lustre_handle *lockh, + int unref) +{ + struct obd_device *obd = exp->exp_obd; + __u64 lflags = *flags; + ldlm_mode_t rc; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH)) + return -EIO; + + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother */ + policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK; + policy->l_extent.end |= ~CFS_PAGE_MASK; + + /* Next, search for already existing extent locks that will cover us */ + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. */ + rc = mode; + if (mode == LCK_PR) + rc |= LCK_PW; + rc = ldlm_lock_match(obd->obd_namespace, lflags, + res_id, type, policy, rc, lockh, unref); + if (rc) { + if (data != NULL) { + if (!osc_set_data_with_check(lockh, data)) { + if (!(lflags & LDLM_FL_TEST_LOCK)) + ldlm_lock_decref(lockh, rc); + return 0; + } + } + if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) { + ldlm_lock_addref(lockh, LCK_PR); + ldlm_lock_decref(lockh, LCK_PW); + } + return rc; + } + return rc; +} + +int osc_cancel_base(struct lustre_handle *lockh, __u32 mode) +{ + if (unlikely(mode == LCK_GROUP)) + ldlm_lock_decref_and_cancel(lockh, mode); + else + ldlm_lock_decref(lockh, mode); + + return 0; +} + +static int osc_statfs_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_async_args *aa, int rc) +{ + struct obd_statfs *msfs; + + if (rc == -EBADR) + /* The request has in fact never been sent + * due to issues at a higher level (LOV). + * Exit immediately since the caller is + * aware of the problem and takes care + * of the clean up */ + return rc; + + if ((rc == -ENOTCONN || rc == -EAGAIN) && + (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY)) { + rc = 0; + goto out; + } + + if (rc != 0) + goto out; + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) { + rc = -EPROTO; + goto out; + } + + *aa->aa_oi->oi_osfs = *msfs; +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + return rc; +} + +static int osc_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, __u64 max_age, + struct ptlrpc_request_set *rqset) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct osc_async_args *aa; + int rc; + + /* We could possibly pass max_age in the request (as an absolute + * timestamp or a "seconds.usec ago") so the target can avoid doing + * extra calls into the filesystem if that isn't necessary (e.g. + * during mount that would help a bit). Having relative timestamps + * is not so great if request processing is slow, while absolute + * timestamps are not ideal because they need time synchronization. */ + req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS); + if (req == NULL) + return -ENOMEM; + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + ptlrpc_request_set_replen(req); + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (oinfo->oi_flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stat in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret; + CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->aa_oi = oinfo; + + ptlrpc_set_add_req(rqset, req); + return 0; +} + +static int osc_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct obd_statfs *msfs; + struct ptlrpc_request *req; + struct obd_import *imp = NULL; + int rc; + + /*Since the request might also come from lprocfs, so we need + *sync this with client_disconnect_export Bug15684*/ + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + if (!imp) + return -ENODEV; + + /* We could possibly pass max_age in the request (as an absolute + * timestamp or a "seconds.usec ago") so the target can avoid doing + * extra calls into the filesystem if that isn't necessary (e.g. + * during mount that would help a bit). Having relative timestamps + * is not so great if request processing is slow, while absolute + * timestamps are not ideal because they need time synchronization. */ + req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS); + + class_import_put(imp); + + if (req == NULL) + return -ENOMEM; + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + ptlrpc_request_set_replen(req); + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stat in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) { + rc = -EPROTO; + goto out; + } + + *osfs = *msfs; + + out: + ptlrpc_req_finished(req); + return rc; +} + +/* Retrieve object striping information. + * + * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating + * the maximum number of OST indices which will fit in the user buffer. + * lmm_magic must be LOV_MAGIC (we only use 1 slot here). + */ +static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump) +{ + /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ + struct lov_user_md_v3 lum, *lumk; + struct lov_user_ost_data_v1 *lmm_objects; + int rc = 0, lum_size; + + if (!lsm) + return -ENODATA; + + /* we only need the header part from user space to get lmm_magic and + * lmm_stripe_count, (the header part is common to v1 and v3) */ + lum_size = sizeof(struct lov_user_md_v1); + if (copy_from_user(&lum, lump, lum_size)) + return -EFAULT; + + if ((lum.lmm_magic != LOV_USER_MAGIC_V1) && + (lum.lmm_magic != LOV_USER_MAGIC_V3)) + return -EINVAL; + + /* lov_user_md_vX and lov_mds_md_vX must have the same size */ + LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1)); + LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3)); + LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0])); + + /* we can use lov_mds_md_size() to compute lum_size + * because lov_user_md_vX and lov_mds_md_vX have the same size */ + if (lum.lmm_stripe_count > 0) { + lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic); + OBD_ALLOC(lumk, lum_size); + if (!lumk) + return -ENOMEM; + + if (lum.lmm_magic == LOV_USER_MAGIC_V1) + lmm_objects = + &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]); + else + lmm_objects = &(lumk->lmm_objects[0]); + lmm_objects->l_ost_oi = lsm->lsm_oi; + } else { + lum_size = lov_mds_md_size(0, lum.lmm_magic); + lumk = &lum; + } + + lumk->lmm_oi = lsm->lsm_oi; + lumk->lmm_stripe_count = 1; + + if (copy_to_user(lump, lumk, lum_size)) + rc = -EFAULT; + + if (lumk != &lum) + OBD_FREE(lumk, lum_size); + + return rc; +} + + +static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + int err = 0; + + if (!try_module_get(THIS_MODULE)) { + CERROR("Can't get module. Is it alive?"); + return -EINVAL; + } + switch (cmd) { + case OBD_IOC_LOV_GET_CONFIG: { + char *buf; + struct lov_desc *desc; + struct obd_uuid uuid; + + buf = NULL; + len = 0; + if (obd_ioctl_getdata(&buf, &len, (void *)uarg)) { + err = -EINVAL; + goto out; + } + + data = (struct obd_ioctl_data *)buf; + + if (sizeof(*desc) > data->ioc_inllen1) { + obd_ioctl_freedata(buf, len); + err = -EINVAL; + goto out; + } + + if (data->ioc_inllen2 < sizeof(uuid)) { + obd_ioctl_freedata(buf, len); + err = -EINVAL; + goto out; + } + + desc = (struct lov_desc *)data->ioc_inlbuf1; + desc->ld_tgt_count = 1; + desc->ld_active_tgt_count = 1; + desc->ld_default_stripe_count = 1; + desc->ld_default_stripe_size = 0; + desc->ld_default_stripe_offset = 0; + desc->ld_pattern = 0; + memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid)); + + memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid)); + + err = copy_to_user((void *)uarg, buf, len); + if (err) + err = -EFAULT; + obd_ioctl_freedata(buf, len); + goto out; + } + case LL_IOC_LOV_SETSTRIPE: + err = obd_alloc_memmd(exp, karg); + if (err > 0) + err = 0; + goto out; + case LL_IOC_LOV_GETSTRIPE: + err = osc_getstripe(karg, uarg); + goto out; + case OBD_IOC_CLIENT_RECOVER: + err = ptlrpc_recover_import(obd->u.cli.cl_import, + data->ioc_inlbuf1, 0); + if (err > 0) + err = 0; + goto out; + case IOC_OSC_SET_ACTIVE: + err = ptlrpc_set_import_active(obd->u.cli.cl_import, + data->ioc_offset); + goto out; + case OBD_IOC_POLL_QUOTACHECK: + err = osc_quota_poll_check(exp, (struct if_quotacheck *)karg); + goto out; + case OBD_IOC_PING_TARGET: + err = ptlrpc_obd_ping(obd); + goto out; + default: + CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", + cmd, current_comm()); + err = -ENOTTY; + goto out; + } +out: + module_put(THIS_MODULE); + return err; +} + +static int osc_get_info(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + if (!vallen || !val) + return -EFAULT; + + if (KEY_IS(KEY_LOCK_TO_STRIPE)) { + __u32 *stripe = val; + *vallen = sizeof(*stripe); + *stripe = 0; + return 0; + } else if (KEY_IS(KEY_LAST_ID)) { + struct ptlrpc_request *req; + u64 *reply; + char *tmp; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_OST_GET_INFO_LAST_ID); + if (req == NULL) + return -ENOMEM; + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + + req->rq_no_delay = req->rq_no_resend = 1; + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID); + if (reply == NULL) { + rc = -EPROTO; + goto out; + } + + *((u64 *)val) = *reply; + out: + ptlrpc_req_finished(req); + return rc; + } else if (KEY_IS(KEY_FIEMAP)) { + struct ll_fiemap_info_key *fm_key = + (struct ll_fiemap_info_key *)key; + struct ldlm_res_id res_id; + ldlm_policy_data_t policy; + struct lustre_handle lockh; + ldlm_mode_t mode = 0; + struct ptlrpc_request *req; + struct ll_user_fiemap *reply; + char *tmp; + int rc; + + if (!(fm_key->fiemap.fm_flags & FIEMAP_FLAG_SYNC)) + goto skip_locking; + + policy.l_extent.start = fm_key->fiemap.fm_start & + CFS_PAGE_MASK; + + if (OBD_OBJECT_EOF - fm_key->fiemap.fm_length <= + fm_key->fiemap.fm_start + PAGE_CACHE_SIZE - 1) + policy.l_extent.end = OBD_OBJECT_EOF; + else + policy.l_extent.end = (fm_key->fiemap.fm_start + + fm_key->fiemap.fm_length + + PAGE_CACHE_SIZE - 1) & CFS_PAGE_MASK; + + ostid_build_res_name(&fm_key->oa.o_oi, &res_id); + mode = ldlm_lock_match(exp->exp_obd->obd_namespace, + LDLM_FL_BLOCK_GRANTED | + LDLM_FL_LVB_READY, + &res_id, LDLM_EXTENT, &policy, + LCK_PR | LCK_PW, &lockh, 0); + if (mode) { /* lock is cached on client */ + if (mode != LCK_PR) { + ldlm_lock_addref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, LCK_PW); + } + } else { /* no cached lock, needs acquire lock on server side */ + fm_key->oa.o_valid |= OBD_MD_FLFLAGS; + fm_key->oa.o_flags |= OBD_FL_SRVLOCK; + } + +skip_locking: + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_OST_GET_INFO_FIEMAP); + if (req == NULL) { + rc = -ENOMEM; + goto drop_lock; + } + + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, + RCL_CLIENT, *vallen); + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, + RCL_SERVER, *vallen); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); + if (rc) { + ptlrpc_request_free(req); + goto drop_lock; + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL); + memcpy(tmp, val, *vallen); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + goto fini_req; + + reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL); + if (reply == NULL) { + rc = -EPROTO; + goto fini_req; + } + + memcpy(val, reply, *vallen); +fini_req: + ptlrpc_req_finished(req); +drop_lock: + if (mode) + ldlm_lock_decref(&lockh, LCK_PR); + return rc; + } + + return -EINVAL; +} + +static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, u32 vallen, + void *val, struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + struct obd_device *obd = exp->exp_obd; + struct obd_import *imp = class_exp2cliimp(exp); + char *tmp; + int rc; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10); + + if (KEY_IS(KEY_CHECKSUM)) { + if (vallen != sizeof(int)) + return -EINVAL; + exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0; + return 0; + } + + if (KEY_IS(KEY_SPTLRPC_CONF)) { + sptlrpc_conf_client_adapt(obd); + return 0; + } + + if (KEY_IS(KEY_FLUSH_CTX)) { + sptlrpc_import_flush_my_ctx(imp); + return 0; + } + + if (KEY_IS(KEY_CACHE_SET)) { + struct client_obd *cli = &obd->u.cli; + + LASSERT(cli->cl_cache == NULL); /* only once */ + cli->cl_cache = (struct cl_client_cache *)val; + atomic_inc(&cli->cl_cache->ccc_users); + cli->cl_lru_left = &cli->cl_cache->ccc_lru_left; + + /* add this osc into entity list */ + LASSERT(list_empty(&cli->cl_lru_osc)); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + + return 0; + } + + if (KEY_IS(KEY_CACHE_LRU_SHRINK)) { + struct client_obd *cli = &obd->u.cli; + int nr = atomic_read(&cli->cl_lru_in_list) >> 1; + int target = *(int *)val; + + nr = osc_lru_shrink(cli, min(nr, target)); + *(int *)val -= nr; + return 0; + } + + if (!set && !KEY_IS(KEY_GRANT_SHRINK)) + return -EINVAL; + + /* We pass all other commands directly to OST. Since nobody calls osc + methods directly and everybody is supposed to go through LOV, we + assume lov checked invalid values for us. + The only recognised values so far are evict_by_nid and mds_conn. + Even if something bad goes through, we'd get a -EINVAL from OST + anyway. */ + + req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ? + &RQF_OST_SET_GRANT_INFO : + &RQF_OBD_SET_INFO); + if (req == NULL) + return -ENOMEM; + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + if (!KEY_IS(KEY_GRANT_SHRINK)) + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ? + &RMF_OST_BODY : + &RMF_SETINFO_VAL); + memcpy(tmp, val, vallen); + + if (KEY_IS(KEY_GRANT_SHRINK)) { + struct osc_brw_async_args *aa; + struct obdo *oa; + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + OBDO_ALLOC(oa); + if (!oa) { + ptlrpc_req_finished(req); + return -ENOMEM; + } + *oa = ((struct ost_body *)val)->oa; + aa->aa_oa = oa; + req->rq_interpret_reply = osc_shrink_grant_interpret; + } + + ptlrpc_request_set_replen(req); + if (!KEY_IS(KEY_GRANT_SHRINK)) { + LASSERT(set != NULL); + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + } else + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + + return 0; +} + +static int osc_reconnect(const struct lu_env *env, + struct obd_export *exp, struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *data, + void *localdata) +{ + struct client_obd *cli = &obd->u.cli; + + if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { + long lost_grant; + + client_obd_list_lock(&cli->cl_loi_list_lock); + data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?: + 2 * cli_brw_size(obd); + lost_grant = cli->cl_lost_grant; + cli->cl_lost_grant = 0; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d, lost: %ld.\n", + data->ocd_connect_flags, + data->ocd_version, data->ocd_grant, lost_grant); + } + + return 0; +} + +static int osc_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + int rc; + + rc = client_disconnect_export(exp); + /** + * Initially we put del_shrink_grant before disconnect_export, but it + * causes the following problem if setup (connect) and cleanup + * (disconnect) are tangled together. + * connect p1 disconnect p2 + * ptlrpc_connect_import + * ............... class_manual_cleanup + * osc_disconnect + * del_shrink_grant + * ptlrpc_connect_interrupt + * init_grant_shrink + * add this client to shrink list + * cleanup_osc + * Bang! pinger trigger the shrink. + * So the osc should be disconnected from the shrink list, after we + * are sure the import has been destroyed. BUG18662 + */ + if (obd->u.cli.cl_import == NULL) + osc_del_shrink_grant(&obd->u.cli); + return rc; +} + +static int osc_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + struct client_obd *cli; + int rc = 0; + + LASSERT(imp->imp_obd == obd); + + switch (event) { + case IMP_EVENT_DISCON: { + cli = &obd->u.cli; + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant = 0; + cli->cl_lost_grant = 0; + client_obd_list_unlock(&cli->cl_loi_list_lock); + break; + } + case IMP_EVENT_INACTIVE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL); + break; + } + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + struct lu_env *env; + int refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + /* Reset grants */ + cli = &obd->u.cli; + /* all pages go to failing rpcs due to the invalid + * import */ + osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND); + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + cl_env_put(env, &refcheck); + } else + rc = PTR_ERR(env); + break; + } + case IMP_EVENT_ACTIVE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); + break; + } + case IMP_EVENT_OCD: { + struct obd_connect_data *ocd = &imp->imp_connect_data; + + if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT) + osc_init_grant(&obd->u.cli, ocd); + + /* See bug 7198 */ + if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL) + imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL; + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL); + break; + } + case IMP_EVENT_DEACTIVATE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL); + break; + } + case IMP_EVENT_ACTIVATE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL); + break; + } + default: + CERROR("Unknown import event %d\n", event); + LBUG(); + } + return rc; +} + +/** + * Determine whether the lock can be canceled before replaying the lock + * during recovery, see bug16774 for detailed information. + * + * \retval zero the lock can't be canceled + * \retval other ok to cancel + */ +static int osc_cancel_for_recovery(struct ldlm_lock *lock) +{ + check_res_locked(lock->l_resource); + + /* + * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR. + * + * XXX as a future improvement, we can also cancel unused write lock + * if it doesn't have dirty data and active mmaps. + */ + if (lock->l_resource->lr_type == LDLM_EXTENT && + (lock->l_granted_mode == LCK_PR || + lock->l_granted_mode == LCK_CR) && + (osc_dlm_lock_pageref(lock) == 0)) + return 1; + + return 0; +} + +static int brw_queue_work(const struct lu_env *env, void *data) +{ + struct client_obd *cli = data; + + CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); + + osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME); + return 0; +} + +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars = { NULL }; + struct client_obd *cli = &obd->u.cli; + void *handler; + int rc; + + rc = ptlrpcd_addref(); + if (rc) + return rc; + + rc = client_obd_setup(obd, lcfg); + if (rc) + goto out_ptlrpcd; + + handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli); + if (IS_ERR(handler)) { + rc = PTR_ERR(handler); + goto out_client_setup; + } + cli->cl_writeback_work = handler; + + rc = osc_quota_setup(obd); + if (rc) + goto out_ptlrpcd_work; + + cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; + lprocfs_osc_init_vars(&lvars); + if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) { + lproc_osc_attach_seqstat(obd); + sptlrpc_lprocfs_cliobd_attach(obd); + ptlrpc_lprocfs_register_obd(obd); + } + + /* We need to allocate a few requests more, because + * brw_interpret tries to create new requests before freeing + * previous ones, Ideally we want to have 2x max_rpcs_in_flight + * reserved, but I'm afraid that might be too much wasted RAM + * in fact, so 2 is just my guess and still should work. */ + cli->cl_import->imp_rq_pool = + ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, + OST_MAXREQSIZE, + ptlrpc_add_rqs_to_pool); + + INIT_LIST_HEAD(&cli->cl_grant_shrink_list); + ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery); + return rc; + +out_ptlrpcd_work: + ptlrpcd_destroy_work(handler); +out_client_setup: + client_obd_cleanup(obd); +out_ptlrpcd: + ptlrpcd_decref(); + return rc; +} + +static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + switch (stage) { + case OBD_CLEANUP_EARLY: { + struct obd_import *imp; + imp = obd->u.cli.cl_import; + CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name); + /* ptlrpc_abort_inflight to stop an mds_lov_synchronize */ + ptlrpc_deactivate_import(imp); + spin_lock(&imp->imp_lock); + imp->imp_pingable = 0; + spin_unlock(&imp->imp_lock); + break; + } + case OBD_CLEANUP_EXPORTS: { + struct client_obd *cli = &obd->u.cli; + /* LU-464 + * for echo client, export may be on zombie list, wait for + * zombie thread to cull it, because cli.cl_import will be + * cleared in client_disconnect_export(): + * class_export_destroy() -> obd_cleanup() -> + * echo_device_free() -> echo_client_cleanup() -> + * obd_disconnect() -> osc_disconnect() -> + * client_disconnect_export() + */ + obd_zombie_barrier(); + if (cli->cl_writeback_work) { + ptlrpcd_destroy_work(cli->cl_writeback_work); + cli->cl_writeback_work = NULL; + } + obd_cleanup_client_import(obd); + ptlrpc_lprocfs_unregister_obd(obd); + lprocfs_obd_cleanup(obd); + break; + } + } + return 0; +} + +int osc_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int rc; + + /* lru cleanup */ + if (cli->cl_cache != NULL) { + LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_del_init(&cli->cl_lru_osc); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + cli->cl_lru_left = NULL; + atomic_dec(&cli->cl_cache->ccc_users); + cli->cl_cache = NULL; + } + + /* free memory of osc quota cache */ + osc_quota_cleanup(obd); + + rc = client_obd_cleanup(obd); + + ptlrpcd_decref(); + return rc; +} + +int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars = { NULL }; + int rc = 0; + + lprocfs_osc_init_vars(&lvars); + + switch (lcfg->lcfg_command) { + default: + rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars, + lcfg, obd); + if (rc > 0) + rc = 0; + break; + } + + return rc; +} + +static int osc_process_config(struct obd_device *obd, u32 len, void *buf) +{ + return osc_process_config_base(obd, buf); +} + +struct obd_ops osc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = osc_setup, + .o_precleanup = osc_precleanup, + .o_cleanup = osc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_reconnect = osc_reconnect, + .o_disconnect = osc_disconnect, + .o_statfs = osc_statfs, + .o_statfs_async = osc_statfs_async, + .o_packmd = osc_packmd, + .o_unpackmd = osc_unpackmd, + .o_create = osc_create, + .o_destroy = osc_destroy, + .o_getattr = osc_getattr, + .o_getattr_async = osc_getattr_async, + .o_setattr = osc_setattr, + .o_setattr_async = osc_setattr_async, + .o_find_cbdata = osc_find_cbdata, + .o_iocontrol = osc_iocontrol, + .o_get_info = osc_get_info, + .o_set_info_async = osc_set_info_async, + .o_import_event = osc_import_event, + .o_process_config = osc_process_config, + .o_quotactl = osc_quotactl, + .o_quotacheck = osc_quotacheck, +}; + +extern struct lu_kmem_descr osc_caches[]; +extern spinlock_t osc_ast_guard; +extern struct lock_class_key osc_ast_guard_class; + +static int __init osc_init(void) +{ + struct lprocfs_static_vars lvars = { NULL }; + int rc; + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches); + + rc = lu_kmem_init(osc_caches); + if (rc) + return rc; + + lprocfs_osc_init_vars(&lvars); + + rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars, + LUSTRE_OSC_NAME, &osc_device_type); + if (rc) { + lu_kmem_fini(osc_caches); + return rc; + } + + spin_lock_init(&osc_ast_guard); + lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class); + + return rc; +} + +static void /*__exit*/ osc_exit(void) +{ + class_unregister_type(LUSTRE_OSC_NAME); + lu_kmem_fini(osc_caches); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(LUSTRE_VERSION_STRING); + +module_init(osc_init); +module_exit(osc_exit); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/Makefile b/kernel/drivers/staging/lustre/lustre/ptlrpc/Makefile new file mode 100644 index 000000000..fb50cd4c6 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/Makefile @@ -0,0 +1,20 @@ +obj-$(CONFIG_LUSTRE_FS) += ptlrpc.o +LDLM := ../../lustre/ldlm/ + +ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o +ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o +ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o +ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o +ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o +ldlm_objs += $(LDLM)ldlm_pool.o +ldlm_objs += $(LDLM)interval_tree.o +ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o +ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o +ptlrpc_objs += llog_net.o llog_client.o import.o ptlrpcd.o +ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o +ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o +ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o + +ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs) +ptlrpc-$(CONFIG_PROC_FS) += sec_lproc.o +ptlrpc-$(CONFIG_LUSTRE_TRANSLATE_ERRNOS) += errno.o diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/client.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/client.c new file mode 100644 index 000000000..0357f1d45 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/client.c @@ -0,0 +1,3149 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** Implementation of client-side PortalRPC interfaces */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_lib.h" +#include "../include/lustre_ha.h" +#include "../include/lustre_import.h" +#include "../include/lustre_req_layout.h" + +#include "ptlrpc_internal.h" + +static int ptlrpc_send_new_req(struct ptlrpc_request *req); +static int ptlrpcd_check_work(struct ptlrpc_request *req); + +/** + * Initialize passed in client structure \a cl. + */ +void ptlrpc_init_client(int req_portal, int rep_portal, char *name, + struct ptlrpc_client *cl) +{ + cl->cli_request_portal = req_portal; + cl->cli_reply_portal = rep_portal; + cl->cli_name = name; +} +EXPORT_SYMBOL(ptlrpc_init_client); + +/** + * Return PortalRPC connection for remote uud \a uuid + */ +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) +{ + struct ptlrpc_connection *c; + lnet_nid_t self; + lnet_process_id_t peer; + int err; + + /* ptlrpc_uuid_to_peer() initializes its 2nd parameter + * before accessing its values. */ + /* coverity[uninit_use_in_call] */ + err = ptlrpc_uuid_to_peer(uuid, &peer, &self); + if (err != 0) { + CNETERR("cannot find peer %s!\n", uuid->uuid); + return NULL; + } + + c = ptlrpc_connection_get(peer, self, uuid); + if (c) { + memcpy(c->c_remote_uuid.uuid, + uuid->uuid, sizeof(c->c_remote_uuid.uuid)); + } + + CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); + + return c; +} +EXPORT_SYMBOL(ptlrpc_uuid_to_connection); + +/** + * Allocate and initialize new bulk descriptor on the sender. + * Returns pointer to the descriptor or NULL on error. + */ +struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, + unsigned type, unsigned portal) +{ + struct ptlrpc_bulk_desc *desc; + int i; + + OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages])); + if (!desc) + return NULL; + + spin_lock_init(&desc->bd_lock); + init_waitqueue_head(&desc->bd_waitq); + desc->bd_max_iov = npages; + desc->bd_iov_count = 0; + desc->bd_portal = portal; + desc->bd_type = type; + desc->bd_md_count = 0; + LASSERT(max_brw > 0); + desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); + /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this + * node. Negotiated ocd_brw_size will always be <= this number. */ + for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) + LNetInvalidateHandle(&desc->bd_mds[i]); + + return desc; +} + +/** + * Prepare bulk descriptor for specified outgoing request \a req that + * can fit \a npages * pages. \a type is bulk type. \a portal is where + * the bulk to be sent. Used on client-side. + * Returns pointer to newly allocated initialized bulk descriptor or NULL on + * error. + */ +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + unsigned npages, unsigned max_brw, + unsigned type, unsigned portal) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_bulk_desc *desc; + + LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE); + desc = ptlrpc_new_bulk(npages, max_brw, type, portal); + if (desc == NULL) + return NULL; + + desc->bd_import_generation = req->rq_import_generation; + desc->bd_import = class_import_get(imp); + desc->bd_req = req; + + desc->bd_cbid.cbid_fn = client_bulk_callback; + desc->bd_cbid.cbid_arg = desc; + + /* This makes req own desc, and free it when she frees herself */ + req->rq_bulk = desc; + + return desc; +} +EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); + +/** + * Add a page \a page to the bulk descriptor \a desc. + * Data to transfer in the page starts at offset \a pageoffset and + * amount of data to transfer from the page is \a len + */ +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len, int pin) +{ + LASSERT(desc->bd_iov_count < desc->bd_max_iov); + LASSERT(page != NULL); + LASSERT(pageoffset >= 0); + LASSERT(len > 0); + LASSERT(pageoffset + len <= PAGE_CACHE_SIZE); + + desc->bd_nob += len; + + if (pin) + page_cache_get(page); + + ptlrpc_add_bulk_page(desc, page, pageoffset, len); +} +EXPORT_SYMBOL(__ptlrpc_prep_bulk_page); + +/** + * Uninitialize and free bulk descriptor \a desc. + * Works on bulk descriptors both from server and client side. + */ +void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin) +{ + int i; + + LASSERT(desc != NULL); + LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ + LASSERT(desc->bd_md_count == 0); /* network hands off */ + LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); + + sptlrpc_enc_pool_put_pages(desc); + + if (desc->bd_export) + class_export_put(desc->bd_export); + else + class_import_put(desc->bd_import); + + if (unpin) { + for (i = 0; i < desc->bd_iov_count; i++) + page_cache_release(desc->bd_iov[i].kiov_page); + } + + OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, + bd_iov[desc->bd_max_iov])); +} +EXPORT_SYMBOL(__ptlrpc_free_bulk); + +/** + * Set server timelimit for this req, i.e. how long are we willing to wait + * for reply before timing out this request. + */ +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) +{ + __u32 serv_est; + int idx; + struct imp_at *at; + + LASSERT(req->rq_import); + + if (AT_OFF) { + /* non-AT settings */ + /** + * \a imp_server_timeout means this is reverse import and + * we send (currently only) ASTs to the client and cannot afford + * to wait too long for the reply, otherwise the other client + * (because of which we are sending this request) would + * timeout waiting for us + */ + req->rq_timeout = req->rq_import->imp_server_timeout ? + obd_timeout / 2 : obd_timeout; + } else { + at = &req->rq_import->imp_at; + idx = import_at_get_index(req->rq_import, + req->rq_request_portal); + serv_est = at_get(&at->iat_service_estimate[idx]); + req->rq_timeout = at_est2timeout(serv_est); + } + /* We could get even fancier here, using history to predict increased + loading... */ + + /* Let the server know what this RPC timeout is by putting it in the + reqmsg*/ + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); +} +EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); + +/* Adjust max service estimate based on server value */ +static void ptlrpc_at_adj_service(struct ptlrpc_request *req, + unsigned int serv_est) +{ + int idx; + unsigned int oldse; + struct imp_at *at; + + LASSERT(req->rq_import); + at = &req->rq_import->imp_at; + + idx = import_at_get_index(req->rq_import, req->rq_request_portal); + /* max service estimates are tracked on the server side, + so just keep minimal history here */ + oldse = at_measured(&at->iat_service_estimate[idx], serv_est); + if (oldse != 0) + CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name, req->rq_request_portal, + oldse, at_get(&at->iat_service_estimate[idx])); +} + +/* Expected network latency per remote node (secs) */ +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) +{ + return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); +} + +/* Adjust expected network latency */ +static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, + unsigned int service_time) +{ + unsigned int nl, oldnl; + struct imp_at *at; + time_t now = get_seconds(); + + LASSERT(req->rq_import); + + if (service_time > now - req->rq_sent + 3) { + /* bz16408, however, this can also happen if early reply + * is lost and client RPC is expired and resent, early reply + * or reply of original RPC can still be fit in reply buffer + * of resent RPC, now client is measuring time from the + * resent time, but server sent back service time of original + * RPC. + */ + CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ? + D_ADAPTTO : D_WARNING, + "Reported service time %u > total measured time " + CFS_DURATION_T"\n", service_time, + cfs_time_sub(now, req->rq_sent)); + return; + } + + /* Network latency is total time less server processing time */ + nl = max_t(int, now - req->rq_sent - + service_time, 0) + 1; /* st rounding */ + at = &req->rq_import->imp_at; + + oldnl = at_measured(&at->iat_net_latency, nl); + if (oldnl != 0) + CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name, + obd_uuid2str( + &req->rq_import->imp_connection->c_remote_uuid), + oldnl, at_get(&at->iat_net_latency)); +} + +static int unpack_reply(struct ptlrpc_request *req) +{ + int rc; + + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { + rc = ptlrpc_unpack_rep_msg(req, req->rq_replen); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); + return -EPROTO; + } + } + + rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); + return -EPROTO; + } + return 0; +} + +/** + * Handle an early reply message, called with the rq_lock held. + * If anything goes wrong just ignore it - same as if it never happened + */ +static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_request *early_req; + time_t olddl; + int rc; + + req->rq_early = 0; + spin_unlock(&req->rq_lock); + + rc = sptlrpc_cli_unwrap_early_reply(req, &early_req); + if (rc) { + spin_lock(&req->rq_lock); + return rc; + } + + rc = unpack_reply(early_req); + if (rc == 0) { + /* Expecting to increase the service time estimate here */ + ptlrpc_at_adj_service(req, + lustre_msg_get_timeout(early_req->rq_repmsg)); + ptlrpc_at_adj_net_latency(req, + lustre_msg_get_service_time(early_req->rq_repmsg)); + } + + sptlrpc_cli_finish_early_reply(early_req); + + if (rc != 0) { + spin_lock(&req->rq_lock); + return rc; + } + + /* Adjust the local timeout for this req */ + ptlrpc_at_set_req_timeout(req); + + spin_lock(&req->rq_lock); + olddl = req->rq_deadline; + /* server assumes it now has rq_timeout from when it sent the + * early reply, so client should give it at least that long. */ + req->rq_deadline = get_seconds() + req->rq_timeout + + ptlrpc_at_get_net_latency(req); + + DEBUG_REQ(D_ADAPTTO, req, + "Early reply #%d, new deadline in " CFS_DURATION_T "s (" CFS_DURATION_T "s)", + req->rq_early_count, + cfs_time_sub(req->rq_deadline, get_seconds()), + cfs_time_sub(req->rq_deadline, olddl)); + + return rc; +} + +struct kmem_cache *request_cache; + +int ptlrpc_request_cache_init(void) +{ + request_cache = kmem_cache_create("ptlrpc_cache", + sizeof(struct ptlrpc_request), + 0, SLAB_HWCACHE_ALIGN, NULL); + return request_cache == NULL ? -ENOMEM : 0; +} + +void ptlrpc_request_cache_fini(void) +{ + kmem_cache_destroy(request_cache); +} + +struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags) +{ + struct ptlrpc_request *req; + + OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags); + return req; +} + +void ptlrpc_request_cache_free(struct ptlrpc_request *req) +{ + OBD_SLAB_FREE_PTR(req, request_cache); +} + +/** + * Wind down request pool \a pool. + * Frees all requests from the pool too + */ +void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) +{ + struct list_head *l, *tmp; + struct ptlrpc_request *req; + + LASSERT(pool != NULL); + + spin_lock(&pool->prp_lock); + list_for_each_safe(l, tmp, &pool->prp_req_list) { + req = list_entry(l, struct ptlrpc_request, rq_list); + list_del(&req->rq_list); + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len == pool->prp_rq_size); + OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size); + ptlrpc_request_cache_free(req); + } + spin_unlock(&pool->prp_lock); + OBD_FREE(pool, sizeof(*pool)); +} +EXPORT_SYMBOL(ptlrpc_free_rq_pool); + +/** + * Allocates, initializes and adds \a num_rq requests to the pool \a pool + */ +void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) +{ + int i; + int size = 1; + + while (size < pool->prp_rq_size) + size <<= 1; + + LASSERTF(list_empty(&pool->prp_req_list) || + size == pool->prp_rq_size, + "Trying to change pool size with nonempty pool from %d to %d bytes\n", + pool->prp_rq_size, size); + + spin_lock(&pool->prp_lock); + pool->prp_rq_size = size; + for (i = 0; i < num_rq; i++) { + struct ptlrpc_request *req; + struct lustre_msg *msg; + + spin_unlock(&pool->prp_lock); + req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (!req) + return; + OBD_ALLOC_LARGE(msg, size); + if (!msg) { + ptlrpc_request_cache_free(req); + return; + } + req->rq_reqbuf = msg; + req->rq_reqbuf_len = size; + req->rq_pool = pool; + spin_lock(&pool->prp_lock); + list_add_tail(&req->rq_list, &pool->prp_req_list); + } + spin_unlock(&pool->prp_lock); +} +EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); + +/** + * Create and initialize new request pool with given attributes: + * \a num_rq - initial number of requests to create for the pool + * \a msgsize - maximum message size possible for requests in thid pool + * \a populate_pool - function to be called when more requests need to be added + * to the pool + * Returns pointer to newly created pool or NULL on error. + */ +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int num_rq, int msgsize, + void (*populate_pool)(struct ptlrpc_request_pool *, int)) +{ + struct ptlrpc_request_pool *pool; + + OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool)); + if (!pool) + return NULL; + + /* Request next power of two for the allocation, because internally + kernel would do exactly this */ + + spin_lock_init(&pool->prp_lock); + INIT_LIST_HEAD(&pool->prp_req_list); + pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD; + pool->prp_populate = populate_pool; + + populate_pool(pool, num_rq); + + if (list_empty(&pool->prp_req_list)) { + /* have not allocated a single request for the pool */ + OBD_FREE(pool, sizeof(struct ptlrpc_request_pool)); + pool = NULL; + } + return pool; +} +EXPORT_SYMBOL(ptlrpc_init_rq_pool); + +/** + * Fetches one request from pool \a pool + */ +static struct ptlrpc_request * +ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request; + struct lustre_msg *reqbuf; + + if (!pool) + return NULL; + + spin_lock(&pool->prp_lock); + + /* See if we have anything in a pool, and bail out if nothing, + * in writeout path, where this matters, this is safe to do, because + * nothing is lost in this case, and when some in-flight requests + * complete, this code will be called again. */ + if (unlikely(list_empty(&pool->prp_req_list))) { + spin_unlock(&pool->prp_lock); + return NULL; + } + + request = list_entry(pool->prp_req_list.next, struct ptlrpc_request, + rq_list); + list_del_init(&request->rq_list); + spin_unlock(&pool->prp_lock); + + LASSERT(request->rq_reqbuf); + LASSERT(request->rq_pool); + + reqbuf = request->rq_reqbuf; + memset(request, 0, sizeof(*request)); + request->rq_reqbuf = reqbuf; + request->rq_reqbuf_len = pool->prp_rq_size; + request->rq_pool = pool; + + return request; +} + +/** + * Returns freed \a request to pool. + */ +static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) +{ + struct ptlrpc_request_pool *pool = request->rq_pool; + + spin_lock(&pool->prp_lock); + LASSERT(list_empty(&request->rq_list)); + LASSERT(!request->rq_receiving_reply); + list_add_tail(&request->rq_list, &pool->prp_req_list); + spin_unlock(&pool->prp_lock); +} + +static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, + int count, __u32 *lengths, char **bufs, + struct ptlrpc_cli_ctx *ctx) +{ + struct obd_import *imp = request->rq_import; + int rc; + + if (unlikely(ctx)) + request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx); + else { + rc = sptlrpc_req_get_ctx(request); + if (rc) + goto out_free; + } + + sptlrpc_req_set_flavor(request, opcode); + + rc = lustre_pack_request(request, imp->imp_msg_magic, count, + lengths, bufs); + if (rc) { + LASSERT(!request->rq_pool); + goto out_ctx; + } + + lustre_msg_add_version(request->rq_reqmsg, version); + request->rq_send_state = LUSTRE_IMP_FULL; + request->rq_type = PTL_RPC_MSG_REQUEST; + request->rq_export = NULL; + + request->rq_req_cbid.cbid_fn = request_out_callback; + request->rq_req_cbid.cbid_arg = request; + + request->rq_reply_cbid.cbid_fn = reply_in_callback; + request->rq_reply_cbid.cbid_arg = request; + + request->rq_reply_deadline = 0; + request->rq_phase = RQ_PHASE_NEW; + request->rq_next_phase = RQ_PHASE_UNDEFINED; + + request->rq_request_portal = imp->imp_client->cli_request_portal; + request->rq_reply_portal = imp->imp_client->cli_reply_portal; + + ptlrpc_at_set_req_timeout(request); + + spin_lock_init(&request->rq_lock); + INIT_LIST_HEAD(&request->rq_list); + INIT_LIST_HEAD(&request->rq_timed_list); + INIT_LIST_HEAD(&request->rq_replay_list); + INIT_LIST_HEAD(&request->rq_ctx_chain); + INIT_LIST_HEAD(&request->rq_set_chain); + INIT_LIST_HEAD(&request->rq_history_list); + INIT_LIST_HEAD(&request->rq_exp_list); + init_waitqueue_head(&request->rq_reply_waitq); + init_waitqueue_head(&request->rq_set_waitq); + request->rq_xid = ptlrpc_next_xid(); + atomic_set(&request->rq_refcount, 1); + + lustre_msg_set_opc(request->rq_reqmsg, opcode); + + return 0; +out_ctx: + sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1); +out_free: + class_import_put(imp); + return rc; +} + +int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, char **bufs, + struct ptlrpc_cli_ctx *ctx) +{ + int count; + + count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT); + return __ptlrpc_request_bufs_pack(request, version, opcode, count, + request->rq_pill.rc_area[RCL_CLIENT], + bufs, ctx); +} +EXPORT_SYMBOL(ptlrpc_request_bufs_pack); + +/** + * Pack request buffers for network transfer, performing necessary encryption + * steps if necessary. + */ +int ptlrpc_request_pack(struct ptlrpc_request *request, + __u32 version, int opcode) +{ + int rc; + rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL); + if (rc) + return rc; + + /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of + * ptlrpc_body sent from server equal to local ptlrpc_body size, so we + * have to send old ptlrpc_body to keep interoperability with these + * clients. + * + * Only three kinds of server->client RPCs so far: + * - LDLM_BL_CALLBACK + * - LDLM_CP_CALLBACK + * - LDLM_GL_CALLBACK + * + * XXX This should be removed whenever we drop the interoperability with + * the these old clients. + */ + if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK || + opcode == LDLM_GL_CALLBACK) + req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY, + sizeof(struct ptlrpc_body_v2), RCL_CLIENT); + + return rc; +} +EXPORT_SYMBOL(ptlrpc_request_pack); + +/** + * Helper function to allocate new request on import \a imp + * and possibly using existing request from pool \a pool if provided. + * Returns allocated request structure with import field filled or + * NULL on error. + */ +static inline +struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, + struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request = NULL; + + if (pool) + request = ptlrpc_prep_req_from_pool(pool); + + if (!request) + request = ptlrpc_request_cache_alloc(GFP_NOFS); + + if (request) { + LASSERTF((unsigned long)imp > 0x1000, "%p", imp); + LASSERT(imp != LP_POISON); + LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p", + imp->imp_client); + LASSERT(imp->imp_client != LP_POISON); + + request->rq_import = class_import_get(imp); + } else { + CERROR("request allocation out of memory\n"); + } + + return request; +} + +/** + * Helper function for creating a request. + * Calls __ptlrpc_request_alloc to allocate new request structure and inits + * buffer structures according to capsule template \a format. + * Returns allocated request structure pointer or NULL on error. + */ +static struct ptlrpc_request * +ptlrpc_request_alloc_internal(struct obd_import *imp, + struct ptlrpc_request_pool *pool, + const struct req_format *format) +{ + struct ptlrpc_request *request; + + request = __ptlrpc_request_alloc(imp, pool); + if (request == NULL) + return NULL; + + req_capsule_init(&request->rq_pill, request, RCL_CLIENT); + req_capsule_set(&request->rq_pill, format); + return request; +} + +/** + * Allocate new request structure for import \a imp and initialize its + * buffer structure according to capsule template \a format. + */ +struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, + const struct req_format *format) +{ + return ptlrpc_request_alloc_internal(imp, NULL, format); +} +EXPORT_SYMBOL(ptlrpc_request_alloc); + +/** + * Allocate new request structure for import \a imp from pool \a pool and + * initialize its buffer structure according to capsule template \a format. + */ +struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, + struct ptlrpc_request_pool *pool, + const struct req_format *format) +{ + return ptlrpc_request_alloc_internal(imp, pool, format); +} +EXPORT_SYMBOL(ptlrpc_request_alloc_pool); + +/** + * For requests not from pool, free memory of the request structure. + * For requests obtained from a pool earlier, return request back to pool. + */ +void ptlrpc_request_free(struct ptlrpc_request *request) +{ + if (request->rq_pool) + __ptlrpc_free_req_to_pool(request); + else + ptlrpc_request_cache_free(request); +} +EXPORT_SYMBOL(ptlrpc_request_free); + +/** + * Allocate new request for operation \a opcode and immediately pack it for + * network transfer. + * Only used for simple requests like OBD_PING where the only important + * part of the request is operation itself. + * Returns allocated request or NULL on error. + */ +struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, + const struct req_format *format, + __u32 version, int opcode) +{ + struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format); + int rc; + + if (req) { + rc = ptlrpc_request_pack(req, version, opcode); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + } + } + return req; +} +EXPORT_SYMBOL(ptlrpc_request_alloc_pack); + +/** + * Prepare request (fetched from pool \a pool if not NULL) on import \a imp + * for operation \a opcode. Request would contain \a count buffers. + * Sizes of buffers are described in array \a lengths and buffers themselves + * are provided by a pointer \a bufs. + * Returns prepared request structure pointer or NULL on error. + */ +struct ptlrpc_request * +ptlrpc_prep_req_pool(struct obd_import *imp, + __u32 version, int opcode, + int count, __u32 *lengths, char **bufs, + struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request; + int rc; + + request = __ptlrpc_request_alloc(imp, pool); + if (!request) + return NULL; + + rc = __ptlrpc_request_bufs_pack(request, version, opcode, count, + lengths, bufs, NULL); + if (rc) { + ptlrpc_request_free(request); + request = NULL; + } + return request; +} +EXPORT_SYMBOL(ptlrpc_prep_req_pool); + +/** + * Same as ptlrpc_prep_req_pool, but without pool + */ +struct ptlrpc_request * +ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count, + __u32 *lengths, char **bufs) +{ + return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs, + NULL); +} +EXPORT_SYMBOL(ptlrpc_prep_req); + +/** + * Allocate and initialize new request set structure. + * Returns a pointer to the newly allocated set structure or NULL on error. + */ +struct ptlrpc_request_set *ptlrpc_prep_set(void) +{ + struct ptlrpc_request_set *set; + + OBD_ALLOC(set, sizeof(*set)); + if (!set) + return NULL; + atomic_set(&set->set_refcount, 1); + INIT_LIST_HEAD(&set->set_requests); + init_waitqueue_head(&set->set_waitq); + atomic_set(&set->set_new_count, 0); + atomic_set(&set->set_remaining, 0); + spin_lock_init(&set->set_new_req_lock); + INIT_LIST_HEAD(&set->set_new_requests); + INIT_LIST_HEAD(&set->set_cblist); + set->set_max_inflight = UINT_MAX; + set->set_producer = NULL; + set->set_producer_arg = NULL; + set->set_rc = 0; + + return set; +} +EXPORT_SYMBOL(ptlrpc_prep_set); + +/** + * Allocate and initialize new request set structure with flow control + * extension. This extension allows to control the number of requests in-flight + * for the whole set. A callback function to generate requests must be provided + * and the request set will keep the number of requests sent over the wire to + * @max_inflight. + * Returns a pointer to the newly allocated set structure or NULL on error. + */ +struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, + void *arg) + +{ + struct ptlrpc_request_set *set; + + set = ptlrpc_prep_set(); + if (!set) + return NULL; + + set->set_max_inflight = max; + set->set_producer = func; + set->set_producer_arg = arg; + + return set; +} +EXPORT_SYMBOL(ptlrpc_prep_fcset); + +/** + * Wind down and free request set structure previously allocated with + * ptlrpc_prep_set. + * Ensures that all requests on the set have completed and removes + * all requests from the request list in a set. + * If any unsent request happen to be on the list, pretends that they got + * an error in flight and calls their completion handler. + */ +void ptlrpc_set_destroy(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + struct list_head *next; + int expected_phase; + int n = 0; + + /* Requests on the set should either all be completed, or all be new */ + expected_phase = (atomic_read(&set->set_remaining) == 0) ? + RQ_PHASE_COMPLETE : RQ_PHASE_NEW; + list_for_each(tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + + LASSERT(req->rq_phase == expected_phase); + n++; + } + + LASSERTF(atomic_read(&set->set_remaining) == 0 || + atomic_read(&set->set_remaining) == n, "%d / %d\n", + atomic_read(&set->set_remaining), n); + + list_for_each_safe(tmp, next, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + list_del_init(&req->rq_set_chain); + + LASSERT(req->rq_phase == expected_phase); + + if (req->rq_phase == RQ_PHASE_NEW) { + ptlrpc_req_interpret(NULL, req, -EBADR); + atomic_dec(&set->set_remaining); + } + + spin_lock(&req->rq_lock); + req->rq_set = NULL; + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + + ptlrpc_req_finished(req); + } + + LASSERT(atomic_read(&set->set_remaining) == 0); + + ptlrpc_reqset_put(set); +} +EXPORT_SYMBOL(ptlrpc_set_destroy); + +/** + * Add a callback function \a fn to the set. + * This function would be called when all requests on this set are completed. + * The function will be passed \a data argument. + */ +int ptlrpc_set_add_cb(struct ptlrpc_request_set *set, + set_interpreter_func fn, void *data) +{ + struct ptlrpc_set_cbdata *cbdata; + + OBD_ALLOC_PTR(cbdata); + if (cbdata == NULL) + return -ENOMEM; + + cbdata->psc_interpret = fn; + cbdata->psc_data = data; + list_add_tail(&cbdata->psc_item, &set->set_cblist); + + return 0; +} +EXPORT_SYMBOL(ptlrpc_set_add_cb); + +/** + * Add a new request to the general purpose request set. + * Assumes request reference from the caller. + */ +void ptlrpc_set_add_req(struct ptlrpc_request_set *set, + struct ptlrpc_request *req) +{ + LASSERT(list_empty(&req->rq_set_chain)); + + /* The set takes over the caller's request reference */ + list_add_tail(&req->rq_set_chain, &set->set_requests); + req->rq_set = set; + atomic_inc(&set->set_remaining); + req->rq_queued_time = cfs_time_current(); + + if (req->rq_reqmsg != NULL) + lustre_msg_set_jobid(req->rq_reqmsg, NULL); + + if (set->set_producer != NULL) + /* If the request set has a producer callback, the RPC must be + * sent straight away */ + ptlrpc_send_new_req(req); +} +EXPORT_SYMBOL(ptlrpc_set_add_req); + +/** + * Add a request to a request with dedicated server thread + * and wake the thread to make any necessary processing. + * Currently only used for ptlrpcd. + */ +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set = pc->pc_set; + int count, i; + + LASSERT(req->rq_set == NULL); + LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0); + + spin_lock(&set->set_new_req_lock); + /* + * The set takes over the caller's request reference. + */ + req->rq_set = set; + req->rq_queued_time = cfs_time_current(); + list_add_tail(&req->rq_set_chain, &set->set_new_requests); + count = atomic_inc_return(&set->set_new_count); + spin_unlock(&set->set_new_req_lock); + + /* Only need to call wakeup once for the first entry. */ + if (count == 1) { + wake_up(&set->set_waitq); + + /* XXX: It maybe unnecessary to wakeup all the partners. But to + * guarantee the async RPC can be processed ASAP, we have + * no other better choice. It maybe fixed in future. */ + for (i = 0; i < pc->pc_npartners; i++) + wake_up(&pc->pc_partners[i]->pc_set->set_waitq); + } +} +EXPORT_SYMBOL(ptlrpc_set_add_new_req); + +/** + * Based on the current state of the import, determine if the request + * can be sent, is an error, or should be delayed. + * + * Returns true if this request should be delayed. If false, and + * *status is set, then the request can not be sent and *status is the + * error code. If false and status is 0, then request can be sent. + * + * The imp->imp_lock must be held. + */ +static int ptlrpc_import_delay_req(struct obd_import *imp, + struct ptlrpc_request *req, int *status) +{ + int delay = 0; + + LASSERT(status != NULL); + *status = 0; + + if (req->rq_ctx_init || req->rq_ctx_fini) { + /* always allow ctx init/fini rpc go through */ + } else if (imp->imp_state == LUSTRE_IMP_NEW) { + DEBUG_REQ(D_ERROR, req, "Uninitialized import."); + *status = -EIO; + } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { + /* pings may safely race with umount */ + DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ? + D_HA : D_ERROR, req, "IMP_CLOSED "); + *status = -EIO; + } else if (ptlrpc_send_limit_expired(req)) { + /* probably doesn't need to be a D_ERROR after initial testing */ + DEBUG_REQ(D_ERROR, req, "send limit expired "); + *status = -EIO; + } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && + imp->imp_state == LUSTRE_IMP_CONNECTING) { + /* allow CONNECT even if import is invalid */ + if (atomic_read(&imp->imp_inval_count) != 0) { + DEBUG_REQ(D_ERROR, req, "invalidate in flight"); + *status = -EIO; + } + } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) { + if (!imp->imp_deactive) + DEBUG_REQ(D_NET, req, "IMP_INVALID"); + *status = -ESHUTDOWN; /* bz 12940 */ + } else if (req->rq_import_generation != imp->imp_generation) { + DEBUG_REQ(D_ERROR, req, "req wrong generation:"); + *status = -EIO; + } else if (req->rq_send_state != imp->imp_state) { + /* invalidate in progress - any requests should be drop */ + if (atomic_read(&imp->imp_inval_count) != 0) { + DEBUG_REQ(D_ERROR, req, "invalidate in flight"); + *status = -EIO; + } else if (imp->imp_dlm_fake || req->rq_no_delay) { + *status = -EWOULDBLOCK; + } else if (req->rq_allow_replay && + (imp->imp_state == LUSTRE_IMP_REPLAY || + imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS || + imp->imp_state == LUSTRE_IMP_REPLAY_WAIT || + imp->imp_state == LUSTRE_IMP_RECOVER)) { + DEBUG_REQ(D_HA, req, "allow during recovery.\n"); + } else { + delay = 1; + } + } + + return delay; +} + +/** + * Decide if the error message regarding provided request \a req + * should be printed to the console or not. + * Makes it's decision on request status and other properties. + * Returns 1 to print error on the system console or 0 if not. + */ +static int ptlrpc_console_allow(struct ptlrpc_request *req) +{ + __u32 opc; + int err; + + LASSERT(req->rq_reqmsg != NULL); + opc = lustre_msg_get_opc(req->rq_reqmsg); + + /* Suppress particular reconnect errors which are to be expected. No + * errors are suppressed for the initial connection on an import */ + if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && + (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { + + /* Suppress timed out reconnect requests */ + if (req->rq_timedout) + return 0; + + /* Suppress unavailable/again reconnect requests */ + err = lustre_msg_get_status(req->rq_repmsg); + if (err == -ENODEV || err == -EAGAIN) + return 0; + } + + return 1; +} + +/** + * Check request processing status. + * Returns the status. + */ +static int ptlrpc_check_status(struct ptlrpc_request *req) +{ + int err; + + err = lustre_msg_get_status(req->rq_repmsg); + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + struct obd_import *imp = req->rq_import; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + if (ptlrpc_console_allow(req)) + LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s, operation %s failed with %d.\n", + imp->imp_obd->obd_name, + libcfs_nid2str( + imp->imp_connection->c_peer.nid), + ll_opcode2str(opc), err); + return err < 0 ? err : -EINVAL; + } + + if (err < 0) { + DEBUG_REQ(D_INFO, req, "status is %d", err); + } else if (err > 0) { + /* XXX: translate this error from net to host */ + DEBUG_REQ(D_INFO, req, "status is %d", err); + } + + return err; +} + +/** + * save pre-versions of objects into request for replay. + * Versions are obtained from server reply. + * used for VBR. + */ +static void ptlrpc_save_versions(struct ptlrpc_request *req) +{ + struct lustre_msg *repmsg = req->rq_repmsg; + struct lustre_msg *reqmsg = req->rq_reqmsg; + __u64 *versions = lustre_msg_get_versions(repmsg); + + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) + return; + + LASSERT(versions); + lustre_msg_set_versions(reqmsg, versions); + CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n", + versions[0], versions[1]); +} + +/** + * Callback function called when client receives RPC reply for \a req. + * Returns 0 on success or error code. + * The return value would be assigned to req->rq_status by the caller + * as request processing status. + * This function also decides if the request needs to be saved for later replay. + */ +static int after_reply(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct obd_device *obd = req->rq_import->imp_obd; + int rc; + struct timeval work_start; + long timediff; + + LASSERT(obd != NULL); + /* repbuf must be unlinked */ + LASSERT(!req->rq_receiving_reply && !req->rq_reply_unlink); + + if (req->rq_reply_truncate) { + if (ptlrpc_no_resend(req)) { + DEBUG_REQ(D_ERROR, req, "reply buffer overflow, expected: %d, actual size: %d", + req->rq_nob_received, req->rq_repbuf_len); + return -EOVERFLOW; + } + + sptlrpc_cli_free_repbuf(req); + /* Pass the required reply buffer size (include + * space for early reply). + * NB: no need to roundup because alloc_repbuf + * will roundup it */ + req->rq_replen = req->rq_nob_received; + req->rq_nob_received = 0; + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + return 0; + } + + /* + * NB Until this point, the whole of the incoming message, + * including buflens, status etc is in the sender's byte order. + */ + rc = sptlrpc_cli_unwrap_reply(req); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc); + return rc; + } + + /* + * Security layer unwrap might ask resend this request. + */ + if (req->rq_resend) + return 0; + + rc = unpack_reply(req); + if (rc) + return rc; + + /* retry indefinitely on EINPROGRESS */ + if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && + ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { + time_t now = get_seconds(); + + DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS"); + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + req->rq_nr_resend++; + + /* allocate new xid to avoid reply reconstruction */ + if (!req->rq_bulk) { + /* new xid is already allocated for bulk in + * ptlrpc_check_set() */ + req->rq_xid = ptlrpc_next_xid(); + DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for resend on EINPROGRESS"); + } + + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + /* delay resend to give a chance to the server to get ready. + * The delay is increased by 1s on every resend and is capped to + * the current request timeout (i.e. obd_timeout if AT is off, + * or AT service time x 125% + 5s, see at_est2timeout) */ + if (req->rq_nr_resend > req->rq_timeout) + req->rq_sent = now + req->rq_timeout; + else + req->rq_sent = now + req->rq_nr_resend; + + return 0; + } + + do_gettimeofday(&work_start); + timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL); + if (obd->obd_svc_stats != NULL) { + lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, + timediff); + ptlrpc_lprocfs_rpc_sent(req, timediff); + } + + if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && + lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { + DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)", + lustre_msg_get_type(req->rq_repmsg)); + return -EPROTO; + } + + if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) + CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val); + ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); + ptlrpc_at_adj_net_latency(req, + lustre_msg_get_service_time(req->rq_repmsg)); + + rc = ptlrpc_check_status(req); + imp->imp_connect_error = rc; + + if (rc) { + /* + * Either we've been evicted, or the server has failed for + * some reason. Try to reconnect, and if that fails, punt to + * the upcall. + */ + if (ll_rpc_recoverable_error(rc)) { + if (req->rq_send_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { + return rc; + } + ptlrpc_request_handle_notconn(req); + return rc; + } + } else { + /* + * Let's look if server sent slv. Do it only for RPC with + * rc == 0. + */ + ldlm_cli_update_pool(req); + } + + /* + * Store transno in reqmsg for replay. + */ + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { + req->rq_transno = lustre_msg_get_transno(req->rq_repmsg); + lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); + } + + if (imp->imp_replayable) { + spin_lock(&imp->imp_lock); + /* + * No point in adding already-committed requests to the replay + * list, we will just remove them immediately. b=9829 + */ + if (req->rq_transno != 0 && + (req->rq_transno > + lustre_msg_get_last_committed(req->rq_repmsg) || + req->rq_replay)) { + /** version recovery */ + ptlrpc_save_versions(req); + ptlrpc_retain_replayable_request(req, imp); + } else if (req->rq_commit_cb != NULL && + list_empty(&req->rq_replay_list)) { + /* NB: don't call rq_commit_cb if it's already on + * rq_replay_list, ptlrpc_free_committed() will call + * it later, see LU-3618 for details */ + spin_unlock(&imp->imp_lock); + req->rq_commit_cb(req); + spin_lock(&imp->imp_lock); + } + + /* + * Replay-enabled imports return commit-status information. + */ + if (lustre_msg_get_last_committed(req->rq_repmsg)) { + imp->imp_peer_committed_transno = + lustre_msg_get_last_committed(req->rq_repmsg); + } + + ptlrpc_free_committed(imp); + + if (!list_empty(&imp->imp_replay_list)) { + struct ptlrpc_request *last; + + last = list_entry(imp->imp_replay_list.prev, + struct ptlrpc_request, + rq_replay_list); + /* + * Requests with rq_replay stay on the list even if no + * commit is expected. + */ + if (last->rq_transno > imp->imp_peer_committed_transno) + ptlrpc_pinger_commit_expected(imp); + } + + spin_unlock(&imp->imp_lock); + } + + return rc; +} + +/** + * Helper function to send request \a req over the network for the first time + * Also adjusts request phase. + * Returns 0 on success or error code. + */ +static int ptlrpc_send_new_req(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + int rc; + + LASSERT(req->rq_phase == RQ_PHASE_NEW); + if (req->rq_sent && (req->rq_sent > get_seconds()) && + (!req->rq_generation_set || + req->rq_import_generation == imp->imp_generation)) + return 0; + + ptlrpc_rqphase_move(req, RQ_PHASE_RPC); + + spin_lock(&imp->imp_lock); + + if (!req->rq_generation_set) + req->rq_import_generation = imp->imp_generation; + + if (ptlrpc_import_delay_req(imp, req, &rc)) { + spin_lock(&req->rq_lock); + req->rq_waiting = 1; + spin_unlock(&req->rq_lock); + + DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: (%s != %s)", + lustre_msg_get_status(req->rq_reqmsg), + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); + LASSERT(list_empty(&req->rq_list)); + list_add_tail(&req->rq_list, &imp->imp_delayed_list); + atomic_inc(&req->rq_import->imp_inflight); + spin_unlock(&imp->imp_lock); + return 0; + } + + if (rc != 0) { + spin_unlock(&imp->imp_lock); + req->rq_status = rc; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + return rc; + } + + LASSERT(list_empty(&req->rq_list)); + list_add_tail(&req->rq_list, &imp->imp_sending_list); + atomic_inc(&req->rq_import->imp_inflight); + spin_unlock(&imp->imp_lock); + + lustre_msg_set_status(req->rq_reqmsg, current_pid()); + + rc = sptlrpc_req_refresh_ctx(req, -1); + if (rc) { + if (req->rq_err) { + req->rq_status = rc; + return 1; + } + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 1; + spin_unlock(&req->rq_lock); + return 0; + } + + CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n", + current_comm(), + imp->imp_obd->obd_uuid.uuid, + lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, + libcfs_nid2str(imp->imp_connection->c_peer.nid), + lustre_msg_get_opc(req->rq_reqmsg)); + + rc = ptl_send_rpc(req, 0); + if (rc) { + DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + return rc; + } + return 0; +} + +static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set) +{ + int remaining, rc; + + LASSERT(set->set_producer != NULL); + + remaining = atomic_read(&set->set_remaining); + + /* populate the ->set_requests list with requests until we + * reach the maximum number of RPCs in flight for this set */ + while (atomic_read(&set->set_remaining) < set->set_max_inflight) { + rc = set->set_producer(set, set->set_producer_arg); + if (rc == -ENOENT) { + /* no more RPC to produce */ + set->set_producer = NULL; + set->set_producer_arg = NULL; + return 0; + } + } + + return (atomic_read(&set->set_remaining) - remaining); +} + +/** + * this sends any unsent RPCs in \a set and returns 1 if all are sent + * and no more replies are expected. + * (it is possible to get less replies than requests sent e.g. due to timed out + * requests or requests that we had trouble to send out) + * + * NOTE: This function contains a potential schedule point (cond_resched()). + */ +int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *next; + struct list_head comp_reqs; + int force_timer_recalc = 0; + + if (atomic_read(&set->set_remaining) == 0) + return 1; + + INIT_LIST_HEAD(&comp_reqs); + list_for_each_safe(tmp, next, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + struct obd_import *imp = req->rq_import; + int unregistered = 0; + int rc = 0; + + /* This schedule point is mainly for the ptlrpcd caller of this + * function. Most ptlrpc sets are not long-lived and unbounded + * in length, but at the least the set used by the ptlrpcd is. + * Since the processing time is unbounded, we need to insert an + * explicit schedule point to make the thread well-behaved. + */ + cond_resched(); + + if (req->rq_phase == RQ_PHASE_NEW && + ptlrpc_send_new_req(req)) { + force_timer_recalc = 1; + } + + /* delayed send - skip */ + if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) + continue; + + /* delayed resend - skip */ + if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && + req->rq_sent > get_seconds()) + continue; + + if (!(req->rq_phase == RQ_PHASE_RPC || + req->rq_phase == RQ_PHASE_BULK || + req->rq_phase == RQ_PHASE_INTERPRET || + req->rq_phase == RQ_PHASE_UNREGISTERING || + req->rq_phase == RQ_PHASE_COMPLETE)) { + DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); + LBUG(); + } + + if (req->rq_phase == RQ_PHASE_UNREGISTERING) { + LASSERT(req->rq_next_phase != req->rq_phase); + LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); + + /* + * Skip processing until reply is unlinked. We + * can't return to pool before that and we can't + * call interpret before that. We need to make + * sure that all rdma transfers finished and will + * not corrupt any data. + */ + if (ptlrpc_client_recv_or_unlink(req) || + ptlrpc_client_bulk_active(req)) + continue; + + /* + * Turn fail_loc off to prevent it from looping + * forever. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, + OBD_FAIL_ONCE); + } + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, + OBD_FAIL_ONCE); + } + + /* + * Move to next phase if reply was successfully + * unlinked. + */ + ptlrpc_rqphase_move(req, req->rq_next_phase); + } + + if (req->rq_phase == RQ_PHASE_COMPLETE) { + list_move_tail(&req->rq_set_chain, &comp_reqs); + continue; + } + + if (req->rq_phase == RQ_PHASE_INTERPRET) + goto interpret; + + /* + * Note that this also will start async reply unlink. + */ + if (req->rq_net_err && !req->rq_timedout) { + ptlrpc_expire_one_request(req, 1); + + /* + * Check if we still need to wait for unlink. + */ + if (ptlrpc_client_recv_or_unlink(req) || + ptlrpc_client_bulk_active(req)) + continue; + /* If there is no need to resend, fail it now. */ + if (req->rq_no_resend) { + if (req->rq_status == 0) + req->rq_status = -EIO; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + goto interpret; + } else { + continue; + } + } + + if (req->rq_err) { + spin_lock(&req->rq_lock); + req->rq_replied = 0; + spin_unlock(&req->rq_lock); + if (req->rq_status == 0) + req->rq_status = -EIO; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + goto interpret; + } + + /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr + * so it sets rq_intr regardless of individual rpc + * timeouts. The synchronous IO waiting path sets + * rq_intr irrespective of whether ptlrpcd + * has seen a timeout. Our policy is to only interpret + * interrupted rpcs after they have timed out, so we + * need to enforce that here. + */ + + if (req->rq_intr && (req->rq_timedout || req->rq_waiting || + req->rq_wait_ctx)) { + req->rq_status = -EINTR; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + goto interpret; + } + + if (req->rq_phase == RQ_PHASE_RPC) { + if (req->rq_timedout || req->rq_resend || + req->rq_waiting || req->rq_wait_ctx) { + int status; + + if (!ptlrpc_unregister_reply(req, 1)) + continue; + + spin_lock(&imp->imp_lock); + if (ptlrpc_import_delay_req(imp, req, + &status)) { + /* put on delay list - only if we wait + * recovery finished - before send */ + list_del_init(&req->rq_list); + list_add_tail(&req->rq_list, + &imp-> + imp_delayed_list); + spin_unlock(&imp->imp_lock); + continue; + } + + if (status != 0) { + req->rq_status = status; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); + spin_unlock(&imp->imp_lock); + goto interpret; + } + if (ptlrpc_no_resend(req) && + !req->rq_wait_ctx) { + req->rq_status = -ENOTCONN; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); + spin_unlock(&imp->imp_lock); + goto interpret; + } + + list_del_init(&req->rq_list); + list_add_tail(&req->rq_list, + &imp->imp_sending_list); + + spin_unlock(&imp->imp_lock); + + spin_lock(&req->rq_lock); + req->rq_waiting = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_timedout || req->rq_resend) { + /* This is re-sending anyways, + * let's mark req as resend. */ + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + if (req->rq_bulk) { + __u64 old_xid; + + if (!ptlrpc_unregister_bulk(req, 1)) + continue; + + /* ensure previous bulk fails */ + old_xid = req->rq_xid; + req->rq_xid = ptlrpc_next_xid(); + CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", + old_xid, req->rq_xid); + } + } + /* + * rq_wait_ctx is only touched by ptlrpcd, + * so no lock is needed here. + */ + status = sptlrpc_req_refresh_ctx(req, -1); + if (status) { + if (req->rq_err) { + req->rq_status = status; + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 0; + spin_unlock(&req->rq_lock); + force_timer_recalc = 1; + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 1; + spin_unlock(&req->rq_lock); + } + + continue; + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 0; + spin_unlock(&req->rq_lock); + } + + rc = ptl_send_rpc(req, 0); + if (rc) { + DEBUG_REQ(D_HA, req, + "send failed: rc = %d", rc); + force_timer_recalc = 1; + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + continue; + } + /* need to reset the timeout */ + force_timer_recalc = 1; + } + + spin_lock(&req->rq_lock); + + if (ptlrpc_client_early(req)) { + ptlrpc_at_recv_early_reply(req); + spin_unlock(&req->rq_lock); + continue; + } + + /* Still waiting for a reply? */ + if (ptlrpc_client_recv(req)) { + spin_unlock(&req->rq_lock); + continue; + } + + /* Did we actually receive a reply? */ + if (!ptlrpc_client_replied(req)) { + spin_unlock(&req->rq_lock); + continue; + } + + spin_unlock(&req->rq_lock); + + /* unlink from net because we are going to + * swab in-place of reply buffer */ + unregistered = ptlrpc_unregister_reply(req, 1); + if (!unregistered) + continue; + + req->rq_status = after_reply(req); + if (req->rq_resend) + continue; + + /* If there is no bulk associated with this request, + * then we're done and should let the interpreter + * process the reply. Similarly if the RPC returned + * an error, and therefore the bulk will never arrive. + */ + if (req->rq_bulk == NULL || req->rq_status < 0) { + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + goto interpret; + } + + ptlrpc_rqphase_move(req, RQ_PHASE_BULK); + } + + LASSERT(req->rq_phase == RQ_PHASE_BULK); + if (ptlrpc_client_bulk_active(req)) + continue; + + if (req->rq_bulk->bd_failure) { + /* The RPC reply arrived OK, but the bulk screwed + * up! Dead weird since the server told us the RPC + * was good after getting the REPLY for her GET or + * the ACK for her PUT. */ + DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); + req->rq_status = -EIO; + } + + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + +interpret: + LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); + + /* This moves to "unregistering" phase we need to wait for + * reply unlink. */ + if (!unregistered && !ptlrpc_unregister_reply(req, 1)) { + /* start async bulk unlink too */ + ptlrpc_unregister_bulk(req, 1); + continue; + } + + if (!ptlrpc_unregister_bulk(req, 1)) + continue; + + /* When calling interpret receiving already should be + * finished. */ + LASSERT(!req->rq_receiving_reply); + + ptlrpc_req_interpret(env, req, req->rq_status); + + if (ptlrpcd_check_work(req)) { + atomic_dec(&set->set_remaining); + continue; + } + ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); + + CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0, + "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n", + current_comm(), imp->imp_obd->obd_uuid.uuid, + lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, + libcfs_nid2str(imp->imp_connection->c_peer.nid), + lustre_msg_get_opc(req->rq_reqmsg)); + + spin_lock(&imp->imp_lock); + /* Request already may be not on sending or delaying list. This + * may happen in the case of marking it erroneous for the case + * ptlrpc_import_delay_req(req, status) find it impossible to + * allow sending this rpc and returns *status != 0. */ + if (!list_empty(&req->rq_list)) { + list_del_init(&req->rq_list); + atomic_dec(&imp->imp_inflight); + } + spin_unlock(&imp->imp_lock); + + atomic_dec(&set->set_remaining); + wake_up_all(&imp->imp_recovery_waitq); + + if (set->set_producer) { + /* produce a new request if possible */ + if (ptlrpc_set_producer(set) > 0) + force_timer_recalc = 1; + + /* free the request that has just been completed + * in order not to pollute set->set_requests */ + list_del_init(&req->rq_set_chain); + spin_lock(&req->rq_lock); + req->rq_set = NULL; + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + + /* record rq_status to compute the final status later */ + if (req->rq_status != 0) + set->set_rc = req->rq_status; + ptlrpc_req_finished(req); + } else { + list_move_tail(&req->rq_set_chain, &comp_reqs); + } + } + + /* move completed request at the head of list so it's easier for + * caller to find them */ + list_splice(&comp_reqs, &set->set_requests); + + /* If we hit an error, we want to recover promptly. */ + return atomic_read(&set->set_remaining) == 0 || force_timer_recalc; +} +EXPORT_SYMBOL(ptlrpc_check_set); + +/** + * Time out request \a req. is \a async_unlink is set, that means do not wait + * until LNet actually confirms network buffer unlinking. + * Return 1 if we should give up further retrying attempts or 0 otherwise. + */ +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) +{ + struct obd_import *imp = req->rq_import; + int rc = 0; + + spin_lock(&req->rq_lock); + req->rq_timedout = 1; + spin_unlock(&req->rq_lock); + + DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T + "/real "CFS_DURATION_T"]", + req->rq_net_err ? "failed due to network error" : + ((req->rq_real_sent == 0 || + time_before((unsigned long)req->rq_real_sent, (unsigned long)req->rq_sent) || + cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ? + "timed out for sent delay" : "timed out for slow reply"), + req->rq_sent, req->rq_real_sent); + + if (imp != NULL && obd_debug_peer_on_timeout) + LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); + + ptlrpc_unregister_reply(req, async_unlink); + ptlrpc_unregister_bulk(req, async_unlink); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + if (imp == NULL) { + DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); + return 1; + } + + atomic_inc(&imp->imp_timeouts); + + /* The DLM server doesn't want recovery run on its imports. */ + if (imp->imp_dlm_fake) + return 1; + + /* If this request is for recovery or other primordial tasks, + * then error it out here. */ + if (req->rq_ctx_init || req->rq_ctx_fini || + req->rq_send_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov) { + DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); + spin_lock(&req->rq_lock); + req->rq_status = -ETIMEDOUT; + req->rq_err = 1; + spin_unlock(&req->rq_lock); + return 1; + } + + /* if a request can't be resent we can't wait for an answer after + the timeout */ + if (ptlrpc_no_resend(req)) { + DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); + rc = 1; + } + + ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); + + return rc; +} + +/** + * Time out all uncompleted requests in request set pointed by \a data + * Callback used when waiting on sets with l_wait_event. + * Always returns 1. + */ +int ptlrpc_expired_set(void *data) +{ + struct ptlrpc_request_set *set = data; + struct list_head *tmp; + time_t now = get_seconds(); + + LASSERT(set != NULL); + + /* + * A timeout expired. See which reqs it applies to... + */ + list_for_each(tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + + /* don't expire request waiting for context */ + if (req->rq_wait_ctx) + continue; + + /* Request in-flight? */ + if (!((req->rq_phase == RQ_PHASE_RPC && + !req->rq_waiting && !req->rq_resend) || + (req->rq_phase == RQ_PHASE_BULK))) + continue; + + if (req->rq_timedout || /* already dealt with */ + req->rq_deadline > now) /* not expired */ + continue; + + /* Deal with this guy. Do it asynchronously to not block + * ptlrpcd thread. */ + ptlrpc_expire_one_request(req, 1); + } + + /* + * When waiting for a whole set, we always break out of the + * sleep so we can recalculate the timeout, or enable interrupts + * if everyone's timed out. + */ + return 1; +} +EXPORT_SYMBOL(ptlrpc_expired_set); + +/** + * Sets rq_intr flag in \a req under spinlock. + */ +void ptlrpc_mark_interrupted(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_lock); + req->rq_intr = 1; + spin_unlock(&req->rq_lock); +} +EXPORT_SYMBOL(ptlrpc_mark_interrupted); + +/** + * Interrupts (sets interrupted flag) all uncompleted requests in + * a set \a data. Callback for l_wait_event for interruptible waits. + */ +void ptlrpc_interrupted_set(void *data) +{ + struct ptlrpc_request_set *set = data; + struct list_head *tmp; + + LASSERT(set != NULL); + CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set); + + list_for_each(tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + + if (req->rq_phase != RQ_PHASE_RPC && + req->rq_phase != RQ_PHASE_UNREGISTERING) + continue; + + ptlrpc_mark_interrupted(req); + } +} +EXPORT_SYMBOL(ptlrpc_interrupted_set); + +/** + * Get the smallest timeout in the set; this does NOT set a timeout. + */ +int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + time_t now = get_seconds(); + int timeout = 0; + struct ptlrpc_request *req; + int deadline; + + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + /* + * Request in-flight? + */ + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + continue; + + /* + * Already timed out. + */ + if (req->rq_timedout) + continue; + + /* + * Waiting for ctx. + */ + if (req->rq_wait_ctx) + continue; + + if (req->rq_phase == RQ_PHASE_NEW) + deadline = req->rq_sent; + else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) + deadline = req->rq_sent; + else + deadline = req->rq_sent + req->rq_timeout; + + if (deadline <= now) /* actually expired already */ + timeout = 1; /* ASAP */ + else if (timeout == 0 || timeout > deadline - now) + timeout = deadline - now; + } + return timeout; +} +EXPORT_SYMBOL(ptlrpc_set_next_timeout); + +/** + * Send all unset request from the set and then wait until all + * requests in the set complete (either get a reply, timeout, get an + * error or otherwise be interrupted). + * Returns 0 on success or error code otherwise. + */ +int ptlrpc_set_wait(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + struct ptlrpc_request *req; + struct l_wait_info lwi; + int rc, timeout; + + if (set->set_producer) + (void)ptlrpc_set_producer(set); + else + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + if (req->rq_phase == RQ_PHASE_NEW) + (void)ptlrpc_send_new_req(req); + } + + if (list_empty(&set->set_requests)) + return 0; + + do { + timeout = ptlrpc_set_next_timeout(set); + + /* wait until all complete, interrupted, or an in-flight + * req times out */ + CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n", + set, timeout); + + if (timeout == 0 && !cfs_signal_pending()) + /* + * No requests are in-flight (ether timed out + * or delayed), so we can allow interrupts. + * We still want to block for a limited time, + * so we allow interrupts during the timeout. + */ + lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1), + ptlrpc_expired_set, + ptlrpc_interrupted_set, set); + else + /* + * At least one request is in flight, so no + * interrupts are allowed. Wait until all + * complete, or an in-flight req times out. + */ + lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1), + ptlrpc_expired_set, set); + + rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi); + + /* LU-769 - if we ignored the signal because it was already + * pending when we started, we need to handle it now or we risk + * it being ignored forever */ + if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr && + cfs_signal_pending()) { + sigset_t blocked_sigs = + cfs_block_sigsinv(LUSTRE_FATAL_SIGS); + + /* In fact we only interrupt for the "fatal" signals + * like SIGINT or SIGKILL. We still ignore less + * important signals since ptlrpc set is not easily + * reentrant from userspace again */ + if (cfs_signal_pending()) + ptlrpc_interrupted_set(set); + cfs_restore_sigs(blocked_sigs); + } + + LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); + + /* -EINTR => all requests have been flagged rq_intr so next + * check completes. + * -ETIMEDOUT => someone timed out. When all reqs have + * timed out, signals are enabled allowing completion with + * EINTR. + * I don't really care if we go once more round the loop in + * the error cases -eeb. */ + if (rc == 0 && atomic_read(&set->set_remaining) == 0) { + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + spin_lock(&req->rq_lock); + req->rq_invalid_rqset = 1; + spin_unlock(&req->rq_lock); + } + } + } while (rc != 0 || atomic_read(&set->set_remaining) != 0); + + LASSERT(atomic_read(&set->set_remaining) == 0); + + rc = set->set_rc; /* rq_status of already freed requests if any */ + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + LASSERT(req->rq_phase == RQ_PHASE_COMPLETE); + if (req->rq_status != 0) + rc = req->rq_status; + } + + if (set->set_interpret != NULL) { + int (*interpreter)(struct ptlrpc_request_set *set, void *, int) = + set->set_interpret; + rc = interpreter(set, set->set_arg, rc); + } else { + struct ptlrpc_set_cbdata *cbdata, *n; + int err; + + list_for_each_entry_safe(cbdata, n, + &set->set_cblist, psc_item) { + list_del_init(&cbdata->psc_item); + err = cbdata->psc_interpret(set, cbdata->psc_data, rc); + if (err && !rc) + rc = err; + OBD_FREE_PTR(cbdata); + } + } + + return rc; +} +EXPORT_SYMBOL(ptlrpc_set_wait); + +/** + * Helper function for request freeing. + * Called when request count reached zero and request needs to be freed. + * Removes request from all sorts of sending/replay lists it might be on, + * frees network buffers if any are present. + * If \a locked is set, that means caller is already holding import imp_lock + * and so we no longer need to reobtain it (for certain lists manipulations) + */ +static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) +{ + if (request == NULL) + return; + LASSERTF(!request->rq_receiving_reply, "req %p\n", request); + LASSERTF(request->rq_rqbd == NULL, "req %p\n", request);/* client-side */ + LASSERTF(list_empty(&request->rq_list), "req %p\n", request); + LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); + LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request); + LASSERTF(!request->rq_replay, "req %p\n", request); + + req_capsule_fini(&request->rq_pill); + + /* We must take it off the imp_replay_list first. Otherwise, we'll set + * request->rq_reqmsg to NULL while osc_close is dereferencing it. */ + if (request->rq_import != NULL) { + if (!locked) + spin_lock(&request->rq_import->imp_lock); + list_del_init(&request->rq_replay_list); + if (!locked) + spin_unlock(&request->rq_import->imp_lock); + } + LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request); + + if (atomic_read(&request->rq_refcount) != 0) { + DEBUG_REQ(D_ERROR, request, + "freeing request with nonzero refcount"); + LBUG(); + } + + if (request->rq_repbuf != NULL) + sptlrpc_cli_free_repbuf(request); + if (request->rq_export != NULL) { + class_export_put(request->rq_export); + request->rq_export = NULL; + } + if (request->rq_import != NULL) { + class_import_put(request->rq_import); + request->rq_import = NULL; + } + if (request->rq_bulk != NULL) + ptlrpc_free_bulk_pin(request->rq_bulk); + + if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL) + sptlrpc_cli_free_reqbuf(request); + + if (request->rq_cli_ctx) + sptlrpc_req_put_ctx(request, !locked); + + if (request->rq_pool) + __ptlrpc_free_req_to_pool(request); + else + ptlrpc_request_cache_free(request); +} + +static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked); +/** + * Drop one request reference. Must be called with import imp_lock held. + * When reference count drops to zero, request is freed. + */ +void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request) +{ + assert_spin_locked(&request->rq_import->imp_lock); + (void)__ptlrpc_req_finished(request, 1); +} +EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock); + +/** + * Helper function + * Drops one reference count for request \a request. + * \a locked set indicates that caller holds import imp_lock. + * Frees the request when reference count reaches zero. + */ +static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) +{ + if (request == NULL) + return 1; + + if (request == LP_POISON || + request->rq_reqmsg == LP_POISON) { + CERROR("dereferencing freed request (bug 575)\n"); + LBUG(); + return 1; + } + + DEBUG_REQ(D_INFO, request, "refcount now %u", + atomic_read(&request->rq_refcount) - 1); + + if (atomic_dec_and_test(&request->rq_refcount)) { + __ptlrpc_free_req(request, locked); + return 1; + } + + return 0; +} + +/** + * Drops one reference count for a request. + */ +void ptlrpc_req_finished(struct ptlrpc_request *request) +{ + __ptlrpc_req_finished(request, 0); +} +EXPORT_SYMBOL(ptlrpc_req_finished); + +/** + * Returns xid of a \a request + */ +__u64 ptlrpc_req_xid(struct ptlrpc_request *request) +{ + return request->rq_xid; +} +EXPORT_SYMBOL(ptlrpc_req_xid); + +/** + * Disengage the client's reply buffer from the network + * NB does _NOT_ unregister any client-side bulk. + * IDEMPOTENT, but _not_ safe against concurrent callers. + * The request owner (i.e. the thread doing the I/O) must call... + * Returns 0 on success or 1 if unregistering cannot be made. + */ +int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) +{ + int rc; + wait_queue_head_t *wq; + struct l_wait_info lwi; + + /* + * Might sleep. + */ + LASSERT(!in_interrupt()); + + /* + * Let's setup deadline for reply unlink. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + async && request->rq_reply_deadline == 0) + request->rq_reply_deadline = get_seconds()+LONG_UNLINK; + + /* + * Nothing left to do. + */ + if (!ptlrpc_client_recv_or_unlink(request)) + return 1; + + LNetMDUnlink(request->rq_reply_md_h); + + /* + * Let's check it once again. + */ + if (!ptlrpc_client_recv_or_unlink(request)) + return 1; + + /* + * Move to "Unregistering" phase as reply was not unlinked yet. + */ + ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING); + + /* + * Do not wait for unlink to finish. + */ + if (async) + return 0; + + /* + * We have to l_wait_event() whatever the result, to give liblustre + * a chance to run reply_in_callback(), and to make sure we've + * unlinked before returning a req to the pool. + */ + if (request->rq_set != NULL) + wq = &request->rq_set->set_waitq; + else + wq = &request->rq_reply_waitq; + + for (;;) { + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request), + &lwi); + if (rc == 0) { + ptlrpc_rqphase_move(request, request->rq_next_phase); + return 1; + } + + LASSERT(rc == -ETIMEDOUT); + DEBUG_REQ(D_WARNING, request, + "Unexpectedly long timeout rvcng=%d unlnk=%d/%d", + request->rq_receiving_reply, + request->rq_req_unlink, request->rq_reply_unlink); + } + return 0; +} +EXPORT_SYMBOL(ptlrpc_unregister_reply); + +static void ptlrpc_free_request(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_lock); + req->rq_replay = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_commit_cb != NULL) + req->rq_commit_cb(req); + list_del_init(&req->rq_replay_list); + + __ptlrpc_req_finished(req, 1); +} + +/** + * the request is committed and dropped from the replay list of its import + */ +void ptlrpc_request_committed(struct ptlrpc_request *req, int force) +{ + struct obd_import *imp = req->rq_import; + + spin_lock(&imp->imp_lock); + if (list_empty(&req->rq_replay_list)) { + spin_unlock(&imp->imp_lock); + return; + } + + if (force || req->rq_transno <= imp->imp_peer_committed_transno) + ptlrpc_free_request(req); + + spin_unlock(&imp->imp_lock); +} +EXPORT_SYMBOL(ptlrpc_request_committed); + +/** + * Iterates through replay_list on import and prunes + * all requests have transno smaller than last_committed for the + * import and don't have rq_replay set. + * Since requests are sorted in transno order, stops when meeting first + * transno bigger than last_committed. + * caller must hold imp->imp_lock + */ +void ptlrpc_free_committed(struct obd_import *imp) +{ + struct ptlrpc_request *req, *saved; + struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ + bool skip_committed_list = true; + + LASSERT(imp != NULL); + assert_spin_locked(&imp->imp_lock); + + if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked && + imp->imp_generation == imp->imp_last_generation_checked) { + CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n", + imp->imp_obd->obd_name, imp->imp_peer_committed_transno); + return; + } + CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n", + imp->imp_obd->obd_name, imp->imp_peer_committed_transno, + imp->imp_generation); + + if (imp->imp_generation != imp->imp_last_generation_checked) + skip_committed_list = false; + + imp->imp_last_transno_checked = imp->imp_peer_committed_transno; + imp->imp_last_generation_checked = imp->imp_generation; + + list_for_each_entry_safe(req, saved, &imp->imp_replay_list, + rq_replay_list) { + /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ + LASSERT(req != last_req); + last_req = req; + + if (req->rq_transno == 0) { + DEBUG_REQ(D_EMERG, req, "zero transno during replay"); + LBUG(); + } + if (req->rq_import_generation < imp->imp_generation) { + DEBUG_REQ(D_RPCTRACE, req, "free request with old gen"); + goto free_req; + } + + /* not yet committed */ + if (req->rq_transno > imp->imp_peer_committed_transno) { + DEBUG_REQ(D_RPCTRACE, req, "stopping search"); + break; + } + + if (req->rq_replay) { + DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); + list_move_tail(&req->rq_replay_list, + &imp->imp_committed_list); + continue; + } + + DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)", + imp->imp_peer_committed_transno); +free_req: + ptlrpc_free_request(req); + } + if (skip_committed_list) + return; + + list_for_each_entry_safe(req, saved, &imp->imp_committed_list, + rq_replay_list) { + LASSERT(req->rq_transno != 0); + if (req->rq_import_generation < imp->imp_generation) { + DEBUG_REQ(D_RPCTRACE, req, "free stale open request"); + ptlrpc_free_request(req); + } + } +} + +void ptlrpc_cleanup_client(struct obd_import *imp) +{ +} +EXPORT_SYMBOL(ptlrpc_cleanup_client); + +/** + * Schedule previously sent request for resend. + * For bulk requests we assign new xid (to avoid problems with + * lost replies and therefore several transfers landing into same buffer + * from different sending attempts). + */ +void ptlrpc_resend_req(struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, "going to resend"); + spin_lock(&req->rq_lock); + + /* Request got reply but linked to the import list still. + Let ptlrpc_check_set() to process it. */ + if (ptlrpc_client_replied(req)) { + spin_unlock(&req->rq_lock); + DEBUG_REQ(D_HA, req, "it has reply, so skip it"); + return; + } + + lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 }); + req->rq_status = -EAGAIN; + + req->rq_resend = 1; + req->rq_net_err = 0; + req->rq_timedout = 0; + if (req->rq_bulk) { + __u64 old_xid = req->rq_xid; + + /* ensure previous bulk fails */ + req->rq_xid = ptlrpc_next_xid(); + CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", + old_xid, req->rq_xid); + } + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); +} +EXPORT_SYMBOL(ptlrpc_resend_req); + +/* XXX: this function and rq_status are currently unused */ +void ptlrpc_restart_req(struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request"); + req->rq_status = -ERESTARTSYS; + + spin_lock(&req->rq_lock); + req->rq_restart = 1; + req->rq_timedout = 0; + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); +} +EXPORT_SYMBOL(ptlrpc_restart_req); + +/** + * Grab additional reference on a request \a req + */ +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) +{ + atomic_inc(&req->rq_refcount); + return req; +} +EXPORT_SYMBOL(ptlrpc_request_addref); + +/** + * Add a request to import replay_list. + * Must be called under imp_lock + */ +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp) +{ + struct list_head *tmp; + + assert_spin_locked(&imp->imp_lock); + + if (req->rq_transno == 0) { + DEBUG_REQ(D_EMERG, req, "saving request with zero transno"); + LBUG(); + } + + /* clear this for new requests that were resent as well + as resent replayed requests. */ + lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); + + /* don't re-add requests that have been replayed */ + if (!list_empty(&req->rq_replay_list)) + return; + + lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); + + LASSERT(imp->imp_replayable); + /* Balanced in ptlrpc_free_committed, usually. */ + ptlrpc_request_addref(req); + list_for_each_prev(tmp, &imp->imp_replay_list) { + struct ptlrpc_request *iter = + list_entry(tmp, struct ptlrpc_request, + rq_replay_list); + + /* We may have duplicate transnos if we create and then + * open a file, or for closes retained if to match creating + * opens, so use req->rq_xid as a secondary key. + * (See bugs 684, 685, and 428.) + * XXX no longer needed, but all opens need transnos! + */ + if (iter->rq_transno > req->rq_transno) + continue; + + if (iter->rq_transno == req->rq_transno) { + LASSERT(iter->rq_xid != req->rq_xid); + if (iter->rq_xid > req->rq_xid) + continue; + } + + list_add(&req->rq_replay_list, &iter->rq_replay_list); + return; + } + + list_add(&req->rq_replay_list, &imp->imp_replay_list); +} +EXPORT_SYMBOL(ptlrpc_retain_replayable_request); + +/** + * Send request and wait until it completes. + * Returns request processing status. + */ +int ptlrpc_queue_wait(struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set; + int rc; + + LASSERT(req->rq_set == NULL); + LASSERT(!req->rq_receiving_reply); + + set = ptlrpc_prep_set(); + if (set == NULL) { + CERROR("Unable to allocate ptlrpc set."); + return -ENOMEM; + } + + /* for distributed debugging */ + lustre_msg_set_status(req->rq_reqmsg, current_pid()); + + /* add a ref for the set (see comment in ptlrpc_set_add_req) */ + ptlrpc_request_addref(req); + ptlrpc_set_add_req(set, req); + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + + return rc; +} +EXPORT_SYMBOL(ptlrpc_queue_wait); + +struct ptlrpc_replay_async_args { + int praa_old_state; + int praa_old_status; +}; + +/** + * Callback used for replayed requests reply processing. + * In case of successful reply calls registered request replay callback. + * In case of error restart replay process. + */ +static int ptlrpc_replay_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *data, int rc) +{ + struct ptlrpc_replay_async_args *aa = data; + struct obd_import *imp = req->rq_import; + + atomic_dec(&imp->imp_replay_inflight); + + if (!ptlrpc_client_replied(req)) { + CERROR("request replay timed out, restarting recovery\n"); + rc = -ETIMEDOUT; + goto out; + } + + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR && + (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN || + lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) { + rc = lustre_msg_get_status(req->rq_repmsg); + goto out; + } + + /** VBR: check version failure */ + if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) { + /** replay was failed due to version mismatch */ + DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n"); + spin_lock(&imp->imp_lock); + imp->imp_vbr_failed = 1; + imp->imp_no_lock_replay = 1; + spin_unlock(&imp->imp_lock); + lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); + } else { + /** The transno had better not change over replay. */ + LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == + lustre_msg_get_transno(req->rq_repmsg) || + lustre_msg_get_transno(req->rq_repmsg) == 0, + "%#llx/%#llx\n", + lustre_msg_get_transno(req->rq_reqmsg), + lustre_msg_get_transno(req->rq_repmsg)); + } + + spin_lock(&imp->imp_lock); + /** if replays by version then gap occur on server, no trust to locks */ + if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY) + imp->imp_no_lock_replay = 1; + imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg); + spin_unlock(&imp->imp_lock); + LASSERT(imp->imp_last_replay_transno); + + /* transaction number shouldn't be bigger than the latest replayed */ + if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { + DEBUG_REQ(D_ERROR, req, + "Reported transno %llu is bigger than the replayed one: %llu", + req->rq_transno, + lustre_msg_get_transno(req->rq_reqmsg)); + rc = -EINVAL; + goto out; + } + + DEBUG_REQ(D_HA, req, "got rep"); + + /* let the callback do fixups, possibly including in the request */ + if (req->rq_replay_cb) + req->rq_replay_cb(req); + + if (ptlrpc_client_replied(req) && + lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { + DEBUG_REQ(D_ERROR, req, "status %d, old was %d", + lustre_msg_get_status(req->rq_repmsg), + aa->praa_old_status); + } else { + /* Put it back for re-replay. */ + lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); + } + + /* + * Errors while replay can set transno to 0, but + * imp_last_replay_transno shouldn't be set to 0 anyway + */ + if (req->rq_transno == 0) + CERROR("Transno is 0 during replay!\n"); + + /* continue with recovery */ + rc = ptlrpc_import_recovery_state_machine(imp); + out: + req->rq_send_state = aa->praa_old_state; + + if (rc != 0) + /* this replay failed, so restart recovery */ + ptlrpc_connect_import(imp); + + return rc; +} + +/** + * Prepares and queues request for replay. + * Adds it to ptlrpcd queue for actual sending. + * Returns 0 on success. + */ +int ptlrpc_replay_req(struct ptlrpc_request *req) +{ + struct ptlrpc_replay_async_args *aa; + + LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); + + LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + memset(aa, 0, sizeof(*aa)); + + /* Prepare request to be resent with ptlrpcd */ + aa->praa_old_state = req->rq_send_state; + req->rq_send_state = LUSTRE_IMP_REPLAY; + req->rq_phase = RQ_PHASE_NEW; + req->rq_next_phase = RQ_PHASE_UNDEFINED; + if (req->rq_repmsg) + aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); + req->rq_status = 0; + req->rq_interpret_reply = ptlrpc_replay_interpret; + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + + /* Tell server the net_latency, so the server can calculate how long + * it should wait for next replay */ + lustre_msg_set_service_time(req->rq_reqmsg, + ptlrpc_at_get_net_latency(req)); + DEBUG_REQ(D_HA, req, "REPLAY"); + + atomic_inc(&req->rq_import->imp_replay_inflight); + ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ + + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + return 0; +} +EXPORT_SYMBOL(ptlrpc_replay_req); + +/** + * Aborts all in-flight request on import \a imp sending and delayed lists + */ +void ptlrpc_abort_inflight(struct obd_import *imp) +{ + struct list_head *tmp, *n; + + /* Make sure that no new requests get processed for this import. + * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing + * this flag and then putting requests on sending_list or delayed_list. + */ + spin_lock(&imp->imp_lock); + + /* XXX locking? Maybe we should remove each request with the list + * locked? Also, how do we know if the requests on the list are + * being freed at this time? + */ + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_RPCTRACE, req, "inflight"); + + spin_lock(&req->rq_lock); + if (req->rq_import_generation < imp->imp_generation) { + req->rq_err = 1; + req->rq_status = -EIO; + ptlrpc_client_wake_req(req); + } + spin_unlock(&req->rq_lock); + } + + list_for_each_safe(tmp, n, &imp->imp_delayed_list) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req"); + + spin_lock(&req->rq_lock); + if (req->rq_import_generation < imp->imp_generation) { + req->rq_err = 1; + req->rq_status = -EIO; + ptlrpc_client_wake_req(req); + } + spin_unlock(&req->rq_lock); + } + + /* Last chance to free reqs left on the replay list, but we + * will still leak reqs that haven't committed. */ + if (imp->imp_replayable) + ptlrpc_free_committed(imp); + + spin_unlock(&imp->imp_lock); +} +EXPORT_SYMBOL(ptlrpc_abort_inflight); + +/** + * Abort all uncompleted requests in request set \a set + */ +void ptlrpc_abort_set(struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *pos; + + LASSERT(set != NULL); + + list_for_each_safe(pos, tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(pos, struct ptlrpc_request, + rq_set_chain); + + spin_lock(&req->rq_lock); + if (req->rq_phase != RQ_PHASE_RPC) { + spin_unlock(&req->rq_lock); + continue; + } + + req->rq_err = 1; + req->rq_status = -EINTR; + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); + } +} + +static __u64 ptlrpc_last_xid; +static spinlock_t ptlrpc_last_xid_lock; + +/** + * Initialize the XID for the node. This is common among all requests on + * this node, and only requires the property that it is monotonically + * increasing. It does not need to be sequential. Since this is also used + * as the RDMA match bits, it is important that a single client NOT have + * the same match bits for two different in-flight requests, hence we do + * NOT want to have an XID per target or similar. + * + * To avoid an unlikely collision between match bits after a client reboot + * (which would deliver old data into the wrong RDMA buffer) initialize + * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s. + * If the time is clearly incorrect, we instead use a 62-bit random number. + * In the worst case the random number will overflow 1M RPCs per second in + * 9133 years, or permutations thereof. + */ +#define YEAR_2004 (1ULL << 30) +void ptlrpc_init_xid(void) +{ + time_t now = get_seconds(); + + spin_lock_init(&ptlrpc_last_xid_lock); + if (now < YEAR_2004) { + cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid)); + ptlrpc_last_xid >>= 2; + ptlrpc_last_xid |= (1ULL << 61); + } else { + ptlrpc_last_xid = (__u64)now << 20; + } + + /* Always need to be aligned to a power-of-two for multi-bulk BRW */ + CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0); + ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK; +} + +/** + * Increase xid and returns resulting new value to the caller. + * + * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting + * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC + * itself uses the last bulk xid needed, so the server can determine the + * the number of bulk transfers from the RPC XID and a bitmask. The starting + * xid must align to a power-of-two value. + * + * This is assumed to be true due to the initial ptlrpc_last_xid + * value also being initialized to a power-of-two value. LU-1431 + */ +__u64 ptlrpc_next_xid(void) +{ + __u64 next; + + spin_lock(&ptlrpc_last_xid_lock); + next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; + ptlrpc_last_xid = next; + spin_unlock(&ptlrpc_last_xid_lock); + + return next; +} +EXPORT_SYMBOL(ptlrpc_next_xid); + +/** + * Get a glimpse at what next xid value might have been. + * Returns possible next xid. + */ +__u64 ptlrpc_sample_next_xid(void) +{ +#if BITS_PER_LONG == 32 + /* need to avoid possible word tearing on 32-bit systems */ + __u64 next; + + spin_lock(&ptlrpc_last_xid_lock); + next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; + spin_unlock(&ptlrpc_last_xid_lock); + + return next; +#else + /* No need to lock, since returned value is racy anyways */ + return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; +#endif +} +EXPORT_SYMBOL(ptlrpc_sample_next_xid); + +/** + * Functions for operating ptlrpc workers. + * + * A ptlrpc work is a function which will be running inside ptlrpc context. + * The callback shouldn't sleep otherwise it will block that ptlrpcd thread. + * + * 1. after a work is created, it can be used many times, that is: + * handler = ptlrpcd_alloc_work(); + * ptlrpcd_queue_work(); + * + * queue it again when necessary: + * ptlrpcd_queue_work(); + * ptlrpcd_destroy_work(); + * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but + * it will only be queued once in any time. Also as its name implies, it may + * have delay before it really runs by ptlrpcd thread. + */ +struct ptlrpc_work_async_args { + int (*cb)(const struct lu_env *, void *); + void *cbdata; +}; + +static void ptlrpcd_add_work_req(struct ptlrpc_request *req) +{ + /* re-initialize the req */ + req->rq_timeout = obd_timeout; + req->rq_sent = get_seconds(); + req->rq_deadline = req->rq_sent + req->rq_timeout; + req->rq_reply_deadline = req->rq_deadline; + req->rq_phase = RQ_PHASE_INTERPRET; + req->rq_next_phase = RQ_PHASE_COMPLETE; + req->rq_xid = ptlrpc_next_xid(); + req->rq_import_generation = req->rq_import->imp_generation; + + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); +} + +static int work_interpreter(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc) +{ + struct ptlrpc_work_async_args *arg = data; + + LASSERT(ptlrpcd_check_work(req)); + LASSERT(arg->cb != NULL); + + rc = arg->cb(env, arg->cbdata); + + list_del_init(&req->rq_set_chain); + req->rq_set = NULL; + + if (atomic_dec_return(&req->rq_refcount) > 1) { + atomic_set(&req->rq_refcount, 2); + ptlrpcd_add_work_req(req); + } + return rc; +} + +static int worker_format; + +static int ptlrpcd_check_work(struct ptlrpc_request *req) +{ + return req->rq_pill.rc_fmt == (void *)&worker_format; +} + +/** + * Create a work for ptlrpc. + */ +void *ptlrpcd_alloc_work(struct obd_import *imp, + int (*cb)(const struct lu_env *, void *), void *cbdata) +{ + struct ptlrpc_request *req = NULL; + struct ptlrpc_work_async_args *args; + + might_sleep(); + + if (cb == NULL) + return ERR_PTR(-EINVAL); + + /* copy some code from deprecated fakereq. */ + req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (req == NULL) { + CERROR("ptlrpc: run out of memory!\n"); + return ERR_PTR(-ENOMEM); + } + + req->rq_send_state = LUSTRE_IMP_FULL; + req->rq_type = PTL_RPC_MSG_REQUEST; + req->rq_import = class_import_get(imp); + req->rq_export = NULL; + req->rq_interpret_reply = work_interpreter; + /* don't want reply */ + req->rq_receiving_reply = 0; + req->rq_req_unlink = req->rq_reply_unlink = 0; + req->rq_no_delay = req->rq_no_resend = 1; + req->rq_pill.rc_fmt = (void *)&worker_format; + + spin_lock_init(&req->rq_lock); + INIT_LIST_HEAD(&req->rq_list); + INIT_LIST_HEAD(&req->rq_replay_list); + INIT_LIST_HEAD(&req->rq_set_chain); + INIT_LIST_HEAD(&req->rq_history_list); + INIT_LIST_HEAD(&req->rq_exp_list); + init_waitqueue_head(&req->rq_reply_waitq); + init_waitqueue_head(&req->rq_set_waitq); + atomic_set(&req->rq_refcount, 1); + + CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args)); + args = ptlrpc_req_async_args(req); + args->cb = cb; + args->cbdata = cbdata; + + return req; +} +EXPORT_SYMBOL(ptlrpcd_alloc_work); + +void ptlrpcd_destroy_work(void *handler) +{ + struct ptlrpc_request *req = handler; + + if (req) + ptlrpc_req_finished(req); +} +EXPORT_SYMBOL(ptlrpcd_destroy_work); + +int ptlrpcd_queue_work(void *handler) +{ + struct ptlrpc_request *req = handler; + + /* + * Check if the req is already being queued. + * + * Here comes a trick: it lacks a way of checking if a req is being + * processed reliably in ptlrpc. Here I have to use refcount of req + * for this purpose. This is okay because the caller should use this + * req as opaque data. - Jinshan + */ + LASSERT(atomic_read(&req->rq_refcount) > 0); + if (atomic_inc_return(&req->rq_refcount) == 2) + ptlrpcd_add_work_req(req); + return 0; +} +EXPORT_SYMBOL(ptlrpcd_queue_work); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/connection.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/connection.c new file mode 100644 index 000000000..7e27397ce --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/connection.c @@ -0,0 +1,241 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_net.h" + +#include "ptlrpc_internal.h" + +static struct cfs_hash *conn_hash; +static cfs_hash_ops_t conn_hash_ops; + +struct ptlrpc_connection * +ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self, + struct obd_uuid *uuid) +{ + struct ptlrpc_connection *conn, *conn2; + + conn = cfs_hash_lookup(conn_hash, &peer); + if (conn) + goto out; + + OBD_ALLOC_PTR(conn); + if (!conn) + return NULL; + + conn->c_peer = peer; + conn->c_self = self; + INIT_HLIST_NODE(&conn->c_hash); + atomic_set(&conn->c_refcount, 1); + if (uuid) + obd_str2uuid(&conn->c_remote_uuid, uuid->uuid); + + /* + * Add the newly created conn to the hash, on key collision we + * lost a racing addition and must destroy our newly allocated + * connection. The object which exists in the has will be + * returned and may be compared against out object. + */ + /* In the function below, .hs_keycmp resolves to + * conn_keycmp() */ + /* coverity[overrun-buffer-val] */ + conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash); + if (conn != conn2) { + OBD_FREE_PTR(conn); + conn = conn2; + } +out: + CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nid2str(conn->c_peer.nid)); + return conn; +} +EXPORT_SYMBOL(ptlrpc_connection_get); + +int ptlrpc_connection_put(struct ptlrpc_connection *conn) +{ + int rc = 0; + + if (!conn) + return rc; + + LASSERT(atomic_read(&conn->c_refcount) > 1); + + /* + * We do not remove connection from hashtable and + * do not free it even if last caller released ref, + * as we want to have it cached for the case it is + * needed again. + * + * Deallocating it and later creating new connection + * again would be wastful. This way we also avoid + * expensive locking to protect things from get/put + * race when found cached connection is freed by + * ptlrpc_connection_put(). + * + * It will be freed later in module unload time, + * when ptlrpc_connection_fini()->lh_exit->conn_exit() + * path is called. + */ + if (atomic_dec_return(&conn->c_refcount) == 1) + rc = 1; + + CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nid2str(conn->c_peer.nid)); + + return rc; +} +EXPORT_SYMBOL(ptlrpc_connection_put); + +struct ptlrpc_connection * +ptlrpc_connection_addref(struct ptlrpc_connection *conn) +{ + atomic_inc(&conn->c_refcount); + CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nid2str(conn->c_peer.nid)); + + return conn; +} +EXPORT_SYMBOL(ptlrpc_connection_addref); + +int ptlrpc_connection_init(void) +{ + conn_hash = cfs_hash_create("CONN_HASH", + HASH_CONN_CUR_BITS, + HASH_CONN_MAX_BITS, + HASH_CONN_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &conn_hash_ops, CFS_HASH_DEFAULT); + if (!conn_hash) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(ptlrpc_connection_init); + +void ptlrpc_connection_fini(void) +{ + cfs_hash_putref(conn_hash); +} +EXPORT_SYMBOL(ptlrpc_connection_fini); + +/* + * Hash operations for net_peer<->connection + */ +static unsigned +conn_hashfn(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(lnet_process_id_t), mask); +} + +static int +conn_keycmp(const void *key, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + const lnet_process_id_t *conn_key; + + LASSERT(key != NULL); + conn_key = (lnet_process_id_t *)key; + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + + return conn_key->nid == conn->c_peer.nid && + conn_key->pid == conn->c_peer.pid; +} + +static void * +conn_key(struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + return &conn->c_peer; +} + +static void * +conn_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ptlrpc_connection, c_hash); +} + +static void +conn_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + atomic_inc(&conn->c_refcount); +} + +static void +conn_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + atomic_dec(&conn->c_refcount); +} + +static void +conn_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + /* + * Nothing should be left. Connection user put it and + * connection also was deleted from table by this time + * so we should have 0 refs. + */ + LASSERTF(atomic_read(&conn->c_refcount) == 0, + "Busy connection with %d refs\n", + atomic_read(&conn->c_refcount)); + OBD_FREE_PTR(conn); +} + +static cfs_hash_ops_t conn_hash_ops = { + .hs_hash = conn_hashfn, + .hs_keycmp = conn_keycmp, + .hs_key = conn_key, + .hs_object = conn_object, + .hs_get = conn_get, + .hs_put_locked = conn_put_locked, + .hs_exit = conn_exit, +}; diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/errno.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/errno.c new file mode 100644 index 000000000..73f8374f1 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/errno.c @@ -0,0 +1,380 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * GPL HEADER END + */ +/* + * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved. + * + * Copyright (c) 2013, Intel Corporation. + */ + +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/lustre/lustre_errno.h" + +/* + * The two translation tables below must define a one-to-one mapping between + * host and network errnos. + * + * EWOULDBLOCK is equal to EAGAIN on all architectures except for parisc, which + * appears irrelevant. Thus, existing references to EWOULDBLOCK are fine. + * + * EDEADLOCK is equal to EDEADLK on x86 but not on sparc, at least. A sparc + * host has no context-free way to determine if a LUSTRE_EDEADLK represents an + * EDEADLK or an EDEADLOCK. Therefore, all existing references to EDEADLOCK + * that need to be transferred on wire have been replaced with EDEADLK. + */ +static int lustre_errno_hton_mapping[] = { + [EPERM] = LUSTRE_EPERM, + [ENOENT] = LUSTRE_ENOENT, + [ESRCH] = LUSTRE_ESRCH, + [EINTR] = LUSTRE_EINTR, + [EIO] = LUSTRE_EIO, + [ENXIO] = LUSTRE_ENXIO, + [E2BIG] = LUSTRE_E2BIG, + [ENOEXEC] = LUSTRE_ENOEXEC, + [EBADF] = LUSTRE_EBADF, + [ECHILD] = LUSTRE_ECHILD, + [EAGAIN] = LUSTRE_EAGAIN, + [ENOMEM] = LUSTRE_ENOMEM, + [EACCES] = LUSTRE_EACCES, + [EFAULT] = LUSTRE_EFAULT, + [ENOTBLK] = LUSTRE_ENOTBLK, + [EBUSY] = LUSTRE_EBUSY, + [EEXIST] = LUSTRE_EEXIST, + [EXDEV] = LUSTRE_EXDEV, + [ENODEV] = LUSTRE_ENODEV, + [ENOTDIR] = LUSTRE_ENOTDIR, + [EISDIR] = LUSTRE_EISDIR, + [EINVAL] = LUSTRE_EINVAL, + [ENFILE] = LUSTRE_ENFILE, + [EMFILE] = LUSTRE_EMFILE, + [ENOTTY] = LUSTRE_ENOTTY, + [ETXTBSY] = LUSTRE_ETXTBSY, + [EFBIG] = LUSTRE_EFBIG, + [ENOSPC] = LUSTRE_ENOSPC, + [ESPIPE] = LUSTRE_ESPIPE, + [EROFS] = LUSTRE_EROFS, + [EMLINK] = LUSTRE_EMLINK, + [EPIPE] = LUSTRE_EPIPE, + [EDOM] = LUSTRE_EDOM, + [ERANGE] = LUSTRE_ERANGE, + [EDEADLK] = LUSTRE_EDEADLK, + [ENAMETOOLONG] = LUSTRE_ENAMETOOLONG, + [ENOLCK] = LUSTRE_ENOLCK, + [ENOSYS] = LUSTRE_ENOSYS, + [ENOTEMPTY] = LUSTRE_ENOTEMPTY, + [ELOOP] = LUSTRE_ELOOP, + [ENOMSG] = LUSTRE_ENOMSG, + [EIDRM] = LUSTRE_EIDRM, + [ECHRNG] = LUSTRE_ECHRNG, + [EL2NSYNC] = LUSTRE_EL2NSYNC, + [EL3HLT] = LUSTRE_EL3HLT, + [EL3RST] = LUSTRE_EL3RST, + [ELNRNG] = LUSTRE_ELNRNG, + [EUNATCH] = LUSTRE_EUNATCH, + [ENOCSI] = LUSTRE_ENOCSI, + [EL2HLT] = LUSTRE_EL2HLT, + [EBADE] = LUSTRE_EBADE, + [EBADR] = LUSTRE_EBADR, + [EXFULL] = LUSTRE_EXFULL, + [ENOANO] = LUSTRE_ENOANO, + [EBADRQC] = LUSTRE_EBADRQC, + [EBADSLT] = LUSTRE_EBADSLT, + [EBFONT] = LUSTRE_EBFONT, + [ENOSTR] = LUSTRE_ENOSTR, + [ENODATA] = LUSTRE_ENODATA, + [ETIME] = LUSTRE_ETIME, + [ENOSR] = LUSTRE_ENOSR, + [ENONET] = LUSTRE_ENONET, + [ENOPKG] = LUSTRE_ENOPKG, + [EREMOTE] = LUSTRE_EREMOTE, + [ENOLINK] = LUSTRE_ENOLINK, + [EADV] = LUSTRE_EADV, + [ESRMNT] = LUSTRE_ESRMNT, + [ECOMM] = LUSTRE_ECOMM, + [EPROTO] = LUSTRE_EPROTO, + [EMULTIHOP] = LUSTRE_EMULTIHOP, + [EDOTDOT] = LUSTRE_EDOTDOT, + [EBADMSG] = LUSTRE_EBADMSG, + [EOVERFLOW] = LUSTRE_EOVERFLOW, + [ENOTUNIQ] = LUSTRE_ENOTUNIQ, + [EBADFD] = LUSTRE_EBADFD, + [EREMCHG] = LUSTRE_EREMCHG, + [ELIBACC] = LUSTRE_ELIBACC, + [ELIBBAD] = LUSTRE_ELIBBAD, + [ELIBSCN] = LUSTRE_ELIBSCN, + [ELIBMAX] = LUSTRE_ELIBMAX, + [ELIBEXEC] = LUSTRE_ELIBEXEC, + [EILSEQ] = LUSTRE_EILSEQ, + [ERESTART] = LUSTRE_ERESTART, + [ESTRPIPE] = LUSTRE_ESTRPIPE, + [EUSERS] = LUSTRE_EUSERS, + [ENOTSOCK] = LUSTRE_ENOTSOCK, + [EDESTADDRREQ] = LUSTRE_EDESTADDRREQ, + [EMSGSIZE] = LUSTRE_EMSGSIZE, + [EPROTOTYPE] = LUSTRE_EPROTOTYPE, + [ENOPROTOOPT] = LUSTRE_ENOPROTOOPT, + [EPROTONOSUPPORT] = LUSTRE_EPROTONOSUPPORT, + [ESOCKTNOSUPPORT] = LUSTRE_ESOCKTNOSUPPORT, + [EOPNOTSUPP] = LUSTRE_EOPNOTSUPP, + [EPFNOSUPPORT] = LUSTRE_EPFNOSUPPORT, + [EAFNOSUPPORT] = LUSTRE_EAFNOSUPPORT, + [EADDRINUSE] = LUSTRE_EADDRINUSE, + [EADDRNOTAVAIL] = LUSTRE_EADDRNOTAVAIL, + [ENETDOWN] = LUSTRE_ENETDOWN, + [ENETUNREACH] = LUSTRE_ENETUNREACH, + [ENETRESET] = LUSTRE_ENETRESET, + [ECONNABORTED] = LUSTRE_ECONNABORTED, + [ECONNRESET] = LUSTRE_ECONNRESET, + [ENOBUFS] = LUSTRE_ENOBUFS, + [EISCONN] = LUSTRE_EISCONN, + [ENOTCONN] = LUSTRE_ENOTCONN, + [ESHUTDOWN] = LUSTRE_ESHUTDOWN, + [ETOOMANYREFS] = LUSTRE_ETOOMANYREFS, + [ETIMEDOUT] = LUSTRE_ETIMEDOUT, + [ECONNREFUSED] = LUSTRE_ECONNREFUSED, + [EHOSTDOWN] = LUSTRE_EHOSTDOWN, + [EHOSTUNREACH] = LUSTRE_EHOSTUNREACH, + [EALREADY] = LUSTRE_EALREADY, + [EINPROGRESS] = LUSTRE_EINPROGRESS, + [ESTALE] = LUSTRE_ESTALE, + [EUCLEAN] = LUSTRE_EUCLEAN, + [ENOTNAM] = LUSTRE_ENOTNAM, + [ENAVAIL] = LUSTRE_ENAVAIL, + [EISNAM] = LUSTRE_EISNAM, + [EREMOTEIO] = LUSTRE_EREMOTEIO, + [EDQUOT] = LUSTRE_EDQUOT, + [ENOMEDIUM] = LUSTRE_ENOMEDIUM, + [EMEDIUMTYPE] = LUSTRE_EMEDIUMTYPE, + [ECANCELED] = LUSTRE_ECANCELED, + [ENOKEY] = LUSTRE_ENOKEY, + [EKEYEXPIRED] = LUSTRE_EKEYEXPIRED, + [EKEYREVOKED] = LUSTRE_EKEYREVOKED, + [EKEYREJECTED] = LUSTRE_EKEYREJECTED, + [EOWNERDEAD] = LUSTRE_EOWNERDEAD, + [ENOTRECOVERABLE] = LUSTRE_ENOTRECOVERABLE, + [ERESTARTSYS] = LUSTRE_ERESTARTSYS, + [ERESTARTNOINTR] = LUSTRE_ERESTARTNOINTR, + [ERESTARTNOHAND] = LUSTRE_ERESTARTNOHAND, + [ENOIOCTLCMD] = LUSTRE_ENOIOCTLCMD, + [ERESTART_RESTARTBLOCK] = LUSTRE_ERESTART_RESTARTBLOCK, + [EBADHANDLE] = LUSTRE_EBADHANDLE, + [ENOTSYNC] = LUSTRE_ENOTSYNC, + [EBADCOOKIE] = LUSTRE_EBADCOOKIE, + [ENOTSUPP] = LUSTRE_ENOTSUPP, + [ETOOSMALL] = LUSTRE_ETOOSMALL, + [ESERVERFAULT] = LUSTRE_ESERVERFAULT, + [EBADTYPE] = LUSTRE_EBADTYPE, + [EJUKEBOX] = LUSTRE_EJUKEBOX, + [EIOCBQUEUED] = LUSTRE_EIOCBQUEUED, +}; + +static int lustre_errno_ntoh_mapping[] = { + [LUSTRE_EPERM] = EPERM, + [LUSTRE_ENOENT] = ENOENT, + [LUSTRE_ESRCH] = ESRCH, + [LUSTRE_EINTR] = EINTR, + [LUSTRE_EIO] = EIO, + [LUSTRE_ENXIO] = ENXIO, + [LUSTRE_E2BIG] = E2BIG, + [LUSTRE_ENOEXEC] = ENOEXEC, + [LUSTRE_EBADF] = EBADF, + [LUSTRE_ECHILD] = ECHILD, + [LUSTRE_EAGAIN] = EAGAIN, + [LUSTRE_ENOMEM] = ENOMEM, + [LUSTRE_EACCES] = EACCES, + [LUSTRE_EFAULT] = EFAULT, + [LUSTRE_ENOTBLK] = ENOTBLK, + [LUSTRE_EBUSY] = EBUSY, + [LUSTRE_EEXIST] = EEXIST, + [LUSTRE_EXDEV] = EXDEV, + [LUSTRE_ENODEV] = ENODEV, + [LUSTRE_ENOTDIR] = ENOTDIR, + [LUSTRE_EISDIR] = EISDIR, + [LUSTRE_EINVAL] = EINVAL, + [LUSTRE_ENFILE] = ENFILE, + [LUSTRE_EMFILE] = EMFILE, + [LUSTRE_ENOTTY] = ENOTTY, + [LUSTRE_ETXTBSY] = ETXTBSY, + [LUSTRE_EFBIG] = EFBIG, + [LUSTRE_ENOSPC] = ENOSPC, + [LUSTRE_ESPIPE] = ESPIPE, + [LUSTRE_EROFS] = EROFS, + [LUSTRE_EMLINK] = EMLINK, + [LUSTRE_EPIPE] = EPIPE, + [LUSTRE_EDOM] = EDOM, + [LUSTRE_ERANGE] = ERANGE, + [LUSTRE_EDEADLK] = EDEADLK, + [LUSTRE_ENAMETOOLONG] = ENAMETOOLONG, + [LUSTRE_ENOLCK] = ENOLCK, + [LUSTRE_ENOSYS] = ENOSYS, + [LUSTRE_ENOTEMPTY] = ENOTEMPTY, + [LUSTRE_ELOOP] = ELOOP, + [LUSTRE_ENOMSG] = ENOMSG, + [LUSTRE_EIDRM] = EIDRM, + [LUSTRE_ECHRNG] = ECHRNG, + [LUSTRE_EL2NSYNC] = EL2NSYNC, + [LUSTRE_EL3HLT] = EL3HLT, + [LUSTRE_EL3RST] = EL3RST, + [LUSTRE_ELNRNG] = ELNRNG, + [LUSTRE_EUNATCH] = EUNATCH, + [LUSTRE_ENOCSI] = ENOCSI, + [LUSTRE_EL2HLT] = EL2HLT, + [LUSTRE_EBADE] = EBADE, + [LUSTRE_EBADR] = EBADR, + [LUSTRE_EXFULL] = EXFULL, + [LUSTRE_ENOANO] = ENOANO, + [LUSTRE_EBADRQC] = EBADRQC, + [LUSTRE_EBADSLT] = EBADSLT, + [LUSTRE_EBFONT] = EBFONT, + [LUSTRE_ENOSTR] = ENOSTR, + [LUSTRE_ENODATA] = ENODATA, + [LUSTRE_ETIME] = ETIME, + [LUSTRE_ENOSR] = ENOSR, + [LUSTRE_ENONET] = ENONET, + [LUSTRE_ENOPKG] = ENOPKG, + [LUSTRE_EREMOTE] = EREMOTE, + [LUSTRE_ENOLINK] = ENOLINK, + [LUSTRE_EADV] = EADV, + [LUSTRE_ESRMNT] = ESRMNT, + [LUSTRE_ECOMM] = ECOMM, + [LUSTRE_EPROTO] = EPROTO, + [LUSTRE_EMULTIHOP] = EMULTIHOP, + [LUSTRE_EDOTDOT] = EDOTDOT, + [LUSTRE_EBADMSG] = EBADMSG, + [LUSTRE_EOVERFLOW] = EOVERFLOW, + [LUSTRE_ENOTUNIQ] = ENOTUNIQ, + [LUSTRE_EBADFD] = EBADFD, + [LUSTRE_EREMCHG] = EREMCHG, + [LUSTRE_ELIBACC] = ELIBACC, + [LUSTRE_ELIBBAD] = ELIBBAD, + [LUSTRE_ELIBSCN] = ELIBSCN, + [LUSTRE_ELIBMAX] = ELIBMAX, + [LUSTRE_ELIBEXEC] = ELIBEXEC, + [LUSTRE_EILSEQ] = EILSEQ, + [LUSTRE_ERESTART] = ERESTART, + [LUSTRE_ESTRPIPE] = ESTRPIPE, + [LUSTRE_EUSERS] = EUSERS, + [LUSTRE_ENOTSOCK] = ENOTSOCK, + [LUSTRE_EDESTADDRREQ] = EDESTADDRREQ, + [LUSTRE_EMSGSIZE] = EMSGSIZE, + [LUSTRE_EPROTOTYPE] = EPROTOTYPE, + [LUSTRE_ENOPROTOOPT] = ENOPROTOOPT, + [LUSTRE_EPROTONOSUPPORT] = EPROTONOSUPPORT, + [LUSTRE_ESOCKTNOSUPPORT] = ESOCKTNOSUPPORT, + [LUSTRE_EOPNOTSUPP] = EOPNOTSUPP, + [LUSTRE_EPFNOSUPPORT] = EPFNOSUPPORT, + [LUSTRE_EAFNOSUPPORT] = EAFNOSUPPORT, + [LUSTRE_EADDRINUSE] = EADDRINUSE, + [LUSTRE_EADDRNOTAVAIL] = EADDRNOTAVAIL, + [LUSTRE_ENETDOWN] = ENETDOWN, + [LUSTRE_ENETUNREACH] = ENETUNREACH, + [LUSTRE_ENETRESET] = ENETRESET, + [LUSTRE_ECONNABORTED] = ECONNABORTED, + [LUSTRE_ECONNRESET] = ECONNRESET, + [LUSTRE_ENOBUFS] = ENOBUFS, + [LUSTRE_EISCONN] = EISCONN, + [LUSTRE_ENOTCONN] = ENOTCONN, + [LUSTRE_ESHUTDOWN] = ESHUTDOWN, + [LUSTRE_ETOOMANYREFS] = ETOOMANYREFS, + [LUSTRE_ETIMEDOUT] = ETIMEDOUT, + [LUSTRE_ECONNREFUSED] = ECONNREFUSED, + [LUSTRE_EHOSTDOWN] = EHOSTDOWN, + [LUSTRE_EHOSTUNREACH] = EHOSTUNREACH, + [LUSTRE_EALREADY] = EALREADY, + [LUSTRE_EINPROGRESS] = EINPROGRESS, + [LUSTRE_ESTALE] = ESTALE, + [LUSTRE_EUCLEAN] = EUCLEAN, + [LUSTRE_ENOTNAM] = ENOTNAM, + [LUSTRE_ENAVAIL] = ENAVAIL, + [LUSTRE_EISNAM] = EISNAM, + [LUSTRE_EREMOTEIO] = EREMOTEIO, + [LUSTRE_EDQUOT] = EDQUOT, + [LUSTRE_ENOMEDIUM] = ENOMEDIUM, + [LUSTRE_EMEDIUMTYPE] = EMEDIUMTYPE, + [LUSTRE_ECANCELED] = ECANCELED, + [LUSTRE_ENOKEY] = ENOKEY, + [LUSTRE_EKEYEXPIRED] = EKEYEXPIRED, + [LUSTRE_EKEYREVOKED] = EKEYREVOKED, + [LUSTRE_EKEYREJECTED] = EKEYREJECTED, + [LUSTRE_EOWNERDEAD] = EOWNERDEAD, + [LUSTRE_ENOTRECOVERABLE] = ENOTRECOVERABLE, + [LUSTRE_ERESTARTSYS] = ERESTARTSYS, + [LUSTRE_ERESTARTNOINTR] = ERESTARTNOINTR, + [LUSTRE_ERESTARTNOHAND] = ERESTARTNOHAND, + [LUSTRE_ENOIOCTLCMD] = ENOIOCTLCMD, + [LUSTRE_ERESTART_RESTARTBLOCK] = ERESTART_RESTARTBLOCK, + [LUSTRE_EBADHANDLE] = EBADHANDLE, + [LUSTRE_ENOTSYNC] = ENOTSYNC, + [LUSTRE_EBADCOOKIE] = EBADCOOKIE, + [LUSTRE_ENOTSUPP] = ENOTSUPP, + [LUSTRE_ETOOSMALL] = ETOOSMALL, + [LUSTRE_ESERVERFAULT] = ESERVERFAULT, + [LUSTRE_EBADTYPE] = EBADTYPE, + [LUSTRE_EJUKEBOX] = EJUKEBOX, + [LUSTRE_EIOCBQUEUED] = EIOCBQUEUED, +}; + +unsigned int lustre_errno_hton(unsigned int h) +{ + unsigned int n; + + if (h == 0) { + n = 0; + } else if (h < ARRAY_SIZE(lustre_errno_hton_mapping)) { + n = lustre_errno_hton_mapping[h]; + if (n == 0) + goto generic; + } else { +generic: + /* + * A generic errno is better than the unknown one that could + * mean anything to a different host. + */ + n = LUSTRE_EIO; + } + + return n; +} +EXPORT_SYMBOL(lustre_errno_hton); + +unsigned int lustre_errno_ntoh(unsigned int n) +{ + unsigned int h; + + if (n == 0) { + h = 0; + } else if (n < ARRAY_SIZE(lustre_errno_ntoh_mapping)) { + h = lustre_errno_ntoh_mapping[n]; + if (h == 0) + goto generic; + } else { +generic: + /* + * Similar to the situation in lustre_errno_hton(), an unknown + * network errno could coincide with anything. Hence, it is + * better to return a generic errno. + */ + h = EIO; + } + + return h; +} +EXPORT_SYMBOL(lustre_errno_ntoh); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/events.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/events.c new file mode 100644 index 000000000..7f8644e01 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/events.c @@ -0,0 +1,585 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include "../../include/linux/libcfs/libcfs.h" +# ifdef __mips64__ +# include +# endif + +#include "../include/obd_class.h" +#include "../include/lustre_net.h" +#include "../include/lustre_sec.h" +#include "ptlrpc_internal.h" + +lnet_handle_eq_t ptlrpc_eq_h; + +/* + * Client's outgoing request callback + */ +void request_out_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; + + LASSERT(ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->unlinked); + + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); + + sptlrpc_request_out_callback(req); + spin_lock(&req->rq_lock); + req->rq_real_sent = get_seconds(); + if (ev->unlinked) + req->rq_req_unlink = 0; + + if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) { + + /* Failed send: make it seem like the reply timed out, just + * like failing sends in client.c does currently... */ + + req->rq_net_err = 1; + ptlrpc_client_wake_req(req); + } + spin_unlock(&req->rq_lock); + + ptlrpc_req_finished(req); +} + +/* + * Client's incoming reply callback + */ +void reply_in_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; + + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); + + LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->md.start == req->rq_repbuf); + LASSERT(ev->offset + ev->mlength <= req->rq_repbuf_len); + /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests + for adaptive timeouts' early reply. */ + LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0); + + spin_lock(&req->rq_lock); + + req->rq_receiving_reply = 0; + req->rq_early = 0; + if (ev->unlinked) + req->rq_reply_unlink = 0; + + if (ev->status) + goto out_wake; + + if (ev->type == LNET_EVENT_UNLINK) { + LASSERT(ev->unlinked); + DEBUG_REQ(D_NET, req, "unlink"); + goto out_wake; + } + + if (ev->mlength < ev->rlength) { + CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req, + req->rq_replen, ev->rlength, ev->offset); + req->rq_reply_truncate = 1; + req->rq_replied = 1; + req->rq_status = -EOVERFLOW; + req->rq_nob_received = ev->rlength + ev->offset; + goto out_wake; + } + + if ((ev->offset == 0) && + ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) { + /* Early reply */ + DEBUG_REQ(D_ADAPTTO, req, + "Early reply received: mlen=%u offset=%d replen=%d replied=%d unlinked=%d", + ev->mlength, ev->offset, + req->rq_replen, req->rq_replied, ev->unlinked); + + req->rq_early_count++; /* number received, client side */ + + if (req->rq_replied) /* already got the real reply */ + goto out_wake; + + req->rq_early = 1; + req->rq_reply_off = ev->offset; + req->rq_nob_received = ev->mlength; + /* And we're still receiving */ + req->rq_receiving_reply = 1; + } else { + /* Real reply */ + req->rq_rep_swab_mask = 0; + req->rq_replied = 1; + /* Got reply, no resend required */ + req->rq_resend = 0; + req->rq_reply_off = ev->offset; + req->rq_nob_received = ev->mlength; + /* LNetMDUnlink can't be called under the LNET_LOCK, + so we must unlink in ptlrpc_unregister_reply */ + DEBUG_REQ(D_INFO, req, + "reply in flags=%x mlen=%u offset=%d replen=%d", + lustre_msg_get_flags(req->rq_reqmsg), + ev->mlength, ev->offset, req->rq_replen); + } + + req->rq_import->imp_last_reply_time = get_seconds(); + +out_wake: + /* NB don't unlock till after wakeup; req can disappear under us + * since we don't have our own ref */ + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); +} + +/* + * Client's bulk has been written/read + */ +void client_bulk_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; + struct ptlrpc_request *req; + + LASSERT((desc->bd_type == BULK_PUT_SINK && + ev->type == LNET_EVENT_PUT) || + (desc->bd_type == BULK_GET_SOURCE && + ev->type == LNET_EVENT_GET) || + ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->unlinked); + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE)) + ev->status = -EIO; + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2, + CFS_FAIL_ONCE)) + ev->status = -EIO; + + CDEBUG((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, desc %p\n", + ev->type, ev->status, desc); + + spin_lock(&desc->bd_lock); + req = desc->bd_req; + LASSERT(desc->bd_md_count > 0); + desc->bd_md_count--; + + if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) { + desc->bd_nob_transferred += ev->mlength; + desc->bd_sender = ev->sender; + } else { + /* start reconnect and resend if network error hit */ + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + } + + if (ev->status != 0) + desc->bd_failure = 1; + + /* NB don't unlock till after wakeup; desc can disappear under us + * otherwise */ + if (desc->bd_md_count == 0) + ptlrpc_client_wake_req(desc->bd_req); + + spin_unlock(&desc->bd_lock); +} + +/* + * We will have percpt request history list for ptlrpc service in upcoming + * patches because we don't want to be serialized by current per-service + * history operations. So we require history ID can (somehow) show arriving + * order w/o grabbing global lock, and user can sort them in userspace. + * + * This is how we generate history ID for ptlrpc_request: + * ---------------------------------------------------- + * | 32 bits | 16 bits | (16 - X)bits | X bits | + * ---------------------------------------------------- + * | seconds | usec / 16 | sequence | CPT id | + * ---------------------------------------------------- + * + * it might not be precise but should be good enough. + */ + +#define REQS_CPT_BITS(svcpt) ((svcpt)->scp_service->srv_cpt_bits) + +#define REQS_SEC_SHIFT 32 +#define REQS_USEC_SHIFT 16 +#define REQS_SEQ_SHIFT(svcpt) REQS_CPT_BITS(svcpt) + +static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + __u64 sec = req->rq_arrival_time.tv_sec; + __u32 usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */ + __u64 new_seq; + + /* set sequence ID for request and add it to history list, + * it must be called with hold svcpt::scp_lock */ + + new_seq = (sec << REQS_SEC_SHIFT) | + (usec << REQS_USEC_SHIFT) | + (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt); + + if (new_seq > svcpt->scp_hist_seq) { + /* This handles the initial case of scp_hist_seq == 0 or + * we just jumped into a new time window */ + svcpt->scp_hist_seq = new_seq; + } else { + LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT); + /* NB: increase sequence number in current usec bucket, + * however, it's possible that we used up all bits for + * sequence and jumped into the next usec bucket (future time), + * then we hope there will be less RPCs per bucket at some + * point, and sequence will catch up again */ + svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt)); + new_seq = svcpt->scp_hist_seq; + } + + req->rq_history_seq = new_seq; + + list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs); +} + +/* + * Server's incoming request callback + */ +void request_in_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg; + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + struct ptlrpc_service *service = svcpt->scp_service; + struct ptlrpc_request *req; + + LASSERT(ev->type == LNET_EVENT_PUT || + ev->type == LNET_EVENT_UNLINK); + LASSERT((char *)ev->md.start >= rqbd->rqbd_buffer); + LASSERT((char *)ev->md.start + ev->offset + ev->mlength <= + rqbd->rqbd_buffer + service->srv_buf_size); + + CDEBUG((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, service %s\n", + ev->type, ev->status, service->srv_name); + + if (ev->unlinked) { + /* If this is the last request message to fit in the + * request buffer we can use the request object embedded in + * rqbd. Note that if we failed to allocate a request, + * we'd have to re-post the rqbd, which we can't do in this + * context. */ + req = &rqbd->rqbd_req; + memset(req, 0, sizeof(*req)); + } else { + LASSERT(ev->type == LNET_EVENT_PUT); + if (ev->status != 0) { + /* We moaned above already... */ + return; + } + req = ptlrpc_request_cache_alloc(GFP_ATOMIC); + if (req == NULL) { + CERROR("Can't allocate incoming request descriptor: Dropping %s RPC from %s\n", + service->srv_name, + libcfs_id2str(ev->initiator)); + return; + } + } + + /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL, + * flags are reset and scalars are zero. We only set the message + * size to non-zero if this was a successful receive. */ + req->rq_xid = ev->match_bits; + req->rq_reqbuf = ev->md.start + ev->offset; + if (ev->type == LNET_EVENT_PUT && ev->status == 0) + req->rq_reqdata_len = ev->mlength; + do_gettimeofday(&req->rq_arrival_time); + req->rq_peer = ev->initiator; + req->rq_self = ev->target.nid; + req->rq_rqbd = rqbd; + req->rq_phase = RQ_PHASE_NEW; + spin_lock_init(&req->rq_lock); + INIT_LIST_HEAD(&req->rq_timed_list); + INIT_LIST_HEAD(&req->rq_exp_list); + atomic_set(&req->rq_refcount, 1); + if (ev->type == LNET_EVENT_PUT) + CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n", + req, req->rq_xid, ev->mlength); + + CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer)); + + spin_lock(&svcpt->scp_lock); + + ptlrpc_req_add_history(svcpt, req); + + if (ev->unlinked) { + svcpt->scp_nrqbds_posted--; + CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n", + svcpt->scp_nrqbds_posted); + + /* Normally, don't complain about 0 buffers posted; LNET won't + * drop incoming reqs since we set the portal lazy */ + if (test_req_buffer_pressure && + ev->type != LNET_EVENT_UNLINK && + svcpt->scp_nrqbds_posted == 0) + CWARN("All %s request buffers busy\n", + service->srv_name); + + /* req takes over the network's ref on rqbd */ + } else { + /* req takes a ref on rqbd */ + rqbd->rqbd_refcount++; + } + + list_add_tail(&req->rq_list, &svcpt->scp_req_incoming); + svcpt->scp_nreqs_incoming++; + + /* NB everything can disappear under us once the request + * has been queued and we unlock, so do the wake now... */ + wake_up(&svcpt->scp_waitq); + + spin_unlock(&svcpt->scp_lock); +} + +/* + * Server's outgoing reply callback + */ +void reply_out_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_reply_state *rs = cbid->cbid_arg; + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + LASSERT(ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_ACK || + ev->type == LNET_EVENT_UNLINK); + + if (!rs->rs_difficult) { + /* 'Easy' replies have no further processing so I drop the + * net's ref on 'rs' */ + LASSERT(ev->unlinked); + ptlrpc_rs_decref(rs); + return; + } + + LASSERT(rs->rs_on_net); + + if (ev->unlinked) { + /* Last network callback. The net's ref on 'rs' stays put + * until ptlrpc_handle_rs() is done with it */ + spin_lock(&svcpt->scp_rep_lock); + spin_lock(&rs->rs_lock); + + rs->rs_on_net = 0; + if (!rs->rs_no_ack || + rs->rs_transno <= + rs->rs_export->exp_obd->obd_last_committed) + ptlrpc_schedule_difficult_reply(rs); + + spin_unlock(&rs->rs_lock); + spin_unlock(&svcpt->scp_rep_lock); + } +} + + +static void ptlrpc_master_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + void (*callback)(lnet_event_t *ev) = cbid->cbid_fn; + + /* Honestly, it's best to find out early. */ + LASSERT(cbid->cbid_arg != LP_POISON); + LASSERT(callback == request_out_callback || + callback == reply_in_callback || + callback == client_bulk_callback || + callback == request_in_callback || + callback == reply_out_callback); + + callback(ev); +} + +int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, + lnet_process_id_t *peer, lnet_nid_t *self) +{ + int best_dist = 0; + __u32 best_order = 0; + int count = 0; + int rc = -ENOENT; + int portals_compatibility; + int dist; + __u32 order; + lnet_nid_t dst_nid; + lnet_nid_t src_nid; + + portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL); + + peer->pid = LUSTRE_SRV_LNET_PID; + + /* Choose the matching UUID that's closest */ + while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) { + dist = LNetDist(dst_nid, &src_nid, &order); + if (dist < 0) + continue; + + if (dist == 0) { /* local! use loopback LND */ + peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0); + rc = 0; + break; + } + + if (rc < 0 || + dist < best_dist || + (dist == best_dist && order < best_order)) { + best_dist = dist; + best_order = order; + + if (portals_compatibility > 1) { + /* Strong portals compatibility: Zero the nid's + * NET, so if I'm reading new config logs, or + * getting configured by (new) lconf I can + * still talk to old servers. */ + dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid)); + src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid)); + } + peer->nid = dst_nid; + *self = src_nid; + rc = 0; + } + } + + CDEBUG(D_NET, "%s->%s\n", uuid->uuid, libcfs_id2str(*peer)); + return rc; +} + +void ptlrpc_ni_fini(void) +{ + wait_queue_head_t waitq; + struct l_wait_info lwi; + int rc; + int retries; + + /* Wait for the event queue to become idle since there may still be + * messages in flight with pending events (i.e. the fire-and-forget + * messages == client requests and "non-difficult" server + * replies */ + + for (retries = 0;; retries++) { + rc = LNetEQFree(ptlrpc_eq_h); + switch (rc) { + default: + LBUG(); + + case 0: + LNetNIFini(); + return; + + case -EBUSY: + if (retries != 0) + CWARN("Event queue still busy\n"); + + /* Wait for a bit */ + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL); + l_wait_event(waitq, 0, &lwi); + break; + } + } + /* notreached */ +} + +lnet_pid_t ptl_get_pid(void) +{ + lnet_pid_t pid; + + pid = LUSTRE_SRV_LNET_PID; + return pid; +} + +int ptlrpc_ni_init(void) +{ + int rc; + lnet_pid_t pid; + + pid = ptl_get_pid(); + CDEBUG(D_NET, "My pid is: %x\n", pid); + + /* We're not passing any limits yet... */ + rc = LNetNIInit(pid); + if (rc < 0) { + CDEBUG(D_NET, "Can't init network interface: %d\n", rc); + return -ENOENT; + } + + /* CAVEAT EMPTOR: how we process portals events is _radically_ + * different depending on... */ + /* kernel LNet calls our master callback when there are new event, + * because we are guaranteed to get every event via callback, + * so we just set EQ size to 0 to avoid overhead of serializing + * enqueue/dequeue operations in LNet. */ + rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h); + if (rc == 0) + return 0; + + CERROR("Failed to allocate event queue: %d\n", rc); + LNetNIFini(); + + return -ENOMEM; +} + + +int ptlrpc_init_portals(void) +{ + int rc = ptlrpc_ni_init(); + + if (rc != 0) { + CERROR("network initialisation failed\n"); + return -EIO; + } + rc = ptlrpcd_addref(); + if (rc == 0) + return 0; + + CERROR("rpcd initialisation failed\n"); + ptlrpc_ni_fini(); + return rc; +} + +void ptlrpc_exit_portals(void) +{ + ptlrpcd_decref(); + ptlrpc_ni_fini(); +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/import.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/import.c new file mode 100644 index 000000000..d5fc689c0 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/import.c @@ -0,0 +1,1642 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/import.c + * + * Author: Mike Shaver + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include "../include/obd_support.h" +#include "../include/lustre_ha.h" +#include "../include/lustre_net.h" +#include "../include/lustre_import.h" +#include "../include/lustre_export.h" +#include "../include/obd.h" +#include "../include/obd_cksum.h" +#include "../include/obd_class.h" + +#include "ptlrpc_internal.h" + +struct ptlrpc_connect_async_args { + __u64 pcaa_peer_committed; + int pcaa_initial_connect; +}; + +/** + * Updates import \a imp current state to provided \a state value + * Helper function. Must be called under imp_lock. + */ +static void __import_set_state(struct obd_import *imp, + enum lustre_imp_state state) +{ + switch (state) { + case LUSTRE_IMP_CLOSED: + case LUSTRE_IMP_NEW: + case LUSTRE_IMP_DISCON: + case LUSTRE_IMP_CONNECTING: + break; + case LUSTRE_IMP_REPLAY_WAIT: + imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS; + break; + default: + imp->imp_replay_state = LUSTRE_IMP_REPLAY; + } + + imp->imp_state = state; + imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state; + imp->imp_state_hist[imp->imp_state_hist_idx].ish_time = + get_seconds(); + imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) % + IMP_STATE_HIST_LEN; +} + +/* A CLOSED import should remain so. */ +#define IMPORT_SET_STATE_NOLOCK(imp, state) \ +do { \ + if (imp->imp_state != LUSTRE_IMP_CLOSED) { \ + CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \ + imp, obd2cli_tgt(imp->imp_obd), \ + ptlrpc_import_state_name(imp->imp_state), \ + ptlrpc_import_state_name(state)); \ + __import_set_state(imp, state); \ + } \ +} while (0) + +#define IMPORT_SET_STATE(imp, state) \ +do { \ + spin_lock(&imp->imp_lock); \ + IMPORT_SET_STATE_NOLOCK(imp, state); \ + spin_unlock(&imp->imp_lock); \ +} while (0) + + +static int ptlrpc_connect_interpret(const struct lu_env *env, + struct ptlrpc_request *request, + void *data, int rc); +int ptlrpc_import_recovery_state_machine(struct obd_import *imp); + +/* Only this function is allowed to change the import state when it is + * CLOSED. I would rather refcount the import and free it after + * disconnection like we do with exports. To do that, the client_obd + * will need to save the peer info somewhere other than in the import, + * though. */ +int ptlrpc_init_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + + imp->imp_generation++; + imp->imp_state = LUSTRE_IMP_NEW; + + spin_unlock(&imp->imp_lock); + + return 0; +} +EXPORT_SYMBOL(ptlrpc_init_import); + +#define UUID_STR "_UUID" +void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len) +{ + *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix)) + ? uuid : uuid + strlen(prefix); + + *uuid_len = strlen(*uuid_start); + + if (*uuid_len < strlen(UUID_STR)) + return; + + if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR), + UUID_STR, strlen(UUID_STR))) + *uuid_len -= strlen(UUID_STR); +} +EXPORT_SYMBOL(deuuidify); + +/** + * Returns true if import was FULL, false if import was already not + * connected. + * @imp - import to be disconnected + * @conn_cnt - connection count (epoch) of the request that timed out + * and caused the disconnection. In some cases, multiple + * inflight requests can fail to a single target (e.g. OST + * bulk requests) and if one has already caused a reconnection + * (increasing the import->conn_cnt) the older failure should + * not also cause a reconnection. If zero it forces a reconnect. + */ +int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt) +{ + int rc = 0; + + spin_lock(&imp->imp_lock); + + if (imp->imp_state == LUSTRE_IMP_FULL && + (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) { + char *target_start; + int target_len; + + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + + if (imp->imp_replayable) { + LCONSOLE_WARN("%s: Connection to %.*s (at %s) was lost; in progress operations using this service will wait for recovery to complete\n", + imp->imp_obd->obd_name, target_len, target_start, + libcfs_nid2str(imp->imp_connection->c_peer.nid)); + } else { + LCONSOLE_ERROR_MSG(0x166, "%s: Connection to %.*s (at %s) was lost; in progress operations using this service will fail\n", + imp->imp_obd->obd_name, + target_len, target_start, + libcfs_nid2str(imp->imp_connection->c_peer.nid)); + } + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); + spin_unlock(&imp->imp_lock); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); + rc = 1; + } else { + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n", + imp->imp_client->cli_name, imp, + (imp->imp_state == LUSTRE_IMP_FULL && + imp->imp_conn_cnt > conn_cnt) ? + "reconnected" : "not connected", imp->imp_conn_cnt, + conn_cnt, ptlrpc_import_state_name(imp->imp_state)); + } + + return rc; +} + +/* Must be called with imp_lock held! */ +static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp) +{ + assert_spin_locked(&imp->imp_lock); + + CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd)); + imp->imp_invalid = 1; + imp->imp_generation++; + spin_unlock(&imp->imp_lock); + + ptlrpc_abort_inflight(imp); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); +} + +/* + * This acts as a barrier; all existing requests are rejected, and + * no new requests will be accepted until the import is valid again. + */ +void ptlrpc_deactivate_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + ptlrpc_deactivate_and_unlock_import(imp); +} +EXPORT_SYMBOL(ptlrpc_deactivate_import); + +static unsigned int +ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now) +{ + long dl; + + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + return 0; + + if (req->rq_timedout) + return 0; + + if (req->rq_phase == RQ_PHASE_NEW) + dl = req->rq_sent; + else + dl = req->rq_deadline; + + if (dl <= now) + return 0; + + return dl - now; +} + +static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp) +{ + time_t now = get_seconds(); + struct list_head *tmp, *n; + struct ptlrpc_request *req; + unsigned int timeout = 0; + + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + timeout = max(ptlrpc_inflight_deadline(req, now), timeout); + } + spin_unlock(&imp->imp_lock); + return timeout; +} + +/** + * This function will invalidate the import, if necessary, then block + * for all the RPC completions, and finally notify the obd to + * invalidate its state (ie cancel locks, clear pending requests, + * etc). + */ +void ptlrpc_invalidate_import(struct obd_import *imp) +{ + struct list_head *tmp, *n; + struct ptlrpc_request *req; + struct l_wait_info lwi; + unsigned int timeout; + int rc; + + atomic_inc(&imp->imp_inval_count); + + if (!imp->imp_invalid || imp->imp_obd->obd_no_recov) + ptlrpc_deactivate_import(imp); + + CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2); + LASSERT(imp->imp_invalid); + + /* Wait forever until inflight == 0. We really can't do it another + * way because in some cases we need to wait for very long reply + * unlink. We can't do anything before that because there is really + * no guarantee that some rdma transfer is not in progress right now. */ + do { + /* Calculate max timeout for waiting on rpcs to error + * out. Use obd_timeout if calculated value is smaller + * than it. */ + if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + timeout = ptlrpc_inflight_timeout(imp); + timeout += timeout / 3; + + if (timeout == 0) + timeout = obd_timeout; + } else { + /* decrease the interval to increase race condition */ + timeout = 1; + } + + CDEBUG(D_RPCTRACE, + "Sleeping %d sec for inflight to error out\n", + timeout); + + /* Wait for all requests to error out and call completion + * callbacks. Cap it at obd_timeout -- these should all + * have been locally cancelled by ptlrpc_abort_inflight. */ + lwi = LWI_TIMEOUT_INTERVAL( + cfs_timeout_cap(cfs_time_seconds(timeout)), + (timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2, + NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inflight) == 0), + &lwi); + if (rc) { + const char *cli_tgt = obd2cli_tgt(imp->imp_obd); + + CERROR("%s: rc = %d waiting for callback (%d != 0)\n", + cli_tgt, rc, + atomic_read(&imp->imp_inflight)); + + spin_lock(&imp->imp_lock); + if (atomic_read(&imp->imp_inflight) == 0) { + int count = atomic_read(&imp->imp_unregistering); + + /* We know that "unregistering" rpcs only can + * survive in sending or delaying lists (they + * maybe waiting for long reply unlink in + * sluggish nets). Let's check this. If there + * is no inflight and unregistering != 0, this + * is bug. */ + LASSERTF(count == 0, "Some RPCs are still unregistering: %d\n", + count); + + /* Let's save one loop as soon as inflight have + * dropped to zero. No new inflights possible at + * this point. */ + rc = 0; + } else { + list_for_each_safe(tmp, n, + &imp->imp_sending_list) { + req = list_entry(tmp, + struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, + "still on sending list"); + } + list_for_each_safe(tmp, n, + &imp->imp_delayed_list) { + req = list_entry(tmp, + struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, + "still on delayed list"); + } + + CERROR("%s: RPCs in \"%s\" phase found (%d). Network is sluggish? Waiting them to error out.\n", + cli_tgt, + ptlrpc_phase2str(RQ_PHASE_UNREGISTERING), + atomic_read(&imp-> + imp_unregistering)); + } + spin_unlock(&imp->imp_lock); + } + } while (rc != 0); + + /* + * Let's additionally check that no new rpcs added to import in + * "invalidate" state. + */ + LASSERT(atomic_read(&imp->imp_inflight) == 0); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); + sptlrpc_import_flush_all_ctx(imp); + + atomic_dec(&imp->imp_inval_count); + wake_up_all(&imp->imp_recovery_waitq); +} +EXPORT_SYMBOL(ptlrpc_invalidate_import); + +/* unset imp_invalid */ +void ptlrpc_activate_import(struct obd_import *imp) +{ + struct obd_device *obd = imp->imp_obd; + + spin_lock(&imp->imp_lock); + if (imp->imp_deactive != 0) { + spin_unlock(&imp->imp_lock); + return; + } + + imp->imp_invalid = 0; + spin_unlock(&imp->imp_lock); + obd_import_event(obd, imp, IMP_EVENT_ACTIVE); +} +EXPORT_SYMBOL(ptlrpc_activate_import); + +static void ptlrpc_pinger_force(struct obd_import *imp) +{ + CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state)); + + spin_lock(&imp->imp_lock); + imp->imp_force_verify = 1; + spin_unlock(&imp->imp_lock); + + if (imp->imp_state != LUSTRE_IMP_CONNECTING) + ptlrpc_pinger_wake_up(); +} + +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) +{ + LASSERT(!imp->imp_dlm_fake); + + if (ptlrpc_set_import_discon(imp, conn_cnt)) { + if (!imp->imp_replayable) { + CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_obd->obd_name); + ptlrpc_deactivate_import(imp); + } + + ptlrpc_pinger_force(imp); + } +} +EXPORT_SYMBOL(ptlrpc_fail_import); + +int ptlrpc_reconnect_import(struct obd_import *imp) +{ +#ifdef ENABLE_PINGER + struct l_wait_info lwi; + int secs = cfs_time_seconds(obd_timeout); + int rc; + + ptlrpc_pinger_force(imp); + + CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n", + obd2cli_tgt(imp->imp_obd), secs); + + lwi = LWI_TIMEOUT(secs, NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), &lwi); + CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state)); + return rc; +#else + ptlrpc_set_import_discon(imp, 0); + /* Force a new connect attempt */ + ptlrpc_invalidate_import(imp); + /* Do a fresh connect next time by zeroing the handle */ + ptlrpc_disconnect_import(imp, 1); + /* Wait for all invalidate calls to finish */ + if (atomic_read(&imp->imp_inval_count) > 0) { + int rc; + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inval_count) == 0), + &lwi); + if (rc) + CERROR("Interrupted, inval=%d\n", + atomic_read(&imp->imp_inval_count)); + } + + /* Allow reconnect attempts */ + imp->imp_obd->obd_no_recov = 0; + /* Remove 'invalid' flag */ + ptlrpc_activate_import(imp); + /* Attempt a new connect */ + ptlrpc_recover_import(imp, NULL, 0); + return 0; +#endif +} +EXPORT_SYMBOL(ptlrpc_reconnect_import); + +/** + * Connection on import \a imp is changed to another one (if more than one is + * present). We typically chose connection that we have not tried to connect to + * the longest + */ +static int import_select_connection(struct obd_import *imp) +{ + struct obd_import_conn *imp_conn = NULL, *conn; + struct obd_export *dlmexp; + char *target_start; + int target_len, tried_all = 1; + + spin_lock(&imp->imp_lock); + + if (list_empty(&imp->imp_conn_list)) { + CERROR("%s: no connections available\n", + imp->imp_obd->obd_name); + spin_unlock(&imp->imp_lock); + return -EINVAL; + } + + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n", + imp->imp_obd->obd_name, + libcfs_nid2str(conn->oic_conn->c_peer.nid), + conn->oic_last_attempt); + + /* If we have not tried this connection since + the last successful attempt, go with this one */ + if ((conn->oic_last_attempt == 0) || + cfs_time_beforeq_64(conn->oic_last_attempt, + imp->imp_last_success_conn)) { + imp_conn = conn; + tried_all = 0; + break; + } + + /* If all of the connections have already been tried + since the last successful connection; just choose the + least recently used */ + if (!imp_conn) + imp_conn = conn; + else if (cfs_time_before_64(conn->oic_last_attempt, + imp_conn->oic_last_attempt)) + imp_conn = conn; + } + + /* if not found, simply choose the current one */ + if (!imp_conn || imp->imp_force_reconnect) { + LASSERT(imp->imp_conn_current); + imp_conn = imp->imp_conn_current; + tried_all = 0; + } + LASSERT(imp_conn->oic_conn); + + /* If we've tried everything, and we're back to the beginning of the + list, increase our timeout and try again. It will be reset when + we do finally connect. (FIXME: really we should wait for all network + state associated with the last connection attempt to drain before + trying to reconnect on it.) */ + if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) { + struct adaptive_timeout *at = &imp->imp_at.iat_net_latency; + if (at_get(at) < CONNECTION_SWITCH_MAX) { + at_measured(at, at_get(at) + CONNECTION_SWITCH_INC); + if (at_get(at) > CONNECTION_SWITCH_MAX) + at_reset(at, CONNECTION_SWITCH_MAX); + } + LASSERT(imp_conn->oic_last_attempt); + CDEBUG(D_HA, "%s: tried all connections, increasing latency to %ds\n", + imp->imp_obd->obd_name, at_get(at)); + } + + imp_conn->oic_last_attempt = cfs_time_current_64(); + + /* switch connection, don't mind if it's same as the current one */ + if (imp->imp_connection) + ptlrpc_connection_put(imp->imp_connection); + imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); + + dlmexp = class_conn2export(&imp->imp_dlm_handle); + LASSERT(dlmexp != NULL); + if (dlmexp->exp_connection) + ptlrpc_connection_put(dlmexp->exp_connection); + dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); + class_export_put(dlmexp); + + if (imp->imp_conn_current != imp_conn) { + if (imp->imp_conn_current) { + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + + CDEBUG(D_HA, "%s: Connection changing to %.*s (at %s)\n", + imp->imp_obd->obd_name, + target_len, target_start, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + } + + imp->imp_conn_current = imp_conn; + } + + CDEBUG(D_HA, "%s: import %p using connection %s/%s\n", + imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + + spin_unlock(&imp->imp_lock); + + return 0; +} + +/* + * must be called under imp_lock + */ +static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) +{ + struct ptlrpc_request *req; + struct list_head *tmp; + + /* The requests in committed_list always have smaller transnos than + * the requests in replay_list */ + if (!list_empty(&imp->imp_committed_list)) { + tmp = imp->imp_committed_list.next; + req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + if (req->rq_transno == 0) { + DEBUG_REQ(D_ERROR, req, + "zero transno in committed_list"); + LBUG(); + } + return 1; + } + if (!list_empty(&imp->imp_replay_list)) { + tmp = imp->imp_replay_list.next; + req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + if (req->rq_transno == 0) { + DEBUG_REQ(D_ERROR, req, "zero transno in replay_list"); + LBUG(); + } + return 1; + } + return 0; +} + +/** + * Attempt to (re)connect import \a imp. This includes all preparations, + * initializing CONNECT RPC request and passing it to ptlrpcd for + * actual sending. + * Returns 0 on success or error code. + */ +int ptlrpc_connect_import(struct obd_import *imp) +{ + struct obd_device *obd = imp->imp_obd; + int initial_connect = 0; + int set_transno = 0; + __u64 committed_before_reconnect = 0; + struct ptlrpc_request *request; + char *bufs[] = { NULL, + obd2cli_tgt(imp->imp_obd), + obd->obd_uuid.uuid, + (char *)&imp->imp_dlm_handle, + (char *)&imp->imp_connect_data }; + struct ptlrpc_connect_async_args *aa; + int rc; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + spin_unlock(&imp->imp_lock); + CERROR("can't connect to a closed import\n"); + return -EINVAL; + } else if (imp->imp_state == LUSTRE_IMP_FULL) { + spin_unlock(&imp->imp_lock); + CERROR("already connected\n"); + return 0; + } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) { + spin_unlock(&imp->imp_lock); + CERROR("already connecting\n"); + return -EALREADY; + } + + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING); + + imp->imp_conn_cnt++; + imp->imp_resend_replay = 0; + + if (!lustre_handle_is_used(&imp->imp_remote_handle)) + initial_connect = 1; + else + committed_before_reconnect = imp->imp_peer_committed_transno; + + set_transno = ptlrpc_first_transno(imp, + &imp->imp_connect_data.ocd_transno); + spin_unlock(&imp->imp_lock); + + rc = import_select_connection(imp); + if (rc) + goto out; + + rc = sptlrpc_import_sec_adapt(imp, NULL, NULL); + if (rc) + goto out; + + /* Reset connect flags to the originally requested flags, in case + * the server is updated on-the-fly we will get the new features. */ + imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig; + /* Reset ocd_version each time so the server knows the exact versions */ + imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE; + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; + + rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd, + &obd->obd_uuid, &imp->imp_connect_data, NULL); + if (rc) + goto out; + + request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT); + if (request == NULL) { + rc = -ENOMEM; + goto out; + } + + rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION, + imp->imp_connect_op, bufs, NULL); + if (rc) { + ptlrpc_request_free(request); + goto out; + } + + /* Report the rpc service time to the server so that it knows how long + * to wait for clients to join recovery */ + lustre_msg_set_service_time(request->rq_reqmsg, + at_timeout2est(request->rq_timeout)); + + /* The amount of time we give the server to process the connect req. + * import_select_connection will increase the net latency on + * repeated reconnect attempts to cover slow networks. + * We override/ignore the server rpc completion estimate here, + * which may be large if this is a reconnect attempt */ + request->rq_timeout = INITIAL_CONNECT_TIMEOUT; + lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout); + + lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER); + + request->rq_no_resend = request->rq_no_delay = 1; + request->rq_send_state = LUSTRE_IMP_CONNECTING; + /* Allow a slightly larger reply for future growth compatibility */ + req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER, + sizeof(struct obd_connect_data)+16*sizeof(__u64)); + ptlrpc_request_set_replen(request); + request->rq_interpret_reply = ptlrpc_connect_interpret; + + CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args)); + aa = ptlrpc_req_async_args(request); + memset(aa, 0, sizeof(*aa)); + + aa->pcaa_peer_committed = committed_before_reconnect; + aa->pcaa_initial_connect = initial_connect; + + if (aa->pcaa_initial_connect) { + spin_lock(&imp->imp_lock); + imp->imp_replayable = 1; + spin_unlock(&imp->imp_lock); + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_INITIAL); + } + + if (set_transno) + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_TRANSNO); + + DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)", + request->rq_timeout); + ptlrpcd_add_req(request, PDL_POLICY_ROUND, -1); + rc = 0; +out: + if (rc != 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); + } + + return rc; +} +EXPORT_SYMBOL(ptlrpc_connect_import); + +static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) +{ + int force_verify; + + spin_lock(&imp->imp_lock); + force_verify = imp->imp_force_verify != 0; + spin_unlock(&imp->imp_lock); + + if (force_verify) + ptlrpc_pinger_wake_up(); +} + +static int ptlrpc_busy_reconnect(int rc) +{ + return (rc == -EBUSY) || (rc == -EAGAIN); +} + +/** + * interpret_reply callback for connect RPCs. + * Looks into returned status of connect operation and decides + * what to do with the import - i.e enter recovery, promote it to + * full state for normal operations of disconnect it due to an error. + */ +static int ptlrpc_connect_interpret(const struct lu_env *env, + struct ptlrpc_request *request, + void *data, int rc) +{ + struct ptlrpc_connect_async_args *aa = data; + struct obd_import *imp = request->rq_import; + struct client_obd *cli = &imp->imp_obd->u.cli; + struct lustre_handle old_hdl; + __u64 old_connect_flags; + int msg_flags; + struct obd_connect_data *ocd; + struct obd_export *exp; + int ret; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + imp->imp_connect_tried = 1; + spin_unlock(&imp->imp_lock); + return 0; + } + + if (rc) { + /* if this reconnect to busy export - not need select new target + * for connecting*/ + imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc); + spin_unlock(&imp->imp_lock); + ptlrpc_maybe_ping_import_soon(imp); + goto out; + } + spin_unlock(&imp->imp_lock); + + LASSERT(imp->imp_conn_current); + + msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); + + ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA, + RCL_SERVER); + /* server replied obd_connect_data is always bigger */ + ocd = req_capsule_server_sized_get(&request->rq_pill, + &RMF_CONNECT_DATA, ret); + + if (ocd == NULL) { + CERROR("%s: no connect data from server\n", + imp->imp_obd->obd_name); + rc = -EPROTO; + goto out; + } + + spin_lock(&imp->imp_lock); + + /* All imports are pingable */ + imp->imp_pingable = 1; + imp->imp_force_reconnect = 0; + imp->imp_force_verify = 0; + + imp->imp_connect_data = *ocd; + + CDEBUG(D_HA, "%s: connect to target with instance %u\n", + imp->imp_obd->obd_name, ocd->ocd_instance); + exp = class_conn2export(&imp->imp_dlm_handle); + + spin_unlock(&imp->imp_lock); + + /* check that server granted subset of flags we asked for. */ + if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) != + ocd->ocd_connect_flags) { + CERROR("%s: Server didn't granted asked subset of flags: asked=%#llx grranted=%#llx\n", + imp->imp_obd->obd_name, imp->imp_connect_flags_orig, + ocd->ocd_connect_flags); + rc = -EPROTO; + goto out; + } + + if (!exp) { + /* This could happen if export is cleaned during the + connect attempt */ + CERROR("%s: missing export after connect\n", + imp->imp_obd->obd_name); + rc = -ENODEV; + goto out; + } + old_connect_flags = exp_connect_flags(exp); + exp->exp_connect_data = *ocd; + imp->imp_obd->obd_self_export->exp_connect_data = *ocd; + class_export_put(exp); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD); + + if (aa->pcaa_initial_connect) { + spin_lock(&imp->imp_lock); + if (msg_flags & MSG_CONNECT_REPLAYABLE) { + imp->imp_replayable = 1; + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "connected to replayable target: %s\n", + obd2cli_tgt(imp->imp_obd)); + } else { + imp->imp_replayable = 0; + spin_unlock(&imp->imp_lock); + } + + /* if applies, adjust the imp->imp_msg_magic here + * according to reply flags */ + + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + + /* Initial connects are allowed for clients with non-random + * uuids when servers are in recovery. Simply signal the + * servers replay is complete and wait in REPLAY_WAIT. */ + if (msg_flags & MSG_CONNECT_RECOVERING) { + CDEBUG(D_HA, "connect to %s during recovery\n", + obd2cli_tgt(imp->imp_obd)); + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); + } else { + IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); + ptlrpc_activate_import(imp); + } + + rc = 0; + goto finish; + } + + /* Determine what recovery state to move the import to. */ + if (MSG_CONNECT_RECONNECT & msg_flags) { + memset(&old_hdl, 0, sizeof(old_hdl)); + if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg), + sizeof(old_hdl))) { + LCONSOLE_WARN("Reconnect to %s (at @%s) failed due bad handle %#llx\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_dlm_handle.cookie); + rc = -ENOTCONN; + goto out; + } + + if (memcmp(&imp->imp_remote_handle, + lustre_msg_get_handle(request->rq_repmsg), + sizeof(imp->imp_remote_handle))) { + int level = msg_flags & MSG_CONNECT_RECOVERING ? + D_HA : D_WARNING; + + /* Bug 16611/14775: if server handle have changed, + * that means some sort of disconnection happened. + * If the server is not in recovery, that also means it + * already erased all of our state because of previous + * eviction. If it is in recovery - we are safe to + * participate since we can reestablish all of our state + * with server again */ + if ((MSG_CONNECT_RECOVERING & msg_flags)) { + CDEBUG(level, "%s@%s changed server handle from %#llx to %#llx but is still in recovery\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } else { + LCONSOLE_WARN("Evicted from %s (at %s) after server handle changed from %#llx to %#llx\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection-> \ + c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } + + + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + + if (!(MSG_CONNECT_RECOVERING & msg_flags)) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); + rc = 0; + goto finish; + } + + } else { + CDEBUG(D_HA, "reconnected to %s@%s after partition\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + } + + if (imp->imp_invalid) { + CDEBUG(D_HA, "%s: reconnected but import is invalid; marking evicted\n", + imp->imp_obd->obd_name); + IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); + } else if (MSG_CONNECT_RECOVERING & msg_flags) { + CDEBUG(D_HA, "%s: reconnected to %s during replay\n", + imp->imp_obd->obd_name, + obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); + imp->imp_resend_replay = 1; + spin_unlock(&imp->imp_lock); + + IMPORT_SET_STATE(imp, imp->imp_replay_state); + } else { + IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); + } + } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) { + LASSERT(imp->imp_replayable); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + imp->imp_last_replay_transno = 0; + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); + } else { + DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags not set: %x)", + imp->imp_obd->obd_name, msg_flags); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); + } + + /* Sanity checks for a reconnected import. */ + if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) { + CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n"); + } + + if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 && + lustre_msg_get_last_committed(request->rq_repmsg) < + aa->pcaa_peer_committed) { + CERROR("%s went back in time (transno %lld was previously committed, server now claims %lld)! See https://bugzilla.lustre.org/show_bug.cgi?id=9646\n", + obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed, + lustre_msg_get_last_committed(request->rq_repmsg)); + } + +finish: + rc = ptlrpc_import_recovery_state_machine(imp); + if (rc != 0) { + if (rc == -ENOTCONN) { + CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + ptlrpc_connect_import(imp); + imp->imp_connect_tried = 1; + return 0; + } + } else { + + spin_lock(&imp->imp_lock); + list_del(&imp->imp_conn_current->oic_item); + list_add(&imp->imp_conn_current->oic_item, + &imp->imp_conn_list); + imp->imp_last_success_conn = + imp->imp_conn_current->oic_last_attempt; + + spin_unlock(&imp->imp_lock); + + if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) && + !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) { + LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %llx, replied %llx\n", + imp->imp_obd->obd_name, + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_connect_flags_orig, + ocd->ocd_connect_flags); + rc = -EPROTO; + goto out; + } + + if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version > LUSTRE_VERSION_CODE + + LUSTRE_VERSION_OFFSET_WARN || + ocd->ocd_version < LUSTRE_VERSION_CODE - + LUSTRE_VERSION_OFFSET_WARN)) { + /* Sigh, some compilers do not like #ifdef in the middle + of macro arguments */ + const char *older = "older. Consider upgrading server or downgrading client" + ; + const char *newer = "newer than client version. Consider upgrading client" + ; + + LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n", + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + ocd->ocd_version > LUSTRE_VERSION_CODE ? + newer : older, LUSTRE_VERSION_STRING); + } + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) + /* Check if server has LU-1252 fix applied to not always swab + * the IR MNE entries. Do this only once per connection. This + * fixup is version-limited, because we don't want to carry the + * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we + * need interop with unpatched 2.2 servers. For newer servers, + * the client will do MNE swabbing only as needed. LU-1644 */ + if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) && + OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 && + OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 && + OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 && + strcmp(imp->imp_obd->obd_type->typ_name, + LUSTRE_MGC_NAME) == 0)) + imp->imp_need_mne_swab = 1; + else /* clear if server was upgraded since last connect */ + imp->imp_need_mne_swab = 0; +#else +#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" +#endif + + if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) { + /* We sent to the server ocd_cksum_types with bits set + * for algorithms we understand. The server masked off + * the checksum types it doesn't support */ + if ((ocd->ocd_cksum_types & + cksum_types_supported_client()) == 0) { + LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n", + obd2cli_tgt(imp->imp_obd), + ocd->ocd_cksum_types, + cksum_types_supported_client()); + cli->cl_checksum = 0; + cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; + } else { + cli->cl_supp_cksum_types = ocd->ocd_cksum_types; + } + } else { + /* The server does not support OBD_CONNECT_CKSUM. + * Enforce ADLER for backward compatibility*/ + cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; + } + cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types); + + if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) + cli->cl_max_pages_per_rpc = + min(ocd->ocd_brw_size >> PAGE_CACHE_SHIFT, + cli->cl_max_pages_per_rpc); + else if (imp->imp_connect_op == MDS_CONNECT || + imp->imp_connect_op == MGS_CONNECT) + cli->cl_max_pages_per_rpc = 1; + + /* Reset ns_connect_flags only for initial connect. It might be + * changed in while using FS and if we reset it in reconnect + * this leads to losing user settings done before such as + * disable lru_resize, etc. */ + if (old_connect_flags != exp_connect_flags(exp) || + aa->pcaa_initial_connect) { + CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n", + imp->imp_obd->obd_name, ocd->ocd_connect_flags); + imp->imp_obd->obd_namespace->ns_connect_flags = + ocd->ocd_connect_flags; + imp->imp_obd->obd_namespace->ns_orig_connect_flags = + ocd->ocd_connect_flags; + } + + if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) && + (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) + /* We need a per-message support flag, because + a. we don't know if the incoming connect reply + supports AT or not (in reply_in_callback) + until we unpack it. + b. failovered server means export and flags are gone + (in ptlrpc_send_reply). + Can only be set when we know AT is supported at + both ends */ + imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; + else + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + + if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) && + (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) + imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; + else + imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; + + LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && + (cli->cl_max_pages_per_rpc > 0)); + } + +out: + imp->imp_connect_tried = 1; + + if (rc != 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); + if (rc == -EACCES) { + /* + * Give up trying to reconnect + * EACCES means client has no permission for connection + */ + imp->imp_obd->obd_no_recov = 1; + ptlrpc_deactivate_import(imp); + } + + if (rc == -EPROTO) { + struct obd_connect_data *ocd; + + /* reply message might not be ready */ + if (request->rq_repmsg == NULL) + return -EPROTO; + + ocd = req_capsule_server_get(&request->rq_pill, + &RMF_CONNECT_DATA); + if (ocd && + (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version != LUSTRE_VERSION_CODE)) { + /* + * Actually servers are only supposed to refuse + * connection from liblustre clients, so we + * should never see this from VFS context + */ + LCONSOLE_ERROR_MSG(0x16a, "Server %s version (%d.%d.%d.%d) refused connection from this client with an incompatible version (%s). Client must be recompiled\n", + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + LUSTRE_VERSION_STRING); + ptlrpc_deactivate_import(imp); + IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED); + } + return -EPROTO; + } + + ptlrpc_maybe_ping_import_soon(imp); + + CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n", + obd2cli_tgt(imp->imp_obd), + (char *)imp->imp_connection->c_remote_uuid.uuid, rc); + } + + wake_up_all(&imp->imp_recovery_waitq); + return rc; +} + +/** + * interpret callback for "completed replay" RPCs. + * \see signal_completed_replay + */ +static int completed_replay_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *data, int rc) +{ + atomic_dec(&req->rq_import->imp_replay_inflight); + if (req->rq_status == 0 && + !req->rq_import->imp_vbr_failed) { + ptlrpc_import_recovery_state_machine(req->rq_import); + } else { + if (req->rq_import->imp_vbr_failed) { + CDEBUG(D_WARNING, + "%s: version recovery fails, reconnecting\n", + req->rq_import->imp_obd->obd_name); + } else { + CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, reconnecting\n", + req->rq_import->imp_obd->obd_name, + req->rq_status); + } + ptlrpc_connect_import(req->rq_import); + } + + return 0; +} + +/** + * Let server know that we have no requests to replay anymore. + * Achieved by just sending a PING request + */ +static int signal_completed_replay(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY))) + return 0; + + LASSERT(atomic_read(&imp->imp_replay_inflight) == 0); + atomic_inc(&imp->imp_replay_inflight); + + req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION, + OBD_PING); + if (req == NULL) { + atomic_dec(&imp->imp_replay_inflight); + return -ENOMEM; + } + + ptlrpc_request_set_replen(req); + req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT; + lustre_msg_add_flags(req->rq_reqmsg, + MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE); + if (AT_OFF) + req->rq_timeout *= 3; + req->rq_interpret_reply = completed_replay_interpret; + + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + return 0; +} + +/** + * In kernel code all import invalidation happens in its own + * separate thread, so that whatever application happened to encounter + * a problem could still be killed or otherwise continue + */ +static int ptlrpc_invalidate_import_thread(void *data) +{ + struct obd_import *imp = data; + + unshare_fs_struct(); + + CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + ptlrpc_invalidate_import(imp); + + if (obd_dump_on_eviction) { + CERROR("dump the log upon eviction\n"); + libcfs_debug_dumplog(); + } + + IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); + ptlrpc_import_recovery_state_machine(imp); + + class_import_put(imp); + return 0; +} + +/** + * This is the state machine for client-side recovery on import. + * + * Typically we have two possibly paths. If we came to server and it is not + * in recovery, we just enter IMP_EVICTED state, invalidate our import + * state and reconnect from scratch. + * If we came to server that is in recovery, we enter IMP_REPLAY import state. + * We go through our list of requests to replay and send them to server one by + * one. + * After sending all request from the list we change import state to + * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server + * and also all the locks we don't yet have and wait for server to grant us. + * After that we send a special "replay completed" request and change import + * state to IMP_REPLAY_WAIT. + * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER + * state and resend all requests from sending list. + * After that we promote import to FULL state and send all delayed requests + * and import is fully operational after that. + * + */ +int ptlrpc_import_recovery_state_machine(struct obd_import *imp) +{ + int rc = 0; + int inflight; + char *target_start; + int target_len; + + if (imp->imp_state == LUSTRE_IMP_EVICTED) { + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + /* Don't care about MGC eviction */ + if (strcmp(imp->imp_obd->obd_type->typ_name, + LUSTRE_MGC_NAME) != 0) { + LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted by %.*s; in progress operations using this service will fail.\n", + imp->imp_obd->obd_name, target_len, + target_start); + } + CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + /* reset vbr_failed flag upon eviction */ + spin_lock(&imp->imp_lock); + imp->imp_vbr_failed = 0; + spin_unlock(&imp->imp_lock); + + { + struct task_struct *task; + /* bug 17802: XXX client_disconnect_export vs connect request + * race. if client will evicted at this time, we start + * invalidate thread without reference to import and import can + * be freed at same time. */ + class_import_get(imp); + task = kthread_run(ptlrpc_invalidate_import_thread, imp, + "ll_imp_inval"); + if (IS_ERR(task)) { + class_import_put(imp); + CERROR("error starting invalidate thread: %d\n", rc); + rc = PTR_ERR(task); + } else { + rc = 0; + } + return rc; + } + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY) { + CDEBUG(D_HA, "replay requested by %s\n", + obd2cli_tgt(imp->imp_obd)); + rc = ptlrpc_replay_next(imp, &inflight); + if (inflight == 0 && + atomic_read(&imp->imp_replay_inflight) == 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); + rc = ldlm_replay_locks(imp); + if (rc) + goto out; + } + rc = 0; + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) { + if (atomic_read(&imp->imp_replay_inflight) == 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT); + rc = signal_completed_replay(imp); + if (rc) + goto out; + } + + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) { + if (atomic_read(&imp->imp_replay_inflight) == 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); + } + } + + if (imp->imp_state == LUSTRE_IMP_RECOVER) { + CDEBUG(D_HA, "reconnected to %s@%s\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + rc = ptlrpc_resend(imp); + if (rc) + goto out; + IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); + ptlrpc_activate_import(imp); + + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n", + imp->imp_obd->obd_name, + target_len, target_start, + libcfs_nid2str(imp->imp_connection->c_peer.nid)); + } + + if (imp->imp_state == LUSTRE_IMP_FULL) { + wake_up_all(&imp->imp_recovery_waitq); + ptlrpc_wake_delayed(imp); + } + +out: + return rc; +} + +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) +{ + struct ptlrpc_request *req; + int rq_opc, rc = 0; + + if (imp->imp_obd->obd_force) + goto set_state; + + switch (imp->imp_connect_op) { + case OST_CONNECT: + rq_opc = OST_DISCONNECT; + break; + case MDS_CONNECT: + rq_opc = MDS_DISCONNECT; + break; + case MGS_CONNECT: + rq_opc = MGS_DISCONNECT; + break; + default: + rc = -EINVAL; + CERROR("%s: don't know how to disconnect from %s (connect_op %d): rc = %d\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connect_op, rc); + return rc; + } + + if (ptlrpc_import_in_recovery(imp)) { + struct l_wait_info lwi; + long timeout; + + if (AT_OFF) { + if (imp->imp_server_timeout) + timeout = cfs_time_seconds(obd_timeout / 2); + else + timeout = cfs_time_seconds(obd_timeout); + } else { + int idx = import_at_get_index(imp, + imp->imp_client->cli_request_portal); + timeout = cfs_time_seconds( + at_get(&imp->imp_at.iat_service_estimate[idx])); + } + + lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout), + back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), &lwi); + + } + + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_FULL) + goto out; + spin_unlock(&imp->imp_lock); + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT, + LUSTRE_OBD_VERSION, rq_opc); + if (req) { + /* We are disconnecting, do not retry a failed DISCONNECT rpc if + * it fails. We can get through the above with a down server + * if the client doesn't know the server is gone yet. */ + req->rq_no_resend = 1; + + /* We want client umounts to happen quickly, no matter the + server state... */ + req->rq_timeout = min_t(int, req->rq_timeout, + INITIAL_CONNECT_TIMEOUT); + + IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING); + req->rq_send_state = LUSTRE_IMP_CONNECTING; + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + } + +set_state: + spin_lock(&imp->imp_lock); +out: + if (noclose) + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); + else + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED); + memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); + spin_unlock(&imp->imp_lock); + + if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN) + rc = 0; + + return rc; +} +EXPORT_SYMBOL(ptlrpc_disconnect_import); + +void ptlrpc_cleanup_imp(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED); + imp->imp_generation++; + spin_unlock(&imp->imp_lock); + ptlrpc_abort_inflight(imp); +} +EXPORT_SYMBOL(ptlrpc_cleanup_imp); + +/* Adaptive Timeout utils */ +extern unsigned int at_min, at_max, at_history; + +/* Bin into timeslices using AT_BINS bins. + This gives us a max of the last binlimit*AT_BINS secs without the storage, + but still smoothing out a return to normalcy from a slow response. + (E.g. remember the maximum latency in each minute of the last 4 minutes.) */ +int at_measured(struct adaptive_timeout *at, unsigned int val) +{ + unsigned int old = at->at_current; + time_t now = get_seconds(); + time_t binlimit = max_t(time_t, at_history / AT_BINS, 1); + + LASSERT(at); + CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n", + val, at, now - at->at_binstart, at->at_current, + at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); + + if (val == 0) + /* 0's don't count, because we never want our timeout to + drop to 0, and because 0 could mean an error */ + return 0; + + spin_lock(&at->at_lock); + + if (unlikely(at->at_binstart == 0)) { + /* Special case to remove default from history */ + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = now; + at->at_hist[0] = val; + at->at_binstart = now; + } else if (now - at->at_binstart < binlimit) { + /* in bin 0 */ + at->at_hist[0] = max(val, at->at_hist[0]); + at->at_current = max(val, at->at_current); + } else { + int i, shift; + unsigned int maxv = val; + /* move bins over */ + shift = (now - at->at_binstart) / binlimit; + LASSERT(shift > 0); + for (i = AT_BINS - 1; i >= 0; i--) { + if (i >= shift) { + at->at_hist[i] = at->at_hist[i - shift]; + maxv = max(maxv, at->at_hist[i]); + } else { + at->at_hist[i] = 0; + } + } + at->at_hist[0] = val; + at->at_current = maxv; + at->at_binstart += shift * binlimit; + } + + if (at->at_current > at->at_worst_ever) { + at->at_worst_ever = at->at_current; + at->at_worst_time = now; + } + + if (at->at_flags & AT_FLG_NOHIST) + /* Only keep last reported val; keeping the rest of the history + for proc only */ + at->at_current = val; + + if (at_max > 0) + at->at_current = min(at->at_current, at_max); + at->at_current = max(at->at_current, at_min); + + if (at->at_current != old) + CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d (val=%u) hist %u %u %u %u\n", + at, + old, at->at_current, at->at_current - old, val, + at->at_hist[0], at->at_hist[1], at->at_hist[2], + at->at_hist[3]); + + /* if we changed, report the old value */ + old = (at->at_current != old) ? old : 0; + + spin_unlock(&at->at_lock); + return old; +} + +/* Find the imp_at index for a given portal; assign if space available */ +int import_at_get_index(struct obd_import *imp, int portal) +{ + struct imp_at *at = &imp->imp_at; + int i; + + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + return i; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not found in list, add it under a lock */ + spin_lock(&imp->imp_lock); + + /* Check unused under lock */ + for (; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + goto out; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not enough portals? */ + LASSERT(i < IMP_AT_MAX_PORTALS); + + at->iat_portal[i] = portal; +out: + spin_unlock(&imp->imp_lock); + return i; +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/layout.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/layout.c new file mode 100644 index 000000000..a42335e26 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/layout.c @@ -0,0 +1,2442 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/layout.c + * + * Lustre Metadata Target (mdt) request handler + * + * Author: Nikita Danilov + */ +/* + * This file contains the "capsule/pill" abstraction layered above PTLRPC. + * + * Every struct ptlrpc_request contains a "pill", which points to a description + * of the format that the request conforms to. + */ + +#if !defined(__REQ_LAYOUT_USER__) + +#define DEBUG_SUBSYSTEM S_RPC + +#include + +/* LUSTRE_VERSION_CODE */ +#include "../include/lustre_ver.h" + +#include "../include/obd_support.h" +/* lustre_swab_mdt_body */ +#include "../include/lustre/lustre_idl.h" +/* obd2cli_tgt() (required by DEBUG_REQ()) */ +#include "../include/obd.h" + +/* __REQ_LAYOUT_USER__ */ +#endif +/* struct ptlrpc_request, lustre_msg* */ +#include "../include/lustre_req_layout.h" +#include "../include/lustre_acl.h" +#include "../include/lustre_debug.h" + +/* + * RQFs (see below) refer to two struct req_msg_field arrays describing the + * client request and server reply, respectively. + */ +/* empty set of fields... for suitable definition of emptiness. */ +static const struct req_msg_field *empty[] = { + &RMF_PTLRPC_BODY +}; + +static const struct req_msg_field *mgs_target_info_only[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_TARGET_INFO +}; + +static const struct req_msg_field *mgs_set_info[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_SEND_PARAM +}; + +static const struct req_msg_field *mgs_config_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_CONFIG_BODY +}; + +static const struct req_msg_field *mgs_config_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_CONFIG_RES +}; + +static const struct req_msg_field *log_cancel_client[] = { + &RMF_PTLRPC_BODY, + &RMF_LOGCOOKIES +}; + +static const struct req_msg_field *mdt_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY +}; + +static const struct req_msg_field *mdt_body_capa[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1 +}; + +static const struct req_msg_field *quotactl_only[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_QUOTACTL +}; + +static const struct req_msg_field *quota_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *ldlm_intent_quota_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *ldlm_intent_quota_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_DLM_LVB, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *mdt_close_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_EPOCH, + &RMF_REC_REINT, + &RMF_CAPA1 +}; + +static const struct req_msg_field *mdt_release_close_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_EPOCH, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CLOSE_DATA +}; + +static const struct req_msg_field *obd_statfs_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_STATFS +}; + +static const struct req_msg_field *seq_query_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SEQ_OPC, + &RMF_SEQ_RANGE +}; + +static const struct req_msg_field *seq_query_server[] = { + &RMF_PTLRPC_BODY, + &RMF_SEQ_RANGE +}; + +static const struct req_msg_field *fld_query_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_OPC, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *fld_query_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *mds_getattr_name_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *mds_reint_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT +}; + +static const struct req_msg_field *mds_reint_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *mds_reint_create_slave_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_create_rmt_acl_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_create_sym_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_open_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_reint_open_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_reint_unlink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_link_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_rename_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_last_unlink_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_LOGCOOKIES, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_reint_setattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_MDT_EPOCH, + &RMF_EADATA, + &RMF_LOGCOOKIES, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_setxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mdt_swap_layouts[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_SWAP_LAYOUTS, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *obd_connect_client[] = { + &RMF_PTLRPC_BODY, + &RMF_TGTUUID, + &RMF_CLUUID, + &RMF_CONN, + &RMF_CONNECT_DATA +}; + +static const struct req_msg_field *obd_connect_server[] = { + &RMF_PTLRPC_BODY, + &RMF_CONNECT_DATA +}; + +static const struct req_msg_field *obd_set_info_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY, + &RMF_SETINFO_VAL +}; + +static const struct req_msg_field *ost_grant_shrink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *mds_getinfo_client[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_KEY, + &RMF_GETINFO_VALLEN +}; + +static const struct req_msg_field *mds_getinfo_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_VAL, +}; + +static const struct req_msg_field *ldlm_enqueue_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *ldlm_enqueue_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP +}; + +static const struct req_msg_field *ldlm_enqueue_lvb_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_cp_callback_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_gl_callback_desc_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_DLM_GL_DESC +}; + +static const struct req_msg_field *ldlm_gl_callback_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_intent_basic_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, +}; + +static const struct req_msg_field *ldlm_intent_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT +}; + +static const struct req_msg_field *ldlm_intent_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL +}; + +static const struct req_msg_field *ldlm_intent_layout_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_LAYOUT_INTENT, + &RMF_EADATA /* for new layout to be set up */ +}; +static const struct req_msg_field *ldlm_intent_open_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *ldlm_intent_getattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_MDT_BODY, /* coincides with mds_getattr_name_client[] */ + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *ldlm_intent_getattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1 +}; + +static const struct req_msg_field *ldlm_intent_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_create_client[] */ + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA +}; + +static const struct req_msg_field *ldlm_intent_open_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_open_client[] */ + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_EADATA +}; + +static const struct req_msg_field *ldlm_intent_unlink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_unlink_client[] */ + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *ldlm_intent_getxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_MDT_BODY, + &RMF_CAPA1, +}; + +static const struct req_msg_field *ldlm_intent_getxattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, /* for req_capsule_extend/mdt_intent_policy */ + &RMF_EADATA, + &RMF_EAVALS, + &RMF_EAVALS_LENS +}; + +static const struct req_msg_field *mds_getxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_getxattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_getattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_setattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_update_client[] = { + &RMF_PTLRPC_BODY, + &RMF_UPDATE, +}; + +static const struct req_msg_field *mds_update_server[] = { + &RMF_PTLRPC_BODY, + &RMF_UPDATE_REPLY, +}; + +static const struct req_msg_field *llog_origin_handle_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY, + &RMF_NAME +}; + +static const struct req_msg_field *llogd_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY +}; + +static const struct req_msg_field *llog_log_hdr_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOG_LOG_HDR +}; + +static const struct req_msg_field *llogd_conn_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_CONN_BODY +}; + +static const struct req_msg_field *llog_origin_handle_next_block_server[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY, + &RMF_EADATA +}; + +static const struct req_msg_field *obd_idx_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_IDX_INFO +}; + +static const struct req_msg_field *obd_idx_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_IDX_INFO +}; + +static const struct req_msg_field *ost_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *ost_body_capa[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_CAPA1 +}; + +static const struct req_msg_field *ost_destroy_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_DLM_REQ, + &RMF_CAPA1 +}; + + +static const struct req_msg_field *ost_brw_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_OBD_IOOBJ, + &RMF_NIOBUF_REMOTE, + &RMF_CAPA1 +}; + +static const struct req_msg_field *ost_brw_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *ost_brw_write_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_RCS +}; + +static const struct req_msg_field *ost_get_info_generic_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GENERIC_DATA, +}; + +static const struct req_msg_field *ost_get_info_generic_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY +}; + +static const struct req_msg_field *ost_get_last_id_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_ID +}; + +static const struct req_msg_field *ost_get_last_fid_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FID, +}; + +static const struct req_msg_field *ost_get_fiemap_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FIEMAP_KEY, + &RMF_FIEMAP_VAL +}; + +static const struct req_msg_field *ost_get_fiemap_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FIEMAP_VAL +}; + +static const struct req_msg_field *mdt_hsm_progress[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_PROGRESS, +}; + +static const struct req_msg_field *mdt_hsm_ct_register[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_ARCHIVE, +}; + +static const struct req_msg_field *mdt_hsm_ct_unregister[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, +}; + +static const struct req_msg_field *mdt_hsm_action_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_CURRENT_ACTION, +}; + +static const struct req_msg_field *mdt_hsm_state_get_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_HSM_USER_STATE, +}; + +static const struct req_msg_field *mdt_hsm_state_set[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_HSM_STATE_SET, +}; + +static const struct req_msg_field *mdt_hsm_request[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_REQUEST, + &RMF_MDS_HSM_USER_ITEM, + &RMF_GENERIC_DATA, +}; + +static struct req_format *req_formats[] = { + &RQF_OBD_PING, + &RQF_OBD_SET_INFO, + &RQF_OBD_IDX_READ, + &RQF_SEC_CTX, + &RQF_MGS_TARGET_REG, + &RQF_MGS_SET_INFO, + &RQF_MGS_CONFIG_READ, + &RQF_SEQ_QUERY, + &RQF_FLD_QUERY, + &RQF_MDS_CONNECT, + &RQF_MDS_DISCONNECT, + &RQF_MDS_GET_INFO, + &RQF_MDS_GETSTATUS, + &RQF_MDS_STATFS, + &RQF_MDS_GETATTR, + &RQF_MDS_GETATTR_NAME, + &RQF_MDS_GETXATTR, + &RQF_MDS_SYNC, + &RQF_MDS_CLOSE, + &RQF_MDS_RELEASE_CLOSE, + &RQF_MDS_PIN, + &RQF_MDS_UNPIN, + &RQF_MDS_READPAGE, + &RQF_MDS_WRITEPAGE, + &RQF_MDS_IS_SUBDIR, + &RQF_MDS_DONE_WRITING, + &RQF_MDS_REINT, + &RQF_MDS_REINT_CREATE, + &RQF_MDS_REINT_CREATE_RMT_ACL, + &RQF_MDS_REINT_CREATE_SLAVE, + &RQF_MDS_REINT_CREATE_SYM, + &RQF_MDS_REINT_OPEN, + &RQF_MDS_REINT_UNLINK, + &RQF_MDS_REINT_LINK, + &RQF_MDS_REINT_RENAME, + &RQF_MDS_REINT_SETATTR, + &RQF_MDS_REINT_SETXATTR, + &RQF_MDS_QUOTACHECK, + &RQF_MDS_QUOTACTL, + &RQF_MDS_HSM_PROGRESS, + &RQF_MDS_HSM_CT_REGISTER, + &RQF_MDS_HSM_CT_UNREGISTER, + &RQF_MDS_HSM_STATE_GET, + &RQF_MDS_HSM_STATE_SET, + &RQF_MDS_HSM_ACTION, + &RQF_MDS_HSM_REQUEST, + &RQF_MDS_SWAP_LAYOUTS, + &RQF_UPDATE_OBJ, + &RQF_QC_CALLBACK, + &RQF_OST_CONNECT, + &RQF_OST_DISCONNECT, + &RQF_OST_QUOTACHECK, + &RQF_OST_QUOTACTL, + &RQF_OST_GETATTR, + &RQF_OST_SETATTR, + &RQF_OST_CREATE, + &RQF_OST_PUNCH, + &RQF_OST_SYNC, + &RQF_OST_DESTROY, + &RQF_OST_BRW_READ, + &RQF_OST_BRW_WRITE, + &RQF_OST_STATFS, + &RQF_OST_SET_GRANT_INFO, + &RQF_OST_GET_INFO_GENERIC, + &RQF_OST_GET_INFO_LAST_ID, + &RQF_OST_GET_INFO_LAST_FID, + &RQF_OST_SET_INFO_LAST_FID, + &RQF_OST_GET_INFO_FIEMAP, + &RQF_LDLM_ENQUEUE, + &RQF_LDLM_ENQUEUE_LVB, + &RQF_LDLM_CONVERT, + &RQF_LDLM_CANCEL, + &RQF_LDLM_CALLBACK, + &RQF_LDLM_CP_CALLBACK, + &RQF_LDLM_BL_CALLBACK, + &RQF_LDLM_GL_CALLBACK, + &RQF_LDLM_GL_DESC_CALLBACK, + &RQF_LDLM_INTENT, + &RQF_LDLM_INTENT_BASIC, + &RQF_LDLM_INTENT_LAYOUT, + &RQF_LDLM_INTENT_GETATTR, + &RQF_LDLM_INTENT_OPEN, + &RQF_LDLM_INTENT_CREATE, + &RQF_LDLM_INTENT_UNLINK, + &RQF_LDLM_INTENT_GETXATTR, + &RQF_LDLM_INTENT_QUOTA, + &RQF_QUOTA_DQACQ, + &RQF_LOG_CANCEL, + &RQF_LLOG_ORIGIN_HANDLE_CREATE, + &RQF_LLOG_ORIGIN_HANDLE_DESTROY, + &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, + &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, + &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, + &RQF_LLOG_ORIGIN_CONNECT, + &RQF_CONNECT, +}; + +struct req_msg_field { + const __u32 rmf_flags; + const char *rmf_name; + /** + * Field length. (-1) means "variable length". If the + * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length, + * but the actual size must be a whole multiple of \a rmf_size. + */ + const int rmf_size; + void (*rmf_swabber)(void *); + void (*rmf_dumper)(void *); + int rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR]; +}; + +enum rmf_flags { + /** + * The field is a string, must be NUL-terminated. + */ + RMF_F_STRING = 1 << 0, + /** + * The field's buffer size need not match the declared \a rmf_size. + */ + RMF_F_NO_SIZE_CHECK = 1 << 1, + /** + * The field's buffer size must be a whole multiple of the declared \a + * rmf_size and the \a rmf_swabber function must work on the declared \a + * rmf_size worth of bytes. + */ + RMF_F_STRUCT_ARRAY = 1 << 2 +}; + +struct req_capsule; + +/* + * Request fields. + */ +#define DEFINE_MSGF(name, flags, size, swabber, dumper) { \ + .rmf_name = (name), \ + .rmf_flags = (flags), \ + .rmf_size = (size), \ + .rmf_swabber = (void (*)(void *))(swabber), \ + .rmf_dumper = (void (*)(void *))(dumper) \ +} + +struct req_msg_field RMF_GENERIC_DATA = + DEFINE_MSGF("generic_data", 0, + -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GENERIC_DATA); + +struct req_msg_field RMF_MGS_TARGET_INFO = + DEFINE_MSGF("mgs_target_info", 0, + sizeof(struct mgs_target_info), + lustre_swab_mgs_target_info, NULL); +EXPORT_SYMBOL(RMF_MGS_TARGET_INFO); + +struct req_msg_field RMF_MGS_SEND_PARAM = + DEFINE_MSGF("mgs_send_param", 0, + sizeof(struct mgs_send_param), + NULL, NULL); +EXPORT_SYMBOL(RMF_MGS_SEND_PARAM); + +struct req_msg_field RMF_MGS_CONFIG_BODY = + DEFINE_MSGF("mgs_config_read request", 0, + sizeof(struct mgs_config_body), + lustre_swab_mgs_config_body, NULL); +EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY); + +struct req_msg_field RMF_MGS_CONFIG_RES = + DEFINE_MSGF("mgs_config_read reply ", 0, + sizeof(struct mgs_config_res), + lustre_swab_mgs_config_res, NULL); +EXPORT_SYMBOL(RMF_MGS_CONFIG_RES); + +struct req_msg_field RMF_U32 = + DEFINE_MSGF("generic u32", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_U32); + +struct req_msg_field RMF_SETINFO_VAL = + DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SETINFO_VAL); + +struct req_msg_field RMF_GETINFO_KEY = + DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GETINFO_KEY); + +struct req_msg_field RMF_GETINFO_VALLEN = + DEFINE_MSGF("getinfo_vallen", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_GETINFO_VALLEN); + +struct req_msg_field RMF_GETINFO_VAL = + DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GETINFO_VAL); + +struct req_msg_field RMF_SEQ_OPC = + DEFINE_MSGF("seq_query_opc", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_SEQ_OPC); + +struct req_msg_field RMF_SEQ_RANGE = + DEFINE_MSGF("seq_query_range", 0, + sizeof(struct lu_seq_range), + lustre_swab_lu_seq_range, NULL); +EXPORT_SYMBOL(RMF_SEQ_RANGE); + +struct req_msg_field RMF_FLD_OPC = + DEFINE_MSGF("fld_query_opc", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_FLD_OPC); + +struct req_msg_field RMF_FLD_MDFLD = + DEFINE_MSGF("fld_query_mdfld", 0, + sizeof(struct lu_seq_range), + lustre_swab_lu_seq_range, NULL); +EXPORT_SYMBOL(RMF_FLD_MDFLD); + +struct req_msg_field RMF_MDT_BODY = + DEFINE_MSGF("mdt_body", 0, + sizeof(struct mdt_body), lustre_swab_mdt_body, NULL); +EXPORT_SYMBOL(RMF_MDT_BODY); + +struct req_msg_field RMF_OBD_QUOTACTL = + DEFINE_MSGF("obd_quotactl", 0, + sizeof(struct obd_quotactl), + lustre_swab_obd_quotactl, NULL); +EXPORT_SYMBOL(RMF_OBD_QUOTACTL); + +struct req_msg_field RMF_QUOTA_BODY = + DEFINE_MSGF("quota_body", 0, + sizeof(struct quota_body), lustre_swab_quota_body, NULL); +EXPORT_SYMBOL(RMF_QUOTA_BODY); + +struct req_msg_field RMF_MDT_EPOCH = + DEFINE_MSGF("mdt_ioepoch", 0, + sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL); +EXPORT_SYMBOL(RMF_MDT_EPOCH); + +struct req_msg_field RMF_PTLRPC_BODY = + DEFINE_MSGF("ptlrpc_body", 0, + sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL); +EXPORT_SYMBOL(RMF_PTLRPC_BODY); + +struct req_msg_field RMF_CLOSE_DATA = + DEFINE_MSGF("data_version", 0, + sizeof(struct close_data), lustre_swab_close_data, NULL); +EXPORT_SYMBOL(RMF_CLOSE_DATA); + +struct req_msg_field RMF_OBD_STATFS = + DEFINE_MSGF("obd_statfs", 0, + sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL); +EXPORT_SYMBOL(RMF_OBD_STATFS); + +struct req_msg_field RMF_SETINFO_KEY = + DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SETINFO_KEY); + +struct req_msg_field RMF_NAME = + DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_NAME); + +struct req_msg_field RMF_SYMTGT = + DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SYMTGT); + +struct req_msg_field RMF_TGTUUID = + DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, + NULL); +EXPORT_SYMBOL(RMF_TGTUUID); + +struct req_msg_field RMF_CLUUID = + DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, + NULL); +EXPORT_SYMBOL(RMF_CLUUID); + +struct req_msg_field RMF_STRING = + DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_STRING); + +struct req_msg_field RMF_LLOGD_BODY = + DEFINE_MSGF("llogd_body", 0, + sizeof(struct llogd_body), lustre_swab_llogd_body, NULL); +EXPORT_SYMBOL(RMF_LLOGD_BODY); + +struct req_msg_field RMF_LLOG_LOG_HDR = + DEFINE_MSGF("llog_log_hdr", 0, + sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL); +EXPORT_SYMBOL(RMF_LLOG_LOG_HDR); + +struct req_msg_field RMF_LLOGD_CONN_BODY = + DEFINE_MSGF("llogd_conn_body", 0, + sizeof(struct llogd_conn_body), + lustre_swab_llogd_conn_body, NULL); +EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY); + +/* + * connection handle received in MDS_CONNECT request. + * + * No swabbing needed because struct lustre_handle contains only a 64-bit cookie + * that the client does not interpret at all. + */ +struct req_msg_field RMF_CONN = + DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL); +EXPORT_SYMBOL(RMF_CONN); + +struct req_msg_field RMF_CONNECT_DATA = + DEFINE_MSGF("cdata", + RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */, + sizeof(struct obd_connect_data), + lustre_swab_connect, NULL); +EXPORT_SYMBOL(RMF_CONNECT_DATA); + +struct req_msg_field RMF_DLM_REQ = + DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */, + sizeof(struct ldlm_request), + lustre_swab_ldlm_request, NULL); +EXPORT_SYMBOL(RMF_DLM_REQ); + +struct req_msg_field RMF_DLM_REP = + DEFINE_MSGF("dlm_rep", 0, + sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL); +EXPORT_SYMBOL(RMF_DLM_REP); + +struct req_msg_field RMF_LDLM_INTENT = + DEFINE_MSGF("ldlm_intent", 0, + sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL); +EXPORT_SYMBOL(RMF_LDLM_INTENT); + +struct req_msg_field RMF_DLM_LVB = + DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_DLM_LVB); + +struct req_msg_field RMF_DLM_GL_DESC = + DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc), + lustre_swab_gl_desc, NULL); +EXPORT_SYMBOL(RMF_DLM_GL_DESC); + +struct req_msg_field RMF_MDT_MD = + DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL); +EXPORT_SYMBOL(RMF_MDT_MD); + +struct req_msg_field RMF_REC_REINT = + DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint), + lustre_swab_mdt_rec_reint, NULL); +EXPORT_SYMBOL(RMF_REC_REINT); + +/* FIXME: this length should be defined as a macro */ +struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1, + NULL, NULL); +EXPORT_SYMBOL(RMF_EADATA); + +struct req_msg_field RMF_EAVALS = DEFINE_MSGF("eavals", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_EAVALS); + +struct req_msg_field RMF_ACL = + DEFINE_MSGF("acl", RMF_F_NO_SIZE_CHECK, + LUSTRE_POSIX_ACL_MAX_SIZE, NULL, NULL); +EXPORT_SYMBOL(RMF_ACL); + +/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */ +struct req_msg_field RMF_LOGCOOKIES = + DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */, + sizeof(struct llog_cookie), NULL, NULL); +EXPORT_SYMBOL(RMF_LOGCOOKIES); + +struct req_msg_field RMF_CAPA1 = + DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa), + lustre_swab_lustre_capa, NULL); +EXPORT_SYMBOL(RMF_CAPA1); + +struct req_msg_field RMF_CAPA2 = + DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa), + lustre_swab_lustre_capa, NULL); +EXPORT_SYMBOL(RMF_CAPA2); + +struct req_msg_field RMF_LAYOUT_INTENT = + DEFINE_MSGF("layout_intent", 0, + sizeof(struct layout_intent), lustre_swab_layout_intent, + NULL); +EXPORT_SYMBOL(RMF_LAYOUT_INTENT); + +/* + * OST request field. + */ +struct req_msg_field RMF_OST_BODY = + DEFINE_MSGF("ost_body", 0, + sizeof(struct ost_body), lustre_swab_ost_body, dump_ost_body); +EXPORT_SYMBOL(RMF_OST_BODY); + +struct req_msg_field RMF_OBD_IOOBJ = + DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY, + sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo); +EXPORT_SYMBOL(RMF_OBD_IOOBJ); + +struct req_msg_field RMF_NIOBUF_REMOTE = + DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, + sizeof(struct niobuf_remote), lustre_swab_niobuf_remote, + dump_rniobuf); +EXPORT_SYMBOL(RMF_NIOBUF_REMOTE); + +struct req_msg_field RMF_RCS = + DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32), + lustre_swab_generic_32s, dump_rcs); +EXPORT_SYMBOL(RMF_RCS); + +struct req_msg_field RMF_EAVALS_LENS = + DEFINE_MSGF("eavals_lens", RMF_F_STRUCT_ARRAY, sizeof(__u32), + lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_EAVALS_LENS); + +struct req_msg_field RMF_OBD_ID = + DEFINE_MSGF("u64", 0, + sizeof(u64), lustre_swab_ost_last_id, NULL); +EXPORT_SYMBOL(RMF_OBD_ID); + +struct req_msg_field RMF_FID = + DEFINE_MSGF("fid", 0, + sizeof(struct lu_fid), lustre_swab_lu_fid, NULL); +EXPORT_SYMBOL(RMF_FID); + +struct req_msg_field RMF_OST_ID = + DEFINE_MSGF("ost_id", 0, + sizeof(struct ost_id), lustre_swab_ost_id, NULL); +EXPORT_SYMBOL(RMF_OST_ID); + +struct req_msg_field RMF_FIEMAP_KEY = + DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key), + lustre_swab_fiemap, NULL); +EXPORT_SYMBOL(RMF_FIEMAP_KEY); + +struct req_msg_field RMF_FIEMAP_VAL = + DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL); +EXPORT_SYMBOL(RMF_FIEMAP_VAL); + +struct req_msg_field RMF_IDX_INFO = + DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info), + lustre_swab_idx_info, NULL); +EXPORT_SYMBOL(RMF_IDX_INFO); +struct req_msg_field RMF_HSM_USER_STATE = + DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state), + lustre_swab_hsm_user_state, NULL); +EXPORT_SYMBOL(RMF_HSM_USER_STATE); + +struct req_msg_field RMF_HSM_STATE_SET = + DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set), + lustre_swab_hsm_state_set, NULL); +EXPORT_SYMBOL(RMF_HSM_STATE_SET); + +struct req_msg_field RMF_MDS_HSM_PROGRESS = + DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel), + lustre_swab_hsm_progress_kernel, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS); + +struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION = + DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action), + lustre_swab_hsm_current_action, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION); + +struct req_msg_field RMF_MDS_HSM_USER_ITEM = + DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY, + sizeof(struct hsm_user_item), lustre_swab_hsm_user_item, + NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM); + +struct req_msg_field RMF_MDS_HSM_ARCHIVE = + DEFINE_MSGF("hsm_archive", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE); + +struct req_msg_field RMF_MDS_HSM_REQUEST = + DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request), + lustre_swab_hsm_request, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST); + +struct req_msg_field RMF_UPDATE = DEFINE_MSGF("update", 0, -1, + lustre_swab_update_buf, NULL); +EXPORT_SYMBOL(RMF_UPDATE); + +struct req_msg_field RMF_UPDATE_REPLY = DEFINE_MSGF("update_reply", 0, -1, + lustre_swab_update_reply_buf, + NULL); +EXPORT_SYMBOL(RMF_UPDATE_REPLY); + +struct req_msg_field RMF_SWAP_LAYOUTS = + DEFINE_MSGF("swap_layouts", 0, sizeof(struct mdc_swap_layouts), + lustre_swab_swap_layouts, NULL); +EXPORT_SYMBOL(RMF_SWAP_LAYOUTS); +/* + * Request formats. + */ + +struct req_format { + const char *rf_name; + int rf_idx; + struct { + int nr; + const struct req_msg_field **d; + } rf_fields[RCL_NR]; +}; + +#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) { \ + .rf_name = name, \ + .rf_fields = { \ + [RCL_CLIENT] = { \ + .nr = client_nr, \ + .d = client \ + }, \ + [RCL_SERVER] = { \ + .nr = server_nr, \ + .d = server \ + } \ + } \ +} + +#define DEFINE_REQ_FMT0(name, client, server) \ +DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server)) + +struct req_format RQF_OBD_PING = + DEFINE_REQ_FMT0("OBD_PING", empty, empty); +EXPORT_SYMBOL(RQF_OBD_PING); + +struct req_format RQF_OBD_SET_INFO = + DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty); +EXPORT_SYMBOL(RQF_OBD_SET_INFO); + +/* Read index file through the network */ +struct req_format RQF_OBD_IDX_READ = + DEFINE_REQ_FMT0("OBD_IDX_READ", + obd_idx_read_client, obd_idx_read_server); +EXPORT_SYMBOL(RQF_OBD_IDX_READ); + +struct req_format RQF_SEC_CTX = + DEFINE_REQ_FMT0("SEC_CTX", empty, empty); +EXPORT_SYMBOL(RQF_SEC_CTX); + +struct req_format RQF_MGS_TARGET_REG = + DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only, + mgs_target_info_only); +EXPORT_SYMBOL(RQF_MGS_TARGET_REG); + +struct req_format RQF_MGS_SET_INFO = + DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info, + mgs_set_info); +EXPORT_SYMBOL(RQF_MGS_SET_INFO); + +struct req_format RQF_MGS_CONFIG_READ = + DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client, + mgs_config_read_server); +EXPORT_SYMBOL(RQF_MGS_CONFIG_READ); + +struct req_format RQF_SEQ_QUERY = + DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server); +EXPORT_SYMBOL(RQF_SEQ_QUERY); + +struct req_format RQF_FLD_QUERY = + DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server); +EXPORT_SYMBOL(RQF_FLD_QUERY); + +struct req_format RQF_LOG_CANCEL = + DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty); +EXPORT_SYMBOL(RQF_LOG_CANCEL); + +struct req_format RQF_MDS_QUOTACHECK = + DEFINE_REQ_FMT0("MDS_QUOTACHECK", quotactl_only, empty); +EXPORT_SYMBOL(RQF_MDS_QUOTACHECK); + +struct req_format RQF_OST_QUOTACHECK = + DEFINE_REQ_FMT0("OST_QUOTACHECK", quotactl_only, empty); +EXPORT_SYMBOL(RQF_OST_QUOTACHECK); + +struct req_format RQF_MDS_QUOTACTL = + DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only); +EXPORT_SYMBOL(RQF_MDS_QUOTACTL); + +struct req_format RQF_OST_QUOTACTL = + DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only); +EXPORT_SYMBOL(RQF_OST_QUOTACTL); + +struct req_format RQF_QC_CALLBACK = + DEFINE_REQ_FMT0("QC_CALLBACK", quotactl_only, empty); +EXPORT_SYMBOL(RQF_QC_CALLBACK); + +struct req_format RQF_QUOTA_DQACQ = + DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only); +EXPORT_SYMBOL(RQF_QUOTA_DQACQ); + +struct req_format RQF_LDLM_INTENT_QUOTA = + DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA", + ldlm_intent_quota_client, + ldlm_intent_quota_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA); + +struct req_format RQF_MDS_GETSTATUS = + DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_GETSTATUS); + +struct req_format RQF_MDS_STATFS = + DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server); +EXPORT_SYMBOL(RQF_MDS_STATFS); + +struct req_format RQF_MDS_SYNC = + DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_SYNC); + +struct req_format RQF_MDS_GETATTR = + DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server); +EXPORT_SYMBOL(RQF_MDS_GETATTR); + +struct req_format RQF_MDS_GETXATTR = + DEFINE_REQ_FMT0("MDS_GETXATTR", + mds_getxattr_client, mds_getxattr_server); +EXPORT_SYMBOL(RQF_MDS_GETXATTR); + +struct req_format RQF_MDS_GETATTR_NAME = + DEFINE_REQ_FMT0("MDS_GETATTR_NAME", + mds_getattr_name_client, mds_getattr_server); +EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME); + +struct req_format RQF_MDS_REINT = + DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT); + +struct req_format RQF_MDS_REINT_CREATE = + DEFINE_REQ_FMT0("MDS_REINT_CREATE", + mds_reint_create_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE); + +struct req_format RQF_MDS_REINT_CREATE_RMT_ACL = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_RMT_ACL", + mds_reint_create_rmt_acl_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_RMT_ACL); + +struct req_format RQF_MDS_REINT_CREATE_SLAVE = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA", + mds_reint_create_slave_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE); + +struct req_format RQF_MDS_REINT_CREATE_SYM = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM", + mds_reint_create_sym_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM); + +struct req_format RQF_MDS_REINT_OPEN = + DEFINE_REQ_FMT0("MDS_REINT_OPEN", + mds_reint_open_client, mds_reint_open_server); +EXPORT_SYMBOL(RQF_MDS_REINT_OPEN); + +struct req_format RQF_MDS_REINT_UNLINK = + DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK); + +struct req_format RQF_MDS_REINT_LINK = + DEFINE_REQ_FMT0("MDS_REINT_LINK", + mds_reint_link_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_LINK); + +struct req_format RQF_MDS_REINT_RENAME = + DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_RENAME); + +struct req_format RQF_MDS_REINT_SETATTR = + DEFINE_REQ_FMT0("MDS_REINT_SETATTR", + mds_reint_setattr_client, mds_setattr_server); +EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR); + +struct req_format RQF_MDS_REINT_SETXATTR = + DEFINE_REQ_FMT0("MDS_REINT_SETXATTR", + mds_reint_setxattr_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR); + +struct req_format RQF_MDS_CONNECT = + DEFINE_REQ_FMT0("MDS_CONNECT", + obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_MDS_CONNECT); + +struct req_format RQF_MDS_DISCONNECT = + DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty); +EXPORT_SYMBOL(RQF_MDS_DISCONNECT); + +struct req_format RQF_MDS_GET_INFO = + DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client, + mds_getinfo_server); +EXPORT_SYMBOL(RQF_MDS_GET_INFO); + +struct req_format RQF_UPDATE_OBJ = + DEFINE_REQ_FMT0("OBJECT_UPDATE_OBJ", mds_update_client, + mds_update_server); +EXPORT_SYMBOL(RQF_UPDATE_OBJ); + +struct req_format RQF_LDLM_ENQUEUE = + DEFINE_REQ_FMT0("LDLM_ENQUEUE", + ldlm_enqueue_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_ENQUEUE); + +struct req_format RQF_LDLM_ENQUEUE_LVB = + DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB", + ldlm_enqueue_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB); + +struct req_format RQF_LDLM_CONVERT = + DEFINE_REQ_FMT0("LDLM_CONVERT", + ldlm_enqueue_client, ldlm_enqueue_server); +EXPORT_SYMBOL(RQF_LDLM_CONVERT); + +struct req_format RQF_LDLM_CANCEL = + DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CANCEL); + +struct req_format RQF_LDLM_CALLBACK = + DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CALLBACK); + +struct req_format RQF_LDLM_CP_CALLBACK = + DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK); + +struct req_format RQF_LDLM_BL_CALLBACK = + DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK); + +struct req_format RQF_LDLM_GL_CALLBACK = + DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client, + ldlm_gl_callback_server); +EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK); + +struct req_format RQF_LDLM_GL_DESC_CALLBACK = + DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client, + ldlm_gl_callback_server); +EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK); + +struct req_format RQF_LDLM_INTENT_BASIC = + DEFINE_REQ_FMT0("LDLM_INTENT_BASIC", + ldlm_intent_basic_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC); + +struct req_format RQF_LDLM_INTENT = + DEFINE_REQ_FMT0("LDLM_INTENT", + ldlm_intent_client, ldlm_intent_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT); + +struct req_format RQF_LDLM_INTENT_LAYOUT = + DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ", + ldlm_intent_layout_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT); + +struct req_format RQF_LDLM_INTENT_GETATTR = + DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR", + ldlm_intent_getattr_client, ldlm_intent_getattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR); + +struct req_format RQF_LDLM_INTENT_OPEN = + DEFINE_REQ_FMT0("LDLM_INTENT_OPEN", + ldlm_intent_open_client, ldlm_intent_open_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN); + +struct req_format RQF_LDLM_INTENT_CREATE = + DEFINE_REQ_FMT0("LDLM_INTENT_CREATE", + ldlm_intent_create_client, ldlm_intent_getattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE); + +struct req_format RQF_LDLM_INTENT_UNLINK = + DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK", + ldlm_intent_unlink_client, ldlm_intent_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK); + +struct req_format RQF_LDLM_INTENT_GETXATTR = + DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR", + ldlm_intent_getxattr_client, + ldlm_intent_getxattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_GETXATTR); + +struct req_format RQF_MDS_CLOSE = + DEFINE_REQ_FMT0("MDS_CLOSE", + mdt_close_client, mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_CLOSE); + +struct req_format RQF_MDS_RELEASE_CLOSE = + DEFINE_REQ_FMT0("MDS_CLOSE", + mdt_release_close_client, mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_RELEASE_CLOSE); + +struct req_format RQF_MDS_PIN = + DEFINE_REQ_FMT0("MDS_PIN", + mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_PIN); + +struct req_format RQF_MDS_UNPIN = + DEFINE_REQ_FMT0("MDS_UNPIN", mdt_body_only, empty); +EXPORT_SYMBOL(RQF_MDS_UNPIN); + +struct req_format RQF_MDS_DONE_WRITING = + DEFINE_REQ_FMT0("MDS_DONE_WRITING", + mdt_close_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_DONE_WRITING); + +struct req_format RQF_MDS_READPAGE = + DEFINE_REQ_FMT0("MDS_READPAGE", + mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_READPAGE); + +struct req_format RQF_MDS_HSM_ACTION = + DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server); +EXPORT_SYMBOL(RQF_MDS_HSM_ACTION); + +struct req_format RQF_MDS_HSM_PROGRESS = + DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS); + +struct req_format RQF_MDS_HSM_CT_REGISTER = + DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER); + +struct req_format RQF_MDS_HSM_CT_UNREGISTER = + DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER); + +struct req_format RQF_MDS_HSM_STATE_GET = + DEFINE_REQ_FMT0("MDS_HSM_STATE_GET", + mdt_body_capa, mdt_hsm_state_get_server); +EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET); + +struct req_format RQF_MDS_HSM_STATE_SET = + DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET); + +struct req_format RQF_MDS_HSM_REQUEST = + DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST); + +struct req_format RQF_MDS_SWAP_LAYOUTS = + DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS", + mdt_swap_layouts, empty); +EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS); + +/* This is for split */ +struct req_format RQF_MDS_WRITEPAGE = + DEFINE_REQ_FMT0("MDS_WRITEPAGE", + mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_WRITEPAGE); + +struct req_format RQF_MDS_IS_SUBDIR = + DEFINE_REQ_FMT0("MDS_IS_SUBDIR", + mdt_body_only, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_IS_SUBDIR); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE", + llog_origin_handle_create_client, llogd_body_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY", + llogd_body_only, llogd_body_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK", + llogd_body_only, llog_origin_handle_next_block_server); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK", + llogd_body_only, llog_origin_handle_next_block_server); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER", + llogd_body_only, llog_log_hdr_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER); + +struct req_format RQF_LLOG_ORIGIN_CONNECT = + DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT); + +struct req_format RQF_CONNECT = + DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_CONNECT); + +struct req_format RQF_OST_CONNECT = + DEFINE_REQ_FMT0("OST_CONNECT", + obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_OST_CONNECT); + +struct req_format RQF_OST_DISCONNECT = + DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty); +EXPORT_SYMBOL(RQF_OST_DISCONNECT); + +struct req_format RQF_OST_GETATTR = + DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_GETATTR); + +struct req_format RQF_OST_SETATTR = + DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_SETATTR); + +struct req_format RQF_OST_CREATE = + DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only); +EXPORT_SYMBOL(RQF_OST_CREATE); + +struct req_format RQF_OST_PUNCH = + DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_PUNCH); + +struct req_format RQF_OST_SYNC = + DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_SYNC); + +struct req_format RQF_OST_DESTROY = + DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only); +EXPORT_SYMBOL(RQF_OST_DESTROY); + +struct req_format RQF_OST_BRW_READ = + DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server); +EXPORT_SYMBOL(RQF_OST_BRW_READ); + +struct req_format RQF_OST_BRW_WRITE = + DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server); +EXPORT_SYMBOL(RQF_OST_BRW_WRITE); + +struct req_format RQF_OST_STATFS = + DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server); +EXPORT_SYMBOL(RQF_OST_STATFS); + +struct req_format RQF_OST_SET_GRANT_INFO = + DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client, + ost_body_only); +EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO); + +struct req_format RQF_OST_GET_INFO_GENERIC = + DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client, + ost_get_info_generic_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_GENERIC); + +struct req_format RQF_OST_GET_INFO_LAST_ID = + DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client, + ost_get_last_id_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID); + +struct req_format RQF_OST_GET_INFO_LAST_FID = + DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", obd_set_info_client, + ost_get_last_fid_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID); + +struct req_format RQF_OST_SET_INFO_LAST_FID = + DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client, + empty); +EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID); + +struct req_format RQF_OST_GET_INFO_FIEMAP = + DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client, + ost_get_fiemap_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP); + +#if !defined(__REQ_LAYOUT_USER__) + +/* Convenience macro */ +#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)] + +/** + * Initializes the capsule abstraction by computing and setting the \a rf_idx + * field of RQFs and the \a rmf_offset field of RMFs. + */ +int req_layout_init(void) +{ + int i; + int j; + int k; + struct req_format *rf = NULL; + + for (i = 0; i < ARRAY_SIZE(req_formats); ++i) { + rf = req_formats[i]; + rf->rf_idx = i; + for (j = 0; j < RCL_NR; ++j) { + LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR); + for (k = 0; k < rf->rf_fields[j].nr; ++k) { + struct req_msg_field *field; + + field = (typeof(field))rf->rf_fields[j].d[k]; + LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY) + || field->rmf_size > 0); + LASSERT(field->rmf_offset[i][j] == 0); + /* + * k + 1 to detect unused format/field + * combinations. + */ + field->rmf_offset[i][j] = k + 1; + } + } + } + return 0; +} +EXPORT_SYMBOL(req_layout_init); + +void req_layout_fini(void) +{ +} +EXPORT_SYMBOL(req_layout_fini); + +/** + * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1. + * + * Actual/expected field sizes are set elsewhere in functions in this file: + * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and + * req_capsule_msg_size(). The \a rc_area information is used by. + * ptlrpc_request_set_replen(). + */ +void req_capsule_init_area(struct req_capsule *pill) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) { + pill->rc_area[RCL_CLIENT][i] = -1; + pill->rc_area[RCL_SERVER][i] = -1; + } +} +EXPORT_SYMBOL(req_capsule_init_area); + +/** + * Initialize a pill. + * + * The \a location indicates whether the caller is executing on the client side + * (RCL_CLIENT) or server side (RCL_SERVER).. + */ +void req_capsule_init(struct req_capsule *pill, + struct ptlrpc_request *req, + enum req_location location) +{ + LASSERT(location == RCL_SERVER || location == RCL_CLIENT); + + /* + * Today all capsules are embedded in ptlrpc_request structs, + * but just in case that ever isn't the case, we don't reach + * into req unless req != NULL and pill is the one embedded in + * the req. + * + * The req->rq_pill_init flag makes it safe to initialize a pill + * twice, which might happen in the OST paths as a result of the + * high-priority RPC queue getting peeked at before ost_handle() + * handles an OST RPC. + */ + if (req != NULL && pill == &req->rq_pill && req->rq_pill_init) + return; + + memset(pill, 0, sizeof(*pill)); + pill->rc_req = req; + pill->rc_loc = location; + req_capsule_init_area(pill); + + if (req != NULL && pill == &req->rq_pill) + req->rq_pill_init = 1; +} +EXPORT_SYMBOL(req_capsule_init); + +void req_capsule_fini(struct req_capsule *pill) +{ +} +EXPORT_SYMBOL(req_capsule_fini); + +static int __req_format_is_sane(const struct req_format *fmt) +{ + return + 0 <= fmt->rf_idx && fmt->rf_idx < ARRAY_SIZE(req_formats) && + req_formats[fmt->rf_idx] == fmt; +} + +static struct lustre_msg *__req_msg(const struct req_capsule *pill, + enum req_location loc) +{ + struct ptlrpc_request *req; + + req = pill->rc_req; + return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg; +} + +/** + * Set the format (\a fmt) of a \a pill; format changes are not allowed here + * (see req_capsule_extend()). + */ +void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt) +{ + LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt); + LASSERT(__req_format_is_sane(fmt)); + + pill->rc_fmt = fmt; +} +EXPORT_SYMBOL(req_capsule_set); + +/** + * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in + * yet. + + * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of + * variable-sized fields. The field sizes come from the declared \a rmf_size + * field of a \a pill's \a rc_fmt's RMF's. + */ +int req_capsule_filled_sizes(struct req_capsule *pill, + enum req_location loc) +{ + const struct req_format *fmt = pill->rc_fmt; + int i; + + LASSERT(fmt != NULL); + + for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { + if (pill->rc_area[loc][i] == -1) { + pill->rc_area[loc][i] = + fmt->rf_fields[loc].d[i]->rmf_size; + if (pill->rc_area[loc][i] == -1) { + /* + * Skip the following fields. + * + * If this LASSERT() trips then you're missing a + * call to req_capsule_set_size(). + */ + LASSERT(loc != RCL_SERVER); + break; + } + } + } + return i; +} +EXPORT_SYMBOL(req_capsule_filled_sizes); + +/** + * Capsule equivalent of lustre_pack_request() and lustre_pack_reply(). + * + * This function uses the \a pill's \a rc_area as filled in by + * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by + * this function). + */ +int req_capsule_server_pack(struct req_capsule *pill) +{ + const struct req_format *fmt; + int count; + int rc; + + LASSERT(pill->rc_loc == RCL_SERVER); + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + + count = req_capsule_filled_sizes(pill, RCL_SERVER); + rc = lustre_pack_reply(pill->rc_req, count, + pill->rc_area[RCL_SERVER], NULL); + if (rc != 0) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "Cannot pack %d fields in format `%s': ", + count, fmt->rf_name); + } + return rc; +} +EXPORT_SYMBOL(req_capsule_server_pack); + +/** + * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill + * corresponding to the given RMF (\a field). + */ +static int __req_capsule_offset(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + int offset; + + offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc]; + LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n", + pill->rc_fmt->rf_name, + field->rmf_name, offset, loc); + offset--; + + LASSERT(0 <= offset && offset < REQ_MAX_FIELD_NR); + return offset; +} + +/** + * Helper for __req_capsule_get(); swabs value / array of values and/or dumps + * them if desired. + */ +static +void +swabber_dumper_helper(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, + int offset, + void *value, int len, int dump, void (*swabber)(void *)) +{ + void *p; + int i; + int n; + int do_swab; + int inout = loc == RCL_CLIENT; + + swabber = swabber ?: field->rmf_swabber; + + if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) && + swabber != NULL && value != NULL) + do_swab = 1; + else + do_swab = 0; + + if (!field->rmf_dumper) + dump = 0; + + if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) { + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n", + do_swab ? "unswabbed " : "", field->rmf_name); + field->rmf_dumper(value); + } + if (!do_swab) + return; + swabber(value); + ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset); + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of swabbed field %s follows\n", + field->rmf_name); + field->rmf_dumper(value); + } + + return; + } + + /* + * We're swabbing an array; swabber() swabs a single array element, so + * swab every element. + */ + LASSERT((len % field->rmf_size) == 0); + for (p = value, i = 0, n = len / field->rmf_size; + i < n; + i++, p += field->rmf_size) { + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, element %d follows\n", + do_swab ? "unswabbed " : "", field->rmf_name, i); + field->rmf_dumper(p); + } + if (!do_swab) + continue; + swabber(p); + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, element %d follows\n", + field->rmf_name, i); + field->rmf_dumper(value); + } + } + if (do_swab) + ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset); +} + +/** + * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill + * corresponding to the given RMF (\a field). + * + * The buffer will be swabbed using the given \a swabber. If \a swabber == NULL + * then the \a rmf_swabber from the RMF will be used. Soon there will be no + * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then + * be removed. Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each + * element of the array swabbed. + */ +static void *__req_capsule_get(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, + void (*swabber)(void *), + int dump) +{ + const struct req_format *fmt; + struct lustre_msg *msg; + void *value; + int len; + int offset; + + void *(*getter)(struct lustre_msg *m, int n, int minlen); + + static const char *rcl_names[RCL_NR] = { + [RCL_CLIENT] = "client", + [RCL_SERVER] = "server" + }; + + LASSERT(pill != NULL); + LASSERT(pill != LP_POISON); + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + LASSERT(fmt != LP_POISON); + LASSERT(__req_format_is_sane(fmt)); + + offset = __req_capsule_offset(pill, field, loc); + + msg = __req_msg(pill, loc); + LASSERT(msg != NULL); + + getter = (field->rmf_flags & RMF_F_STRING) ? + (typeof(getter))lustre_msg_string : lustre_msg_buf; + + if (field->rmf_flags & RMF_F_STRUCT_ARRAY) { + /* + * We've already asserted that field->rmf_size > 0 in + * req_layout_init(). + */ + len = lustre_msg_buflen(msg, offset); + if ((len % field->rmf_size) != 0) { + CERROR("%s: array field size mismatch %d modulo %d != 0 (%d)\n", + field->rmf_name, len, field->rmf_size, loc); + return NULL; + } + } else if (pill->rc_area[loc][offset] != -1) { + len = pill->rc_area[loc][offset]; + } else { + len = max(field->rmf_size, 0); + } + value = getter(msg, offset, len); + + if (value == NULL) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "Wrong buffer for field `%s' (%d of %d) in format `%s': %d vs. %d (%s)\n", + field->rmf_name, offset, lustre_msg_bufcount(msg), + fmt->rf_name, lustre_msg_buflen(msg, offset), len, + rcl_names[loc]); + } else { + swabber_dumper_helper(pill, field, loc, offset, value, len, + dump, swabber); + } + + return value; +} + +/** + * Dump a request and/or reply + */ +static void __req_capsule_dump(struct req_capsule *pill, enum req_location loc) +{ + const struct req_format *fmt; + const struct req_msg_field *field; + int len; + int i; + + fmt = pill->rc_fmt; + + DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n"); + for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { + field = FMT_FIELD(fmt, loc, i); + if (field->rmf_dumper == NULL) { + /* + * FIXME Add a default hex dumper for fields that don't + * have a specific dumper + */ + len = req_capsule_get_size(pill, field, loc); + CDEBUG(D_RPCTRACE, "Field %s has no dumper function; field size is %d\n", + field->rmf_name, len); + } else { + /* It's the dumping side-effect that we're interested in */ + (void) __req_capsule_get(pill, field, loc, NULL, 1); + } + } + CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n"); +} + +/** + * Dump a request. + */ +void req_capsule_client_dump(struct req_capsule *pill) +{ + __req_capsule_dump(pill, RCL_CLIENT); +} +EXPORT_SYMBOL(req_capsule_client_dump); + +/** + * Dump a reply + */ +void req_capsule_server_dump(struct req_capsule *pill) +{ + __req_capsule_dump(pill, RCL_SERVER); +} +EXPORT_SYMBOL(req_capsule_server_dump); + +/** + * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request + * buffer corresponding to the given RMF (\a field) of a \a pill. + */ +void *req_capsule_client_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_client_get); + +/** + * Same as req_capsule_client_get(), but with a \a swabber argument. + * + * Currently unused; will be removed when req_capsule_server_swab_get() is + * unused too. + */ +void *req_capsule_client_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber) +{ + return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0); +} +EXPORT_SYMBOL(req_capsule_client_swab_get); + +/** + * Utility that combines req_capsule_set_size() and req_capsule_client_get(). + * + * First the \a pill's request \a field's size is set (\a rc_area) using + * req_capsule_set_size() with the given \a len. Then the actual buffer is + * returned. + */ +void *req_capsule_client_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len) +{ + req_capsule_set_size(pill, field, RCL_CLIENT, len); + return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_client_sized_get); + +/** + * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply + * buffer corresponding to the given RMF (\a field) of a \a pill. + */ +void *req_capsule_server_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_server_get); + +/** + * Same as req_capsule_server_get(), but with a \a swabber argument. + * + * Ideally all swabbing should be done pursuant to RMF definitions, with no + * swabbing done outside this capsule abstraction. + */ +void *req_capsule_server_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber) +{ + return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0); +} +EXPORT_SYMBOL(req_capsule_server_swab_get); + +/** + * Utility that combines req_capsule_set_size() and req_capsule_server_get(). + * + * First the \a pill's request \a field's size is set (\a rc_area) using + * req_capsule_set_size() with the given \a len. Then the actual buffer is + * returned. + */ +void *req_capsule_server_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len) +{ + req_capsule_set_size(pill, field, RCL_SERVER, len); + return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_server_sized_get); + +void *req_capsule_server_sized_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len, void *swabber) +{ + req_capsule_set_size(pill, field, RCL_SERVER, len); + return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0); +} +EXPORT_SYMBOL(req_capsule_server_sized_swab_get); + +/** + * Returns the buffer of a \a pill corresponding to the given \a field from the + * request (if the caller is executing on the server-side) or reply (if the + * caller is executing on the client-side). + * + * This function convenient for use is code that could be executed on the + * client and server alike. + */ +const void *req_capsule_other_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_other_get); + +/** + * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a + * field of the given \a pill. + * + * This function must be used when constructing variable sized fields of a + * request or reply. + */ +void req_capsule_set_size(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, int size) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + if ((size != field->rmf_size) && + (field->rmf_size != -1) && + !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) && + (size > 0)) { + if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) && + (size % field->rmf_size != 0)) { + CERROR("%s: array field size mismatch %d %% %d != 0 (%d)\n", + field->rmf_name, size, field->rmf_size, loc); + LBUG(); + } else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) && + size < field->rmf_size) { + CERROR("%s: field size mismatch %d != %d (%d)\n", + field->rmf_name, size, field->rmf_size, loc); + LBUG(); + } + } + + pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size; +} +EXPORT_SYMBOL(req_capsule_set_size); + +/** + * Return the actual PTLRPC buffer length of a request or reply (\a loc) + * for the given \a pill's given \a field. + * + * NB: this function doesn't correspond with req_capsule_set_size(), which + * actually sets the size in pill.rc_area[loc][offset], but this function + * returns the message buflen[offset], maybe we should use another name. + */ +int req_capsule_get_size(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + return lustre_msg_buflen(__req_msg(pill, loc), + __req_capsule_offset(pill, field, loc)); +} +EXPORT_SYMBOL(req_capsule_get_size); + +/** + * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the + * given \a pill's request or reply (\a loc) given the field size recorded in + * the \a pill's rc_area. + * + * See also req_capsule_set_size(). + */ +int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc) +{ + return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic, + pill->rc_fmt->rf_fields[loc].nr, + pill->rc_area[loc]); +} + +/** + * While req_capsule_msg_size() computes the size of a PTLRPC request or reply + * (\a loc) given a \a pill's \a rc_area, this function computes the size of a + * PTLRPC request or reply given only an RQF (\a fmt). + * + * This function should not be used for formats which contain variable size + * fields. + */ +int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, + enum req_location loc) +{ + int size, i = 0; + + /* + * This function should probably LASSERT() that fmt has no fields with + * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many + * elements in the array there will ultimately be, but then, we could + * assume that there will be at least one element, and that's just what + * we do. + */ + size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr); + if (size < 0) + return size; + + for (; i < fmt->rf_fields[loc].nr; ++i) + if (fmt->rf_fields[loc].d[i]->rmf_size != -1) + size += cfs_size_round(fmt->rf_fields[loc].d[i]-> + rmf_size); + return size; +} + +/** + * Changes the format of an RPC. + * + * The pill must already have been initialized, which means that it already has + * a request format. The new format \a fmt must be an extension of the pill's + * old format. Specifically: the new format must have as many request and reply + * fields as the old one, and all fields shared by the old and new format must + * be at least as large in the new format. + * + * The new format's fields may be of different "type" than the old format, but + * only for fields that are "opaque" blobs: fields which have a) have no + * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a + * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK. For example, + * OBD_SET_INFO has a key field and an opaque value field that gets interpreted + * according to the key field. When the value, according to the key, contains a + * structure (or array thereof) to be swabbed, the format should be changed to + * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set + * accordingly. + */ +void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt) +{ + int i; + int j; + + const struct req_format *old; + + LASSERT(pill->rc_fmt != NULL); + LASSERT(__req_format_is_sane(fmt)); + + old = pill->rc_fmt; + /* + * Sanity checking... + */ + for (i = 0; i < RCL_NR; ++i) { + LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr); + for (j = 0; j < old->rf_fields[i].nr - 1; ++j) { + const struct req_msg_field *ofield = FMT_FIELD(old, i, j); + + /* "opaque" fields can be transmogrified */ + if (ofield->rmf_swabber == NULL && + (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 && + (ofield->rmf_size == -1 || + ofield->rmf_flags == RMF_F_NO_SIZE_CHECK)) + continue; + LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j)); + } + /* + * Last field in old format can be shorter than in new. + */ + LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >= + FMT_FIELD(old, i, j)->rmf_size); + } + + pill->rc_fmt = fmt; +} +EXPORT_SYMBOL(req_capsule_extend); + +/** + * This function returns a non-zero value if the given \a field is present in + * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it + * returns 0. + */ +int req_capsule_has_field(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + return field->rmf_offset[pill->rc_fmt->rf_idx][loc]; +} +EXPORT_SYMBOL(req_capsule_has_field); + +/** + * Returns a non-zero value if the given \a field is present in the given \a + * pill's PTLRPC request or reply (\a loc), else it returns 0. + */ +int req_capsule_field_present(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + int offset; + + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + LASSERT(req_capsule_has_field(pill, field, loc)); + + offset = __req_capsule_offset(pill, field, loc); + return lustre_msg_bufcount(__req_msg(pill, loc)) > offset; +} +EXPORT_SYMBOL(req_capsule_field_present); + +/** + * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC + * request or reply (\a loc). + * + * This is not the opposite of req_capsule_extend(). + */ +void req_capsule_shrink(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen, + enum req_location loc) +{ + const struct req_format *fmt; + struct lustre_msg *msg; + int len; + int offset; + + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + LASSERT(__req_format_is_sane(fmt)); + LASSERT(req_capsule_has_field(pill, field, loc)); + LASSERT(req_capsule_field_present(pill, field, loc)); + + offset = __req_capsule_offset(pill, field, loc); + + msg = __req_msg(pill, loc); + len = lustre_msg_buflen(msg, offset); + LASSERTF(newlen <= len, "%s:%s, oldlen=%d, newlen=%d\n", + fmt->rf_name, field->rmf_name, len, newlen); + + if (loc == RCL_CLIENT) + pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen, + 1); + else + pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen, + 1); +} +EXPORT_SYMBOL(req_capsule_shrink); + +int req_capsule_server_grow(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen) +{ + struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs; + char *from, *to; + int offset, len, rc; + + LASSERT(pill->rc_fmt != NULL); + LASSERT(__req_format_is_sane(pill->rc_fmt)); + LASSERT(req_capsule_has_field(pill, field, RCL_SERVER)); + LASSERT(req_capsule_field_present(pill, field, RCL_SERVER)); + + len = req_capsule_get_size(pill, field, RCL_SERVER); + offset = __req_capsule_offset(pill, field, RCL_SERVER); + if (pill->rc_req->rq_repbuf_len >= + lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen) + CERROR("Inplace repack might be done\n"); + + pill->rc_req->rq_reply_state = NULL; + req_capsule_set_size(pill, field, RCL_SERVER, newlen); + rc = req_capsule_server_pack(pill); + if (rc) { + /* put old rs back, the caller will decide what to do */ + pill->rc_req->rq_reply_state = rs; + return rc; + } + nrs = pill->rc_req->rq_reply_state; + /* Now we need only buffers, copy first chunk */ + to = lustre_msg_buf(nrs->rs_msg, 0, 0); + from = lustre_msg_buf(rs->rs_msg, 0, 0); + len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from; + memcpy(to, from, len); + /* check if we have tail and copy it too */ + if (rs->rs_msg->lm_bufcount > offset + 1) { + to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0); + from = lustre_msg_buf(rs->rs_msg, offset + 1, 0); + offset = rs->rs_msg->lm_bufcount - 1; + len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) + + cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from; + memcpy(to, from, len); + } + /* drop old reply if everything is fine */ + if (rs->rs_difficult) { + /* copy rs data */ + int i; + + nrs->rs_difficult = 1; + nrs->rs_no_ack = rs->rs_no_ack; + for (i = 0; i < rs->rs_nlocks; i++) { + nrs->rs_locks[i] = rs->rs_locks[i]; + nrs->rs_modes[i] = rs->rs_modes[i]; + nrs->rs_nlocks++; + } + rs->rs_nlocks = 0; + rs->rs_difficult = 0; + rs->rs_no_ack = 0; + } + ptlrpc_rs_decref(rs); + return 0; +} +EXPORT_SYMBOL(req_capsule_server_grow); +/* __REQ_LAYOUT_USER__ */ +#endif diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_client.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_client.c new file mode 100644 index 000000000..e9baf5bbe --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_client.c @@ -0,0 +1,366 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/llog_client.c + * + * remote api for llog - client side + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_class.h" +#include "../include/lustre_log.h" +#include "../include/lustre_net.h" +#include + +#define LLOG_CLIENT_ENTRY(ctxt, imp) do { \ + mutex_lock(&ctxt->loc_mutex); \ + if (ctxt->loc_imp) { \ + imp = class_import_get(ctxt->loc_imp); \ + } else { \ + CERROR("ctxt->loc_imp == NULL for context idx %d." \ + "Unable to complete MDS/OSS recovery," \ + "but I'll try again next time. Not fatal.\n", \ + ctxt->loc_idx); \ + imp = NULL; \ + mutex_unlock(&ctxt->loc_mutex); \ + return (-EINVAL); \ + } \ + mutex_unlock(&ctxt->loc_mutex); \ +} while (0) + +#define LLOG_CLIENT_EXIT(ctxt, imp) do { \ + mutex_lock(&ctxt->loc_mutex); \ + if (ctxt->loc_imp != imp) \ + CWARN("loc_imp has changed from %p to %p\n", \ + ctxt->loc_imp, imp); \ + class_import_put(imp); \ + mutex_unlock(&ctxt->loc_mutex); \ +} while (0) + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static int llog_client_open(const struct lu_env *env, + struct llog_handle *lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param) +{ + struct obd_import *imp; + struct llogd_body *body; + struct llog_ctxt *ctxt = lgh->lgh_ctxt; + struct ptlrpc_request *req = NULL; + int rc; + + LLOG_CLIENT_ENTRY(ctxt, imp); + + /* client cannot create llog */ + LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param); + LASSERT(lgh); + + req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE); + if (req == NULL) { + rc = -ENOMEM; + goto out; + } + + if (name) + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + strlen(name) + 1); + + rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_CREATE); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + goto out; + } + ptlrpc_request_set_replen(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (logid) + body->lgd_logid = *logid; + body->lgd_ctxt_idx = ctxt->loc_idx - 1; + + if (name) { + char *tmp; + tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME, + strlen(name) + 1); + LASSERT(tmp); + strcpy(tmp, name); + } + + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) { + rc = -EFAULT; + goto out; + } + + lgh->lgh_id = body->lgd_logid; + lgh->lgh_ctxt = ctxt; +out: + LLOG_CLIENT_EXIT(ctxt, imp); + ptlrpc_req_finished(req); + return rc; +} + +static int llog_client_destroy(const struct lu_env *env, + struct llog_handle *loghandle) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + int rc; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_DESTROY); + if (req == NULL) { + rc = -ENOMEM; + goto err_exit; + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + + if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN)) + CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name, + body->lgd_llh_flags); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + return rc; +} + + +static int llog_client_next_block(const struct lu_env *env, + struct llog_handle *loghandle, + int *cur_idx, int next_idx, + __u64 *cur_offset, void *buf, int len) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + void *ptr; + int rc; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK); + if (req == NULL) { + rc = -ENOMEM; + goto err_exit; + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + body->lgd_index = next_idx; + body->lgd_saved_index = *cur_idx; + body->lgd_len = len; + body->lgd_cur_offset = *cur_offset; + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) { + rc = -EFAULT; + goto out; + } + + /* The log records are swabbed as they are processed */ + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + if (ptr == NULL) { + rc = -EFAULT; + goto out; + } + + *cur_idx = body->lgd_saved_index; + *cur_offset = body->lgd_cur_offset; + + memcpy(buf, ptr, len); +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + void *ptr; + int rc; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_PREV_BLOCK); + if (req == NULL) { + rc = -ENOMEM; + goto err_exit; + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + body->lgd_index = prev_idx; + body->lgd_len = len; + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) { + rc = -EFAULT; + goto out; + } + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + if (ptr == NULL) { + rc = -EFAULT; + goto out; + } + + memcpy(buf, ptr, len); +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + struct llog_log_hdr *hdr; + struct llog_rec_hdr *llh_hdr; + int rc; + + LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_READ_HEADER); + if (req == NULL) { + rc = -ENOMEM; + goto err_exit; + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = handle->lgh_id; + body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = handle->lgh_hdr->llh_flags; + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + goto out; + + hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR); + if (hdr == NULL) { + rc = -EFAULT; + goto out; + } + + memcpy(handle->lgh_hdr, hdr, sizeof(*hdr)); + handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index; + + /* sanity checks */ + llh_hdr = &handle->lgh_hdr->llh_hdr; + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("bad log header magic: %#x (expecting %#x)\n", + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + rc = -EIO; + } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) { + CERROR("incorrectly sized log header: %#x (expecting %#x)\n", + llh_hdr->lrh_len, LLOG_CHUNK_SIZE); + CERROR("you may need to re-run lconf --write_conf.\n"); + rc = -EIO; + } +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_close(const struct lu_env *env, + struct llog_handle *handle) +{ + /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because + the servers all close the file at the end of every + other LLOG_ RPC. */ + return 0; +} + +struct llog_operations llog_client_ops = { + .lop_next_block = llog_client_next_block, + .lop_prev_block = llog_client_prev_block, + .lop_read_header = llog_client_read_header, + .lop_open = llog_client_open, + .lop_destroy = llog_client_destroy, + .lop_close = llog_client_close, +}; +EXPORT_SYMBOL(llog_client_ops); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_net.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_net.c new file mode 100644 index 000000000..dac66f5b3 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/llog_net.c @@ -0,0 +1,72 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/llog_net.c + * + * OST<->MDS recovery logging infrastructure. + * + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_class.h" +#include "../include/lustre_log.h" +#include + +int llog_initiator_connect(struct llog_ctxt *ctxt) +{ + struct obd_import *new_imp; + + LASSERT(ctxt); + new_imp = ctxt->loc_obd->u.cli.cl_import; + LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp, + "%p - %p\n", ctxt->loc_imp, new_imp); + mutex_lock(&ctxt->loc_mutex); + if (ctxt->loc_imp != new_imp) { + if (ctxt->loc_imp) + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = class_import_get(new_imp); + } + mutex_unlock(&ctxt->loc_mutex); + return 0; +} +EXPORT_SYMBOL(llog_initiator_connect); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c new file mode 100644 index 000000000..9533ab976 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c @@ -0,0 +1,1366 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + + +#include "../include/obd_support.h" +#include "../include/obd.h" +#include "../include/lprocfs_status.h" +#include "../include/lustre/lustre_idl.h" +#include "../include/lustre_net.h" +#include "../include/obd_class.h" +#include "ptlrpc_internal.h" + + +static struct ll_rpc_opcode { + __u32 opcode; + const char *opname; +} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = { + { OST_REPLY, "ost_reply" }, + { OST_GETATTR, "ost_getattr" }, + { OST_SETATTR, "ost_setattr" }, + { OST_READ, "ost_read" }, + { OST_WRITE, "ost_write" }, + { OST_CREATE , "ost_create" }, + { OST_DESTROY, "ost_destroy" }, + { OST_GET_INFO, "ost_get_info" }, + { OST_CONNECT, "ost_connect" }, + { OST_DISCONNECT, "ost_disconnect" }, + { OST_PUNCH, "ost_punch" }, + { OST_OPEN, "ost_open" }, + { OST_CLOSE, "ost_close" }, + { OST_STATFS, "ost_statfs" }, + { 14, NULL }, /* formerly OST_SAN_READ */ + { 15, NULL }, /* formerly OST_SAN_WRITE */ + { OST_SYNC, "ost_sync" }, + { OST_SET_INFO, "ost_set_info" }, + { OST_QUOTACHECK, "ost_quotacheck" }, + { OST_QUOTACTL, "ost_quotactl" }, + { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" }, + { MDS_GETATTR, "mds_getattr" }, + { MDS_GETATTR_NAME, "mds_getattr_lock" }, + { MDS_CLOSE, "mds_close" }, + { MDS_REINT, "mds_reint" }, + { MDS_READPAGE, "mds_readpage" }, + { MDS_CONNECT, "mds_connect" }, + { MDS_DISCONNECT, "mds_disconnect" }, + { MDS_GETSTATUS, "mds_getstatus" }, + { MDS_STATFS, "mds_statfs" }, + { MDS_PIN, "mds_pin" }, + { MDS_UNPIN, "mds_unpin" }, + { MDS_SYNC, "mds_sync" }, + { MDS_DONE_WRITING, "mds_done_writing" }, + { MDS_SET_INFO, "mds_set_info" }, + { MDS_QUOTACHECK, "mds_quotacheck" }, + { MDS_QUOTACTL, "mds_quotactl" }, + { MDS_GETXATTR, "mds_getxattr" }, + { MDS_SETXATTR, "mds_setxattr" }, + { MDS_WRITEPAGE, "mds_writepage" }, + { MDS_IS_SUBDIR, "mds_is_subdir" }, + { MDS_GET_INFO, "mds_get_info" }, + { MDS_HSM_STATE_GET, "mds_hsm_state_get" }, + { MDS_HSM_STATE_SET, "mds_hsm_state_set" }, + { MDS_HSM_ACTION, "mds_hsm_action" }, + { MDS_HSM_PROGRESS, "mds_hsm_progress" }, + { MDS_HSM_REQUEST, "mds_hsm_request" }, + { MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" }, + { MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" }, + { MDS_SWAP_LAYOUTS, "mds_swap_layouts" }, + { LDLM_ENQUEUE, "ldlm_enqueue" }, + { LDLM_CONVERT, "ldlm_convert" }, + { LDLM_CANCEL, "ldlm_cancel" }, + { LDLM_BL_CALLBACK, "ldlm_bl_callback" }, + { LDLM_CP_CALLBACK, "ldlm_cp_callback" }, + { LDLM_GL_CALLBACK, "ldlm_gl_callback" }, + { LDLM_SET_INFO, "ldlm_set_info" }, + { MGS_CONNECT, "mgs_connect" }, + { MGS_DISCONNECT, "mgs_disconnect" }, + { MGS_EXCEPTION, "mgs_exception" }, + { MGS_TARGET_REG, "mgs_target_reg" }, + { MGS_TARGET_DEL, "mgs_target_del" }, + { MGS_SET_INFO, "mgs_set_info" }, + { MGS_CONFIG_READ, "mgs_config_read" }, + { OBD_PING, "obd_ping" }, + { OBD_LOG_CANCEL, "llog_cancel" }, + { OBD_QC_CALLBACK, "obd_quota_callback" }, + { OBD_IDX_READ, "dt_index_read" }, + { LLOG_ORIGIN_HANDLE_CREATE, "llog_origin_handle_open" }, + { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" }, + { LLOG_ORIGIN_HANDLE_READ_HEADER, "llog_origin_handle_read_header" }, + { LLOG_ORIGIN_HANDLE_WRITE_REC, "llog_origin_handle_write_rec" }, + { LLOG_ORIGIN_HANDLE_CLOSE, "llog_origin_handle_close" }, + { LLOG_ORIGIN_CONNECT, "llog_origin_connect" }, + { LLOG_CATINFO, "llog_catinfo" }, + { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" }, + { LLOG_ORIGIN_HANDLE_DESTROY, "llog_origin_handle_destroy" }, + { QUOTA_DQACQ, "quota_acquire" }, + { QUOTA_DQREL, "quota_release" }, + { SEQ_QUERY, "seq_query" }, + { SEC_CTX_INIT, "sec_ctx_init" }, + { SEC_CTX_INIT_CONT, "sec_ctx_init_cont" }, + { SEC_CTX_FINI, "sec_ctx_fini" }, + { FLD_QUERY, "fld_query" }, + { UPDATE_OBJ, "update_obj" }, +}; + +static struct ll_eopcode { + __u32 opcode; + const char *opname; +} ll_eopcode_table[EXTRA_LAST_OPC] = { + { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" }, + { LDLM_PLAIN_ENQUEUE, "ldlm_plain_enqueue" }, + { LDLM_EXTENT_ENQUEUE, "ldlm_extent_enqueue" }, + { LDLM_FLOCK_ENQUEUE, "ldlm_flock_enqueue" }, + { LDLM_IBITS_ENQUEUE, "ldlm_ibits_enqueue" }, + { MDS_REINT_SETATTR, "mds_reint_setattr" }, + { MDS_REINT_CREATE, "mds_reint_create" }, + { MDS_REINT_LINK, "mds_reint_link" }, + { MDS_REINT_UNLINK, "mds_reint_unlink" }, + { MDS_REINT_RENAME, "mds_reint_rename" }, + { MDS_REINT_OPEN, "mds_reint_open" }, + { MDS_REINT_SETXATTR, "mds_reint_setxattr" }, + { BRW_READ_BYTES, "read_bytes" }, + { BRW_WRITE_BYTES, "write_bytes" }, +}; + +const char *ll_opcode2str(__u32 opcode) +{ + /* When one of the assertions below fail, chances are that: + * 1) A new opcode was added in include/lustre/lustre_idl.h, + * but is missing from the table above. + * or 2) The opcode space was renumbered or rearranged, + * and the opcode_offset() function in + * ptlrpc_internal.h needs to be modified. + */ + __u32 offset = opcode_offset(opcode); + LASSERTF(offset < LUSTRE_MAX_OPCODES, + "offset %u >= LUSTRE_MAX_OPCODES %u\n", + offset, LUSTRE_MAX_OPCODES); + LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode, + "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n", + offset, ll_rpc_opcode_table[offset].opcode, opcode); + return ll_rpc_opcode_table[offset].opname; +} + +static const char *ll_eopcode2str(__u32 opcode) +{ + LASSERT(ll_eopcode_table[opcode].opcode == opcode); + return ll_eopcode_table[opcode].opname; +} + +#if defined(CONFIG_PROC_FS) +static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir, + char *name, + struct proc_dir_entry **procroot_ret, + struct lprocfs_stats **stats_ret) +{ + struct proc_dir_entry *svc_procroot; + struct lprocfs_stats *svc_stats; + int i, rc; + unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV; + + LASSERT(*procroot_ret == NULL); + LASSERT(*stats_ret == NULL); + + svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES, + 0); + if (svc_stats == NULL) + return; + + if (dir) { + svc_procroot = lprocfs_register(dir, root, NULL, NULL); + if (IS_ERR(svc_procroot)) { + lprocfs_free_stats(&svc_stats); + return; + } + } else { + svc_procroot = root; + } + + lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR, + svc_counter_config, "req_waittime", "usec"); + lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR, + svc_counter_config, "req_qdepth", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR, + svc_counter_config, "req_active", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT, + svc_counter_config, "req_timeout", "sec"); + lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR, + svc_counter_config, "reqbuf_avail", "bufs"); + for (i = 0; i < EXTRA_LAST_OPC; i++) { + char *units; + + switch (i) { + case BRW_WRITE_BYTES: + case BRW_READ_BYTES: + units = "bytes"; + break; + default: + units = "reqs"; + break; + } + lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i, + svc_counter_config, + ll_eopcode2str(i), units); + } + for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { + __u32 opcode = ll_rpc_opcode_table[i].opcode; + lprocfs_counter_init(svc_stats, + EXTRA_MAX_OPCODES + i, svc_counter_config, + ll_opcode2str(opcode), "usec"); + } + + rc = lprocfs_register_stats(svc_procroot, name, svc_stats); + if (rc < 0) { + if (dir) + lprocfs_remove(&svc_procroot); + lprocfs_free_stats(&svc_stats); + } else { + if (dir) + *procroot_ret = svc_procroot; + *stats_ret = svc_stats; + } +} + +static int +ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svcpt->scp_hist_nrqbds; + + seq_printf(m, "%d\n", total); + return 0; +} +LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len); + +static int +ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svc->srv_hist_nrqbds_cpt_max; + + seq_printf(m, "%d\n", total); + return 0; +} + +static ssize_t +ptlrpc_lprocfs_req_history_max_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private; + int bufpages; + int val; + int rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val < 0) + return -ERANGE; + + /* This sanity check is more of an insanity check; we can still + * hose a kernel by allowing the request history to grow too + * far. */ + bufpages = (svc->srv_buf_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (val > totalram_pages / (2 * bufpages)) + return -ERANGE; + + spin_lock(&svc->srv_lock); + + if (val == 0) + svc->srv_hist_nrqbds_cpt_max = 0; + else + svc->srv_hist_nrqbds_cpt_max = max(1, (val / svc->srv_ncpts)); + + spin_unlock(&svc->srv_lock); + + return count; +} +LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max); + +static int +ptlrpc_lprocfs_threads_min_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + + seq_printf(m, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts); + return 0; +} + +static ssize_t +ptlrpc_lprocfs_threads_min_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private; + int val; + int rc = lprocfs_write_helper(buffer, count, &val); + + if (rc < 0) + return rc; + + if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) + return -ERANGE; + + spin_lock(&svc->srv_lock); + if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) { + spin_unlock(&svc->srv_lock); + return -ERANGE; + } + + svc->srv_nthrs_cpt_init = val / svc->srv_ncpts; + + spin_unlock(&svc->srv_lock); + + return count; +} +LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_min); + +static int +ptlrpc_lprocfs_threads_started_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svcpt->scp_nthrs_running; + + seq_printf(m, "%d\n", total); + return 0; +} +LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_threads_started); + +static int +ptlrpc_lprocfs_threads_max_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + + seq_printf(m, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts); + return 0; +} + +static ssize_t +ptlrpc_lprocfs_threads_max_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private; + int val; + int rc = lprocfs_write_helper(buffer, count, &val); + + if (rc < 0) + return rc; + + if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) + return -ERANGE; + + spin_lock(&svc->srv_lock); + if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) { + spin_unlock(&svc->srv_lock); + return -ERANGE; + } + + svc->srv_nthrs_cpt_limit = val / svc->srv_ncpts; + + spin_unlock(&svc->srv_lock); + + return count; +} +LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_max); + +/** + * \addtogoup nrs + * @{ + */ +extern struct nrs_core nrs_core; + +/** + * Translates \e ptlrpc_nrs_pol_state values to human-readable strings. + * + * \param[in] state The policy state + */ +static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state) +{ + switch (state) { + default: + LBUG(); + case NRS_POL_STATE_INVALID: + return "invalid"; + case NRS_POL_STATE_STOPPED: + return "stopped"; + case NRS_POL_STATE_STOPPING: + return "stopping"; + case NRS_POL_STATE_STARTING: + return "starting"; + case NRS_POL_STATE_STARTED: + return "started"; + } +} + +/** + * Obtains status information for \a policy. + * + * Information is copied in \a info. + * + * \param[in] policy The policy + * \param[out] info Holds returned status information + */ +void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_pol_info *info) +{ + LASSERT(policy != NULL); + LASSERT(info != NULL); + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX); + + info->pi_fallback = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK); + info->pi_state = policy->pol_state; + /** + * XXX: These are accessed without holding + * ptlrpc_service_part::scp_req_lock. + */ + info->pi_req_queued = policy->pol_req_queued; + info->pi_req_started = policy->pol_req_started; +} + +/** + * Reads and prints policy status information for all policies of a PTLRPC + * service. + */ +static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_nrs *nrs; + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_pol_info *infos; + struct ptlrpc_nrs_pol_info tmp; + unsigned num_pols; + unsigned pol_idx = 0; + bool hp = false; + int i; + int rc = 0; + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Use the first service partition's regular NRS head in order to obtain + * the number of policies registered with NRS heads of this service. All + * service partitions will have the same number of policies. + */ + nrs = nrs_svcpt2nrs(svc->srv_parts[0], false); + + spin_lock(&nrs->nrs_lock); + num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols; + spin_unlock(&nrs->nrs_lock); + + OBD_ALLOC(infos, num_pols * sizeof(*infos)); + if (infos == NULL) { + rc = -ENOMEM; + goto out; + } +again: + + ptlrpc_service_for_each_part(svcpt, i, svc) { + nrs = nrs_svcpt2nrs(svcpt, hp); + spin_lock(&nrs->nrs_lock); + + pol_idx = 0; + + list_for_each_entry(policy, &nrs->nrs_policy_list, + pol_list) { + LASSERT(pol_idx < num_pols); + + nrs_policy_get_info_locked(policy, &tmp); + /** + * Copy values when handling the first service + * partition. + */ + if (i == 0) { + memcpy(infos[pol_idx].pi_name, tmp.pi_name, + NRS_POL_NAME_MAX); + memcpy(&infos[pol_idx].pi_state, &tmp.pi_state, + sizeof(tmp.pi_state)); + infos[pol_idx].pi_fallback = tmp.pi_fallback; + /** + * For the rest of the service partitions + * sanity-check the values we get. + */ + } else { + LASSERT(strncmp(infos[pol_idx].pi_name, + tmp.pi_name, + NRS_POL_NAME_MAX) == 0); + /** + * Not asserting ptlrpc_nrs_pol_info::pi_state, + * because it may be different between + * instances of the same policy in different + * service partitions. + */ + LASSERT(infos[pol_idx].pi_fallback == + tmp.pi_fallback); + } + + infos[pol_idx].pi_req_queued += tmp.pi_req_queued; + infos[pol_idx].pi_req_started += tmp.pi_req_started; + + pol_idx++; + } + spin_unlock(&nrs->nrs_lock); + } + + /** + * Policy status information output is in YAML format. + * For example: + * + * regular_requests: + * - name: fifo + * state: started + * fallback: yes + * queued: 0 + * active: 0 + * + * - name: crrn + * state: started + * fallback: no + * queued: 2015 + * active: 384 + * + * high_priority_requests: + * - name: fifo + * state: started + * fallback: yes + * queued: 0 + * active: 2 + * + * - name: crrn + * state: stopped + * fallback: no + * queued: 0 + * active: 0 + */ + seq_printf(m, "%s\n", + !hp ? "\nregular_requests:" : "high_priority_requests:"); + + for (pol_idx = 0; pol_idx < num_pols; pol_idx++) { + seq_printf(m, " - name: %s\n" + " state: %s\n" + " fallback: %s\n" + " queued: %-20d\n" + " active: %-20d\n\n", + infos[pol_idx].pi_name, + nrs_state2str(infos[pol_idx].pi_state), + infos[pol_idx].pi_fallback ? "yes" : "no", + (int)infos[pol_idx].pi_req_queued, + (int)infos[pol_idx].pi_req_started); + } + + if (!hp && nrs_svc_has_hp(svc)) { + memset(infos, 0, num_pols * sizeof(*infos)); + + /** + * Redo the processing for the service's HP NRS heads' policies. + */ + hp = true; + goto again; + } + +out: + if (infos) + OBD_FREE(infos, num_pols * sizeof(*infos)); + + mutex_unlock(&nrs_core.nrs_mutex); + + return rc; +} + +/** + * The longest valid command string is the maximum policy name size, plus the + * length of the " reg" substring + */ +#define LPROCFS_NRS_WR_MAX_CMD (NRS_POL_NAME_MAX + sizeof(" reg") - 1) + +/** + * Starts and stops a given policy on a PTLRPC service. + * + * Commands consist of the policy name, followed by an optional [reg|hp] token; + * if the optional token is omitted, the operation is performed on both the + * regular and high-priority (if the service has one) NRS head. + */ +static ssize_t ptlrpc_lprocfs_nrs_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private; + enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH; + char *cmd; + char *cmd_copy = NULL; + char *token; + int rc = 0; + + if (count >= LPROCFS_NRS_WR_MAX_CMD) { + rc = -EINVAL; + goto out; + } + + OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD); + if (cmd == NULL) { + rc = -ENOMEM; + goto out; + } + /** + * strsep() modifies its argument, so keep a copy + */ + cmd_copy = cmd; + + if (copy_from_user(cmd, buffer, count)) { + rc = -EFAULT; + goto out; + } + + cmd[count] = '\0'; + + token = strsep(&cmd, " "); + + if (strlen(token) > NRS_POL_NAME_MAX - 1) { + rc = -EINVAL; + goto out; + } + + /** + * No [reg|hp] token has been specified + */ + if (cmd == NULL) + goto default_queue; + + /** + * The second token is either NULL, or an optional [reg|hp] string + */ + if (strcmp(cmd, "reg") == 0) + queue = PTLRPC_NRS_QUEUE_REG; + else if (strcmp(cmd, "hp") == 0) + queue = PTLRPC_NRS_QUEUE_HP; + else { + rc = -EINVAL; + goto out; + } + +default_queue: + + if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc)) { + rc = -ENODEV; + goto out; + } else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc)) + queue = PTLRPC_NRS_QUEUE_REG; + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + + rc = ptlrpc_nrs_policy_control(svc, queue, token, PTLRPC_NRS_CTL_START, + false, NULL); + + mutex_unlock(&nrs_core.nrs_mutex); +out: + if (cmd_copy) + OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD); + + return rc < 0 ? rc : count; +} +LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs); + +/** @} nrs */ + +struct ptlrpc_srh_iterator { + int srhi_idx; + __u64 srhi_seq; + struct ptlrpc_request *srhi_req; +}; + +static int +ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt, + struct ptlrpc_srh_iterator *srhi, + __u64 seq) +{ + struct list_head *e; + struct ptlrpc_request *req; + + if (srhi->srhi_req != NULL && + srhi->srhi_seq > svcpt->scp_hist_seq_culled && + srhi->srhi_seq <= seq) { + /* If srhi_req was set previously, hasn't been culled and + * we're searching for a seq on or after it (i.e. more + * recent), search from it onwards. + * Since the service history is LRU (i.e. culled reqs will + * be near the head), we shouldn't have to do long + * re-scans */ + LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq, + "%s:%d: seek seq %llu, request seq %llu\n", + svcpt->scp_service->srv_name, svcpt->scp_cpt, + srhi->srhi_seq, srhi->srhi_req->rq_history_seq); + LASSERTF(!list_empty(&svcpt->scp_hist_reqs), + "%s:%d: seek offset %llu, request seq %llu, last culled %llu\n", + svcpt->scp_service->srv_name, svcpt->scp_cpt, + seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled); + e = &srhi->srhi_req->rq_history_list; + } else { + /* search from start */ + e = svcpt->scp_hist_reqs.next; + } + + while (e != &svcpt->scp_hist_reqs) { + req = list_entry(e, struct ptlrpc_request, rq_history_list); + + if (req->rq_history_seq >= seq) { + srhi->srhi_seq = req->rq_history_seq; + srhi->srhi_req = req; + return 0; + } + e = e->next; + } + + return -ENOENT; +} + +/* + * ptlrpc history sequence is used as "position" of seq_file, in some case, + * seq_read() will increase "position" to indicate reading the next + * element, however, low bits of history sequence are reserved for CPT id + * (check the details from comments before ptlrpc_req_add_history), which + * means seq_read() might change CPT id of history sequence and never + * finish reading of requests on a CPT. To make it work, we have to shift + * CPT id to high bits and timestamp to low bits, so seq_read() will only + * increase timestamp which can correctly indicate the next position. + */ + +/* convert seq_file pos to cpt */ +#define PTLRPC_REQ_POS2CPT(svc, pos) \ + ((svc)->srv_cpt_bits == 0 ? 0 : \ + (__u64)(pos) >> (64 - (svc)->srv_cpt_bits)) + +/* make up seq_file pos from cpt */ +#define PTLRPC_REQ_CPT2POS(svc, cpt) \ + ((svc)->srv_cpt_bits == 0 ? 0 : \ + (cpt) << (64 - (svc)->srv_cpt_bits)) + +/* convert sequence to position */ +#define PTLRPC_REQ_SEQ2POS(svc, seq) \ + ((svc)->srv_cpt_bits == 0 ? (seq) : \ + ((seq) >> (svc)->srv_cpt_bits) | \ + ((seq) << (64 - (svc)->srv_cpt_bits))) + +/* convert position to sequence */ +#define PTLRPC_REQ_POS2SEQ(svc, pos) \ + ((svc)->srv_cpt_bits == 0 ? (pos) : \ + ((__u64)(pos) << (svc)->srv_cpt_bits) | \ + ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits))) + +static void * +ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_srh_iterator *srhi; + unsigned int cpt; + int rc; + int i; + + if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */ + CWARN("Failed to read request history because size of loff_t %d can't match size of u64\n", + (int)sizeof(loff_t)); + return NULL; + } + + OBD_ALLOC(srhi, sizeof(*srhi)); + if (srhi == NULL) + return NULL; + + srhi->srhi_seq = 0; + srhi->srhi_req = NULL; + + cpt = PTLRPC_REQ_POS2CPT(svc, *pos); + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (i < cpt) /* skip */ + continue; + if (i > cpt) /* make up the lowest position for this CPT */ + *pos = PTLRPC_REQ_CPT2POS(svc, i); + + spin_lock(&svcpt->scp_lock); + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, + PTLRPC_REQ_POS2SEQ(svc, *pos)); + spin_unlock(&svcpt->scp_lock); + if (rc == 0) { + *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); + srhi->srhi_idx = i; + return srhi; + } + } + + OBD_FREE(srhi, sizeof(*srhi)); + return NULL; +} + +static void +ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter) +{ + struct ptlrpc_srh_iterator *srhi = iter; + + if (srhi != NULL) + OBD_FREE(srhi, sizeof(*srhi)); +} + +static void * +ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s, + void *iter, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + struct ptlrpc_service_part *svcpt; + __u64 seq; + int rc; + int i; + + for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) { + svcpt = svc->srv_parts[i]; + + if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */ + srhi->srhi_req = NULL; + seq = srhi->srhi_seq = 0; + } else { /* the next sequence */ + seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits); + } + + spin_lock(&svcpt->scp_lock); + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq); + spin_unlock(&svcpt->scp_lock); + if (rc == 0) { + *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); + srhi->srhi_idx = i; + return srhi; + } + } + + OBD_FREE(srhi, sizeof(*srhi)); + return NULL; +} + +/* common ost/mdt so_req_printer */ +void target_print_req(void *seq_file, struct ptlrpc_request *req) +{ + /* Called holding srv_lock with irqs disabled. + * Print specific req contents and a newline. + * CAVEAT EMPTOR: check request message length before printing!!! + * You might have received any old crap so you must be just as + * careful here as the service's request parser!!! */ + struct seq_file *sf = seq_file; + + switch (req->rq_phase) { + case RQ_PHASE_NEW: + /* still awaiting a service thread's attention, or rejected + * because the generic request message didn't unpack */ + seq_printf(sf, "\n"); + break; + case RQ_PHASE_INTERPRET: + /* being handled, so basic msg swabbed, and opc is valid + * but racing with mds_handle() */ + case RQ_PHASE_COMPLETE: + /* been handled by mds_handle() reply state possibly still + * volatile */ + seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg)); + break; + default: + DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase); + } +} +EXPORT_SYMBOL(target_print_req); + +static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request *req; + int rc; + + LASSERT(srhi->srhi_idx < svc->srv_ncpts); + + svcpt = svc->srv_parts[srhi->srhi_idx]; + + spin_lock(&svcpt->scp_lock); + + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq); + + if (rc == 0) { + req = srhi->srhi_req; + + /* Print common req fields. + * CAVEAT EMPTOR: we're racing with the service handler + * here. The request could contain any old crap, so you + * must be just as careful as the service's request + * parser. Currently I only print stuff here I know is OK + * to look at coz it was set up in request_in_callback()!!! */ + seq_printf(s, "%lld:%s:%s:x%llu:%d:%s:%ld:%lds(%+lds) ", + req->rq_history_seq, libcfs_nid2str(req->rq_self), + libcfs_id2str(req->rq_peer), req->rq_xid, + req->rq_reqlen, ptlrpc_rqphase2str(req), + req->rq_arrival_time.tv_sec, + req->rq_sent - req->rq_arrival_time.tv_sec, + req->rq_sent - req->rq_deadline); + if (svc->srv_ops.so_req_printer == NULL) + seq_printf(s, "\n"); + else + svc->srv_ops.so_req_printer(s, srhi->srhi_req); + } + + spin_unlock(&svcpt->scp_lock); + return rc; +} + +static int +ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file) +{ + static struct seq_operations sops = { + .start = ptlrpc_lprocfs_svc_req_history_start, + .stop = ptlrpc_lprocfs_svc_req_history_stop, + .next = ptlrpc_lprocfs_svc_req_history_next, + .show = ptlrpc_lprocfs_svc_req_history_show, + }; + struct seq_file *seqf; + int rc; + + rc = seq_open(file, &sops); + if (rc) + return rc; + + seqf = file->private_data; + seqf->private = PDE_DATA(inode); + return 0; +} + +/* See also lprocfs_rd_timeouts */ +static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + struct dhms ts; + time_t worstt; + unsigned int cur; + unsigned int worst; + int i; + + if (AT_OFF) { + seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n", + obd_timeout); + return 0; + } + + ptlrpc_service_for_each_part(svcpt, i, svc) { + cur = at_get(&svcpt->scp_at_estimate); + worst = svcpt->scp_at_estimate.at_worst_ever; + worstt = svcpt->scp_at_estimate.at_worst_time; + s2dhms(&ts, get_seconds() - worstt); + + seq_printf(m, "%10s : cur %3u worst %3u (at %ld, " + DHMS_FMT" ago) ", "service", + cur, worst, worstt, DHMS_VARS(&ts)); + + lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate); + } + + return 0; +} +LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts); + +static int ptlrpc_lprocfs_hp_ratio_seq_show(struct seq_file *m, void *v) +{ + struct ptlrpc_service *svc = m->private; + seq_printf(m, "%d", svc->srv_hpreq_ratio); + return 0; +} + +static ssize_t ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file, + const char __user *buffer, + size_t count, + loff_t *off) +{ + struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private; + int rc; + int val; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val < 0) + return -ERANGE; + + spin_lock(&svc->srv_lock); + svc->srv_hpreq_ratio = val; + spin_unlock(&svc->srv_lock); + + return count; +} +LPROC_SEQ_FOPS(ptlrpc_lprocfs_hp_ratio); + +void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, + struct ptlrpc_service *svc) +{ + struct lprocfs_vars lproc_vars[] = { + {.name = "high_priority_ratio", + .fops = &ptlrpc_lprocfs_hp_ratio_fops, + .data = svc}, + {.name = "req_buffer_history_len", + .fops = &ptlrpc_lprocfs_req_history_len_fops, + .data = svc}, + {.name = "req_buffer_history_max", + .fops = &ptlrpc_lprocfs_req_history_max_fops, + .data = svc}, + {.name = "threads_min", + .fops = &ptlrpc_lprocfs_threads_min_fops, + .data = svc}, + {.name = "threads_max", + .fops = &ptlrpc_lprocfs_threads_max_fops, + .data = svc}, + {.name = "threads_started", + .fops = &ptlrpc_lprocfs_threads_started_fops, + .data = svc}, + {.name = "timeouts", + .fops = &ptlrpc_lprocfs_timeouts_fops, + .data = svc}, + {.name = "nrs_policies", + .fops = &ptlrpc_lprocfs_nrs_fops, + .data = svc}, + {NULL} + }; + static const struct file_operations req_history_fops = { + .owner = THIS_MODULE, + .open = ptlrpc_lprocfs_svc_req_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lprocfs_seq_release, + }; + + int rc; + + ptlrpc_lprocfs_register(entry, svc->srv_name, + "stats", &svc->srv_procroot, + &svc->srv_stats); + + if (svc->srv_procroot == NULL) + return; + + lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL); + + rc = lprocfs_seq_create(svc->srv_procroot, "req_history", + 0400, &req_history_fops, svc); + if (rc) + CWARN("Error adding the req_history file\n"); +} + +void ptlrpc_lprocfs_register_obd(struct obd_device *obddev) +{ + ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats", + &obddev->obd_svc_procroot, + &obddev->obd_svc_stats); +} +EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd); + +void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount) +{ + struct lprocfs_stats *svc_stats; + __u32 op = lustre_msg_get_opc(req->rq_reqmsg); + int opc = opcode_offset(op); + + svc_stats = req->rq_import->imp_obd->obd_svc_stats; + if (svc_stats == NULL || opc <= 0) + return; + LASSERT(opc < LUSTRE_MAX_OPCODES); + if (!(op == LDLM_ENQUEUE || op == MDS_REINT)) + lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount); +} + +void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) +{ + struct lprocfs_stats *svc_stats; + int idx; + + if (!req->rq_import) + return; + svc_stats = req->rq_import->imp_obd->obd_svc_stats; + if (!svc_stats) + return; + idx = lustre_msg_get_opc(req->rq_reqmsg); + switch (idx) { + case OST_READ: + idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR; + break; + case OST_WRITE: + idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR; + break; + default: + LASSERTF(0, "unsupported opcode %u\n", idx); + break; + } + + lprocfs_counter_add(svc_stats, idx, bytes); +} + +EXPORT_SYMBOL(ptlrpc_lprocfs_brw); + +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc) +{ + if (svc->srv_procroot != NULL) + lprocfs_remove(&svc->srv_procroot); + + if (svc->srv_stats) + lprocfs_free_stats(&svc->srv_stats); +} + +void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) +{ + if (obd->obd_svc_procroot) + lprocfs_remove(&obd->obd_svc_procroot); + + if (obd->obd_svc_stats) + lprocfs_free_stats(&obd->obd_svc_stats); +} +EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd); + + +#define BUFLEN (UUID_MAX + 5) + +int lprocfs_wr_evict_client(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + char *kbuf; + char *tmpbuf; + + OBD_ALLOC(kbuf, BUFLEN); + if (kbuf == NULL) + return -ENOMEM; + + /* + * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1 + * bytes into kbuf, to ensure that the string is NUL-terminated. + * UUID_MAX should include a trailing NUL already. + */ + if (copy_from_user(kbuf, buffer, + min_t(unsigned long, BUFLEN - 1, count))) { + count = -EFAULT; + goto out; + } + tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count)); + /* Kludge code(deadlock situation): the lprocfs lock has been held + * since the client is evicted by writing client's + * uuid/nid to procfs "evict_client" entry. However, + * obd_export_evict_by_uuid() will call lprocfs_remove() to destroy + * the proc entries under the being destroyed export{}, so I have + * to drop the lock at first here. + * - jay, jxiong@clusterfs.com */ + class_incref(obd, __func__, current); + + if (strncmp(tmpbuf, "nid:", 4) == 0) + obd_export_evict_by_nid(obd, tmpbuf + 4); + else if (strncmp(tmpbuf, "uuid:", 5) == 0) + obd_export_evict_by_uuid(obd, tmpbuf + 5); + else + obd_export_evict_by_uuid(obd, tmpbuf); + + class_decref(obd, __func__, current); + +out: + OBD_FREE(kbuf, BUFLEN); + return count; +} +EXPORT_SYMBOL(lprocfs_wr_evict_client); + +#undef BUFLEN + +int lprocfs_wr_ping(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + struct ptlrpc_request *req; + int rc; + + LPROCFS_CLIMP_CHECK(obd); + req = ptlrpc_prep_ping(obd->u.cli.cl_import); + LPROCFS_CLIMP_EXIT(obd); + if (req == NULL) + return -ENOMEM; + + req->rq_send_state = LUSTRE_IMP_FULL; + + rc = ptlrpc_queue_wait(req); + + ptlrpc_req_finished(req); + if (rc >= 0) + return count; + return rc; +} +EXPORT_SYMBOL(lprocfs_wr_ping); + +/* Write the connection UUID to this file to attempt to connect to that node. + * The connection UUID is a node's primary NID. For example, + * "echo connection=192.168.0.1@tcp0::instance > .../import". + */ +int lprocfs_wr_import(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + struct obd_import *imp = obd->u.cli.cl_import; + char *kbuf = NULL; + char *uuid; + char *ptr; + int do_reconn = 1; + const char prefix[] = "connection="; + const int prefix_len = sizeof(prefix) - 1; + + if (count > PAGE_CACHE_SIZE - 1 || count <= prefix_len) + return -EINVAL; + + OBD_ALLOC(kbuf, count + 1); + if (kbuf == NULL) + return -ENOMEM; + + if (copy_from_user(kbuf, buffer, count)) { + count = -EFAULT; + goto out; + } + + kbuf[count] = 0; + + /* only support connection=uuid::instance now */ + if (strncmp(prefix, kbuf, prefix_len) != 0) { + count = -EINVAL; + goto out; + } + + uuid = kbuf + prefix_len; + ptr = strstr(uuid, "::"); + if (ptr) { + __u32 inst; + char *endptr; + + *ptr = 0; + do_reconn = 0; + ptr += strlen("::"); + inst = simple_strtol(ptr, &endptr, 10); + if (*endptr) { + CERROR("config: wrong instance # %s\n", ptr); + } else if (inst != imp->imp_connect_data.ocd_instance) { + CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted target(%u/%u), reconnecting...\n", + imp->imp_obd->obd_name, + imp->imp_connect_data.ocd_instance, inst); + do_reconn = 1; + } else { + CDEBUG(D_INFO, "IR: %s has already been connecting to new target(%u)\n", + imp->imp_obd->obd_name, inst); + } + } + + if (do_reconn) + ptlrpc_recover_import(imp, uuid, 1); + +out: + OBD_FREE(kbuf, count + 1); + return count; +} +EXPORT_SYMBOL(lprocfs_wr_import); + +int lprocfs_rd_pinger_recov(struct seq_file *m, void *n) +{ + struct obd_device *obd = m->private; + struct obd_import *imp = obd->u.cli.cl_import; + + LPROCFS_CLIMP_CHECK(obd); + seq_printf(m, "%d\n", !imp->imp_no_pinger_recover); + LPROCFS_CLIMP_EXIT(obd); + + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_pinger_recov); + +int lprocfs_wr_pinger_recov(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + int rc, val; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val != 0 && val != 1) + return -ERANGE; + + LPROCFS_CLIMP_CHECK(obd); + spin_lock(&imp->imp_lock); + imp->imp_no_pinger_recover = !val; + spin_unlock(&imp->imp_lock); + LPROCFS_CLIMP_EXIT(obd); + + return count; + +} +EXPORT_SYMBOL(lprocfs_wr_pinger_recov); + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/niobuf.c new file mode 100644 index 000000000..2fa258558 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/niobuf.c @@ -0,0 +1,731 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include "../include/obd_support.h" +#include "../include/lustre_net.h" +#include "../include/lustre_lib.h" +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "ptlrpc_internal.h" + +/** + * Helper function. Sends \a len bytes from \a base at offset \a offset + * over \a conn connection to portal \a portal. + * Returns 0 on success or error code. + */ +static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len, + lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid, + struct ptlrpc_connection *conn, int portal, __u64 xid, + unsigned int offset) +{ + int rc; + lnet_md_t md; + + LASSERT(portal != 0); + LASSERT(conn != NULL); + CDEBUG(D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer)); + md.start = base; + md.length = len; + md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1; + md.options = PTLRPC_MD_OPTIONS; + md.user_ptr = cbid; + md.eq_handle = ptlrpc_eq_h; + + if (unlikely(ack == LNET_ACK_REQ && + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, + OBD_FAIL_ONCE))) { + /* don't ask for the ack to simulate failing client */ + ack = LNET_NOACK_REQ; + } + + rc = LNetMDBind(md, LNET_UNLINK, mdh); + if (unlikely(rc != 0)) { + CERROR("LNetMDBind failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + return -ENOMEM; + } + + CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n", + len, portal, xid, offset); + + rc = LNetPut(conn->c_self, *mdh, ack, + conn->c_peer, portal, xid, offset, 0); + if (unlikely(rc != 0)) { + int rc2; + /* We're going to get an UNLINK event when I unlink below, + * which will complete just like any other failed send, so + * I fall through and return success here! */ + CERROR("LNetPut(%s, %d, %lld) failed: %d\n", + libcfs_id2str(conn->c_peer), portal, xid, rc); + rc2 = LNetMDUnlink(*mdh); + LASSERTF(rc2 == 0, "rc2 = %d\n", rc2); + } + + return 0; +} + +static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count) +{ + int i; + + for (i = 0; i < count; i++) + LNetMDUnlink(bd_mds[i]); +} + + +/** + * Register bulk at the sender for later transfer. + * Returns 0 on success or error code. + */ +int ptlrpc_register_bulk(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + lnet_process_id_t peer; + int rc = 0; + int rc2; + int posted_md; + int total_md; + __u64 xid; + lnet_handle_me_t me_h; + lnet_md_t md; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET)) + return 0; + + /* NB no locking required until desc is on the network */ + LASSERT(desc->bd_nob > 0); + LASSERT(desc->bd_md_count == 0); + LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT); + LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); + LASSERT(desc->bd_req != NULL); + LASSERT(desc->bd_type == BULK_PUT_SINK || + desc->bd_type == BULK_GET_SOURCE); + + /* cleanup the state of the bulk for it will be reused */ + if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY) + desc->bd_nob_transferred = 0; + else + LASSERT(desc->bd_nob_transferred == 0); + + desc->bd_failure = 0; + + peer = desc->bd_import->imp_connection->c_peer; + + LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback); + LASSERT(desc->bd_cbid.cbid_arg == desc); + + /* An XID is only used for a single request from the client. + * For retried bulk transfers, a new XID will be allocated in + * in ptlrpc_check_set() if it needs to be resent, so it is not + * using the same RDMA match bits after an error. + * + * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The + * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */ + xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1); + LASSERTF(!(desc->bd_registered && + req->rq_send_state != LUSTRE_IMP_REPLAY) || + xid != desc->bd_last_xid, + "registered: %d rq_xid: %llu bd_last_xid: %llu\n", + desc->bd_registered, xid, desc->bd_last_xid); + + total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV; + desc->bd_registered = 1; + desc->bd_last_xid = xid; + desc->bd_md_count = total_md; + md.user_ptr = &desc->bd_cbid; + md.eq_handle = ptlrpc_eq_h; + md.threshold = 1; /* PUT or GET */ + + for (posted_md = 0; posted_md < total_md; posted_md++, xid++) { + md.options = PTLRPC_MD_OPTIONS | + ((desc->bd_type == BULK_GET_SOURCE) ? + LNET_MD_OP_GET : LNET_MD_OP_PUT); + ptlrpc_fill_bulk_md(&md, desc, posted_md); + + rc = LNetMEAttach(desc->bd_portal, peer, xid, 0, + LNET_UNLINK, LNET_INS_AFTER, &me_h); + if (rc != 0) { + CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n", + desc->bd_import->imp_obd->obd_name, xid, + posted_md, rc); + break; + } + + /* About to let the network at it... */ + rc = LNetMDAttach(me_h, md, LNET_UNLINK, + &desc->bd_mds[posted_md]); + if (rc != 0) { + CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n", + desc->bd_import->imp_obd->obd_name, xid, + posted_md, rc); + rc2 = LNetMEUnlink(me_h); + LASSERT(rc2 == 0); + break; + } + } + + if (rc != 0) { + LASSERT(rc == -ENOMEM); + spin_lock(&desc->bd_lock); + desc->bd_md_count -= total_md - posted_md; + spin_unlock(&desc->bd_lock); + LASSERT(desc->bd_md_count >= 0); + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + req->rq_status = -ENOMEM; + return -ENOMEM; + } + + /* Set rq_xid to matchbits of the final bulk so that server can + * infer the number of bulks that were prepared */ + req->rq_xid = --xid; + LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK), + "bd_last_xid = x%llu, rq_xid = x%llu\n", + desc->bd_last_xid, req->rq_xid); + + spin_lock(&desc->bd_lock); + /* Holler if peer manages to touch buffers before he knows the xid */ + if (desc->bd_md_count != total_md) + CWARN("%s: Peer %s touched %d buffers while I registered\n", + desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer), + total_md - desc->bd_md_count); + spin_unlock(&desc->bd_lock); + + CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, xid x%#llx-%#llx, portal %u\n", + desc->bd_md_count, + desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink", + desc->bd_iov_count, desc->bd_nob, + desc->bd_last_xid, req->rq_xid, desc->bd_portal); + + return 0; +} +EXPORT_SYMBOL(ptlrpc_register_bulk); + +/** + * Disconnect a bulk desc from the network. Idempotent. Not + * thread-safe (i.e. only interlocks with completion callback). + * Returns 1 on success or 0 if network unregistration failed for whatever + * reason. + */ +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + wait_queue_head_t *wq; + struct l_wait_info lwi; + int rc; + + LASSERT(!in_interrupt()); /* might sleep */ + + /* Let's setup deadline for reply unlink. */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && + async && req->rq_bulk_deadline == 0) + req->rq_bulk_deadline = get_seconds() + LONG_UNLINK; + + if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ + return 1; /* never registered */ + + LASSERT(desc->bd_req == req); /* bd_req NULL until registered */ + + /* the unlink ensures the callback happens ASAP and is the last + * one. If it fails, it must be because completion just happened, + * but we must still l_wait_event() in this case to give liblustre + * a chance to run client_bulk_callback() */ + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + + if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ + return 1; /* never registered */ + + /* Move to "Unregistering" phase as bulk was not unlinked yet. */ + ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING); + + /* Do not wait for unlink to finish. */ + if (async) + return 0; + + if (req->rq_set != NULL) + wq = &req->rq_set->set_waitq; + else + wq = &req->rq_reply_waitq; + + for (;;) { + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi); + if (rc == 0) { + ptlrpc_rqphase_move(req, req->rq_next_phase); + return 1; + } + + LASSERT(rc == -ETIMEDOUT); + DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", + desc); + } + return 0; +} +EXPORT_SYMBOL(ptlrpc_unregister_bulk); + +static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int service_time = max_t(int, get_seconds() - + req->rq_arrival_time.tv_sec, 1); + + if (!(flags & PTLRPC_REPLY_EARLY) && + (req->rq_type != PTL_RPC_MSG_ERR) && + (req->rq_reqmsg != NULL) && + !(lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY | + MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) { + /* early replies, errors and recovery requests don't count + * toward our service time estimate */ + int oldse = at_measured(&svcpt->scp_at_estimate, service_time); + + if (oldse != 0) { + DEBUG_REQ(D_ADAPTTO, req, + "svc %s changed estimate from %d to %d", + svc->srv_name, oldse, + at_get(&svcpt->scp_at_estimate)); + } + } + /* Report actual service time for client latency calc */ + lustre_msg_set_service_time(req->rq_repmsg, service_time); + /* Report service time estimate for future client reqs, but report 0 + * (to be ignored by client) if it's a error reply during recovery. + * (bz15815) */ + if (req->rq_type == PTL_RPC_MSG_ERR && + (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering)) + lustre_msg_set_timeout(req->rq_repmsg, 0); + else + lustre_msg_set_timeout(req->rq_repmsg, + at_get(&svcpt->scp_at_estimate)); + + if (req->rq_reqmsg && + !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x req_flags=%#x magic=%d:%x/%x len=%d\n", + flags, lustre_msg_get_flags(req->rq_reqmsg), + lustre_msg_is_v1(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); + } +} + +/** + * Send request reply from request \a req reply buffer. + * \a flags defines reply types + * Returns 0 on success or error code + */ +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_connection *conn; + int rc; + + /* We must already have a reply buffer (only ptlrpc_error() may be + * called without one). The reply generated by sptlrpc layer (e.g. + * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must + * have a request buffer which is either the actual (swabbed) incoming + * request, or a saved copy if this is a req saved in + * target_queue_final_reply(). + */ + LASSERT(req->rq_no_reply == 0); + LASSERT(req->rq_reqbuf != NULL); + LASSERT(rs != NULL); + LASSERT((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); + LASSERT(req->rq_repmsg != NULL); + LASSERT(req->rq_repmsg == rs->rs_msg); + LASSERT(rs->rs_cb_id.cbid_fn == reply_out_callback); + LASSERT(rs->rs_cb_id.cbid_arg == rs); + + /* There may be no rq_export during failover */ + + if (unlikely(req->rq_export && req->rq_export->exp_obd && + req->rq_export->exp_obd->obd_fail)) { + /* Failed obd's only send ENODEV */ + req->rq_type = PTL_RPC_MSG_ERR; + req->rq_status = -ENODEV; + CDEBUG(D_HA, "sending ENODEV from failed obd %d\n", + req->rq_export->exp_obd->obd_minor); + } + + /* In order to keep interoperability with the client (< 2.3) which + * doesn't have pb_jobid in ptlrpc_body, We have to shrink the + * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the + * reply buffer on client will be overflow. + * + * XXX Remove this whenever we drop the interoperability with + * such client. + */ + req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0, + sizeof(struct ptlrpc_body_v2), 1); + + if (req->rq_type != PTL_RPC_MSG_ERR) + req->rq_type = PTL_RPC_MSG_REPLY; + + lustre_msg_set_type(req->rq_repmsg, req->rq_type); + lustre_msg_set_status(req->rq_repmsg, + ptlrpc_status_hton(req->rq_status)); + lustre_msg_set_opc(req->rq_repmsg, + req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0); + + target_pack_pool_reply(req); + + ptlrpc_at_set_reply(req, flags); + + if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) + conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL); + else + conn = ptlrpc_connection_addref(req->rq_export->exp_connection); + + if (unlikely(conn == NULL)) { + CERROR("not replying on NULL connection\n"); /* bug 9635 */ + return -ENOTCONN; + } + ptlrpc_rs_addref(rs); /* +1 ref for the network */ + + rc = sptlrpc_svc_wrap_reply(req); + if (unlikely(rc)) + goto out; + + req->rq_sent = get_seconds(); + + rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, + (rs->rs_difficult && !rs->rs_no_ack) ? + LNET_ACK_REQ : LNET_NOACK_REQ, + &rs->rs_cb_id, conn, + ptlrpc_req2svc(req)->srv_rep_portal, + req->rq_xid, req->rq_reply_off); +out: + if (unlikely(rc != 0)) + ptlrpc_req_drop_rs(req); + ptlrpc_connection_put(conn); + return rc; +} +EXPORT_SYMBOL(ptlrpc_send_reply); + +int ptlrpc_reply(struct ptlrpc_request *req) +{ + if (req->rq_no_reply) + return 0; + return ptlrpc_send_reply(req, 0); +} +EXPORT_SYMBOL(ptlrpc_reply); + +/** + * For request \a req send an error reply back. Create empty + * reply buffers if necessary. + */ +int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) +{ + int rc; + + if (req->rq_no_reply) + return 0; + + if (!req->rq_repmsg) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + return rc; + } + + if (req->rq_status != -ENOSPC && req->rq_status != -EACCES && + req->rq_status != -EPERM && req->rq_status != -ENOENT && + req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT) + req->rq_type = PTL_RPC_MSG_ERR; + + rc = ptlrpc_send_reply(req, may_be_difficult); + return rc; +} +EXPORT_SYMBOL(ptlrpc_send_error); + +int ptlrpc_error(struct ptlrpc_request *req) +{ + return ptlrpc_send_error(req, 0); +} +EXPORT_SYMBOL(ptlrpc_error); + +/** + * Send request \a request. + * if \a noreply is set, don't expect any reply back and don't set up + * reply buffers. + * Returns 0 on success or error code. + */ +int ptl_send_rpc(struct ptlrpc_request *request, int noreply) +{ + int rc; + int rc2; + int mpflag = 0; + struct ptlrpc_connection *connection; + lnet_handle_me_t reply_me_h; + lnet_md_t reply_md; + struct obd_device *obd = request->rq_import->imp_obd; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC)) + return 0; + + LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST); + LASSERT(request->rq_wait_ctx == 0); + + /* If this is a re-transmit, we're required to have disengaged + * cleanly from the previous attempt */ + LASSERT(!request->rq_receiving_reply); + LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) && + (request->rq_import->imp_state == LUSTRE_IMP_FULL))); + + if (unlikely(obd != NULL && obd->obd_fail)) { + CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", + obd->obd_name); + /* this prevents us from waiting in ptlrpc_queue_wait */ + spin_lock(&request->rq_lock); + request->rq_err = 1; + spin_unlock(&request->rq_lock); + request->rq_status = -ENODEV; + return -ENODEV; + } + + connection = request->rq_import->imp_connection; + + lustre_msg_set_handle(request->rq_reqmsg, + &request->rq_import->imp_remote_handle); + lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); + lustre_msg_set_conn_cnt(request->rq_reqmsg, + request->rq_import->imp_conn_cnt); + lustre_msghdr_set_flags(request->rq_reqmsg, + request->rq_import->imp_msghdr_flags); + + if (request->rq_resend) + lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); + + if (request->rq_memalloc) + mpflag = cfs_memory_pressure_get_and_set(); + + rc = sptlrpc_cli_wrap_request(request); + if (rc) + goto out; + + /* bulk register should be done after wrap_request() */ + if (request->rq_bulk != NULL) { + rc = ptlrpc_register_bulk(request); + if (rc != 0) + goto out; + } + + if (!noreply) { + LASSERT(request->rq_replen != 0); + if (request->rq_repbuf == NULL) { + LASSERT(request->rq_repdata == NULL); + LASSERT(request->rq_repmsg == NULL); + rc = sptlrpc_cli_alloc_repbuf(request, + request->rq_replen); + if (rc) { + /* this prevents us from looping in + * ptlrpc_queue_wait */ + spin_lock(&request->rq_lock); + request->rq_err = 1; + spin_unlock(&request->rq_lock); + request->rq_status = rc; + goto cleanup_bulk; + } + } else { + request->rq_repdata = NULL; + request->rq_repmsg = NULL; + } + + rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ + connection->c_peer, request->rq_xid, 0, + LNET_UNLINK, LNET_INS_AFTER, &reply_me_h); + if (rc != 0) { + CERROR("LNetMEAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + rc = -ENOMEM; + goto cleanup_bulk; + } + } + + spin_lock(&request->rq_lock); + /* If the MD attach succeeds, there _will_ be a reply_in callback */ + request->rq_receiving_reply = !noreply; + request->rq_req_unlink = 1; + /* We are responsible for unlinking the reply buffer */ + request->rq_reply_unlink = !noreply; + /* Clear any flags that may be present from previous sends. */ + request->rq_replied = 0; + request->rq_err = 0; + request->rq_timedout = 0; + request->rq_net_err = 0; + request->rq_resend = 0; + request->rq_restart = 0; + request->rq_reply_truncate = 0; + spin_unlock(&request->rq_lock); + + if (!noreply) { + reply_md.start = request->rq_repbuf; + reply_md.length = request->rq_repbuf_len; + /* Allow multiple early replies */ + reply_md.threshold = LNET_MD_THRESH_INF; + /* Manage remote for early replies */ + reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | + LNET_MD_MANAGE_REMOTE | + LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */; + reply_md.user_ptr = &request->rq_reply_cbid; + reply_md.eq_handle = ptlrpc_eq_h; + + /* We must see the unlink callback to unset rq_reply_unlink, + so we can't auto-unlink */ + rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, + &request->rq_reply_md_h); + if (rc != 0) { + CERROR("LNetMDAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + spin_lock(&request->rq_lock); + /* ...but the MD attach didn't succeed... */ + request->rq_receiving_reply = 0; + spin_unlock(&request->rq_lock); + rc = -ENOMEM; + goto cleanup_me; + } + + CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu, portal %u\n", + request->rq_repbuf_len, request->rq_xid, + request->rq_reply_portal); + } + + /* add references on request for request_out_callback */ + ptlrpc_request_addref(request); + if (obd != NULL && obd->obd_svc_stats != NULL) + lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, + atomic_read(&request->rq_import->imp_inflight)); + + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); + + do_gettimeofday(&request->rq_arrival_time); + request->rq_sent = get_seconds(); + /* We give the server rq_timeout secs to process the req, and + add the network latency for our local timeout. */ + request->rq_deadline = request->rq_sent + request->rq_timeout + + ptlrpc_at_get_net_latency(request); + + ptlrpc_pinger_sending_on_import(request->rq_import); + + DEBUG_REQ(D_INFO, request, "send flg=%x", + lustre_msg_get_flags(request->rq_reqmsg)); + rc = ptl_send_buf(&request->rq_req_md_h, + request->rq_reqbuf, request->rq_reqdata_len, + LNET_NOACK_REQ, &request->rq_req_cbid, + connection, + request->rq_request_portal, + request->rq_xid, 0); + if (rc == 0) + goto out; + + ptlrpc_req_finished(request); + if (noreply) + goto out; + + cleanup_me: + /* MEUnlink is safe; the PUT didn't even get off the ground, and + * nobody apart from the PUT's target has the right nid+XID to + * access the reply buffer. */ + rc2 = LNetMEUnlink(reply_me_h); + LASSERT(rc2 == 0); + /* UNLINKED callback called synchronously */ + LASSERT(!request->rq_receiving_reply); + + cleanup_bulk: + /* We do sync unlink here as there was no real transfer here so + * the chance to have long unlink to sluggish net is smaller here. */ + ptlrpc_unregister_bulk(request, 0); + out: + if (request->rq_memalloc) + cfs_memory_pressure_restore(mpflag); + return rc; +} +EXPORT_SYMBOL(ptl_send_rpc); + +/** + * Register request buffer descriptor for request receiving. + */ +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd) +{ + struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service; + static lnet_process_id_t match_id = {LNET_NID_ANY, LNET_PID_ANY}; + int rc; + lnet_md_t md; + lnet_handle_me_t me_h; + + CDEBUG(D_NET, "LNetMEAttach: portal %d\n", + service->srv_req_portal); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD)) + return -ENOMEM; + + /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL, + * which means buffer can only be attached on local CPT, and LND + * threads can find it by grabbing a local lock */ + rc = LNetMEAttach(service->srv_req_portal, + match_id, 0, ~0, LNET_UNLINK, + rqbd->rqbd_svcpt->scp_cpt >= 0 ? + LNET_INS_LOCAL : LNET_INS_AFTER, &me_h); + if (rc != 0) { + CERROR("LNetMEAttach failed: %d\n", rc); + return -ENOMEM; + } + + LASSERT(rqbd->rqbd_refcount == 0); + rqbd->rqbd_refcount = 1; + + md.start = rqbd->rqbd_buffer; + md.length = service->srv_buf_size; + md.max_size = service->srv_max_req_size; + md.threshold = LNET_MD_THRESH_INF; + md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE; + md.user_ptr = &rqbd->rqbd_cbid; + md.eq_handle = ptlrpc_eq_h; + + rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h); + if (rc == 0) + return 0; + + CERROR("LNetMDAttach failed: %d;\n", rc); + LASSERT(rc == -ENOMEM); + rc = LNetMEUnlink(me_h); + LASSERT(rc == 0); + rqbd->rqbd_refcount = 0; + + return -ENOMEM; +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c new file mode 100644 index 000000000..81ad74732 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs.c @@ -0,0 +1,1754 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs.c + * + * Network Request Scheduler (NRS) + * + * Allows to reorder the handling of RPCs at servers. + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_net.h" +#include "../include/lprocfs_status.h" +#include "../../include/linux/libcfs/libcfs.h" +#include "ptlrpc_internal.h" + +/* XXX: This is just for liblustre. Remove the #if defined directive when the + * "cfs_" prefix is dropped from cfs_list_head. */ +extern struct list_head ptlrpc_all_services; + +/** + * NRS core object. + */ +struct nrs_core nrs_core; + +static int nrs_policy_init(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_desc->pd_ops->op_policy_init != NULL ? + policy->pol_desc->pd_ops->op_policy_init(policy) : 0; +} + +static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_ref == 0); + LASSERT(policy->pol_req_queued == 0); + + if (policy->pol_desc->pd_ops->op_policy_fini != NULL) + policy->pol_desc->pd_ops->op_policy_fini(policy); +} + +static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + /** + * The policy may be stopped, but the lprocfs files and + * ptlrpc_nrs_policy instances remain present until unregistration time. + * Do not perform the ctl operation if the policy is stopped, as + * policy->pol_private will be NULL in such a case. + */ + if (policy->pol_state == NRS_POL_STATE_STOPPED) + return -ENODEV; + + return policy->pol_desc->pd_ops->op_policy_ctl != NULL ? + policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) : + -ENOSYS; +} + +static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + + if (policy->pol_desc->pd_ops->op_policy_stop != NULL) { + spin_unlock(&nrs->nrs_lock); + + policy->pol_desc->pd_ops->op_policy_stop(policy); + + spin_lock(&nrs->nrs_lock); + } + + LASSERT(list_empty(&policy->pol_list_queued)); + LASSERT(policy->pol_req_queued == 0 && + policy->pol_req_started == 0); + + policy->pol_private = NULL; + + policy->pol_state = NRS_POL_STATE_STOPPED; + + if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) + module_put(policy->pol_desc->pd_owner); +} + +static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + + if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping) + return -EPERM; + + if (policy->pol_state == NRS_POL_STATE_STARTING) + return -EAGAIN; + + /* In progress or already stopped */ + if (policy->pol_state != NRS_POL_STATE_STARTED) + return 0; + + policy->pol_state = NRS_POL_STATE_STOPPING; + + /* Immediately make it invisible */ + if (nrs->nrs_policy_primary == policy) { + nrs->nrs_policy_primary = NULL; + + } else { + LASSERT(nrs->nrs_policy_fallback == policy); + nrs->nrs_policy_fallback = NULL; + } + + /* I have the only refcount */ + if (policy->pol_ref == 1) + nrs_policy_stop0(policy); + + return 0; +} + +/** + * Transitions the \a nrs NRS head's primary policy to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no + * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED. + * + * \param[in] nrs the NRS head to carry out this operation on + */ +static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs) +{ + struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary; + + if (tmp == NULL) + return; + + nrs->nrs_policy_primary = NULL; + + LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED); + tmp->pol_state = NRS_POL_STATE_STOPPING; + + if (tmp->pol_ref == 0) + nrs_policy_stop0(tmp); +} + +/** + * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in + * response to an lprocfs command to start a policy. + * + * If a primary policy different to the current one is specified, this function + * will transition the new policy to the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition + * the old primary policy (if there is one) to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding + * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. + * + * If the fallback policy is specified, this is taken to indicate an instruction + * to stop the current primary policy, without substituting it with another + * primary policy, so the primary policy (if any) is transitioned to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding + * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In + * this case, the fallback policy is only left active in the NRS head. + */ +static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + int rc = 0; + + /** + * Don't allow multiple starting which is too complex, and has no real + * benefit. + */ + if (nrs->nrs_policy_starting) + return -EAGAIN; + + LASSERT(policy->pol_state != NRS_POL_STATE_STARTING); + + if (policy->pol_state == NRS_POL_STATE_STOPPING) + return -EAGAIN; + + if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { + /** + * This is for cases in which the user sets the policy to the + * fallback policy (currently fifo for all services); i.e. the + * user is resetting the policy to the default; so we stop the + * primary policy, if any. + */ + if (policy == nrs->nrs_policy_fallback) { + nrs_policy_stop_primary(nrs); + return 0; + } + + /** + * If we reach here, we must be setting up the fallback policy + * at service startup time, and only a single policy with the + * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can + * register with NRS core. + */ + LASSERT(nrs->nrs_policy_fallback == NULL); + } else { + /** + * Shouldn't start primary policy if w/o fallback policy. + */ + if (nrs->nrs_policy_fallback == NULL) + return -EPERM; + + if (policy->pol_state == NRS_POL_STATE_STARTED) + return 0; + } + + /** + * Increase the module usage count for policies registering from other + * modules. + */ + if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 && + !try_module_get(policy->pol_desc->pd_owner)) { + atomic_dec(&policy->pol_desc->pd_refs); + CERROR("NRS: cannot get module for policy %s; is it alive?\n", + policy->pol_desc->pd_name); + return -ENODEV; + } + + /** + * Serialize policy starting across the NRS head + */ + nrs->nrs_policy_starting = 1; + + policy->pol_state = NRS_POL_STATE_STARTING; + + if (policy->pol_desc->pd_ops->op_policy_start) { + spin_unlock(&nrs->nrs_lock); + + rc = policy->pol_desc->pd_ops->op_policy_start(policy); + + spin_lock(&nrs->nrs_lock); + if (rc != 0) { + if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) + module_put(policy->pol_desc->pd_owner); + + policy->pol_state = NRS_POL_STATE_STOPPED; + goto out; + } + } + + policy->pol_state = NRS_POL_STATE_STARTED; + + if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { + /** + * This path is only used at PTLRPC service setup time. + */ + nrs->nrs_policy_fallback = policy; + } else { + /* + * Try to stop the current primary policy if there is one. + */ + nrs_policy_stop_primary(nrs); + + /** + * And set the newly-started policy as the primary one. + */ + nrs->nrs_policy_primary = policy; + } + +out: + nrs->nrs_policy_starting = 0; + + return rc; +} + +/** + * Increases the policy's usage reference count. + */ +static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy) +{ + policy->pol_ref++; +} + +/** + * Decreases the policy's usage reference count, and stops the policy in case it + * was already stopping and have no more outstanding usage references (which + * indicates it has no more queued or started requests, and can be safely + * stopped). + */ +static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_ref > 0); + + policy->pol_ref--; + if (unlikely(policy->pol_ref == 0 && + policy->pol_state == NRS_POL_STATE_STOPPING)) + nrs_policy_stop0(policy); +} + +static void nrs_policy_put(struct ptlrpc_nrs_policy *policy) +{ + spin_lock(&policy->pol_nrs->nrs_lock); + nrs_policy_put_locked(policy); + spin_unlock(&policy->pol_nrs->nrs_lock); +} + +/** + * Find and return a policy by name. + */ +static struct ptlrpc_nrs_policy *nrs_policy_find_locked(struct ptlrpc_nrs *nrs, + char *name) +{ + struct ptlrpc_nrs_policy *tmp; + + list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) { + if (strncmp(tmp->pol_desc->pd_name, name, + NRS_POL_NAME_MAX) == 0) { + nrs_policy_get_locked(tmp); + return tmp; + } + } + return NULL; +} + +/** + * Release references for the resource hierarchy moving upwards towards the + * policy instance resource. + */ +static void nrs_resource_put(struct ptlrpc_nrs_resource *res) +{ + struct ptlrpc_nrs_policy *policy = res->res_policy; + + if (policy->pol_desc->pd_ops->op_res_put != NULL) { + struct ptlrpc_nrs_resource *parent; + + for (; res != NULL; res = parent) { + parent = res->res_parent; + policy->pol_desc->pd_ops->op_res_put(policy, res); + } + } +} + +/** + * Obtains references for each resource in the resource hierarchy for request + * \a nrq if it is to be handled by \a policy. + * + * \param[in] policy the policy + * \param[in] nrq the request + * \param[in] moving_req denotes whether this is a call to the function by + * ldlm_lock_reorder_req(), in order to move \a nrq to + * the high-priority NRS head; we should not sleep when + * set. + * + * \retval NULL resource hierarchy references not obtained + * \retval valid-pointer the bottom level of the resource hierarchy + * + * \see ptlrpc_nrs_pol_ops::op_res_get() + */ +static +struct ptlrpc_nrs_resource *nrs_resource_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + bool moving_req) +{ + /** + * Set to NULL to traverse the resource hierarchy from the top. + */ + struct ptlrpc_nrs_resource *res = NULL; + struct ptlrpc_nrs_resource *tmp = NULL; + int rc; + + while (1) { + rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res, + &tmp, moving_req); + if (rc < 0) { + if (res != NULL) + nrs_resource_put(res); + return NULL; + } + + LASSERT(tmp != NULL); + tmp->res_parent = res; + tmp->res_policy = policy; + res = tmp; + tmp = NULL; + /** + * Return once we have obtained a reference to the bottom level + * of the resource hierarchy. + */ + if (rc > 0) + return res; + } +} + +/** + * Obtains resources for the resource hierarchies and policy references for + * the fallback and current primary policy (if any), that will later be used + * to handle request \a nrq. + * + * \param[in] nrs the NRS head instance that will be handling request \a nrq. + * \param[in] nrq the request that is being handled. + * \param[out] resp the array where references to the resource hierarchy are + * stored. + * \param[in] moving_req is set when obtaining resources while moving a + * request from a policy on the regular NRS head to a + * policy on the HP NRS head (via + * ldlm_lock_reorder_req()). It signifies that + * allocations to get resources should be atomic; for + * a full explanation, see comment in + * ptlrpc_nrs_pol_ops::op_res_get(). + */ +static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs, + struct ptlrpc_nrs_request *nrq, + struct ptlrpc_nrs_resource **resp, + bool moving_req) +{ + struct ptlrpc_nrs_policy *primary = NULL; + struct ptlrpc_nrs_policy *fallback = NULL; + + memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX); + + /** + * Obtain policy references. + */ + spin_lock(&nrs->nrs_lock); + + fallback = nrs->nrs_policy_fallback; + nrs_policy_get_locked(fallback); + + primary = nrs->nrs_policy_primary; + if (primary != NULL) + nrs_policy_get_locked(primary); + + spin_unlock(&nrs->nrs_lock); + + /** + * Obtain resource hierarchy references. + */ + resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req); + LASSERT(resp[NRS_RES_FALLBACK] != NULL); + + if (primary != NULL) { + resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq, + moving_req); + /** + * A primary policy may exist which may not wish to serve a + * particular request for different reasons; release the + * reference on the policy as it will not be used for this + * request. + */ + if (resp[NRS_RES_PRIMARY] == NULL) + nrs_policy_put(primary); + } +} + +/** + * Releases references to resource hierarchies and policies, because they are no + * longer required; used when request handling has been completed, or the + * request is moving to the high priority NRS head. + * + * \param resp the resource hierarchy that is being released + * + * \see ptlrpcnrs_req_hp_move() + * \see ptlrpc_nrs_req_finalize() + */ +static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp) +{ + struct ptlrpc_nrs_policy *pols[NRS_RES_MAX]; + struct ptlrpc_nrs *nrs = NULL; + int i; + + for (i = 0; i < NRS_RES_MAX; i++) { + if (resp[i] != NULL) { + pols[i] = resp[i]->res_policy; + nrs_resource_put(resp[i]); + resp[i] = NULL; + } else { + pols[i] = NULL; + } + } + + for (i = 0; i < NRS_RES_MAX; i++) { + if (pols[i] == NULL) + continue; + + if (nrs == NULL) { + nrs = pols[i]->pol_nrs; + spin_lock(&nrs->nrs_lock); + } + nrs_policy_put_locked(pols[i]); + } + + if (nrs != NULL) + spin_unlock(&nrs->nrs_lock); +} + +/** + * Obtains an NRS request from \a policy for handling or examination; the + * request should be removed in the 'handling' case. + * + * Calling into this function implies we already know the policy has a request + * waiting to be handled. + * + * \param[in] policy the policy from which a request + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force when set, it will force a policy to return a request if it + * has one pending + * + * \retval the NRS request to be handled + */ +static inline +struct ptlrpc_nrs_request *nrs_request_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct ptlrpc_nrs_request *nrq; + + LASSERT(policy->pol_req_queued > 0); + + nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force); + + LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy)); + + return nrq; +} + +/** + * Enqueues request \a nrq for later handling, via one one the policies for + * which resources where earlier obtained via nrs_resource_get_safe(). The + * function attempts to enqueue the request first on the primary policy + * (if any), since this is the preferred choice. + * + * \param nrq the request being enqueued + * + * \see nrs_resource_get_safe() + */ +static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_nrs_policy *policy; + int rc; + int i; + + /** + * Try in descending order, because the primary policy (if any) is + * the preferred choice. + */ + for (i = NRS_RES_MAX - 1; i >= 0; i--) { + if (nrq->nr_res_ptrs[i] == NULL) + continue; + + nrq->nr_res_idx = i; + policy = nrq->nr_res_ptrs[i]->res_policy; + + rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq); + if (rc == 0) { + policy->pol_nrs->nrs_req_queued++; + policy->pol_req_queued++; + return; + } + } + /** + * Should never get here, as at least the primary policy's + * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always + * succeed. + */ + LBUG(); +} + +/** + * Called when a request has been handled + * + * \param[in] nrs the request that has been handled; can be used for + * job/resource control. + * + * \see ptlrpc_nrs_req_stop_nolock() + */ +static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq); + + if (policy->pol_desc->pd_ops->op_req_stop) + policy->pol_desc->pd_ops->op_req_stop(policy, nrq); + + LASSERT(policy->pol_nrs->nrs_req_started > 0); + LASSERT(policy->pol_req_started > 0); + + policy->pol_nrs->nrs_req_started--; + policy->pol_req_started--; +} + +/** + * Handler for operations that can be carried out on policies. + * + * Handles opcodes that are common to all policy types within NRS core, and + * passes any unknown opcodes to the policy-specific control function. + * + * \param[in] nrs the NRS head this policy belongs to. + * \param[in] name the human-readable policy name; should be the same as + * ptlrpc_nrs_pol_desc::pd_name. + * \param[in] opc the opcode of the operation being carried out. + * \param[in,out] arg can be used to pass information in and out between when + * carrying an operation; usually data that is private to + * the policy at some level, or generic policy status + * information. + * + * \retval -ve error condition + * \retval 0 operation was carried out successfully + */ +static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + struct ptlrpc_nrs_policy *policy; + int rc = 0; + + spin_lock(&nrs->nrs_lock); + + policy = nrs_policy_find_locked(nrs, name); + if (policy == NULL) { + rc = -ENOENT; + goto out; + } + + switch (opc) { + /** + * Unknown opcode, pass it down to the policy-specific control + * function for handling. + */ + default: + rc = nrs_policy_ctl_locked(policy, opc, arg); + break; + + /** + * Start \e policy + */ + case PTLRPC_NRS_CTL_START: + rc = nrs_policy_start_locked(policy); + break; + } +out: + if (policy != NULL) + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + return rc; +} + +/** + * Unregisters a policy by name. + * + * \param[in] nrs the NRS head this policy belongs to. + * \param[in] name the human-readable policy name; should be the same as + * ptlrpc_nrs_pol_desc::pd_name + * + * \retval -ve error + * \retval 0 success + */ +static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name) +{ + struct ptlrpc_nrs_policy *policy = NULL; + + spin_lock(&nrs->nrs_lock); + + policy = nrs_policy_find_locked(nrs, name); + if (policy == NULL) { + spin_unlock(&nrs->nrs_lock); + + CERROR("Can't find NRS policy %s\n", name); + return -ENOENT; + } + + if (policy->pol_ref > 1) { + CERROR("Policy %s is busy with %d references\n", name, + (int)policy->pol_ref); + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + return -EBUSY; + } + + LASSERT(policy->pol_req_queued == 0); + LASSERT(policy->pol_req_started == 0); + + if (policy->pol_state != NRS_POL_STATE_STOPPED) { + nrs_policy_stop_locked(policy); + LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED); + } + + list_del(&policy->pol_list); + nrs->nrs_num_pols--; + + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + nrs_policy_fini(policy); + + LASSERT(policy->pol_private == NULL); + OBD_FREE_PTR(policy); + + return 0; +} + +/** + * Register a policy from \policy descriptor \a desc with NRS head \a nrs. + * + * \param[in] nrs the NRS head on which the policy will be registered. + * \param[in] desc the policy descriptor from which the information will be + * obtained to register the policy. + * + * \retval -ve error + * \retval 0 success + */ +static int nrs_policy_register(struct ptlrpc_nrs *nrs, + struct ptlrpc_nrs_pol_desc *desc) +{ + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_policy *tmp; + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + int rc; + + LASSERT(svcpt != NULL); + LASSERT(desc->pd_ops != NULL); + LASSERT(desc->pd_ops->op_res_get != NULL); + LASSERT(desc->pd_ops->op_req_get != NULL); + LASSERT(desc->pd_ops->op_req_enqueue != NULL); + LASSERT(desc->pd_ops->op_req_dequeue != NULL); + LASSERT(desc->pd_compat != NULL); + + OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable, + svcpt->scp_cpt, sizeof(*policy), GFP_NOFS); + if (policy == NULL) + return -ENOMEM; + + policy->pol_nrs = nrs; + policy->pol_desc = desc; + policy->pol_state = NRS_POL_STATE_STOPPED; + policy->pol_flags = desc->pd_flags; + + INIT_LIST_HEAD(&policy->pol_list); + INIT_LIST_HEAD(&policy->pol_list_queued); + + rc = nrs_policy_init(policy); + if (rc != 0) { + OBD_FREE_PTR(policy); + return rc; + } + + spin_lock(&nrs->nrs_lock); + + tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name); + if (tmp != NULL) { + CERROR("NRS policy %s has been registered, can't register it for %s\n", + policy->pol_desc->pd_name, + svcpt->scp_service->srv_name); + nrs_policy_put_locked(tmp); + + spin_unlock(&nrs->nrs_lock); + nrs_policy_fini(policy); + OBD_FREE_PTR(policy); + + return -EEXIST; + } + + list_add_tail(&policy->pol_list, &nrs->nrs_policy_list); + nrs->nrs_num_pols++; + + if (policy->pol_flags & PTLRPC_NRS_FL_REG_START) + rc = nrs_policy_start_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + if (rc != 0) + (void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name); + + return rc; +} + +/** + * Enqueue request \a req using one of the policies its resources are referring + * to. + * + * \param[in] req the request to enqueue. + */ +static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_policy *policy; + + LASSERT(req->rq_nrq.nr_initialized); + LASSERT(!req->rq_nrq.nr_enqueued); + + nrs_request_enqueue(&req->rq_nrq); + req->rq_nrq.nr_enqueued = 1; + + policy = nrs_request_policy(&req->rq_nrq); + /** + * Add the policy to the NRS head's list of policies with enqueued + * requests, if it has not been added there. + */ + if (unlikely(list_empty(&policy->pol_list_queued))) + list_add_tail(&policy->pol_list_queued, + &policy->pol_nrs->nrs_policy_queued); +} + +/** + * Enqueue a request on the high priority NRS head. + * + * \param req the request to enqueue. + */ +static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req) +{ + int opc = lustre_msg_get_opc(req->rq_reqmsg); + + spin_lock(&req->rq_lock); + req->rq_hp = 1; + ptlrpc_nrs_req_add_nolock(req); + if (opc != OBD_PING) + DEBUG_REQ(D_NET, req, "high priority req"); + spin_unlock(&req->rq_lock); +} + +/** + * Returns a boolean predicate indicating whether the policy described by + * \a desc is adequate for use with service \a svc. + * + * \param[in] svc the service + * \param[in] desc the policy descriptor + * + * \retval false the policy is not compatible with the service + * \retval true the policy is compatible with the service + */ +static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + return desc->pd_compat(svc, desc); +} + +/** + * Registers all compatible policies in nrs_core.nrs_policies, for NRS head + * \a nrs. + * + * \param[in] nrs the NRS head + * + * \retval -ve error + * \retval 0 success + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + * + * \see ptlrpc_service_nrs_setup() + */ +static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs) +{ + struct ptlrpc_nrs_pol_desc *desc; + /* for convenience */ + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int rc = -EINVAL; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (nrs_policy_compatible(svc, desc)) { + rc = nrs_policy_register(nrs, desc); + if (rc != 0) { + CERROR("Failed to register NRS policy %s for partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svc->srv_name, rc); + /** + * Fail registration if any of the policies' + * registration fails. + */ + break; + } + } + } + + return rc; +} + +/** + * Initializes NRS head \a nrs of service partition \a svcpt, and registers all + * compatible policies in NRS core, with the NRS head. + * + * \param[in] nrs the NRS head + * \param[in] svcpt the PTLRPC service partition to setup + * + * \retval -ve error + * \retval 0 success + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs, + struct ptlrpc_service_part *svcpt) +{ + enum ptlrpc_nrs_queue_type queue; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + if (nrs == &svcpt->scp_nrs_reg) + queue = PTLRPC_NRS_QUEUE_REG; + else if (nrs == svcpt->scp_nrs_hp) + queue = PTLRPC_NRS_QUEUE_HP; + else + LBUG(); + + nrs->nrs_svcpt = svcpt; + nrs->nrs_queue_type = queue; + spin_lock_init(&nrs->nrs_lock); + INIT_LIST_HEAD(&nrs->nrs_policy_list); + INIT_LIST_HEAD(&nrs->nrs_policy_queued); + + return nrs_register_policies_locked(nrs); +} + +/** + * Allocates a regular and optionally a high-priority NRS head (if the service + * handles high-priority RPCs), and then registers all available compatible + * policies on those NRS heads. + * + * \param[in,out] svcpt the PTLRPC service partition to setup + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_nrs *nrs; + int rc; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + /** + * Initialize the regular NRS head. + */ + nrs = nrs_svcpt2nrs(svcpt, false); + rc = nrs_svcpt_setup_locked0(nrs, svcpt); + if (rc < 0) + goto out; + + /** + * Optionally allocate a high-priority NRS head. + */ + if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL) + goto out; + + OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp, + svcpt->scp_service->srv_cptable, + svcpt->scp_cpt); + if (svcpt->scp_nrs_hp == NULL) { + rc = -ENOMEM; + goto out; + } + + nrs = nrs_svcpt2nrs(svcpt, true); + rc = nrs_svcpt_setup_locked0(nrs, svcpt); + +out: + return rc; +} + +/** + * Unregisters all policies on all available NRS heads in a service partition; + * called at PTLRPC service unregistration time. + * + * \param[in] svcpt the PTLRPC service partition + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_nrs *nrs; + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_policy *tmp; + int rc; + bool hp = false; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + nrs->nrs_stopping = 1; + + list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list, + pol_list) { + rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name); + LASSERT(rc == 0); + } + + /** + * If the service partition has an HP NRS head, clean that up as well. + */ + if (!hp && nrs_svcpt_has_hp(svcpt)) { + hp = true; + goto again; + } + + if (hp) + OBD_FREE_PTR(nrs); +} + +/** + * Returns the descriptor for a policy as identified by by \a name. + * + * \param[in] name the policy name + * + * \retval the policy descriptor + * \retval NULL + */ +static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name) +{ + struct ptlrpc_nrs_pol_desc *tmp; + + list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) { + if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0) + return tmp; + } + return NULL; +} + +/** + * Removes the policy from all supported NRS heads of all partitions of all + * PTLRPC services. + * + * \param[in] desc the policy descriptor to unregister + * + * \retval -ve error + * \retval 0 successfully unregistered policy on all supported NRS heads + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + * \pre mutex_is_locked(&ptlrpc_all_services_mutex) + */ +static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc) +{ + struct ptlrpc_nrs *nrs; + struct ptlrpc_service *svc; + struct ptlrpc_service_part *svcpt; + int i; + int rc = 0; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex)); + + list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { + + if (!nrs_policy_compatible(svc, desc) || + unlikely(svc->srv_is_stopping)) + continue; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + bool hp = false; + +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + rc = nrs_policy_unregister(nrs, desc->pd_name); + /** + * Ignore -ENOENT as the policy may not have registered + * successfully on all service partitions. + */ + if (rc == -ENOENT) { + rc = 0; + } else if (rc != 0) { + CERROR("Failed to unregister NRS policy %s for partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svcpt->scp_service->srv_name, rc); + return rc; + } + + if (!hp && nrs_svc_has_hp(svc)) { + hp = true; + goto again; + } + } + + if (desc->pd_ops->op_lprocfs_fini != NULL) + desc->pd_ops->op_lprocfs_fini(svc); + } + + return rc; +} + +/** + * Registers a new policy with NRS core. + * + * The function will only succeed if policy registration with all compatible + * service partitions (if any) is successful. + * + * N.B. This function should be called either at ptlrpc module initialization + * time when registering a policy that ships with NRS core, or in a + * module's init() function for policies registering from other modules. + * + * \param[in] conf configuration information for the new policy to register + * + * \retval -ve error + * \retval 0 success + */ +int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf) +{ + struct ptlrpc_service *svc; + struct ptlrpc_nrs_pol_desc *desc; + int rc = 0; + + LASSERT(conf != NULL); + LASSERT(conf->nc_ops != NULL); + LASSERT(conf->nc_compat != NULL); + LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one, + conf->nc_compat_svc_name != NULL)); + LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0, + conf->nc_owner != NULL)); + + conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0'; + + /** + * External policies are not allowed to start immediately upon + * registration, as there is a relatively higher chance that their + * registration might fail. In such a case, some policy instances may + * already have requests queued wen unregistration needs to happen as + * part o cleanup; since there is currently no way to drain requests + * from a policy unless the service is unregistering, we just disallow + * this. + */ + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) && + (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK | + PTLRPC_NRS_FL_REG_START))) { + CERROR("NRS: failing to register policy %s. Please check policy flags; external policies cannot act as fallback policies, or be started immediately upon registration without interaction with lprocfs\n", + conf->nc_name); + return -EINVAL; + } + + mutex_lock(&nrs_core.nrs_mutex); + + if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) { + CERROR("NRS: failing to register policy %s which has already been registered with NRS core!\n", + conf->nc_name); + rc = -EEXIST; + goto fail; + } + + OBD_ALLOC_PTR(desc); + if (desc == NULL) { + rc = -ENOMEM; + goto fail; + } + + strncpy(desc->pd_name, conf->nc_name, NRS_POL_NAME_MAX); + desc->pd_ops = conf->nc_ops; + desc->pd_compat = conf->nc_compat; + desc->pd_compat_svc_name = conf->nc_compat_svc_name; + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0) + desc->pd_owner = conf->nc_owner; + desc->pd_flags = conf->nc_flags; + atomic_set(&desc->pd_refs, 0); + + /** + * For policies that are held in the same module as NRS (currently + * ptlrpc), do not register the policy with all compatible services, + * as the services will not have started at this point, since we are + * calling from ptlrpc module initialization code. In such cases each + * service will register all compatible policies later, via + * ptlrpc_service_nrs_setup(). + */ + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0) + goto internal; + + /** + * Register the new policy on all compatible services + */ + mutex_lock(&ptlrpc_all_services_mutex); + + list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { + struct ptlrpc_service_part *svcpt; + int i; + int rc2; + + if (!nrs_policy_compatible(svc, desc) || + unlikely(svc->srv_is_stopping)) + continue; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + struct ptlrpc_nrs *nrs; + bool hp = false; +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + rc = nrs_policy_register(nrs, desc); + if (rc != 0) { + CERROR("Failed to register NRS policy %s for partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svcpt->scp_service->srv_name, rc); + + rc2 = nrs_policy_unregister_locked(desc); + /** + * Should not fail at this point + */ + LASSERT(rc2 == 0); + mutex_unlock(&ptlrpc_all_services_mutex); + OBD_FREE_PTR(desc); + goto fail; + } + + if (!hp && nrs_svc_has_hp(svc)) { + hp = true; + goto again; + } + } + + /** + * No need to take a reference to other modules here, as we + * will be calling from the module's init() function. + */ + if (desc->pd_ops->op_lprocfs_init != NULL) { + rc = desc->pd_ops->op_lprocfs_init(svc); + if (rc != 0) { + rc2 = nrs_policy_unregister_locked(desc); + /** + * Should not fail at this point + */ + LASSERT(rc2 == 0); + mutex_unlock(&ptlrpc_all_services_mutex); + OBD_FREE_PTR(desc); + goto fail; + } + } + } + + mutex_unlock(&ptlrpc_all_services_mutex); +internal: + list_add_tail(&desc->pd_list, &nrs_core.nrs_policies); +fail: + mutex_unlock(&nrs_core.nrs_mutex); + + return rc; +} +EXPORT_SYMBOL(ptlrpc_nrs_policy_register); + +/** + * Unregisters a previously registered policy with NRS core. All instances of + * the policy on all NRS heads of all supported services are removed. + * + * N.B. This function should only be called from a module's exit() function. + * Although it can be used for policies that ship alongside NRS core, the + * function is primarily intended for policies that register externally, + * from other modules. + * + * \param[in] conf configuration information for the policy to unregister + * + * \retval -ve error + * \retval 0 success + */ +int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf) +{ + struct ptlrpc_nrs_pol_desc *desc; + int rc; + + LASSERT(conf != NULL); + + if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) { + CERROR("Unable to unregister a fallback policy, unless the PTLRPC service is stopping.\n"); + return -EPERM; + } + + conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0'; + + mutex_lock(&nrs_core.nrs_mutex); + + desc = nrs_policy_find_desc_locked(conf->nc_name); + if (desc == NULL) { + CERROR("Failing to unregister NRS policy %s which has not been registered with NRS core!\n", + conf->nc_name); + rc = -ENOENT; + goto not_exist; + } + + mutex_lock(&ptlrpc_all_services_mutex); + + rc = nrs_policy_unregister_locked(desc); + if (rc < 0) { + if (rc == -EBUSY) + CERROR("Please first stop policy %s on all service partitions and then retry to unregister the policy.\n", + conf->nc_name); + goto fail; + } + + CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n", + conf->nc_name); + + list_del(&desc->pd_list); + OBD_FREE_PTR(desc); + +fail: + mutex_unlock(&ptlrpc_all_services_mutex); + +not_exist: + mutex_unlock(&nrs_core.nrs_mutex); + + return rc; +} +EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister); + +/** + * Setup NRS heads on all service partitions of service \a svc, and register + * all compatible policies on those NRS heads. + * + * To be called from within ptl + * \param[in] svc the service to setup + * + * \retval -ve error, the calling logic should eventually call + * ptlrpc_service_nrs_cleanup() to undo any work performed + * by this function. + * + * \see ptlrpc_register_service() + * \see ptlrpc_service_nrs_cleanup() + */ +int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + const struct ptlrpc_nrs_pol_desc *desc; + int i; + int rc = 0; + + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Initialize NRS heads on all service CPTs. + */ + ptlrpc_service_for_each_part(svcpt, i, svc) { + rc = nrs_svcpt_setup_locked(svcpt); + if (rc != 0) + goto failed; + } + + /** + * Set up lprocfs interfaces for all supported policies for the + * service. + */ + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (!nrs_policy_compatible(svc, desc)) + continue; + + if (desc->pd_ops->op_lprocfs_init != NULL) { + rc = desc->pd_ops->op_lprocfs_init(svc); + if (rc != 0) + goto failed; + } + } + +failed: + + mutex_unlock(&nrs_core.nrs_mutex); + + return rc; +} + +/** + * Unregisters all policies on all service partitions of service \a svc. + * + * \param[in] svc the PTLRPC service to unregister + */ +void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + const struct ptlrpc_nrs_pol_desc *desc; + int i; + + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Clean up NRS heads on all service partitions + */ + ptlrpc_service_for_each_part(svcpt, i, svc) + nrs_svcpt_cleanup_locked(svcpt); + + /** + * Clean up lprocfs interfaces for all supported policies for the + * service. + */ + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (!nrs_policy_compatible(svc, desc)) + continue; + + if (desc->pd_ops->op_lprocfs_fini != NULL) + desc->pd_ops->op_lprocfs_fini(svc); + } + + mutex_unlock(&nrs_core.nrs_mutex); +} + +/** + * Obtains NRS head resources for request \a req. + * + * These could be either on the regular or HP NRS head of \a svcpt; resources + * taken on the regular head can later be swapped for HP head resources by + * ldlm_lock_reorder_req(). + * + * \param[in] svcpt the service partition + * \param[in] req the request + * \param[in] hp which NRS head of \a svcpt to use + */ +void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + memset(&req->rq_nrq, 0, sizeof(req->rq_nrq)); + nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs, + false); + + /** + * It is fine to access \e nr_initialized without locking as there is + * no contention at this early stage. + */ + req->rq_nrq.nr_initialized = 1; +} + +/** + * Releases resources for a request; is called after the request has been + * handled. + * + * \param[in] req the request + * + * \see ptlrpc_server_finish_request() + */ +void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req) +{ + if (req->rq_nrq.nr_initialized) { + nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs); + /* no protection on bit nr_initialized because no + * contention at this late stage */ + req->rq_nrq.nr_finalized = 1; + } +} + +void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req) +{ + if (req->rq_nrq.nr_started) + nrs_request_stop(&req->rq_nrq); +} + +/** + * Enqueues request \a req on either the regular or high-priority NRS head + * of service partition \a svcpt. + * + * \param[in] svcpt the service partition + * \param[in] req the request to be enqueued + * \param[in] hp whether to enqueue the request on the regular or + * high-priority NRS head. + */ +void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp) +{ + spin_lock(&svcpt->scp_req_lock); + + if (hp) + ptlrpc_nrs_hpreq_add_nolock(req); + else + ptlrpc_nrs_req_add_nolock(req); + + spin_unlock(&svcpt->scp_req_lock); +} + +static void nrs_request_removed(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_nrs->nrs_req_queued > 0); + LASSERT(policy->pol_req_queued > 0); + + policy->pol_nrs->nrs_req_queued--; + policy->pol_req_queued--; + + /** + * If the policy has no more requests queued, remove it from + * ptlrpc_nrs::nrs_policy_queued. + */ + if (unlikely(policy->pol_req_queued == 0)) { + list_del_init(&policy->pol_list_queued); + + /** + * If there are other policies with queued requests, move the + * current policy to the end so that we can round robin over + * all policies and drain the requests. + */ + } else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) { + LASSERT(policy->pol_req_queued < + policy->pol_nrs->nrs_req_queued); + + list_move_tail(&policy->pol_list_queued, + &policy->pol_nrs->nrs_policy_queued); + } +} + +/** + * Obtains a request for handling from an NRS head of service partition + * \a svcpt. + * + * \param[in] svcpt the service partition + * \param[in] hp whether to obtain a request from the regular or + * high-priority NRS head. + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force when set, it will force a policy to return a request if it + * has one pending + * + * \retval the request to be handled + * \retval NULL the head has no requests to serve + */ +struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, + bool peek, bool force) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_request *nrq; + + /** + * Always try to drain requests from all NRS polices even if they are + * inactive, because the user can change policy status at runtime. + */ + list_for_each_entry(policy, &nrs->nrs_policy_queued, + pol_list_queued) { + nrq = nrs_request_get(policy, peek, force); + if (nrq != NULL) { + if (likely(!peek)) { + nrq->nr_started = 1; + + policy->pol_req_started++; + policy->pol_nrs->nrs_req_started++; + + nrs_request_removed(policy); + } + + return container_of(nrq, struct ptlrpc_request, rq_nrq); + } + } + + return NULL; +} + +/** + * Dequeues request \a req from the policy it has been enqueued on. + * + * \param[in] req the request + */ +void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq); + + policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq); + + req->rq_nrq.nr_enqueued = 0; + + nrs_request_removed(policy); +} + +/** + * Returns whether there are any requests currently enqueued on any of the + * policies of service partition's \a svcpt NRS head specified by \a hp. Should + * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable + * result. + * + * \param[in] svcpt the service partition to enquire. + * \param[in] hp whether the regular or high-priority NRS head is to be + * enquired. + * + * \retval false the indicated NRS head has no enqueued requests. + * \retval true the indicated NRS head has some enqueued requests. + */ +bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + return nrs->nrs_req_queued > 0; +}; + +/** + * Moves request \a req from the regular to the high-priority NRS head. + * + * \param[in] req the request to move + */ +void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_nrs_request *nrq = &req->rq_nrq; + struct ptlrpc_nrs_resource *res1[NRS_RES_MAX]; + struct ptlrpc_nrs_resource *res2[NRS_RES_MAX]; + + /** + * Obtain the high-priority NRS head resources. + */ + nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true); + + spin_lock(&svcpt->scp_req_lock); + + if (!ptlrpc_nrs_req_can_move(req)) + goto out; + + ptlrpc_nrs_req_del_nolock(req); + + memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0])); + memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0])); + + ptlrpc_nrs_hpreq_add_nolock(req); + + memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0])); +out: + spin_unlock(&svcpt->scp_req_lock); + + /** + * Release either the regular NRS head resources if we moved the + * request, or the high-priority NRS head resources if we took a + * reference earlier in this function and ptlrpc_nrs_req_can_move() + * returned false. + */ + nrs_resource_put_safe(res1); +} + +/** + * Carries out a control operation \a opc on the policy identified by the + * human-readable \a name, on either all partitions, or only on the first + * partition of service \a svc. + * + * \param[in] svc the service the policy belongs to. + * \param[in] queue whether to carry out the command on the policy which + * belongs to the regular, high-priority, or both NRS + * heads of service partitions of \a svc. + * \param[in] name the policy to act upon, by human-readable name + * \param[in] opc the opcode of the operation to carry out + * \param[in] single when set, the operation will only be carried out on the + * NRS heads of the first service partition of \a svc. + * This is useful for some policies which e.g. share + * identical values on the same parameters of different + * service partitions; when reading these parameters via + * lprocfs, these policies may just want to obtain and + * print out the values from the first service partition. + * Storing these values centrally elsewhere then could be + * another solution for this. + * \param[in,out] arg can be used as a generic in/out buffer between control + * operations and the user environment. + * + *\retval -ve error condition + *\retval 0 operation was carried out successfully + */ +int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, + enum ptlrpc_nrs_queue_type queue, char *name, + enum ptlrpc_nrs_ctl opc, bool single, void *arg) +{ + struct ptlrpc_service_part *svcpt; + int i; + int rc = 0; + + LASSERT(opc != PTLRPC_NRS_CTL_INVALID); + + if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0) + return -EINVAL; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name, + opc, arg); + if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG && + single)) + goto out; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + /** + * XXX: We could optionally check for + * nrs_svc_has_hp(svc) here, and return an error if it + * is false. Right now we rely on the policies' lprocfs + * handlers that call the present function to make this + * check; if they fail to do so, they might hit the + * assertion inside nrs_svcpt2nrs() below. + */ + rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name, + opc, arg); + if (rc != 0 || single) + goto out; + } + } +out: + return rc; +} + + +/* ptlrpc/nrs_fifo.c */ +extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo; + +/** + * Adds all policies that ship with the ptlrpc module, to NRS core's list of + * policies \e nrs_core.nrs_policies. + * + * \retval 0 all policies have been registered successfully + * \retval -ve error + */ +int ptlrpc_nrs_init(void) +{ + int rc; + + mutex_init(&nrs_core.nrs_mutex); + INIT_LIST_HEAD(&nrs_core.nrs_policies); + + rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo); + if (rc != 0) + goto fail; + + + return rc; +fail: + /** + * Since no PTLRPC services have been started at this point, all we need + * to do for cleanup is to free the descriptors. + */ + ptlrpc_nrs_fini(); + + return rc; +} + +/** + * Removes all policy descriptors from nrs_core::nrs_policies, and frees the + * policy descriptors. + * + * Since all PTLRPC services are stopped at this point, there are no more + * instances of any policies, because each service will have stopped its policy + * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the + * descriptors here. + */ +void ptlrpc_nrs_fini(void) +{ + struct ptlrpc_nrs_pol_desc *desc; + struct ptlrpc_nrs_pol_desc *tmp; + + list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies, + pd_list) { + list_del_init(&desc->pd_list); + OBD_FREE_PTR(desc); + } +} + +/** @} nrs */ diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c new file mode 100644 index 000000000..eb40c01db --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c @@ -0,0 +1,270 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_fifo.c + * + * Network Request Scheduler (NRS) FIFO policy + * + * Handles RPCs in a FIFO manner, as received from the network. This policy is + * a logical wrapper around previous, non-NRS functionality. It is used as the + * default and fallback policy for all types of RPCs on all PTLRPC service + * partitions, for both regular and high-priority NRS heads. Default here means + * the policy is the one enabled at PTLRPC service partition startup time, and + * fallback means the policy is used to handle RPCs that are not handled + * successfully or are not handled at all by any primary policy that may be + * enabled on a given NRS head. + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../../include/linux/libcfs/libcfs.h" +#include "ptlrpc_internal.h" + +/** + * \name fifo + * + * The FIFO policy is a logical wrapper around previous, non-NRS functionality. + * It schedules RPCs in the same order as they are queued from LNet. + * + * @{ + */ + +#define NRS_POL_NAME_FIFO "fifo" + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a + * policy-specific private data structure. + * + * \param[in] policy The policy to start + * + * \retval -ENOMEM OOM error + * \retval 0 success + * + * \see nrs_policy_register() + * \see nrs_policy_ctl() + */ +static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_fifo_head *head; + + OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (head == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&head->fh_list); + policy->pol_private = head; + return 0; +} + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific + * private data structure. + * + * \param[in] policy The policy to stop + * + * \see nrs_policy_stop0() + */ +static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_fifo_head *head = policy->pol_private; + + LASSERT(head != NULL); + LASSERT(list_empty(&head->fh_list)); + + OBD_FREE_PTR(head); +} + +/** + * Is called for obtaining a FIFO policy resource. + * + * \param[in] policy The policy on which the request is being asked for + * \param[in] nrq The request for which resources are being taken + * \param[in] parent Parent resource, unused in this policy + * \param[out] resp Resources references are placed in this array + * \param[in] moving_req Signifies limited caller context; unused in this + * policy + * + * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since + * it implements a simple scheduling algorithm in which request + * priority is determined on the request arrival order, it does not + * need to maintain a set of resources that would otherwise be used + * to calculate a request's priority. + * + * \see nrs_resource_get_safe() + */ +static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + /** + * Just return the resource embedded inside nrs_fifo_head, and end this + * resource hierarchy reference request. + */ + *resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res; + return 1; +} + +/** + * Called when getting a request from the FIFO policy for handling, or just + * peeking; removes the request from the policy when it is to be handled. + * + * \param[in] policy The policy + * \param[in] peek When set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force Force the policy to return a request; unused in this + * policy + * + * \retval The request to be handled; this is the next request in the FIFO + * queue + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request *nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_fifo_head *head = policy->pol_private; + struct ptlrpc_nrs_request *nrq; + + nrq = unlikely(list_empty(&head->fh_list)) ? NULL : + list_entry(head->fh_list.next, struct ptlrpc_nrs_request, + nr_u.fifo.fr_list); + + if (likely(!peek && nrq != NULL)) { + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + + list_del_init(&nrq->nr_u.fifo.fr_list); + + CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: %llu\n", + policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer), + nrq->nr_u.fifo.fr_sequence); + } + + return nrq; +} + +/** + * Adds request \a nrq to \a policy's list of queued requests + * + * \param[in] policy The policy + * \param[in] nrq The request to add + * + * \retval 0 success; nrs_request_enqueue() assumes this function will always + * succeed + */ +static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_fifo_head *head; + + head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head, + fh_res); + /** + * Only used for debugging + */ + nrq->nr_u.fifo.fr_sequence = head->fh_sequence++; + list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list); + + return 0; +} + +/** + * Removes request \a nrq from \a policy's list of queued requests. + * + * \param[in] policy The policy + * \param[in] nrq The request to remove + */ +static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list)); + list_del_init(&nrq->nr_u.fifo.fr_list); +} + +/** + * Prints a debug statement right before the request \a nrq stops being + * handled. + * + * \param[in] policy The policy handling the request + * \param[in] nrq The request being handled + * + * \see ptlrpc_server_finish_request() + * \see ptlrpc_nrs_req_stop_nolock() + */ +static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n", + policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer), + nrq->nr_u.fifo.fr_sequence); +} + +/** + * FIFO policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = { + .op_policy_start = nrs_fifo_start, + .op_policy_stop = nrs_fifo_stop, + .op_res_get = nrs_fifo_res_get, + .op_req_get = nrs_fifo_req_get, + .op_req_enqueue = nrs_fifo_req_add, + .op_req_dequeue = nrs_fifo_req_del, + .op_req_stop = nrs_fifo_req_stop, +}; + +/** + * FIFO policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_fifo = { + .nc_name = NRS_POL_NAME_FIFO, + .nc_ops = &nrs_fifo_ops, + .nc_compat = nrs_policy_compat_all, + .nc_flags = PTLRPC_NRS_FL_FALLBACK | + PTLRPC_NRS_FL_REG_START +}; + +/** @} fifo */ + +/** @} nrs */ diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c new file mode 100644 index 000000000..b51af9bf3 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c @@ -0,0 +1,2536 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/pack_generic.c + * + * (Un)packing of OST requests + * + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_net.h" +#include "../include/obd_cksum.h" +#include "../include/lustre/ll_fiemap.h" + +static inline int lustre_msg_hdr_size_v2(int count) +{ + return cfs_size_round(offsetof(struct lustre_msg_v2, + lm_buflens[count])); +} + +int lustre_msg_hdr_size(__u32 magic, int count) +{ + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_hdr_size_v2(count); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_hdr_size); + +void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, + int index) +{ + if (inout) + lustre_set_req_swabbed(req, index); + else + lustre_set_rep_swabbed(req, index); +} +EXPORT_SYMBOL(ptlrpc_buf_set_swabbed); + +int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, + int index) +{ + if (inout) + return (ptlrpc_req_need_swab(req) && + !lustre_req_swabbed(req, index)); + else + return (ptlrpc_rep_need_swab(req) && + !lustre_rep_swabbed(req, index)); +} +EXPORT_SYMBOL(ptlrpc_buf_need_swab); + +static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg, + __u32 version) +{ + __u32 ver = lustre_msg_get_version(msg); + return (ver & LUSTRE_VERSION_MASK) != version; +} + +int lustre_msg_check_version(struct lustre_msg *msg, __u32 version) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + CERROR("msg v1 not supported - please upgrade you system\n"); + return -EINVAL; + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_check_version_v2(msg, version); + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_check_version); + +/* early reply size */ +int lustre_msg_early_size(void) +{ + static int size; + if (!size) { + /* Always reply old ptlrpc_body_v2 to keep interoperability + * with the old client (< 2.3) which doesn't have pb_jobid + * in the ptlrpc_body. + * + * XXX Remove this whenever we drop interoperability with such + * client. + */ + __u32 pblen = sizeof(struct ptlrpc_body_v2); + size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen); + } + return size; +} +EXPORT_SYMBOL(lustre_msg_early_size); + +int lustre_msg_size_v2(int count, __u32 *lengths) +{ + int size; + int i; + + size = lustre_msg_hdr_size_v2(count); + for (i = 0; i < count; i++) + size += cfs_size_round(lengths[i]); + + return size; +} +EXPORT_SYMBOL(lustre_msg_size_v2); + +/* This returns the size of the buffer that is required to hold a lustre_msg + * with the given sub-buffer lengths. + * NOTE: this should only be used for NEW requests, and should always be + * in the form of a v2 request. If this is a connection to a v1 + * target then the first buffer will be stripped because the ptlrpc + * data is part of the lustre_msg_v1 header. b=14043 */ +int lustre_msg_size(__u32 magic, int count, __u32 *lens) +{ + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2)); + + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_size_v2(count, lens); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_size); + +/* This is used to determine the size of a buffer that was already packed + * and will correctly handle the different message formats. */ +int lustre_packed_msg_size(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_packed_msg_size); + +void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, + char **bufs) +{ + char *ptr; + int i; + + msg->lm_bufcount = count; + /* XXX: lm_secflvr uninitialized here */ + msg->lm_magic = LUSTRE_MSG_MAGIC_V2; + + for (i = 0; i < count; i++) + msg->lm_buflens[i] = lens[i]; + + if (bufs == NULL) + return; + + ptr = (char *)msg + lustre_msg_hdr_size_v2(count); + for (i = 0; i < count; i++) { + char *tmp = bufs[i]; + LOGL(tmp, lens[i], ptr); + } +} +EXPORT_SYMBOL(lustre_init_msg_v2); + +static int lustre_pack_request_v2(struct ptlrpc_request *req, + int count, __u32 *lens, char **bufs) +{ + int reqlen, rc; + + reqlen = lustre_msg_size_v2(count, lens); + + rc = sptlrpc_cli_alloc_reqbuf(req, reqlen); + if (rc) + return rc; + + req->rq_reqlen = reqlen; + + lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs); + lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION); + return 0; +} + +int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count, + __u32 *lens, char **bufs) +{ + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); + + /* only use new format, we don't need to be compatible with 1.4 */ + return lustre_pack_request_v2(req, count, lens, bufs); +} +EXPORT_SYMBOL(lustre_pack_request); + +#if RS_DEBUG +LIST_HEAD(ptlrpc_rs_debug_lru); +spinlock_t ptlrpc_rs_debug_lock; + +#define PTLRPC_RS_DEBUG_LRU_ADD(rs) \ +do { \ + spin_lock(&ptlrpc_rs_debug_lock); \ + list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru); \ + spin_unlock(&ptlrpc_rs_debug_lock); \ +} while (0) + +#define PTLRPC_RS_DEBUG_LRU_DEL(rs) \ +do { \ + spin_lock(&ptlrpc_rs_debug_lock); \ + list_del(&(rs)->rs_debug_list); \ + spin_unlock(&ptlrpc_rs_debug_lock); \ +} while (0) +#else +# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while (0) +# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while (0) +#endif + +struct ptlrpc_reply_state * +lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_reply_state *rs = NULL; + + spin_lock(&svcpt->scp_rep_lock); + + /* See if we have anything in a pool, and wait if nothing */ + while (list_empty(&svcpt->scp_rep_idle)) { + struct l_wait_info lwi; + int rc; + + spin_unlock(&svcpt->scp_rep_lock); + /* If we cannot get anything for some long time, we better + * bail out instead of waiting infinitely */ + lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL); + rc = l_wait_event(svcpt->scp_rep_waitq, + !list_empty(&svcpt->scp_rep_idle), &lwi); + if (rc != 0) + goto out; + spin_lock(&svcpt->scp_rep_lock); + } + + rs = list_entry(svcpt->scp_rep_idle.next, + struct ptlrpc_reply_state, rs_list); + list_del(&rs->rs_list); + + spin_unlock(&svcpt->scp_rep_lock); + + memset(rs, 0, svcpt->scp_service->srv_max_reply_size); + rs->rs_size = svcpt->scp_service->srv_max_reply_size; + rs->rs_svcpt = svcpt; + rs->rs_prealloc = 1; +out: + return rs; +} + +void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + spin_lock(&svcpt->scp_rep_lock); + list_add(&rs->rs_list, &svcpt->scp_rep_idle); + spin_unlock(&svcpt->scp_rep_lock); + wake_up(&svcpt->scp_rep_waitq); +} + +int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, + __u32 *lens, char **bufs, int flags) +{ + struct ptlrpc_reply_state *rs; + int msg_len, rc; + + LASSERT(req->rq_reply_state == NULL); + + if ((flags & LPRFL_EARLY_REPLY) == 0) { + spin_lock(&req->rq_lock); + req->rq_packed_final = 1; + spin_unlock(&req->rq_lock); + } + + msg_len = lustre_msg_size_v2(count, lens); + rc = sptlrpc_svc_alloc_rs(req, msg_len); + if (rc) + return rc; + + rs = req->rq_reply_state; + atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ + rs->rs_cb_id.cbid_fn = reply_out_callback; + rs->rs_cb_id.cbid_arg = rs; + rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt; + INIT_LIST_HEAD(&rs->rs_exp_list); + INIT_LIST_HEAD(&rs->rs_obd_list); + INIT_LIST_HEAD(&rs->rs_list); + spin_lock_init(&rs->rs_lock); + + req->rq_replen = msg_len; + req->rq_reply_state = rs; + req->rq_repmsg = rs->rs_msg; + + lustre_init_msg_v2(rs->rs_msg, count, lens, bufs); + lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION); + + PTLRPC_RS_DEBUG_LRU_ADD(rs); + + return 0; +} +EXPORT_SYMBOL(lustre_pack_reply_v2); + +int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens, + char **bufs, int flags) +{ + int rc = 0; + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); + + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + rc = lustre_pack_reply_v2(req, count, lens, bufs, flags); + break; + default: + LASSERTF(0, "incorrect message magic: %08x\n", + req->rq_reqmsg->lm_magic); + rc = -EINVAL; + } + if (rc != 0) + CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc, + lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens)); + return rc; +} +EXPORT_SYMBOL(lustre_pack_reply_flags); + +int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens, + char **bufs) +{ + return lustre_pack_reply_flags(req, count, lens, bufs, 0); +} +EXPORT_SYMBOL(lustre_pack_reply); + +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size) +{ + int i, offset, buflen, bufcount; + + LASSERT(m != NULL); + LASSERT(n >= 0); + + bufcount = m->lm_bufcount; + if (unlikely(n >= bufcount)) { + CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n", + m, n, bufcount); + return NULL; + } + + buflen = m->lm_buflens[n]; + if (unlikely(buflen < min_size)) { + CERROR("msg %p buffer[%d] size %d too small (required %d, opc=%d)\n", + m, n, buflen, min_size, + n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m)); + return NULL; + } + + offset = lustre_msg_hdr_size_v2(bufcount); + for (i = 0; i < n; i++) + offset += cfs_size_round(m->lm_buflens[i]); + + return (char *)m + offset; +} + +void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_buf_v2(m, n, min_size); + default: + LASSERTF(0, "incorrect message magic: %08x(msg:%p)\n", m->lm_magic, m); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_buf); + +int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, int segment, + unsigned int newlen, int move_data) +{ + char *tail = NULL, *newpos; + int tail_len = 0, n; + + LASSERT(msg); + LASSERT(msg->lm_bufcount > segment); + LASSERT(msg->lm_buflens[segment] >= newlen); + + if (msg->lm_buflens[segment] == newlen) + goto out; + + if (move_data && msg->lm_bufcount > segment + 1) { + tail = lustre_msg_buf_v2(msg, segment + 1, 0); + for (n = segment + 1; n < msg->lm_bufcount; n++) + tail_len += cfs_size_round(msg->lm_buflens[n]); + } + + msg->lm_buflens[segment] = newlen; + + if (tail && tail_len) { + newpos = lustre_msg_buf_v2(msg, segment + 1, 0); + LASSERT(newpos <= tail); + if (newpos != tail) + memmove(newpos, tail, tail_len); + } +out: + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); +} + +/* + * for @msg, shrink @segment to size @newlen. if @move_data is non-zero, + * we also move data forward from @segment + 1. + * + * if @newlen == 0, we remove the segment completely, but we still keep the + * totally bufcount the same to save possible data moving. this will leave a + * unused segment with size 0 at the tail, but that's ok. + * + * return new msg size after shrinking. + * + * CAUTION: + * + if any buffers higher than @segment has been filled in, must call shrink + * with non-zero @move_data. + * + caller should NOT keep pointers to msg buffers which higher than @segment + * after call shrink. + */ +int lustre_shrink_msg(struct lustre_msg *msg, int segment, + unsigned int newlen, int move_data) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_shrink_msg_v2(msg, segment, newlen, move_data); + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_shrink_msg); + +void lustre_free_reply_state(struct ptlrpc_reply_state *rs) +{ + PTLRPC_RS_DEBUG_LRU_DEL(rs); + + LASSERT(atomic_read(&rs->rs_refcount) == 0); + LASSERT(!rs->rs_difficult || rs->rs_handled); + LASSERT(!rs->rs_on_net); + LASSERT(!rs->rs_scheduled); + LASSERT(rs->rs_export == NULL); + LASSERT(rs->rs_nlocks == 0); + LASSERT(list_empty(&rs->rs_exp_list)); + LASSERT(list_empty(&rs->rs_obd_list)); + + sptlrpc_svc_free_rs(rs); +} +EXPORT_SYMBOL(lustre_free_reply_state); + +static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) +{ + int swabbed, required_len, i; + + /* Now we know the sender speaks my language. */ + required_len = lustre_msg_hdr_size_v2(0); + if (len < required_len) { + /* can't even look inside the message */ + CERROR("message length %d too small for lustre_msg\n", len); + return -EINVAL; + } + + swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED); + + if (swabbed) { + __swab32s(&m->lm_magic); + __swab32s(&m->lm_bufcount); + __swab32s(&m->lm_secflvr); + __swab32s(&m->lm_repsize); + __swab32s(&m->lm_cksum); + __swab32s(&m->lm_flags); + CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0); + CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0); + } + + required_len = lustre_msg_hdr_size_v2(m->lm_bufcount); + if (len < required_len) { + /* didn't receive all the buffer lengths */ + CERROR("message length %d too small for %d buflens\n", + len, m->lm_bufcount); + return -EINVAL; + } + + for (i = 0; i < m->lm_bufcount; i++) { + if (swabbed) + __swab32s(&m->lm_buflens[i]); + required_len += cfs_size_round(m->lm_buflens[i]); + } + + if (len < required_len) { + CERROR("len: %d, required_len %d\n", len, required_len); + CERROR("bufcount: %d\n", m->lm_bufcount); + for (i = 0; i < m->lm_bufcount; i++) + CERROR("buffer %d length %d\n", i, m->lm_buflens[i]); + return -EINVAL; + } + + return swabbed; +} + +int __lustre_unpack_msg(struct lustre_msg *m, int len) +{ + int required_len, rc; + + /* We can provide a slightly better error log, if we check the + * message magic and version first. In the future, struct + * lustre_msg may grow, and we'd like to log a version mismatch, + * rather than a short message. + * + */ + required_len = offsetof(struct lustre_msg, lm_magic) + + sizeof(m->lm_magic); + if (len < required_len) { + /* can't even look inside the message */ + CERROR("message length %d too small for magic/version check\n", + len); + return -EINVAL; + } + + rc = lustre_unpack_msg_v2(m, len); + + return rc; +} +EXPORT_SYMBOL(__lustre_unpack_msg); + +int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len) +{ + int rc; + rc = __lustre_unpack_msg(req->rq_reqmsg, len); + if (rc == 1) { + lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(ptlrpc_unpack_req_msg); + +int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len) +{ + int rc; + rc = __lustre_unpack_msg(req->rq_repmsg, len); + if (rc == 1) { + lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(ptlrpc_unpack_rep_msg); + +static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req, + const int inout, int offset) +{ + struct ptlrpc_body *pb; + struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg; + + pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2)); + if (!pb) { + CERROR("error unpacking ptlrpc body\n"); + return -EFAULT; + } + if (ptlrpc_buf_need_swab(req, inout, offset)) { + lustre_swab_ptlrpc_body(pb); + ptlrpc_buf_set_swabbed(req, inout, offset); + } + + if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) { + CERROR("wrong lustre_msg version %08x\n", pb->pb_version); + return -EINVAL; + } + + if (!inout) + pb->pb_status = ptlrpc_status_ntoh(pb->pb_status); + + return 0; +} + +int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset) +{ + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_unpack_ptlrpc_body_v2(req, 1, offset); + default: + CERROR("bad lustre msg magic: %08x\n", + req->rq_reqmsg->lm_magic); + return -EINVAL; + } +} + +int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset) +{ + switch (req->rq_repmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_unpack_ptlrpc_body_v2(req, 0, offset); + default: + CERROR("bad lustre msg magic: %08x\n", + req->rq_repmsg->lm_magic); + return -EINVAL; + } +} + +static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n) +{ + if (n >= m->lm_bufcount) + return 0; + + return m->lm_buflens[n]; +} + +/** + * lustre_msg_buflen - return the length of buffer \a n in message \a m + * \param m lustre_msg (request or reply) to look at + * \param n message index (base 0) + * + * returns zero for non-existent message indices + */ +int lustre_msg_buflen(struct lustre_msg *m, int n) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_buflen_v2(m, n); + default: + CERROR("incorrect message magic: %08x\n", m->lm_magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_buflen); + +static inline void +lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, int n, int len) +{ + if (n >= m->lm_bufcount) + LBUG(); + + m->lm_buflens[n] = len; +} + +void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + lustre_msg_set_buflen_v2(m, n, len); + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); + } +} + +EXPORT_SYMBOL(lustre_msg_set_buflen); + +/* NB return the bufcount for lustre_msg_v2 format, so if message is packed + * in V1 format, the result is one bigger. (add struct ptlrpc_body). */ +int lustre_msg_bufcount(struct lustre_msg *m) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return m->lm_bufcount; + default: + CERROR("incorrect message magic: %08x\n", m->lm_magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_bufcount); + +char *lustre_msg_string(struct lustre_msg *m, int index, int max_len) +{ + /* max_len == 0 means the string should fill the buffer */ + char *str; + int slen, blen; + + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + str = lustre_msg_buf_v2(m, index, 0); + blen = lustre_msg_buflen_v2(m, index); + break; + default: + LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); + } + + if (str == NULL) { + CERROR("can't unpack string in msg %p buffer[%d]\n", m, index); + return NULL; + } + + slen = strnlen(str, blen); + + if (slen == blen) { /* not NULL terminated */ + CERROR("can't unpack non-NULL terminated string in msg %p buffer[%d] len %d\n", + m, index, blen); + return NULL; + } + + if (max_len == 0) { + if (slen != blen - 1) { + CERROR("can't unpack short string in msg %p buffer[%d] len %d: strlen %d\n", + m, index, blen, slen); + return NULL; + } + } else if (slen > max_len) { + CERROR("can't unpack oversized string in msg %p buffer[%d] len %d strlen %d: max %d expected\n", + m, index, blen, slen, max_len); + return NULL; + } + + return str; +} +EXPORT_SYMBOL(lustre_msg_string); + +/* Wrap up the normal fixed length cases */ +static inline void *__lustre_swab_buf(struct lustre_msg *msg, int index, + int min_size, void *swabber) +{ + void *ptr = NULL; + + LASSERT(msg != NULL); + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + ptr = lustre_msg_buf_v2(msg, index, min_size); + break; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + } + + if (ptr && swabber) + ((void (*)(void *))swabber)(ptr); + + return ptr; +} + +static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg) +{ + return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body_v2)); +} + +__u32 lustre_msghdr_get_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + /* already in host endian */ + return msg->lm_flags; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msghdr_get_flags); + +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: + msg->lm_flags = flags; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +__u32 lustre_msg_get_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_flags; + } + default: + /* flags might be printed in debug code while message + * uninitialized */ + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_flags); + +void lustre_msg_add_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_flags); + +void lustre_msg_set_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags = flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_flags); + +void lustre_msg_clear_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags); + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_clear_flags); + +__u32 lustre_msg_get_op_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_op_flags; + } + default: + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_op_flags); + +void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_op_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_op_flags); + +void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_op_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_op_flags); + +struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return NULL; + } + return &pb->pb_handle; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_handle); + +__u32 lustre_msg_get_type(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return PTL_RPC_MSG_ERR; + } + return pb->pb_type; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return PTL_RPC_MSG_ERR; + } +} +EXPORT_SYMBOL(lustre_msg_get_type); + +__u32 lustre_msg_get_version(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_version; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_version); + +void lustre_msg_add_version(struct lustre_msg *msg, int version) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_version |= version; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_version); + +__u32 lustre_msg_get_opc(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_opc; + } + default: + CERROR("incorrect message magic: %08x(msg:%p)\n", msg->lm_magic, msg); + LBUG(); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_opc); + +__u64 lustre_msg_get_last_xid(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_last_xid; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_last_xid); + +__u64 lustre_msg_get_last_committed(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_last_committed; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_last_committed); + +__u64 *lustre_msg_get_versions(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return NULL; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return NULL; + } + return pb->pb_pre_versions; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_versions); + +__u64 lustre_msg_get_transno(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_transno; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_transno); + +int lustre_msg_get_status(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_status; + } + default: + /* status might be printed in debug code while message + * uninitialized */ + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_get_status); + +__u64 lustre_msg_get_slv(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_slv; + } + default: + CERROR("invalid msg magic %08x\n", msg->lm_magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_get_slv); + + +void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return; + } + pb->pb_slv = slv; + return; + } + default: + CERROR("invalid msg magic %x\n", msg->lm_magic); + return; + } +} +EXPORT_SYMBOL(lustre_msg_set_slv); + +__u32 lustre_msg_get_limit(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_limit; + } + default: + CERROR("invalid msg magic %x\n", msg->lm_magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_get_limit); + + +void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return; + } + pb->pb_limit = limit; + return; + } + default: + CERROR("invalid msg magic %08x\n", msg->lm_magic); + return; + } +} +EXPORT_SYMBOL(lustre_msg_set_limit); + +__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_conn_cnt; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_conn_cnt); + +int lustre_msg_is_v1(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 1; + default: + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_is_v1); + +__u32 lustre_msg_get_magic(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return msg->lm_magic; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_magic); + +__u32 lustre_msg_get_timeout(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + + } + return pb->pb_timeout; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_get_service_time(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + + } + return pb->pb_service_time; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +char *lustre_msg_get_jobid(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return NULL; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = + lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body)); + if (!pb) + return NULL; + + return pb->pb_jobid; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_jobid); + +__u32 lustre_msg_get_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return msg->lm_cksum; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + __u32 crc; + unsigned int hsize = 4; + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb, + lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF), + NULL, 0, (unsigned char *)&crc, &hsize); + return crc; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_handle = *handle; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_handle); + +void lustre_msg_set_type(struct lustre_msg *msg, __u32 type) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_type = type; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_type); + +void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_opc = opc; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_opc); + +void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_last_xid = last_xid; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_last_xid); + +void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_last_committed = last_committed; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_last_committed); + +void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_pre_versions[0] = versions[0]; + pb->pb_pre_versions[1] = versions[1]; + pb->pb_pre_versions[2] = versions[2]; + pb->pb_pre_versions[3] = versions[3]; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_versions); + +void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_transno = transno; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_transno); + +void lustre_msg_set_status(struct lustre_msg *msg, __u32 status) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_status = status; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_status); + +void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_conn_cnt = conn_cnt; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_conn_cnt); + +void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_timeout = timeout; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_service_time = service_time; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + __u32 opc = lustre_msg_get_opc(msg); + struct ptlrpc_body *pb; + + /* Don't set jobid for ldlm ast RPCs, they've been shrunk. + * See the comment in ptlrpc_request_pack(). */ + if (!opc || opc == LDLM_BL_CALLBACK || + opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK) + return; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + + if (jobid != NULL) + memcpy(pb->pb_jobid, jobid, JOBSTATS_JOBID_SIZE); + else if (pb->pb_jobid[0] == '\0') + lustre_get_jobid(pb->pb_jobid); + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_jobid); + +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: + msg->lm_cksum = cksum; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + + +void ptlrpc_request_set_replen(struct ptlrpc_request *req) +{ + int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER); + + req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, + req->rq_pill.rc_area[RCL_SERVER]); + if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) + req->rq_reqmsg->lm_repsize = req->rq_replen; +} +EXPORT_SYMBOL(ptlrpc_request_set_replen); + +void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens) +{ + req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens); + if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) + req->rq_reqmsg->lm_repsize = req->rq_replen; +} +EXPORT_SYMBOL(ptlrpc_req_set_repsize); + +/** + * Send a remote set_info_async. + * + * This may go from client to server or server to client. + */ +int do_set_info_async(struct obd_import *imp, + int opcode, int version, + u32 keylen, void *key, + u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + char *tmp; + int rc; + + req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO); + if (req == NULL) + return -ENOMEM; + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, version, opcode); + if (rc) { + ptlrpc_request_free(req); + return rc; + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + memcpy(tmp, val, vallen); + + ptlrpc_request_set_replen(req); + + if (set) { + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + } else { + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + } + + return rc; +} +EXPORT_SYMBOL(do_set_info_async); + +/* byte flipping routines for all wire types declared in + * lustre_idl.h implemented here. + */ +void lustre_swab_ptlrpc_body(struct ptlrpc_body *b) +{ + __swab32s(&b->pb_type); + __swab32s(&b->pb_version); + __swab32s(&b->pb_opc); + __swab32s(&b->pb_status); + __swab64s(&b->pb_last_xid); + __swab64s(&b->pb_last_seen); + __swab64s(&b->pb_last_committed); + __swab64s(&b->pb_transno); + __swab32s(&b->pb_flags); + __swab32s(&b->pb_op_flags); + __swab32s(&b->pb_conn_cnt); + __swab32s(&b->pb_timeout); + __swab32s(&b->pb_service_time); + __swab32s(&b->pb_limit); + __swab64s(&b->pb_slv); + __swab64s(&b->pb_pre_versions[0]); + __swab64s(&b->pb_pre_versions[1]); + __swab64s(&b->pb_pre_versions[2]); + __swab64s(&b->pb_pre_versions[3]); + CLASSERT(offsetof(typeof(*b), pb_padding) != 0); + /* While we need to maintain compatibility between + * clients and servers without ptlrpc_body_v2 (< 2.3) + * do not swab any fields beyond pb_jobid, as we are + * using this swab function for both ptlrpc_body + * and ptlrpc_body_v2. */ + CLASSERT(offsetof(typeof(*b), pb_jobid) != 0); +} +EXPORT_SYMBOL(lustre_swab_ptlrpc_body); + +void lustre_swab_connect(struct obd_connect_data *ocd) +{ + __swab64s(&ocd->ocd_connect_flags); + __swab32s(&ocd->ocd_version); + __swab32s(&ocd->ocd_grant); + __swab64s(&ocd->ocd_ibits_known); + __swab32s(&ocd->ocd_index); + __swab32s(&ocd->ocd_brw_size); + /* ocd_blocksize and ocd_inodespace don't need to be swabbed because + * they are 8-byte values */ + __swab16s(&ocd->ocd_grant_extent); + __swab32s(&ocd->ocd_unused); + __swab64s(&ocd->ocd_transno); + __swab32s(&ocd->ocd_group); + __swab32s(&ocd->ocd_cksum_types); + __swab32s(&ocd->ocd_instance); + /* Fields after ocd_cksum_types are only accessible by the receiver + * if the corresponding flag in ocd_connect_flags is set. Accessing + * any field after ocd_maxbytes on the receiver without a valid flag + * may result in out-of-bound memory access and kernel oops. */ + if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE) + __swab32s(&ocd->ocd_max_easize); + if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES) + __swab64s(&ocd->ocd_maxbytes); + CLASSERT(offsetof(typeof(*ocd), padding1) != 0); + CLASSERT(offsetof(typeof(*ocd), padding2) != 0); + CLASSERT(offsetof(typeof(*ocd), padding3) != 0); + CLASSERT(offsetof(typeof(*ocd), padding4) != 0); + CLASSERT(offsetof(typeof(*ocd), padding5) != 0); + CLASSERT(offsetof(typeof(*ocd), padding6) != 0); + CLASSERT(offsetof(typeof(*ocd), padding7) != 0); + CLASSERT(offsetof(typeof(*ocd), padding8) != 0); + CLASSERT(offsetof(typeof(*ocd), padding9) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingA) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingB) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingC) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingD) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingE) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingF) != 0); +} + +void lustre_swab_obdo(struct obdo *o) +{ + __swab64s(&o->o_valid); + lustre_swab_ost_id(&o->o_oi); + __swab64s(&o->o_parent_seq); + __swab64s(&o->o_size); + __swab64s(&o->o_mtime); + __swab64s(&o->o_atime); + __swab64s(&o->o_ctime); + __swab64s(&o->o_blocks); + __swab64s(&o->o_grant); + __swab32s(&o->o_blksize); + __swab32s(&o->o_mode); + __swab32s(&o->o_uid); + __swab32s(&o->o_gid); + __swab32s(&o->o_flags); + __swab32s(&o->o_nlink); + __swab32s(&o->o_parent_oid); + __swab32s(&o->o_misc); + __swab64s(&o->o_ioepoch); + __swab32s(&o->o_stripe_idx); + __swab32s(&o->o_parent_ver); + /* o_handle is opaque */ + /* o_lcookie is swabbed elsewhere */ + __swab32s(&o->o_uid_h); + __swab32s(&o->o_gid_h); + __swab64s(&o->o_data_version); + CLASSERT(offsetof(typeof(*o), o_padding_4) != 0); + CLASSERT(offsetof(typeof(*o), o_padding_5) != 0); + CLASSERT(offsetof(typeof(*o), o_padding_6) != 0); + +} +EXPORT_SYMBOL(lustre_swab_obdo); + +void lustre_swab_obd_statfs(struct obd_statfs *os) +{ + __swab64s(&os->os_type); + __swab64s(&os->os_blocks); + __swab64s(&os->os_bfree); + __swab64s(&os->os_bavail); + __swab64s(&os->os_files); + __swab64s(&os->os_ffree); + /* no need to swab os_fsid */ + __swab32s(&os->os_bsize); + __swab32s(&os->os_namelen); + __swab64s(&os->os_maxbytes); + __swab32s(&os->os_state); + CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0); + CLASSERT(offsetof(typeof(*os), os_spare2) != 0); + CLASSERT(offsetof(typeof(*os), os_spare3) != 0); + CLASSERT(offsetof(typeof(*os), os_spare4) != 0); + CLASSERT(offsetof(typeof(*os), os_spare5) != 0); + CLASSERT(offsetof(typeof(*os), os_spare6) != 0); + CLASSERT(offsetof(typeof(*os), os_spare7) != 0); + CLASSERT(offsetof(typeof(*os), os_spare8) != 0); + CLASSERT(offsetof(typeof(*os), os_spare9) != 0); +} +EXPORT_SYMBOL(lustre_swab_obd_statfs); + +void lustre_swab_obd_ioobj(struct obd_ioobj *ioo) +{ + lustre_swab_ost_id(&ioo->ioo_oid); + __swab32s(&ioo->ioo_max_brw); + __swab32s(&ioo->ioo_bufcnt); +} +EXPORT_SYMBOL(lustre_swab_obd_ioobj); + +void lustre_swab_niobuf_remote(struct niobuf_remote *nbr) +{ + __swab64s(&nbr->offset); + __swab32s(&nbr->len); + __swab32s(&nbr->flags); +} +EXPORT_SYMBOL(lustre_swab_niobuf_remote); + +void lustre_swab_ost_body(struct ost_body *b) +{ + lustre_swab_obdo(&b->oa); +} +EXPORT_SYMBOL(lustre_swab_ost_body); + +void lustre_swab_ost_last_id(u64 *id) +{ + __swab64s(id); +} +EXPORT_SYMBOL(lustre_swab_ost_last_id); + +void lustre_swab_generic_32s(__u32 *val) +{ + __swab32s(val); +} +EXPORT_SYMBOL(lustre_swab_generic_32s); + +void lustre_swab_gl_desc(union ldlm_gl_desc *desc) +{ + lustre_swab_lu_fid(&desc->lquota_desc.gl_id.qid_fid); + __swab64s(&desc->lquota_desc.gl_flags); + __swab64s(&desc->lquota_desc.gl_ver); + __swab64s(&desc->lquota_desc.gl_hardlimit); + __swab64s(&desc->lquota_desc.gl_softlimit); + __swab64s(&desc->lquota_desc.gl_time); + CLASSERT(offsetof(typeof(desc->lquota_desc), gl_pad2) != 0); +} + +void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb) +{ + __swab64s(&lvb->lvb_size); + __swab64s(&lvb->lvb_mtime); + __swab64s(&lvb->lvb_atime); + __swab64s(&lvb->lvb_ctime); + __swab64s(&lvb->lvb_blocks); +} +EXPORT_SYMBOL(lustre_swab_ost_lvb_v1); + +void lustre_swab_ost_lvb(struct ost_lvb *lvb) +{ + __swab64s(&lvb->lvb_size); + __swab64s(&lvb->lvb_mtime); + __swab64s(&lvb->lvb_atime); + __swab64s(&lvb->lvb_ctime); + __swab64s(&lvb->lvb_blocks); + __swab32s(&lvb->lvb_mtime_ns); + __swab32s(&lvb->lvb_atime_ns); + __swab32s(&lvb->lvb_ctime_ns); + __swab32s(&lvb->lvb_padding); +} +EXPORT_SYMBOL(lustre_swab_ost_lvb); + +void lustre_swab_lquota_lvb(struct lquota_lvb *lvb) +{ + __swab64s(&lvb->lvb_flags); + __swab64s(&lvb->lvb_id_may_rel); + __swab64s(&lvb->lvb_id_rel); + __swab64s(&lvb->lvb_id_qunit); + __swab64s(&lvb->lvb_pad1); +} +EXPORT_SYMBOL(lustre_swab_lquota_lvb); + +void lustre_swab_mdt_body(struct mdt_body *b) +{ + lustre_swab_lu_fid(&b->fid1); + lustre_swab_lu_fid(&b->fid2); + /* handle is opaque */ + __swab64s(&b->valid); + __swab64s(&b->size); + __swab64s(&b->mtime); + __swab64s(&b->atime); + __swab64s(&b->ctime); + __swab64s(&b->blocks); + __swab64s(&b->ioepoch); + __swab64s(&b->t_state); + __swab32s(&b->fsuid); + __swab32s(&b->fsgid); + __swab32s(&b->capability); + __swab32s(&b->mode); + __swab32s(&b->uid); + __swab32s(&b->gid); + __swab32s(&b->flags); + __swab32s(&b->rdev); + __swab32s(&b->nlink); + CLASSERT(offsetof(typeof(*b), unused2) != 0); + __swab32s(&b->suppgid); + __swab32s(&b->eadatasize); + __swab32s(&b->aclsize); + __swab32s(&b->max_mdsize); + __swab32s(&b->max_cookiesize); + __swab32s(&b->uid_h); + __swab32s(&b->gid_h); + CLASSERT(offsetof(typeof(*b), padding_5) != 0); +} +EXPORT_SYMBOL(lustre_swab_mdt_body); + +void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b) +{ + /* handle is opaque */ + __swab64s(&b->ioepoch); + __swab32s(&b->flags); + CLASSERT(offsetof(typeof(*b), padding) != 0); +} +EXPORT_SYMBOL(lustre_swab_mdt_ioepoch); + +void lustre_swab_mgs_target_info(struct mgs_target_info *mti) +{ + int i; + __swab32s(&mti->mti_lustre_ver); + __swab32s(&mti->mti_stripe_index); + __swab32s(&mti->mti_config_ver); + __swab32s(&mti->mti_flags); + __swab32s(&mti->mti_instance); + __swab32s(&mti->mti_nid_count); + CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64)); + for (i = 0; i < MTI_NIDS_MAX; i++) + __swab64s(&mti->mti_nids[i]); +} +EXPORT_SYMBOL(lustre_swab_mgs_target_info); + +void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry) +{ + int i; + + __swab64s(&entry->mne_version); + __swab32s(&entry->mne_instance); + __swab32s(&entry->mne_index); + __swab32s(&entry->mne_length); + + /* mne_nid_(count|type) must be one byte size because we're gonna + * access it w/o swapping. */ + CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8)); + CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8)); + + /* remove this assertion if ipv6 is supported. */ + LASSERT(entry->mne_nid_type == 0); + for (i = 0; i < entry->mne_nid_count; i++) { + CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64)); + __swab64s(&entry->u.nids[i]); + } +} +EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry); + +void lustre_swab_mgs_config_body(struct mgs_config_body *body) +{ + __swab64s(&body->mcb_offset); + __swab32s(&body->mcb_units); + __swab16s(&body->mcb_type); +} +EXPORT_SYMBOL(lustre_swab_mgs_config_body); + +void lustre_swab_mgs_config_res(struct mgs_config_res *body) +{ + __swab64s(&body->mcr_offset); + __swab64s(&body->mcr_size); +} +EXPORT_SYMBOL(lustre_swab_mgs_config_res); + +static void lustre_swab_obd_dqinfo(struct obd_dqinfo *i) +{ + __swab64s(&i->dqi_bgrace); + __swab64s(&i->dqi_igrace); + __swab32s(&i->dqi_flags); + __swab32s(&i->dqi_valid); +} + +static void lustre_swab_obd_dqblk(struct obd_dqblk *b) +{ + __swab64s(&b->dqb_ihardlimit); + __swab64s(&b->dqb_isoftlimit); + __swab64s(&b->dqb_curinodes); + __swab64s(&b->dqb_bhardlimit); + __swab64s(&b->dqb_bsoftlimit); + __swab64s(&b->dqb_curspace); + __swab64s(&b->dqb_btime); + __swab64s(&b->dqb_itime); + __swab32s(&b->dqb_valid); + CLASSERT(offsetof(typeof(*b), dqb_padding) != 0); +} + +void lustre_swab_obd_quotactl(struct obd_quotactl *q) +{ + __swab32s(&q->qc_cmd); + __swab32s(&q->qc_type); + __swab32s(&q->qc_id); + __swab32s(&q->qc_stat); + lustre_swab_obd_dqinfo(&q->qc_dqinfo); + lustre_swab_obd_dqblk(&q->qc_dqblk); +} +EXPORT_SYMBOL(lustre_swab_obd_quotactl); + +void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p) +{ + __swab32s(&p->rp_uid); + __swab32s(&p->rp_gid); + __swab32s(&p->rp_fsuid); + __swab32s(&p->rp_fsuid_h); + __swab32s(&p->rp_fsgid); + __swab32s(&p->rp_fsgid_h); + __swab32s(&p->rp_access_perm); + __swab32s(&p->rp_padding); +}; +EXPORT_SYMBOL(lustre_swab_mdt_remote_perm); + +void lustre_swab_fid2path(struct getinfo_fid2path *gf) +{ + lustre_swab_lu_fid(&gf->gf_fid); + __swab64s(&gf->gf_recno); + __swab32s(&gf->gf_linkno); + __swab32s(&gf->gf_pathlen); +} +EXPORT_SYMBOL(lustre_swab_fid2path); + +void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent) +{ + __swab64s(&fm_extent->fe_logical); + __swab64s(&fm_extent->fe_physical); + __swab64s(&fm_extent->fe_length); + __swab32s(&fm_extent->fe_flags); + __swab32s(&fm_extent->fe_device); +} + +void lustre_swab_fiemap(struct ll_user_fiemap *fiemap) +{ + int i; + + __swab64s(&fiemap->fm_start); + __swab64s(&fiemap->fm_length); + __swab32s(&fiemap->fm_flags); + __swab32s(&fiemap->fm_mapped_extents); + __swab32s(&fiemap->fm_extent_count); + __swab32s(&fiemap->fm_reserved); + + for (i = 0; i < fiemap->fm_mapped_extents; i++) + lustre_swab_fiemap_extent(&fiemap->fm_extents[i]); +} +EXPORT_SYMBOL(lustre_swab_fiemap); + +void lustre_swab_idx_info(struct idx_info *ii) +{ + __swab32s(&ii->ii_magic); + __swab32s(&ii->ii_flags); + __swab16s(&ii->ii_count); + __swab32s(&ii->ii_attrs); + lustre_swab_lu_fid(&ii->ii_fid); + __swab64s(&ii->ii_version); + __swab64s(&ii->ii_hash_start); + __swab64s(&ii->ii_hash_end); + __swab16s(&ii->ii_keysize); + __swab16s(&ii->ii_recsize); +} + +void lustre_swab_lip_header(struct lu_idxpage *lip) +{ + /* swab header */ + __swab32s(&lip->lip_magic); + __swab16s(&lip->lip_flags); + __swab16s(&lip->lip_nr); +} +EXPORT_SYMBOL(lustre_swab_lip_header); + +void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr) +{ + __swab32s(&rr->rr_opcode); + __swab32s(&rr->rr_cap); + __swab32s(&rr->rr_fsuid); + /* rr_fsuid_h is unused */ + __swab32s(&rr->rr_fsgid); + /* rr_fsgid_h is unused */ + __swab32s(&rr->rr_suppgid1); + /* rr_suppgid1_h is unused */ + __swab32s(&rr->rr_suppgid2); + /* rr_suppgid2_h is unused */ + lustre_swab_lu_fid(&rr->rr_fid1); + lustre_swab_lu_fid(&rr->rr_fid2); + __swab64s(&rr->rr_mtime); + __swab64s(&rr->rr_atime); + __swab64s(&rr->rr_ctime); + __swab64s(&rr->rr_size); + __swab64s(&rr->rr_blocks); + __swab32s(&rr->rr_bias); + __swab32s(&rr->rr_mode); + __swab32s(&rr->rr_flags); + __swab32s(&rr->rr_flags_h); + __swab32s(&rr->rr_umask); + + CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0); +}; +EXPORT_SYMBOL(lustre_swab_mdt_rec_reint); + +void lustre_swab_lov_desc(struct lov_desc *ld) +{ + __swab32s(&ld->ld_tgt_count); + __swab32s(&ld->ld_active_tgt_count); + __swab32s(&ld->ld_default_stripe_count); + __swab32s(&ld->ld_pattern); + __swab64s(&ld->ld_default_stripe_size); + __swab64s(&ld->ld_default_stripe_offset); + __swab32s(&ld->ld_qos_maxage); + /* uuid endian insensitive */ +} +EXPORT_SYMBOL(lustre_swab_lov_desc); + +void lustre_swab_lmv_desc(struct lmv_desc *ld) +{ + __swab32s(&ld->ld_tgt_count); + __swab32s(&ld->ld_active_tgt_count); + __swab32s(&ld->ld_default_stripe_count); + __swab32s(&ld->ld_pattern); + __swab64s(&ld->ld_default_hash_size); + __swab32s(&ld->ld_qos_maxage); + /* uuid endian insensitive */ +} + +void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea) +{ + __swab32s(&mea->mea_magic); + __swab32s(&mea->mea_count); + __swab32s(&mea->mea_master); + CLASSERT(offsetof(typeof(*mea), mea_padding) != 0); +} + +void lustre_swab_lmv_user_md(struct lmv_user_md *lum) +{ + int i; + + __swab32s(&lum->lum_magic); + __swab32s(&lum->lum_stripe_count); + __swab32s(&lum->lum_stripe_offset); + __swab32s(&lum->lum_hash_type); + __swab32s(&lum->lum_type); + CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0); + CLASSERT(offsetof(typeof(*lum), lum_padding2) != 0); + CLASSERT(offsetof(typeof(*lum), lum_padding3) != 0); + + for (i = 0; i < lum->lum_stripe_count; i++) { + __swab32s(&lum->lum_objects[i].lum_mds); + lustre_swab_lu_fid(&lum->lum_objects[i].lum_fid); + } + +} +EXPORT_SYMBOL(lustre_swab_lmv_user_md); + +static void print_lum(struct lov_user_md *lum) +{ + CDEBUG(D_OTHER, "lov_user_md %p:\n", lum); + CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic); + CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern); + CDEBUG(D_OTHER, "\tlmm_object_id: %llu\n", lmm_oi_id(&lum->lmm_oi)); + CDEBUG(D_OTHER, "\tlmm_object_gr: %llu\n", lmm_oi_seq(&lum->lmm_oi)); + CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size); + CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count); + CDEBUG(D_OTHER, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n", + lum->lmm_stripe_offset); +} + +static void lustre_swab_lmm_oi(struct ost_id *oi) +{ + __swab64s(&oi->oi.oi_id); + __swab64s(&oi->oi.oi_seq); +} + +static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum) +{ + __swab32s(&lum->lmm_magic); + __swab32s(&lum->lmm_pattern); + lustre_swab_lmm_oi(&lum->lmm_oi); + __swab32s(&lum->lmm_stripe_size); + __swab16s(&lum->lmm_stripe_count); + __swab16s(&lum->lmm_stripe_offset); + print_lum(lum); +} + +void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum) +{ + CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n"); + lustre_swab_lov_user_md_common(lum); +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_v1); + +void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum) +{ + CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n"); + lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum); + /* lmm_pool_name nothing to do with char */ +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_v3); + +void lustre_swab_lov_mds_md(struct lov_mds_md *lmm) +{ + CDEBUG(D_IOCTL, "swabbing lov_mds_md\n"); + __swab32s(&lmm->lmm_magic); + __swab32s(&lmm->lmm_pattern); + lustre_swab_lmm_oi(&lmm->lmm_oi); + __swab32s(&lmm->lmm_stripe_size); + __swab16s(&lmm->lmm_stripe_count); + __swab16s(&lmm->lmm_layout_gen); +} +EXPORT_SYMBOL(lustre_swab_lov_mds_md); + +void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count) +{ + int i; + + for (i = 0; i < stripe_count; i++) { + lustre_swab_ost_id(&(lod[i].l_ost_oi)); + __swab32s(&(lod[i].l_ost_gen)); + __swab32s(&(lod[i].l_ost_idx)); + } +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); + +void lustre_swab_ldlm_res_id(struct ldlm_res_id *id) +{ + int i; + + for (i = 0; i < RES_NAME_SIZE; i++) + __swab64s(&id->name[i]); +} +EXPORT_SYMBOL(lustre_swab_ldlm_res_id); + +void lustre_swab_ldlm_policy_data(ldlm_wire_policy_data_t *d) +{ + /* the lock data is a union and the first two fields are always an + * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock + * data the same way. */ + __swab64s(&d->l_extent.start); + __swab64s(&d->l_extent.end); + __swab64s(&d->l_extent.gid); + __swab64s(&d->l_flock.lfw_owner); + __swab32s(&d->l_flock.lfw_pid); +} +EXPORT_SYMBOL(lustre_swab_ldlm_policy_data); + +void lustre_swab_ldlm_intent(struct ldlm_intent *i) +{ + __swab64s(&i->opc); +} +EXPORT_SYMBOL(lustre_swab_ldlm_intent); + +void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r) +{ + __swab32s(&r->lr_type); + CLASSERT(offsetof(typeof(*r), lr_padding) != 0); + lustre_swab_ldlm_res_id(&r->lr_name); +} +EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc); + +void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l) +{ + lustre_swab_ldlm_resource_desc(&l->l_resource); + __swab32s(&l->l_req_mode); + __swab32s(&l->l_granted_mode); + lustre_swab_ldlm_policy_data(&l->l_policy_data); +} +EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc); + +void lustre_swab_ldlm_request(struct ldlm_request *rq) +{ + __swab32s(&rq->lock_flags); + lustre_swab_ldlm_lock_desc(&rq->lock_desc); + __swab32s(&rq->lock_count); + /* lock_handle[] opaque */ +} +EXPORT_SYMBOL(lustre_swab_ldlm_request); + +void lustre_swab_ldlm_reply(struct ldlm_reply *r) +{ + __swab32s(&r->lock_flags); + CLASSERT(offsetof(typeof(*r), lock_padding) != 0); + lustre_swab_ldlm_lock_desc(&r->lock_desc); + /* lock_handle opaque */ + __swab64s(&r->lock_policy_res1); + __swab64s(&r->lock_policy_res2); +} +EXPORT_SYMBOL(lustre_swab_ldlm_reply); + +void lustre_swab_quota_body(struct quota_body *b) +{ + lustre_swab_lu_fid(&b->qb_fid); + lustre_swab_lu_fid((struct lu_fid *)&b->qb_id); + __swab32s(&b->qb_flags); + __swab64s(&b->qb_count); + __swab64s(&b->qb_usage); + __swab64s(&b->qb_slv_ver); +} + +/* Dump functions */ +void dump_ioo(struct obd_ioobj *ioo) +{ + CDEBUG(D_RPCTRACE, + "obd_ioobj: ioo_oid=" DOSTID ", ioo_max_brw=%#x, ioo_bufct=%d\n", + POSTID(&ioo->ioo_oid), ioo->ioo_max_brw, + ioo->ioo_bufcnt); +} +EXPORT_SYMBOL(dump_ioo); + +void dump_rniobuf(struct niobuf_remote *nb) +{ + CDEBUG(D_RPCTRACE, "niobuf_remote: offset=%llu, len=%d, flags=%x\n", + nb->offset, nb->len, nb->flags); +} +EXPORT_SYMBOL(dump_rniobuf); + +void dump_obdo(struct obdo *oa) +{ + __u32 valid = oa->o_valid; + + CDEBUG(D_RPCTRACE, "obdo: o_valid = %08x\n", valid); + if (valid & OBD_MD_FLID) + CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi)); + if (valid & OBD_MD_FLFID) + CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n", + oa->o_parent_seq); + if (valid & OBD_MD_FLSIZE) + CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size); + if (valid & OBD_MD_FLMTIME) + CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime); + if (valid & OBD_MD_FLATIME) + CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime); + if (valid & OBD_MD_FLCTIME) + CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime); + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks); + if (valid & OBD_MD_FLGRANT) + CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant); + if (valid & OBD_MD_FLBLKSZ) + CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize); + if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE)) + CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n", + oa->o_mode & ((valid & OBD_MD_FLTYPE ? S_IFMT : 0) | + (valid & OBD_MD_FLMODE ? ~S_IFMT : 0))); + if (valid & OBD_MD_FLUID) + CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid); + if (valid & OBD_MD_FLUID) + CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h); + if (valid & OBD_MD_FLGID) + CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid); + if (valid & OBD_MD_FLGID) + CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h); + if (valid & OBD_MD_FLFLAGS) + CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags); + if (valid & OBD_MD_FLNLINK) + CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink); + else if (valid & OBD_MD_FLCKSUM) + CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n", + oa->o_nlink); + if (valid & OBD_MD_FLGENER) + CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n", + oa->o_parent_oid); + if (valid & OBD_MD_FLEPOCH) + CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = %lld\n", + oa->o_ioepoch); + if (valid & OBD_MD_FLFID) { + CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n", + oa->o_stripe_idx); + CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n", + oa->o_parent_ver); + } + if (valid & OBD_MD_FLHANDLE) + CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n", + oa->o_handle.cookie); + if (valid & OBD_MD_FLCOOKIE) + CDEBUG(D_RPCTRACE, "obdo: o_lcookie = (llog_cookie dumping not yet implemented)\n"); +} +EXPORT_SYMBOL(dump_obdo); + +void dump_ost_body(struct ost_body *ob) +{ + dump_obdo(&ob->oa); +} +EXPORT_SYMBOL(dump_ost_body); + +void dump_rcs(__u32 *rc) +{ + CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc); +} +EXPORT_SYMBOL(dump_rcs); + +static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req) +{ + LASSERT(req->rq_reqmsg); + + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF); + default: + CERROR("bad lustre msg magic: %#08X\n", + req->rq_reqmsg->lm_magic); + } + return 0; +} + +static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req) +{ + LASSERT(req->rq_repmsg); + + switch (req->rq_repmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF); + default: + /* uninitialized yet */ + return 0; + } +} + +void _debug_req(struct ptlrpc_request *req, + struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + int req_ok = req->rq_reqmsg != NULL; + int rep_ok = req->rq_repmsg != NULL; + lnet_nid_t nid = LNET_NID_ANY; + va_list args; + + if (ptlrpc_req_need_swab(req)) { + req_ok = req_ok && req_ptlrpc_body_swabbed(req); + rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req); + } + + if (req->rq_import && req->rq_import->imp_connection) + nid = req->rq_import->imp_connection->c_peer.nid; + else if (req->rq_export && req->rq_export->exp_connection) + nid = req->rq_export->exp_connection->c_peer.nid; + + va_start(args, fmt); + libcfs_debug_vmsg2(msgdata, fmt, args, + " req@%p x%llu/t%lld(%lld) o%d->%s@%s:%d/%d lens %d/%d e %d to %d dl " CFS_TIME_T " ref %d fl " REQ_FLAGS_FMT "/%x/%x rc %d/%d\n", + req, req->rq_xid, req->rq_transno, + req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0, + req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1, + req->rq_import ? + req->rq_import->imp_obd->obd_name : + req->rq_export ? + req->rq_export->exp_client_uuid.uuid : + "", + libcfs_nid2str(nid), + req->rq_request_portal, req->rq_reply_portal, + req->rq_reqlen, req->rq_replen, + req->rq_early_count, req->rq_timedout, + req->rq_deadline, + atomic_read(&req->rq_refcount), + DEBUG_REQ_FLAGS(req), + req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1, + rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1, + req->rq_status, + rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1); + va_end(args); +} +EXPORT_SYMBOL(_debug_req); + +void lustre_swab_lustre_capa(struct lustre_capa *c) +{ + lustre_swab_lu_fid(&c->lc_fid); + __swab64s(&c->lc_opc); + __swab64s(&c->lc_uid); + __swab64s(&c->lc_gid); + __swab32s(&c->lc_flags); + __swab32s(&c->lc_keyid); + __swab32s(&c->lc_timeout); + __swab32s(&c->lc_expiry); +} +EXPORT_SYMBOL(lustre_swab_lustre_capa); + +void lustre_swab_lustre_capa_key(struct lustre_capa_key *k) +{ + __swab64s(&k->lk_seq); + __swab32s(&k->lk_keyid); + CLASSERT(offsetof(typeof(*k), lk_padding) != 0); +} +EXPORT_SYMBOL(lustre_swab_lustre_capa_key); + +void lustre_swab_hsm_user_state(struct hsm_user_state *state) +{ + __swab32s(&state->hus_states); + __swab32s(&state->hus_archive_id); +} +EXPORT_SYMBOL(lustre_swab_hsm_user_state); + +void lustre_swab_hsm_state_set(struct hsm_state_set *hss) +{ + __swab32s(&hss->hss_valid); + __swab64s(&hss->hss_setmask); + __swab64s(&hss->hss_clearmask); + __swab32s(&hss->hss_archive_id); +} +EXPORT_SYMBOL(lustre_swab_hsm_state_set); + +void lustre_swab_hsm_extent(struct hsm_extent *extent) +{ + __swab64s(&extent->offset); + __swab64s(&extent->length); +} + +void lustre_swab_hsm_current_action(struct hsm_current_action *action) +{ + __swab32s(&action->hca_state); + __swab32s(&action->hca_action); + lustre_swab_hsm_extent(&action->hca_location); +} +EXPORT_SYMBOL(lustre_swab_hsm_current_action); + +void lustre_swab_hsm_user_item(struct hsm_user_item *hui) +{ + lustre_swab_lu_fid(&hui->hui_fid); + lustre_swab_hsm_extent(&hui->hui_extent); +} +EXPORT_SYMBOL(lustre_swab_hsm_user_item); + +void lustre_swab_layout_intent(struct layout_intent *li) +{ + __swab32s(&li->li_opc); + __swab32s(&li->li_flags); + __swab64s(&li->li_start); + __swab64s(&li->li_end); +} +EXPORT_SYMBOL(lustre_swab_layout_intent); + +void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk) +{ + lustre_swab_lu_fid(&hpk->hpk_fid); + __swab64s(&hpk->hpk_cookie); + __swab64s(&hpk->hpk_extent.offset); + __swab64s(&hpk->hpk_extent.length); + __swab16s(&hpk->hpk_flags); + __swab16s(&hpk->hpk_errval); +} +EXPORT_SYMBOL(lustre_swab_hsm_progress_kernel); + +void lustre_swab_hsm_request(struct hsm_request *hr) +{ + __swab32s(&hr->hr_action); + __swab32s(&hr->hr_archive_id); + __swab64s(&hr->hr_flags); + __swab32s(&hr->hr_itemcount); + __swab32s(&hr->hr_data_len); +} +EXPORT_SYMBOL(lustre_swab_hsm_request); + +void lustre_swab_update_buf(struct update_buf *ub) +{ + __swab32s(&ub->ub_magic); + __swab32s(&ub->ub_count); +} +EXPORT_SYMBOL(lustre_swab_update_buf); + +void lustre_swab_update_reply_buf(struct update_reply *ur) +{ + int i; + + __swab32s(&ur->ur_version); + __swab32s(&ur->ur_count); + for (i = 0; i < ur->ur_count; i++) + __swab32s(&ur->ur_lens[i]); +} +EXPORT_SYMBOL(lustre_swab_update_reply_buf); + +void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl) +{ + __swab64s(&msl->msl_flags); +} +EXPORT_SYMBOL(lustre_swab_swap_layouts); + +void lustre_swab_close_data(struct close_data *cd) +{ + lustre_swab_lu_fid(&cd->cd_fid); + __swab64s(&cd->cd_data_version); +} +EXPORT_SYMBOL(lustre_swab_close_data); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c new file mode 100644 index 000000000..e1334c24e --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/pers.c @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_lib.h" +#include "../include/lustre_ha.h" +#include "../include/lustre_import.h" + +#include "ptlrpc_internal.h" + + +void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc, + int mdidx) +{ + CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON); + + LASSERT(mdidx < desc->bd_md_max_brw); + LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); + LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV | + LNET_MD_PHYS))); + + md->options |= LNET_MD_KIOV; + md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV); + md->length = min_t(unsigned int, LNET_MAX_IOV, md->length); + if (desc->bd_enc_iov) + md->start = &desc->bd_enc_iov[mdidx * LNET_MAX_IOV]; + else + md->start = &desc->bd_iov[mdidx * LNET_MAX_IOV]; +} + +void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, + int pageoffset, int len) +{ + lnet_kiov_t *kiov = &desc->bd_iov[desc->bd_iov_count]; + + kiov->kiov_page = page; + kiov->kiov_offset = pageoffset; + kiov->kiov_len = len; + + desc->bd_iov_count++; +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/pinger.c new file mode 100644 index 000000000..9dbda9332 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/pinger.c @@ -0,0 +1,678 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/pinger.c + * + * Portal-RPC reconnection and replay operations, for use in recovery. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "ptlrpc_internal.h" + +static int suppress_pings; +module_param(suppress_pings, int, 0644); +MODULE_PARM_DESC(suppress_pings, "Suppress pings"); + +struct mutex pinger_mutex; +static LIST_HEAD(pinger_imports); +static struct list_head timeout_list = LIST_HEAD_INIT(timeout_list); + +int ptlrpc_pinger_suppress_pings(void) +{ + return suppress_pings; +} +EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings); + +struct ptlrpc_request * +ptlrpc_prep_ping(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, + LUSTRE_OBD_VERSION, OBD_PING); + if (req) { + ptlrpc_request_set_replen(req); + req->rq_no_resend = req->rq_no_delay = 1; + } + return req; +} + +int ptlrpc_obd_ping(struct obd_device *obd) +{ + int rc; + struct ptlrpc_request *req; + + req = ptlrpc_prep_ping(obd->u.cli.cl_import); + if (req == NULL) + return -ENOMEM; + + req->rq_send_state = LUSTRE_IMP_FULL; + + rc = ptlrpc_queue_wait(req); + + ptlrpc_req_finished(req); + + return rc; +} +EXPORT_SYMBOL(ptlrpc_obd_ping); + +int ptlrpc_ping(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + req = ptlrpc_prep_ping(imp); + if (req == NULL) { + CERROR("OOM trying to ping %s->%s\n", + imp->imp_obd->obd_uuid.uuid, + obd2cli_tgt(imp->imp_obd)); + return -ENOMEM; + } + + DEBUG_REQ(D_INFO, req, "pinging %s->%s", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + + return 0; +} + +void ptlrpc_update_next_ping(struct obd_import *imp, int soon) +{ + int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL; + if (imp->imp_state == LUSTRE_IMP_DISCON) { + int dtime = max_t(int, CONNECTION_SWITCH_MIN, + AT_OFF ? 0 : + at_get(&imp->imp_at.iat_net_latency)); + time = min(time, dtime); + } + imp->imp_next_ping = cfs_time_shift(time); +} + +void ptlrpc_ping_import_soon(struct obd_import *imp) +{ + imp->imp_next_ping = cfs_time_current(); +} + +static inline int imp_is_deactive(struct obd_import *imp) +{ + return (imp->imp_deactive || + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE)); +} + +static inline int ptlrpc_next_reconnect(struct obd_import *imp) +{ + if (imp->imp_server_timeout) + return cfs_time_shift(obd_timeout / 2); + else + return cfs_time_shift(obd_timeout); +} + +long pinger_check_timeout(unsigned long time) +{ + struct timeout_item *item; + unsigned long timeout = PING_INTERVAL; + + /* The timeout list is a increase order sorted list */ + mutex_lock(&pinger_mutex); + list_for_each_entry(item, &timeout_list, ti_chain) { + int ti_timeout = item->ti_timeout; + if (timeout > ti_timeout) + timeout = ti_timeout; + break; + } + mutex_unlock(&pinger_mutex); + + return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)), + cfs_time_current()); +} + +static bool ir_up; + +void ptlrpc_pinger_ir_up(void) +{ + CDEBUG(D_HA, "IR up\n"); + ir_up = true; +} +EXPORT_SYMBOL(ptlrpc_pinger_ir_up); + +void ptlrpc_pinger_ir_down(void) +{ + CDEBUG(D_HA, "IR down\n"); + ir_up = false; +} +EXPORT_SYMBOL(ptlrpc_pinger_ir_down); + +static void ptlrpc_pinger_process_import(struct obd_import *imp, + unsigned long this_ping) +{ + int level; + int force; + int force_next; + int suppress; + + spin_lock(&imp->imp_lock); + + level = imp->imp_state; + force = imp->imp_force_verify; + force_next = imp->imp_force_next_verify; + /* + * This will be used below only if the import is "FULL". + */ + suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS); + + imp->imp_force_verify = 0; + + if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) && + !force) { + spin_unlock(&imp->imp_lock); + return; + } + + imp->imp_force_next_verify = 0; + + spin_unlock(&imp->imp_lock); + + CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u force %u force_next %u deactive %u pingable %u suppress %u\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level), level, force, force_next, + imp->imp_deactive, imp->imp_pingable, suppress); + + if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) { + /* wait for a while before trying recovery again */ + imp->imp_next_ping = ptlrpc_next_reconnect(imp); + if (!imp->imp_no_pinger_recover) + ptlrpc_initiate_recovery(imp); + } else if (level != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov || + imp_is_deactive(imp)) { + CDEBUG(D_HA, "%s->%s: not pinging (in recovery or recovery disabled: %s)\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level)); + if (force) { + spin_lock(&imp->imp_lock); + imp->imp_force_verify = 1; + spin_unlock(&imp->imp_lock); + } + } else if ((imp->imp_pingable && !suppress) || force_next || force) { + ptlrpc_ping(imp); + } +} + +static int ptlrpc_pinger_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; + + /* Record that the thread is running */ + thread_set_flags(thread, SVC_RUNNING); + wake_up(&thread->t_ctl_waitq); + + /* And now, loop forever, pinging as needed. */ + while (1) { + unsigned long this_ping = cfs_time_current(); + struct l_wait_info lwi; + long time_to_next_wake; + struct timeout_item *item; + struct list_head *iter; + + mutex_lock(&pinger_mutex); + list_for_each_entry(item, &timeout_list, ti_chain) { + item->ti_cb(item, item->ti_cb_data); + } + list_for_each(iter, &pinger_imports) { + struct obd_import *imp = + list_entry(iter, struct obd_import, + imp_pinger_chain); + + ptlrpc_pinger_process_import(imp, this_ping); + /* obd_timeout might have changed */ + if (imp->imp_pingable && imp->imp_next_ping && + cfs_time_after(imp->imp_next_ping, + cfs_time_add(this_ping, + cfs_time_seconds(PING_INTERVAL)))) + ptlrpc_update_next_ping(imp, 0); + } + mutex_unlock(&pinger_mutex); + /* update memory usage info */ + obd_update_maxusage(); + + /* Wait until the next ping time, or until we're stopped. */ + time_to_next_wake = pinger_check_timeout(this_ping); + /* The ping sent by ptlrpc_send_rpc may get sent out + say .01 second after this. + ptlrpc_pinger_sending_on_import will then set the + next ping time to next_ping + .01 sec, which means + we will SKIP the next ping at next_ping, and the + ping will get sent 2 timeouts from now! Beware. */ + CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" (" + CFS_TIME_T")\n", time_to_next_wake, + cfs_time_add(this_ping, + cfs_time_seconds(PING_INTERVAL))); + if (time_to_next_wake > 0) { + lwi = LWI_TIMEOUT(max_t(long, time_to_next_wake, + cfs_time_seconds(1)), + NULL, NULL); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopping(thread) || + thread_is_event(thread), + &lwi); + if (thread_test_and_clear_flags(thread, SVC_STOPPING)) { + break; + } else { + /* woken after adding import to reset timer */ + thread_test_and_clear_flags(thread, SVC_EVENT); + } + } + } + + thread_set_flags(thread, SVC_STOPPED); + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid()); + return 0; +} + +static struct ptlrpc_thread pinger_thread; + +int ptlrpc_start_pinger(void) +{ + struct l_wait_info lwi = { 0 }; + int rc; + + if (!thread_is_init(&pinger_thread) && + !thread_is_stopped(&pinger_thread)) + return -EALREADY; + + init_waitqueue_head(&pinger_thread.t_ctl_waitq); + + strcpy(pinger_thread.t_name, "ll_ping"); + + /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we + * just drop the VM and FILES in cfs_daemonize_ctxt() right away. */ + rc = PTR_ERR(kthread_run(ptlrpc_pinger_main, &pinger_thread, + "%s", pinger_thread.t_name)); + if (IS_ERR_VALUE(rc)) { + CERROR("cannot start thread: %d\n", rc); + return rc; + } + l_wait_event(pinger_thread.t_ctl_waitq, + thread_is_running(&pinger_thread), &lwi); + + if (suppress_pings) + CWARN("Pings will be suppressed at the request of the administrator. The configuration shall meet the additional requirements described in the manual. (Search for the \"suppress_pings\" kernel module parameter.)\n"); + + return 0; +} + +int ptlrpc_pinger_remove_timeouts(void); + +int ptlrpc_stop_pinger(void) +{ + struct l_wait_info lwi = { 0 }; + int rc = 0; + + if (thread_is_init(&pinger_thread) || + thread_is_stopped(&pinger_thread)) + return -EALREADY; + + ptlrpc_pinger_remove_timeouts(); + thread_set_flags(&pinger_thread, SVC_STOPPING); + wake_up(&pinger_thread.t_ctl_waitq); + + l_wait_event(pinger_thread.t_ctl_waitq, + thread_is_stopped(&pinger_thread), &lwi); + + return rc; +} + +void ptlrpc_pinger_sending_on_import(struct obd_import *imp) +{ + ptlrpc_update_next_ping(imp, 0); +} +EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import); + +void ptlrpc_pinger_commit_expected(struct obd_import *imp) +{ + ptlrpc_update_next_ping(imp, 1); + assert_spin_locked(&imp->imp_lock); + /* + * Avoid reading stale imp_connect_data. When not sure if pings are + * expected or not on next connection, we assume they are not and force + * one anyway to guarantee the chance of updating + * imp_peer_committed_transno. + */ + if (imp->imp_state != LUSTRE_IMP_FULL || + OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS)) + imp->imp_force_next_verify = 1; +} + +int ptlrpc_pinger_add_import(struct obd_import *imp) +{ + if (!list_empty(&imp->imp_pinger_chain)) + return -EALREADY; + + mutex_lock(&pinger_mutex); + CDEBUG(D_HA, "adding pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we add to pinger we want recovery on this import */ + imp->imp_obd->obd_no_recov = 0; + ptlrpc_update_next_ping(imp, 0); + /* XXX sort, blah blah */ + list_add_tail(&imp->imp_pinger_chain, &pinger_imports); + class_import_get(imp); + + ptlrpc_pinger_wake_up(); + mutex_unlock(&pinger_mutex); + + return 0; +} +EXPORT_SYMBOL(ptlrpc_pinger_add_import); + +int ptlrpc_pinger_del_import(struct obd_import *imp) +{ + if (list_empty(&imp->imp_pinger_chain)) + return -ENOENT; + + mutex_lock(&pinger_mutex); + list_del_init(&imp->imp_pinger_chain); + CDEBUG(D_HA, "removing pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we remove from pinger we don't want recovery on this import */ + imp->imp_obd->obd_no_recov = 1; + class_import_put(imp); + mutex_unlock(&pinger_mutex); + return 0; +} +EXPORT_SYMBOL(ptlrpc_pinger_del_import); + +/** + * Register a timeout callback to the pinger list, and the callback will + * be called when timeout happens. + */ +struct timeout_item *ptlrpc_new_timeout(int time, enum timeout_event event, + timeout_cb_t cb, void *data) +{ + struct timeout_item *ti; + + OBD_ALLOC_PTR(ti); + if (!ti) + return NULL; + + INIT_LIST_HEAD(&ti->ti_obd_list); + INIT_LIST_HEAD(&ti->ti_chain); + ti->ti_timeout = time; + ti->ti_event = event; + ti->ti_cb = cb; + ti->ti_cb_data = data; + + return ti; +} + +/** + * Register timeout event on the pinger thread. + * Note: the timeout list is an sorted list with increased timeout value. + */ +static struct timeout_item* +ptlrpc_pinger_register_timeout(int time, enum timeout_event event, + timeout_cb_t cb, void *data) +{ + struct timeout_item *item, *tmp; + + LASSERT(mutex_is_locked(&pinger_mutex)); + + list_for_each_entry(item, &timeout_list, ti_chain) + if (item->ti_event == event) + goto out; + + item = ptlrpc_new_timeout(time, event, cb, data); + if (item) { + list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) { + if (tmp->ti_timeout < time) { + list_add(&item->ti_chain, &tmp->ti_chain); + goto out; + } + } + list_add(&item->ti_chain, &timeout_list); + } +out: + return item; +} + +/* Add a client_obd to the timeout event list, when timeout(@time) + * happens, the callback(@cb) will be called. + */ +int ptlrpc_add_timeout_client(int time, enum timeout_event event, + timeout_cb_t cb, void *data, + struct list_head *obd_list) +{ + struct timeout_item *ti; + + mutex_lock(&pinger_mutex); + ti = ptlrpc_pinger_register_timeout(time, event, cb, data); + if (!ti) { + mutex_unlock(&pinger_mutex); + return -EINVAL; + } + list_add(obd_list, &ti->ti_obd_list); + mutex_unlock(&pinger_mutex); + return 0; +} +EXPORT_SYMBOL(ptlrpc_add_timeout_client); + +int ptlrpc_del_timeout_client(struct list_head *obd_list, + enum timeout_event event) +{ + struct timeout_item *ti = NULL, *item; + + if (list_empty(obd_list)) + return 0; + mutex_lock(&pinger_mutex); + list_del_init(obd_list); + /** + * If there are no obd attached to the timeout event + * list, remove this timeout event from the pinger + */ + list_for_each_entry(item, &timeout_list, ti_chain) { + if (item->ti_event == event) { + ti = item; + break; + } + } + LASSERTF(ti != NULL, "ti is NULL !\n"); + if (list_empty(&ti->ti_obd_list)) { + list_del(&ti->ti_chain); + OBD_FREE_PTR(ti); + } + mutex_unlock(&pinger_mutex); + return 0; +} +EXPORT_SYMBOL(ptlrpc_del_timeout_client); + +int ptlrpc_pinger_remove_timeouts(void) +{ + struct timeout_item *item, *tmp; + + mutex_lock(&pinger_mutex); + list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) { + LASSERT(list_empty(&item->ti_obd_list)); + list_del(&item->ti_chain); + OBD_FREE_PTR(item); + } + mutex_unlock(&pinger_mutex); + return 0; +} + +void ptlrpc_pinger_wake_up(void) +{ + thread_add_flags(&pinger_thread, SVC_EVENT); + wake_up(&pinger_thread.t_ctl_waitq); +} + +/* Ping evictor thread */ +#define PET_READY 1 +#define PET_TERMINATE 2 + +static int pet_refcount; +static int pet_state; +static wait_queue_head_t pet_waitq; +LIST_HEAD(pet_list); +static DEFINE_SPINLOCK(pet_lock); + +int ping_evictor_wake(struct obd_export *exp) +{ + struct obd_device *obd; + + spin_lock(&pet_lock); + if (pet_state != PET_READY) { + /* eventually the new obd will call here again. */ + spin_unlock(&pet_lock); + return 1; + } + + obd = class_exp2obd(exp); + if (list_empty(&obd->obd_evict_list)) { + class_incref(obd, "evictor", obd); + list_add(&obd->obd_evict_list, &pet_list); + } + spin_unlock(&pet_lock); + + wake_up(&pet_waitq); + return 0; +} + +static int ping_evictor_main(void *arg) +{ + struct obd_device *obd; + struct obd_export *exp; + struct l_wait_info lwi = { 0 }; + time_t expire_time; + + unshare_fs_struct(); + + CDEBUG(D_HA, "Starting Ping Evictor\n"); + pet_state = PET_READY; + while (1) { + l_wait_event(pet_waitq, (!list_empty(&pet_list)) || + (pet_state == PET_TERMINATE), &lwi); + + /* loop until all obd's will be removed */ + if ((pet_state == PET_TERMINATE) && list_empty(&pet_list)) + break; + + /* we only get here if pet_exp != NULL, and the end of this + * loop is the only place which sets it NULL again, so lock + * is not strictly necessary. */ + spin_lock(&pet_lock); + obd = list_entry(pet_list.next, struct obd_device, + obd_evict_list); + spin_unlock(&pet_lock); + + expire_time = get_seconds() - PING_EVICT_TIMEOUT; + + CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n", + obd->obd_name, expire_time); + + /* Exports can't be deleted out of the list while we hold + * the obd lock (class_unlink_export), which means we can't + * lose the last ref on the export. If they've already been + * removed from the list, we won't find them here. */ + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_exports_timed)) { + exp = list_entry(obd->obd_exports_timed.next, + struct obd_export, + exp_obd_chain_timed); + if (expire_time > exp->exp_last_request_time) { + class_export_get(exp); + spin_unlock(&obd->obd_dev_lock); + LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %ld seconds. I think it's dead, and I am evicting it. exp %p, cur %ld expire %ld last %ld\n", + obd->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + (long)(get_seconds() - + exp->exp_last_request_time), + exp, (long)get_seconds(), + (long)expire_time, + (long)exp->exp_last_request_time); + CDEBUG(D_HA, "Last request was at %ld\n", + exp->exp_last_request_time); + class_fail_export(exp); + class_export_put(exp); + spin_lock(&obd->obd_dev_lock); + } else { + /* List is sorted, so everyone below is ok */ + break; + } + } + spin_unlock(&obd->obd_dev_lock); + + spin_lock(&pet_lock); + list_del_init(&obd->obd_evict_list); + spin_unlock(&pet_lock); + + class_decref(obd, "evictor", obd); + } + CDEBUG(D_HA, "Exiting Ping Evictor\n"); + + return 0; +} + +void ping_evictor_start(void) +{ + struct task_struct *task; + + if (++pet_refcount > 1) + return; + + init_waitqueue_head(&pet_waitq); + + task = kthread_run(ping_evictor_main, NULL, "ll_evictor"); + if (IS_ERR(task)) { + pet_refcount--; + CERROR("Cannot start ping evictor thread: %ld\n", + PTR_ERR(task)); + } +} +EXPORT_SYMBOL(ping_evictor_start); + +void ping_evictor_stop(void) +{ + if (--pet_refcount > 0) + return; + + pet_state = PET_TERMINATE; + wake_up(&pet_waitq); +} +EXPORT_SYMBOL(ping_evictor_stop); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h new file mode 100644 index 000000000..a66dc3c6d --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h @@ -0,0 +1,312 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/* Intramodule declarations for ptlrpc. */ + +#ifndef PTLRPC_INTERNAL_H +#define PTLRPC_INTERNAL_H + +#include "../ldlm/ldlm_internal.h" + +struct ldlm_namespace; +struct obd_import; +struct ldlm_res_id; +struct ptlrpc_request_set; +extern int test_req_buffer_pressure; +extern struct mutex ptlrpc_all_services_mutex; + +int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait); +/* ptlrpcd.c */ +int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc); + +/* client.c */ +struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, + unsigned type, unsigned portal); +int ptlrpc_request_cache_init(void); +void ptlrpc_request_cache_fini(void); +struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags); +void ptlrpc_request_cache_free(struct ptlrpc_request *req); +void ptlrpc_init_xid(void); + +/* events.c */ +int ptlrpc_init_portals(void); +void ptlrpc_exit_portals(void); + +void ptlrpc_request_handle_notconn(struct ptlrpc_request *); +void lustre_assert_wire_constants(void); +int ptlrpc_import_in_recovery(struct obd_import *imp); +int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt); +void ptlrpc_handle_failed_import(struct obd_import *imp); +int ptlrpc_replay_next(struct obd_import *imp, int *inflight); +void ptlrpc_initiate_recovery(struct obd_import *imp); + +int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset); +int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset); + +#if defined(CONFIG_PROC_FS) +void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry, + struct ptlrpc_service *svc); +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc); +void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount); +void ptlrpc_lprocfs_do_request_stat(struct ptlrpc_request *req, + long q_usec, long work_usec); +#else +#define ptlrpc_lprocfs_register_service(params...) do {} while (0) +#define ptlrpc_lprocfs_unregister_service(params...) do {} while (0) +#define ptlrpc_lprocfs_rpc_sent(params...) do {} while (0) +#define ptlrpc_lprocfs_do_request_stat(params...) do {} while (0) +#endif /* CONFIG_PROC_FS */ + +/* NRS */ + +/** + * NRS core object. + * + * Holds NRS core fields. + */ +struct nrs_core { + /** + * Protects nrs_core::nrs_policies, serializes external policy + * registration/unregistration, and NRS core lprocfs operations. + */ + struct mutex nrs_mutex; + /* XXX: This is just for liblustre. Remove the #if defined directive + * when the * "cfs_" prefix is dropped from cfs_list_head. */ + /** + * List of all policy descriptors registered with NRS core; protected + * by nrs_core::nrs_mutex. + */ + struct list_head nrs_policies; + +}; + +int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc); +void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc); + +void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp); +void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req); +void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req); +void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp); + +struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, + bool peek, bool force); + +static inline struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp, + bool force) +{ + return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force); +} + +static inline struct ptlrpc_request * +ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp) +{ + return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false); +} + +void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req); +bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp); + +int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, + enum ptlrpc_nrs_queue_type queue, char *name, + enum ptlrpc_nrs_ctl opc, bool single, void *arg); + +int ptlrpc_nrs_init(void); +void ptlrpc_nrs_fini(void); + +static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nrs_hp != NULL; +} + +static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc) +{ + /** + * If the first service partition has an HP NRS head, all service + * partitions will. + */ + return nrs_svcpt_has_hp(svc->srv_parts[0]); +} + +static inline +struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp) +{ + LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt))); + return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg; +} + +static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt->scp_cpt; +} + +static inline +struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt->scp_service; +} + +static inline +struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt; +} + +static inline +struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy) +{ + return nrs_pol2svc(policy)->srv_cptable; +} + +static inline struct ptlrpc_nrs_resource * +nrs_request_resource(struct ptlrpc_nrs_request *nrq) +{ + LASSERT(nrq->nr_initialized); + LASSERT(!nrq->nr_finalized); + + return nrq->nr_res_ptrs[nrq->nr_res_idx]; +} + +static inline +struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq) +{ + return nrs_request_resource(nrq)->res_policy; +} + +#define NRS_LPROCFS_QUANTUM_NAME_REG "reg_quantum:" +#define NRS_LPROCFS_QUANTUM_NAME_HP "hp_quantum:" + +/** + * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum. + */ +#define LPROCFS_NRS_QUANTUM_MAX 65535 + +/** + * Max valid command string is the size of the labels, plus "65535" twice, plus + * a separating space character. + */ +#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD \ + sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " " \ + NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX)) + +/* recovd_thread.c */ + +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink); + +/* pers.c */ +void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc, + int mdcnt); +void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, + int pageoffset, int len); + +/* pack_generic.c */ +struct ptlrpc_reply_state * +lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt); +void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs); + +/* pinger.c */ +int ptlrpc_start_pinger(void); +int ptlrpc_stop_pinger(void); +void ptlrpc_pinger_sending_on_import(struct obd_import *imp); +void ptlrpc_pinger_commit_expected(struct obd_import *imp); +void ptlrpc_pinger_wake_up(void); +void ptlrpc_ping_import_soon(struct obd_import *imp); +int ping_evictor_wake(struct obd_export *exp); + +/* sec_null.c */ +int sptlrpc_null_init(void); +void sptlrpc_null_fini(void); + +/* sec_plain.c */ +int sptlrpc_plain_init(void); +void sptlrpc_plain_fini(void); + +/* sec_bulk.c */ +int sptlrpc_enc_pool_init(void); +void sptlrpc_enc_pool_fini(void); +int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v); + +/* sec_lproc.c */ +#if defined(CONFIG_PROC_FS) +int sptlrpc_lproc_init(void); +void sptlrpc_lproc_fini(void); +#else +static inline int sptlrpc_lproc_init(void) +{ return 0; } +static inline void sptlrpc_lproc_fini(void) {} +#endif + +/* sec_gc.c */ +int sptlrpc_gc_init(void); +void sptlrpc_gc_fini(void); + +/* sec_config.c */ +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); +int sptlrpc_conf_init(void); +void sptlrpc_conf_fini(void); + +/* sec.c */ +int sptlrpc_init(void); +void sptlrpc_fini(void); + +static inline int ll_rpc_recoverable_error(int rc) +{ + return (rc == -ENOTCONN || rc == -ENODEV); +} + +static inline int tgt_mod_init(void) +{ + return 0; +} + +static inline void tgt_mod_exit(void) +{ + return; +} + +static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set) +{ + if (atomic_dec_and_test(&set->set_refcount)) + OBD_FREE_PTR(set); +} +#endif /* PTLRPC_INTERNAL_H */ diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c new file mode 100644 index 000000000..5268887ca --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c @@ -0,0 +1,171 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + + +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_net.h" +#include "../include/lustre_req_layout.h" + +#include "ptlrpc_internal.h" + +extern spinlock_t ptlrpc_last_xid_lock; +#if RS_DEBUG +extern spinlock_t ptlrpc_rs_debug_lock; +#endif +extern struct mutex pinger_mutex; +extern struct mutex ptlrpcd_mutex; + +__init int ptlrpc_init(void) +{ + int rc, cleanup_phase = 0; + + lustre_assert_wire_constants(); +#if RS_DEBUG + spin_lock_init(&ptlrpc_rs_debug_lock); +#endif + mutex_init(&ptlrpc_all_services_mutex); + mutex_init(&pinger_mutex); + mutex_init(&ptlrpcd_mutex); + ptlrpc_init_xid(); + + rc = req_layout_init(); + if (rc) + return rc; + + rc = ptlrpc_hr_init(); + if (rc) + return rc; + + cleanup_phase = 1; + rc = ptlrpc_request_cache_init(); + if (rc) + goto cleanup; + + cleanup_phase = 2; + rc = ptlrpc_init_portals(); + if (rc) + goto cleanup; + + cleanup_phase = 3; + + rc = ptlrpc_connection_init(); + if (rc) + goto cleanup; + + cleanup_phase = 4; + ptlrpc_put_connection_superhack = ptlrpc_connection_put; + + rc = ptlrpc_start_pinger(); + if (rc) + goto cleanup; + + cleanup_phase = 5; + rc = ldlm_init(); + if (rc) + goto cleanup; + + cleanup_phase = 6; + rc = sptlrpc_init(); + if (rc) + goto cleanup; + + cleanup_phase = 7; + rc = ptlrpc_nrs_init(); + if (rc) + goto cleanup; + + cleanup_phase = 8; + rc = tgt_mod_init(); + if (rc) + goto cleanup; + return 0; + +cleanup: + switch (cleanup_phase) { + case 8: + ptlrpc_nrs_fini(); + /* Fall through */ + case 7: + sptlrpc_fini(); + /* Fall through */ + case 6: + ldlm_exit(); + /* Fall through */ + case 5: + ptlrpc_stop_pinger(); + /* Fall through */ + case 4: + ptlrpc_connection_fini(); + /* Fall through */ + case 3: + ptlrpc_exit_portals(); + /* Fall through */ + case 2: + ptlrpc_request_cache_fini(); + /* Fall through */ + case 1: + ptlrpc_hr_fini(); + req_layout_fini(); + /* Fall through */ + default: ; + } + + return rc; +} + +static void __exit ptlrpc_exit(void) +{ + tgt_mod_exit(); + ptlrpc_nrs_fini(); + sptlrpc_fini(); + ldlm_exit(); + ptlrpc_stop_pinger(); + ptlrpc_exit_portals(); + ptlrpc_request_cache_fini(); + ptlrpc_hr_fini(); + ptlrpc_connection_fini(); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Request Processor and Lock Management"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0.0"); + +module_init(ptlrpc_init); +module_exit(ptlrpc_exit); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c new file mode 100644 index 000000000..0c178ec0e --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c @@ -0,0 +1,811 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/ptlrpcd.c + */ + +/** \defgroup ptlrpcd PortalRPC daemon + * + * ptlrpcd is a special thread with its own set where other user might add + * requests when they don't want to wait for their completion. + * PtlRPCD will take care of sending such requests and then processing their + * replies and calling completion callbacks as necessary. + * The callbacks are called directly from ptlrpcd context. + * It is important to never significantly block (esp. on RPCs!) within such + * completion handler or a deadlock might occur where ptlrpcd enters some + * callback that attempts to send another RPC and wait for it to return, + * during which time ptlrpcd is completely blocked, so e.g. if import + * fails, recovery cannot progress because connection requests are also + * sent by ptlrpcd. + * + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/lustre_net.h" +#include "../include/lustre_lib.h" +#include "../include/lustre_ha.h" +#include "../include/obd_class.h" /* for obd_zombie */ +#include "../include/obd_support.h" /* for OBD_FAIL_CHECK */ +#include "../include/cl_object.h" /* cl_env_{get,put}() */ +#include "../include/lprocfs_status.h" + +#include "ptlrpc_internal.h" + +struct ptlrpcd { + int pd_size; + int pd_index; + int pd_nthreads; + struct ptlrpcd_ctl pd_thread_rcv; + struct ptlrpcd_ctl pd_threads[0]; +}; + +static int max_ptlrpcds; +module_param(max_ptlrpcds, int, 0644); +MODULE_PARM_DESC(max_ptlrpcds, "Max ptlrpcd thread count to be started."); + +static int ptlrpcd_bind_policy = PDB_POLICY_PAIR; +module_param(ptlrpcd_bind_policy, int, 0644); +MODULE_PARM_DESC(ptlrpcd_bind_policy, "Ptlrpcd threads binding mode."); +static struct ptlrpcd *ptlrpcds; + +struct mutex ptlrpcd_mutex; +static int ptlrpcd_users; + +void ptlrpcd_wake(struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *rq_set = req->rq_set; + + LASSERT(rq_set != NULL); + + wake_up(&rq_set->set_waitq); +} +EXPORT_SYMBOL(ptlrpcd_wake); + +static struct ptlrpcd_ctl * +ptlrpcd_select_pc(struct ptlrpc_request *req, pdl_policy_t policy, int index) +{ + int idx = 0; + + if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL) + return &ptlrpcds->pd_thread_rcv; + + switch (policy) { + case PDL_POLICY_SAME: + idx = smp_processor_id() % ptlrpcds->pd_nthreads; + break; + case PDL_POLICY_LOCAL: + /* Before CPU partition patches available, process it the same + * as "PDL_POLICY_ROUND". */ +# ifdef CFS_CPU_MODE_NUMA +# warning "fix this code to use new CPU partition APIs" +# endif + /* Fall through to PDL_POLICY_ROUND until the CPU + * CPU partition patches are available. */ + index = -1; + case PDL_POLICY_PREFERRED: + if (index >= 0 && index < num_online_cpus()) { + idx = index % ptlrpcds->pd_nthreads; + break; + } + /* Fall through to PDL_POLICY_ROUND for bad index. */ + default: + /* Fall through to PDL_POLICY_ROUND for unknown policy. */ + case PDL_POLICY_ROUND: + /* We do not care whether it is strict load balance. */ + idx = ptlrpcds->pd_index + 1; + if (idx == smp_processor_id()) + idx++; + idx %= ptlrpcds->pd_nthreads; + ptlrpcds->pd_index = idx; + break; + } + + return &ptlrpcds->pd_threads[idx]; +} + +/** + * Move all request from an existing request set to the ptlrpcd queue. + * All requests from the set must be in phase RQ_PHASE_NEW. + */ +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *pos; + struct ptlrpcd_ctl *pc; + struct ptlrpc_request_set *new; + int count, i; + + pc = ptlrpcd_select_pc(NULL, PDL_POLICY_LOCAL, -1); + new = pc->pc_set; + + list_for_each_safe(pos, tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(pos, struct ptlrpc_request, + rq_set_chain); + + LASSERT(req->rq_phase == RQ_PHASE_NEW); + req->rq_set = new; + req->rq_queued_time = cfs_time_current(); + } + + spin_lock(&new->set_new_req_lock); + list_splice_init(&set->set_requests, &new->set_new_requests); + i = atomic_read(&set->set_remaining); + count = atomic_add_return(i, &new->set_new_count); + atomic_set(&set->set_remaining, 0); + spin_unlock(&new->set_new_req_lock); + if (count == i) { + wake_up(&new->set_waitq); + + /* XXX: It maybe unnecessary to wakeup all the partners. But to + * guarantee the async RPC can be processed ASAP, we have + * no other better choice. It maybe fixed in future. */ + for (i = 0; i < pc->pc_npartners; i++) + wake_up(&pc->pc_partners[i]->pc_set->set_waitq); + } +} +EXPORT_SYMBOL(ptlrpcd_add_rqset); + +/** + * Return transferred RPCs count. + */ +static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des, + struct ptlrpc_request_set *src) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + int rc = 0; + + spin_lock(&src->set_new_req_lock); + if (likely(!list_empty(&src->set_new_requests))) { + list_for_each_safe(pos, tmp, &src->set_new_requests) { + req = list_entry(pos, struct ptlrpc_request, + rq_set_chain); + req->rq_set = des; + } + list_splice_init(&src->set_new_requests, + &des->set_requests); + rc = atomic_read(&src->set_new_count); + atomic_add(rc, &des->set_remaining); + atomic_set(&src->set_new_count, 0); + } + spin_unlock(&src->set_new_req_lock); + return rc; +} + +/** + * Requests that are added to the ptlrpcd queue are sent via + * ptlrpcd_check->ptlrpc_check_set(). + */ +void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx) +{ + struct ptlrpcd_ctl *pc; + + if (req->rq_reqmsg) + lustre_msg_set_jobid(req->rq_reqmsg, NULL); + + spin_lock(&req->rq_lock); + if (req->rq_invalid_rqset) { + struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5), + back_to_sleep, NULL); + + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi); + } else if (req->rq_set) { + /* If we have a valid "rq_set", just reuse it to avoid double + * linked. */ + LASSERT(req->rq_phase == RQ_PHASE_NEW); + LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY); + + /* ptlrpc_check_set will decrease the count */ + atomic_inc(&req->rq_set->set_remaining); + spin_unlock(&req->rq_lock); + wake_up(&req->rq_set->set_waitq); + return; + } else { + spin_unlock(&req->rq_lock); + } + + pc = ptlrpcd_select_pc(req, policy, idx); + + DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]", + req, pc->pc_name, pc->pc_index); + + ptlrpc_set_add_new_req(pc, req); +} +EXPORT_SYMBOL(ptlrpcd_add_req); + +static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set) +{ + atomic_inc(&set->set_refcount); +} + +/** + * Check if there is more work to do on ptlrpcd set. + * Returns 1 if yes. + */ +static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + struct ptlrpc_request_set *set = pc->pc_set; + int rc = 0; + int rc2; + + if (atomic_read(&set->set_new_count)) { + spin_lock(&set->set_new_req_lock); + if (likely(!list_empty(&set->set_new_requests))) { + list_splice_init(&set->set_new_requests, + &set->set_requests); + atomic_add(atomic_read(&set->set_new_count), + &set->set_remaining); + atomic_set(&set->set_new_count, 0); + /* + * Need to calculate its timeout. + */ + rc = 1; + } + spin_unlock(&set->set_new_req_lock); + } + + /* We should call lu_env_refill() before handling new requests to make + * sure that env key the requests depending on really exists. + */ + rc2 = lu_env_refill(env); + if (rc2 != 0) { + /* + * XXX This is very awkward situation, because + * execution can neither continue (request + * interpreters assume that env is set up), nor repeat + * the loop (as this potentially results in a tight + * loop of -ENOMEM's). + * + * Fortunately, refill only ever does something when + * new modules are loaded, i.e., early during boot up. + */ + CERROR("Failure to refill session: %d\n", rc2); + return rc; + } + + if (atomic_read(&set->set_remaining)) + rc |= ptlrpc_check_set(env, set); + + /* NB: ptlrpc_check_set has already moved completed request at the + * head of seq::set_requests */ + list_for_each_safe(pos, tmp, &set->set_requests) { + req = list_entry(pos, struct ptlrpc_request, rq_set_chain); + if (req->rq_phase != RQ_PHASE_COMPLETE) + break; + + list_del_init(&req->rq_set_chain); + req->rq_set = NULL; + ptlrpc_req_finished(req); + } + + if (rc == 0) { + /* + * If new requests have been added, make sure to wake up. + */ + rc = atomic_read(&set->set_new_count); + + /* If we have nothing to do, check whether we can take some + * work from our partner threads. */ + if (rc == 0 && pc->pc_npartners > 0) { + struct ptlrpcd_ctl *partner; + struct ptlrpc_request_set *ps; + int first = pc->pc_cursor; + + do { + partner = pc->pc_partners[pc->pc_cursor++]; + if (pc->pc_cursor >= pc->pc_npartners) + pc->pc_cursor = 0; + if (partner == NULL) + continue; + + spin_lock(&partner->pc_lock); + ps = partner->pc_set; + if (ps == NULL) { + spin_unlock(&partner->pc_lock); + continue; + } + + ptlrpc_reqset_get(ps); + spin_unlock(&partner->pc_lock); + + if (atomic_read(&ps->set_new_count)) { + rc = ptlrpcd_steal_rqset(set, ps); + if (rc > 0) + CDEBUG(D_RPCTRACE, "transfer %d async RPCs [%d->%d]\n", + rc, partner->pc_index, + pc->pc_index); + } + ptlrpc_reqset_put(ps); + } while (rc == 0 && pc->pc_cursor != first); + } + } + + return rc; +} + +/** + * Main ptlrpcd thread. + * ptlrpc's code paths like to execute in process context, so we have this + * thread which spins on a set which contains the rpcs and sends them. + * + */ +static int ptlrpcd(void *arg) +{ + struct ptlrpcd_ctl *pc = arg; + struct ptlrpc_request_set *set = pc->pc_set; + struct lu_env env = { .le_ses = NULL }; + int rc, exit = 0; + + unshare_fs_struct(); +#if defined(CONFIG_SMP) + if (test_bit(LIOD_BIND, &pc->pc_flags)) { + int index = pc->pc_index; + + if (index >= 0 && index < num_possible_cpus()) { + while (!cpu_online(index)) { + if (++index >= num_possible_cpus()) + index = 0; + } + set_cpus_allowed_ptr(current, + cpumask_of_node(cpu_to_node(index))); + } + } +#endif + /* + * XXX So far only "client" ptlrpcd uses an environment. In + * the future, ptlrpcd thread (or a thread-set) has to given + * an argument, describing its "scope". + */ + rc = lu_context_init(&env.le_ctx, + LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF); + complete(&pc->pc_starting); + + if (rc != 0) + return rc; + + /* + * This mainloop strongly resembles ptlrpc_set_wait() except that our + * set never completes. ptlrpcd_check() calls ptlrpc_check_set() when + * there are requests in the set. New requests come in on the set's + * new_req_list and ptlrpcd_check() moves them into the set. + */ + do { + struct l_wait_info lwi; + int timeout; + + timeout = ptlrpc_set_next_timeout(set); + lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1), + ptlrpc_expired_set, set); + + lu_context_enter(&env.le_ctx); + l_wait_event(set->set_waitq, + ptlrpcd_check(&env, pc), &lwi); + lu_context_exit(&env.le_ctx); + + /* + * Abort inflight rpcs for forced stop case. + */ + if (test_bit(LIOD_STOP, &pc->pc_flags)) { + if (test_bit(LIOD_FORCE, &pc->pc_flags)) + ptlrpc_abort_set(set); + exit++; + } + + /* + * Let's make one more loop to make sure that ptlrpcd_check() + * copied all raced new rpcs into the set so we can kill them. + */ + } while (exit < 2); + + /* + * Wait for inflight requests to drain. + */ + if (!list_empty(&set->set_requests)) + ptlrpc_set_wait(set); + lu_context_fini(&env.le_ctx); + + complete(&pc->pc_finishing); + + return 0; +} + +/* XXX: We want multiple CPU cores to share the async RPC load. So we start many + * ptlrpcd threads. We also want to reduce the ptlrpcd overhead caused by + * data transfer cross-CPU cores. So we bind ptlrpcd thread to specified + * CPU core. But binding all ptlrpcd threads maybe cause response delay + * because of some CPU core(s) busy with other loads. + * + * For example: "ls -l", some async RPCs for statahead are assigned to + * ptlrpcd_0, and ptlrpcd_0 is bound to CPU_0, but CPU_0 may be quite busy + * with other non-ptlrpcd, like "ls -l" itself (we want to the "ls -l" + * thread, statahead thread, and ptlrpcd thread can run in parallel), under + * such case, the statahead async RPCs can not be processed in time, it is + * unexpected. If ptlrpcd_0 can be re-scheduled on other CPU core, it may + * be better. But it breaks former data transfer policy. + * + * So we shouldn't be blind for avoiding the data transfer. We make some + * compromise: divide the ptlrpcd threads pool into two parts. One part is + * for bound mode, each ptlrpcd thread in this part is bound to some CPU + * core. The other part is for free mode, all the ptlrpcd threads in the + * part can be scheduled on any CPU core. We specify some partnership + * between bound mode ptlrpcd thread(s) and free mode ptlrpcd thread(s), + * and the async RPC load within the partners are shared. + * + * It can partly avoid data transfer cross-CPU (if the bound mode ptlrpcd + * thread can be scheduled in time), and try to guarantee the async RPC + * processed ASAP (as long as the free mode ptlrpcd thread can be scheduled + * on any CPU core). + * + * As for how to specify the partnership between bound mode ptlrpcd + * thread(s) and free mode ptlrpcd thread(s), the simplest way is to use + * pair. In future, we can specify some more complex + * partnership based on the patches for CPU partition. But before such + * patches are available, we prefer to use the simplest one. + */ +# ifdef CFS_CPU_MODE_NUMA +# warning "fix ptlrpcd_bind() to use new CPU partition APIs" +# endif +static int ptlrpcd_bind(int index, int max) +{ + struct ptlrpcd_ctl *pc; + int rc = 0; +#if defined(CONFIG_NUMA) + cpumask_t mask; +#endif + + LASSERT(index <= max - 1); + pc = &ptlrpcds->pd_threads[index]; + switch (ptlrpcd_bind_policy) { + case PDB_POLICY_NONE: + pc->pc_npartners = -1; + break; + case PDB_POLICY_FULL: + pc->pc_npartners = 0; + set_bit(LIOD_BIND, &pc->pc_flags); + break; + case PDB_POLICY_PAIR: + LASSERT(max % 2 == 0); + pc->pc_npartners = 1; + break; + case PDB_POLICY_NEIGHBOR: +#if defined(CONFIG_NUMA) + { + int i; + cpumask_copy(&mask, cpumask_of_node(cpu_to_node(index))); + for (i = max; i < num_online_cpus(); i++) + cpumask_clear_cpu(i, &mask); + pc->pc_npartners = cpumask_weight(&mask) - 1; + set_bit(LIOD_BIND, &pc->pc_flags); + } +#else + LASSERT(max >= 3); + pc->pc_npartners = 2; +#endif + break; + default: + CERROR("unknown ptlrpcd bind policy %d\n", ptlrpcd_bind_policy); + rc = -EINVAL; + } + + if (rc == 0 && pc->pc_npartners > 0) { + OBD_ALLOC(pc->pc_partners, + sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners); + if (pc->pc_partners == NULL) { + pc->pc_npartners = 0; + rc = -ENOMEM; + } else { + switch (ptlrpcd_bind_policy) { + case PDB_POLICY_PAIR: + if (index & 0x1) { + set_bit(LIOD_BIND, &pc->pc_flags); + pc->pc_partners[0] = &ptlrpcds-> + pd_threads[index - 1]; + ptlrpcds->pd_threads[index - 1]. + pc_partners[0] = pc; + } + break; + case PDB_POLICY_NEIGHBOR: +#if defined(CONFIG_NUMA) + { + struct ptlrpcd_ctl *ppc; + int i, pidx; + /* partners are cores in the same NUMA node. + * setup partnership only with ptlrpcd threads + * that are already initialized + */ + for (pidx = 0, i = 0; i < index; i++) { + if (cpumask_test_cpu(i, &mask)) { + ppc = &ptlrpcds->pd_threads[i]; + pc->pc_partners[pidx++] = ppc; + ppc->pc_partners[ppc-> + pc_npartners++] = pc; + } + } + /* adjust number of partners to the number + * of partnership really setup */ + pc->pc_npartners = pidx; + } +#else + if (index & 0x1) + set_bit(LIOD_BIND, &pc->pc_flags); + if (index > 0) { + pc->pc_partners[0] = &ptlrpcds-> + pd_threads[index - 1]; + ptlrpcds->pd_threads[index - 1]. + pc_partners[1] = pc; + if (index == max - 1) { + pc->pc_partners[1] = + &ptlrpcds->pd_threads[0]; + ptlrpcds->pd_threads[0]. + pc_partners[0] = pc; + } + } +#endif + break; + } + } + } + + return rc; +} + + +int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc) +{ + int rc; + + /* + * Do not allow start second thread for one pc. + */ + if (test_and_set_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Starting second thread (%s) for same pc %p\n", + name, pc); + return 0; + } + + pc->pc_index = index; + init_completion(&pc->pc_starting); + init_completion(&pc->pc_finishing); + spin_lock_init(&pc->pc_lock); + strlcpy(pc->pc_name, name, sizeof(pc->pc_name)); + pc->pc_set = ptlrpc_prep_set(); + if (pc->pc_set == NULL) { + rc = -ENOMEM; + goto out; + } + + /* + * So far only "client" ptlrpcd uses an environment. In the future, + * ptlrpcd thread (or a thread-set) has to be given an argument, + * describing its "scope". + */ + rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER); + if (rc != 0) + goto out_set; + + { + struct task_struct *task; + if (index >= 0) { + rc = ptlrpcd_bind(index, max); + if (rc < 0) + goto out_env; + } + + task = kthread_run(ptlrpcd, pc, "%s", pc->pc_name); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + goto out_env; + } + + wait_for_completion(&pc->pc_starting); + } + return 0; + +out_env: + lu_context_fini(&pc->pc_env.le_ctx); + +out_set: + if (pc->pc_set != NULL) { + struct ptlrpc_request_set *set = pc->pc_set; + + spin_lock(&pc->pc_lock); + pc->pc_set = NULL; + spin_unlock(&pc->pc_lock); + ptlrpc_set_destroy(set); + } + clear_bit(LIOD_BIND, &pc->pc_flags); + +out: + clear_bit(LIOD_START, &pc->pc_flags); + return rc; +} + +void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force) +{ + if (!test_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Thread for pc %p was not started\n", pc); + return; + } + + set_bit(LIOD_STOP, &pc->pc_flags); + if (force) + set_bit(LIOD_FORCE, &pc->pc_flags); + wake_up(&pc->pc_set->set_waitq); +} + +void ptlrpcd_free(struct ptlrpcd_ctl *pc) +{ + struct ptlrpc_request_set *set = pc->pc_set; + + if (!test_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Thread for pc %p was not started\n", pc); + goto out; + } + + wait_for_completion(&pc->pc_finishing); + lu_context_fini(&pc->pc_env.le_ctx); + + spin_lock(&pc->pc_lock); + pc->pc_set = NULL; + spin_unlock(&pc->pc_lock); + ptlrpc_set_destroy(set); + + clear_bit(LIOD_START, &pc->pc_flags); + clear_bit(LIOD_STOP, &pc->pc_flags); + clear_bit(LIOD_FORCE, &pc->pc_flags); + clear_bit(LIOD_BIND, &pc->pc_flags); + +out: + if (pc->pc_npartners > 0) { + LASSERT(pc->pc_partners != NULL); + + OBD_FREE(pc->pc_partners, + sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners); + pc->pc_partners = NULL; + } + pc->pc_npartners = 0; +} + +static void ptlrpcd_fini(void) +{ + int i; + + if (ptlrpcds != NULL) { + for (i = 0; i < ptlrpcds->pd_nthreads; i++) + ptlrpcd_stop(&ptlrpcds->pd_threads[i], 0); + for (i = 0; i < ptlrpcds->pd_nthreads; i++) + ptlrpcd_free(&ptlrpcds->pd_threads[i]); + ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0); + ptlrpcd_free(&ptlrpcds->pd_thread_rcv); + OBD_FREE(ptlrpcds, ptlrpcds->pd_size); + ptlrpcds = NULL; + } +} + +static int ptlrpcd_init(void) +{ + int nthreads = num_online_cpus(); + char name[16]; + int size, i = -1, j, rc = 0; + + if (max_ptlrpcds > 0 && max_ptlrpcds < nthreads) + nthreads = max_ptlrpcds; + if (nthreads < 2) + nthreads = 2; + if (nthreads < 3 && ptlrpcd_bind_policy == PDB_POLICY_NEIGHBOR) + ptlrpcd_bind_policy = PDB_POLICY_PAIR; + else if (nthreads % 2 != 0 && ptlrpcd_bind_policy == PDB_POLICY_PAIR) + nthreads &= ~1; /* make sure it is even */ + + size = offsetof(struct ptlrpcd, pd_threads[nthreads]); + OBD_ALLOC(ptlrpcds, size); + if (ptlrpcds == NULL) { + rc = -ENOMEM; + goto out; + } + + snprintf(name, sizeof(name), "ptlrpcd_rcv"); + set_bit(LIOD_RECOVERY, &ptlrpcds->pd_thread_rcv.pc_flags); + rc = ptlrpcd_start(-1, nthreads, name, &ptlrpcds->pd_thread_rcv); + if (rc < 0) + goto out; + + /* XXX: We start nthreads ptlrpc daemons. Each of them can process any + * non-recovery async RPC to improve overall async RPC efficiency. + * + * But there are some issues with async I/O RPCs and async non-I/O + * RPCs processed in the same set under some cases. The ptlrpcd may + * be blocked by some async I/O RPC(s), then will cause other async + * non-I/O RPC(s) can not be processed in time. + * + * Maybe we should distinguish blocked async RPCs from non-blocked + * async RPCs, and process them in different ptlrpcd sets to avoid + * unnecessary dependency. But how to distribute async RPCs load + * among all the ptlrpc daemons becomes another trouble. */ + for (i = 0; i < nthreads; i++) { + snprintf(name, sizeof(name), "ptlrpcd_%d", i); + rc = ptlrpcd_start(i, nthreads, name, &ptlrpcds->pd_threads[i]); + if (rc < 0) + goto out; + } + + ptlrpcds->pd_size = size; + ptlrpcds->pd_index = 0; + ptlrpcds->pd_nthreads = nthreads; + +out: + if (rc != 0 && ptlrpcds != NULL) { + for (j = 0; j <= i; j++) + ptlrpcd_stop(&ptlrpcds->pd_threads[j], 0); + for (j = 0; j <= i; j++) + ptlrpcd_free(&ptlrpcds->pd_threads[j]); + ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0); + ptlrpcd_free(&ptlrpcds->pd_thread_rcv); + OBD_FREE(ptlrpcds, size); + ptlrpcds = NULL; + } + + return 0; +} + +int ptlrpcd_addref(void) +{ + int rc = 0; + + mutex_lock(&ptlrpcd_mutex); + if (++ptlrpcd_users == 1) + rc = ptlrpcd_init(); + mutex_unlock(&ptlrpcd_mutex); + return rc; +} +EXPORT_SYMBOL(ptlrpcd_addref); + +void ptlrpcd_decref(void) +{ + mutex_lock(&ptlrpcd_mutex); + if (--ptlrpcd_users == 0) + ptlrpcd_fini(); + mutex_unlock(&ptlrpcd_mutex); +} +EXPORT_SYMBOL(ptlrpcd_decref); +/** @} ptlrpcd */ diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/recover.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/recover.c new file mode 100644 index 000000000..7b1d72947 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/recover.c @@ -0,0 +1,379 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/recover.c + * + * Author: Mike Shaver + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_support.h" +#include "../include/lustre_ha.h" +#include "../include/lustre_net.h" +#include "../include/lustre_import.h" +#include "../include/lustre_export.h" +#include "../include/obd.h" +#include "../include/obd_class.h" +#include + +#include "ptlrpc_internal.h" + +/** + * Start recovery on disconnected import. + * This is done by just attempting a connect + */ +void ptlrpc_initiate_recovery(struct obd_import *imp) +{ + CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd)); + ptlrpc_connect_import(imp); +} + +/** + * Identify what request from replay list needs to be replayed next + * (based on what we have already replayed) and send it to server. + */ +int ptlrpc_replay_next(struct obd_import *imp, int *inflight) +{ + int rc = 0; + struct list_head *tmp, *pos; + struct ptlrpc_request *req = NULL; + __u64 last_transno; + + *inflight = 0; + + /* It might have committed some after we last spoke, so make sure we + * get rid of them now. + */ + spin_lock(&imp->imp_lock); + imp->imp_last_transno_checked = 0; + ptlrpc_free_committed(imp); + last_transno = imp->imp_last_replay_transno; + spin_unlock(&imp->imp_lock); + + CDEBUG(D_HA, "import %p from %s committed %llu last %llu\n", + imp, obd2cli_tgt(imp->imp_obd), + imp->imp_peer_committed_transno, last_transno); + + /* Do I need to hold a lock across this iteration? We shouldn't be + * racing with any additions to the list, because we're in recovery + * and are therefore not processing additional requests to add. Calls + * to ptlrpc_free_committed might commit requests, but nothing "newer" + * than the one we're replaying (it can't be committed until it's + * replayed, and we're doing that here). l_f_e_safe protects against + * problems with the current request being committed, in the unlikely + * event of that race. So, in conclusion, I think that it's safe to + * perform this list-walk without the imp_lock held. + * + * But, the {mdc,osc}_replay_open callbacks both iterate + * request lists, and have comments saying they assume the + * imp_lock is being held by ptlrpc_replay, but it's not. it's + * just a little race... + */ + + /* Replay all the committed open requests on committed_list first */ + if (!list_empty(&imp->imp_committed_list)) { + tmp = imp->imp_committed_list.prev; + req = list_entry(tmp, struct ptlrpc_request, + rq_replay_list); + + /* The last request on committed_list hasn't been replayed */ + if (req->rq_transno > last_transno) { + /* Since the imp_committed_list is immutable before + * all of it's requests being replayed, it's safe to + * use a cursor to accelerate the search */ + imp->imp_replay_cursor = imp->imp_replay_cursor->next; + + while (imp->imp_replay_cursor != + &imp->imp_committed_list) { + req = list_entry(imp->imp_replay_cursor, + struct ptlrpc_request, + rq_replay_list); + if (req->rq_transno > last_transno) + break; + + req = NULL; + imp->imp_replay_cursor = + imp->imp_replay_cursor->next; + } + } else { + /* All requests on committed_list have been replayed */ + imp->imp_replay_cursor = &imp->imp_committed_list; + req = NULL; + } + } + + /* All the requests in committed list have been replayed, let's replay + * the imp_replay_list */ + if (req == NULL) { + list_for_each_safe(tmp, pos, &imp->imp_replay_list) { + req = list_entry(tmp, struct ptlrpc_request, + rq_replay_list); + + if (req->rq_transno > last_transno) + break; + req = NULL; + } + } + + /* If need to resend the last sent transno (because a reconnect + * has occurred), then stop on the matching req and send it again. + * If, however, the last sent transno has been committed then we + * continue replay from the next request. */ + if (req != NULL && imp->imp_resend_replay) + lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); + + spin_lock(&imp->imp_lock); + imp->imp_resend_replay = 0; + spin_unlock(&imp->imp_lock); + + if (req != NULL) { + rc = ptlrpc_replay_req(req); + if (rc) { + CERROR("recovery replay error %d for req %llu\n", + rc, req->rq_xid); + return rc; + } + *inflight = 1; + } + return rc; +} + +/** + * Schedule resending of request on sending_list. This is done after + * we completed replaying of requests and locks. + */ +int ptlrpc_resend(struct obd_import *imp) +{ + struct ptlrpc_request *req, *next; + + /* As long as we're in recovery, nothing should be added to the sending + * list, so we don't need to hold the lock during this iteration and + * resend process. + */ + /* Well... what if lctl recover is called twice at the same time? + */ + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_RECOVER) { + spin_unlock(&imp->imp_lock); + return -1; + } + + list_for_each_entry_safe(req, next, &imp->imp_sending_list, + rq_list) { + LASSERTF((long)req > PAGE_CACHE_SIZE && req != LP_POISON, + "req %p bad\n", req); + LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req); + if (!ptlrpc_no_resend(req)) + ptlrpc_resend_req(req); + } + spin_unlock(&imp->imp_lock); + + return 0; +} +EXPORT_SYMBOL(ptlrpc_resend); + +/** + * Go through all requests in delayed list and wake their threads + * for resending + */ +void ptlrpc_wake_delayed(struct obd_import *imp) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, pos, &imp->imp_delayed_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set); + ptlrpc_client_wake_req(req); + } + spin_unlock(&imp->imp_lock); +} +EXPORT_SYMBOL(ptlrpc_wake_delayed); + +void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) +{ + struct obd_import *imp = failed_req->rq_import; + + CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + if (ptlrpc_set_import_discon(imp, + lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) { + if (!imp->imp_replayable) { + CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_obd->obd_name); + ptlrpc_deactivate_import(imp); + } + /* to control recovery via lctl {disable|enable}_recovery */ + if (imp->imp_deactive == 0) + ptlrpc_connect_import(imp); + } + + /* Wait for recovery to complete and resend. If evicted, then + this request will be errored out later.*/ + spin_lock(&failed_req->rq_lock); + if (!failed_req->rq_no_resend) + failed_req->rq_resend = 1; + spin_unlock(&failed_req->rq_lock); +} + +/** + * Administratively active/deactive a client. + * This should only be called by the ioctl interface, currently + * - the lctl deactivate and activate commands + * - echo 0/1 >> /proc/osc/XXX/active + * - client umount -f (ll_umount_begin) + */ +int ptlrpc_set_import_active(struct obd_import *imp, int active) +{ + struct obd_device *obd = imp->imp_obd; + int rc = 0; + + LASSERT(obd); + + /* When deactivating, mark import invalid, and abort in-flight + * requests. */ + if (!active) { + LCONSOLE_WARN("setting import %s INACTIVE by administrator request\n", + obd2cli_tgt(imp->imp_obd)); + + /* set before invalidate to avoid messages about imp_inval + * set without imp_deactive in ptlrpc_import_delay_req */ + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE); + + ptlrpc_invalidate_import(imp); + } + + /* When activating, mark import valid, and attempt recovery */ + if (active) { + CDEBUG(D_HA, "setting import %s VALID\n", + obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); + imp->imp_deactive = 0; + spin_unlock(&imp->imp_lock); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE); + + rc = ptlrpc_recover_import(imp, NULL, 0); + } + + return rc; +} +EXPORT_SYMBOL(ptlrpc_set_import_active); + +/* Attempt to reconnect an import */ +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async) +{ + int rc = 0; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive || + atomic_read(&imp->imp_inval_count)) + rc = -EINVAL; + spin_unlock(&imp->imp_lock); + if (rc) + goto out; + + /* force import to be disconnected. */ + ptlrpc_set_import_discon(imp, 0); + + if (new_uuid) { + struct obd_uuid uuid; + + /* intruct import to use new uuid */ + obd_str2uuid(&uuid, new_uuid); + rc = import_set_conn_priority(imp, &uuid); + if (rc) + goto out; + } + + /* Check if reconnect is already in progress */ + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_DISCON) { + imp->imp_force_verify = 1; + rc = -EALREADY; + } + spin_unlock(&imp->imp_lock); + if (rc) + goto out; + + rc = ptlrpc_connect_import(imp); + if (rc) + goto out; + + if (!async) { + struct l_wait_info lwi; + int secs = cfs_time_seconds(obd_timeout); + + CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n", + obd2cli_tgt(imp->imp_obd), secs); + + lwi = LWI_TIMEOUT(secs, NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), &lwi); + CDEBUG(D_HA, "%s: recovery finished\n", + obd2cli_tgt(imp->imp_obd)); + } + +out: + return rc; +} +EXPORT_SYMBOL(ptlrpc_recover_import); + +int ptlrpc_import_in_recovery(struct obd_import *imp) +{ + int in_recovery = 1; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_FULL || + imp->imp_state == LUSTRE_IMP_CLOSED || + imp->imp_state == LUSTRE_IMP_DISCON || + imp->imp_obd->obd_no_recov) + in_recovery = 0; + spin_unlock(&imp->imp_lock); + + return in_recovery; +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c new file mode 100644 index 000000000..21e9dc9d5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec.c @@ -0,0 +1,2459 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include "../../include/linux/libcfs/libcfs.h" +#include +#include + +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lustre_net.h" +#include "../include/lustre_import.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_sec.h" + +#include "ptlrpc_internal.h" + +/*********************************************** + * policy registers * + ***********************************************/ + +static rwlock_t policy_lock; +static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = { + NULL, +}; + +int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy) +{ + __u16 number = policy->sp_policy; + + LASSERT(policy->sp_name); + LASSERT(policy->sp_cops); + LASSERT(policy->sp_sops); + + if (number >= SPTLRPC_POLICY_MAX) + return -EINVAL; + + write_lock(&policy_lock); + if (unlikely(policies[number])) { + write_unlock(&policy_lock); + return -EALREADY; + } + policies[number] = policy; + write_unlock(&policy_lock); + + CDEBUG(D_SEC, "%s: registered\n", policy->sp_name); + return 0; +} +EXPORT_SYMBOL(sptlrpc_register_policy); + +int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy) +{ + __u16 number = policy->sp_policy; + + LASSERT(number < SPTLRPC_POLICY_MAX); + + write_lock(&policy_lock); + if (unlikely(policies[number] == NULL)) { + write_unlock(&policy_lock); + CERROR("%s: already unregistered\n", policy->sp_name); + return -EINVAL; + } + + LASSERT(policies[number] == policy); + policies[number] = NULL; + write_unlock(&policy_lock); + + CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name); + return 0; +} +EXPORT_SYMBOL(sptlrpc_unregister_policy); + +static +struct ptlrpc_sec_policy *sptlrpc_wireflavor2policy(__u32 flavor) +{ + static DEFINE_MUTEX(load_mutex); + static atomic_t loaded = ATOMIC_INIT(0); + struct ptlrpc_sec_policy *policy; + __u16 number = SPTLRPC_FLVR_POLICY(flavor); + __u16 flag = 0; + + if (number >= SPTLRPC_POLICY_MAX) + return NULL; + + while (1) { + read_lock(&policy_lock); + policy = policies[number]; + if (policy && !try_module_get(policy->sp_owner)) + policy = NULL; + if (policy == NULL) + flag = atomic_read(&loaded); + read_unlock(&policy_lock); + + if (policy != NULL || flag != 0 || + number != SPTLRPC_POLICY_GSS) + break; + + /* try to load gss module, once */ + mutex_lock(&load_mutex); + if (atomic_read(&loaded) == 0) { + if (request_module("ptlrpc_gss") == 0) + CDEBUG(D_SEC, + "module ptlrpc_gss loaded on demand\n"); + else + CERROR("Unable to load module ptlrpc_gss\n"); + + atomic_set(&loaded, 1); + } + mutex_unlock(&load_mutex); + } + + return policy; +} + +__u32 sptlrpc_name2flavor_base(const char *name) +{ + if (!strcmp(name, "null")) + return SPTLRPC_FLVR_NULL; + if (!strcmp(name, "plain")) + return SPTLRPC_FLVR_PLAIN; + if (!strcmp(name, "krb5n")) + return SPTLRPC_FLVR_KRB5N; + if (!strcmp(name, "krb5a")) + return SPTLRPC_FLVR_KRB5A; + if (!strcmp(name, "krb5i")) + return SPTLRPC_FLVR_KRB5I; + if (!strcmp(name, "krb5p")) + return SPTLRPC_FLVR_KRB5P; + + return SPTLRPC_FLVR_INVALID; +} +EXPORT_SYMBOL(sptlrpc_name2flavor_base); + +const char *sptlrpc_flavor2name_base(__u32 flvr) +{ + __u32 base = SPTLRPC_FLVR_BASE(flvr); + + if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) + return "null"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN)) + return "plain"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N)) + return "krb5n"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A)) + return "krb5a"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I)) + return "krb5i"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P)) + return "krb5p"; + + CERROR("invalid wire flavor 0x%x\n", flvr); + return "invalid"; +} +EXPORT_SYMBOL(sptlrpc_flavor2name_base); + +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize) +{ + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) + snprintf(buf, bufsize, "hash:%s", + sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg)); + else + snprintf(buf, bufsize, "%s", + sptlrpc_flavor2name_base(sf->sf_rpc)); + + buf[bufsize - 1] = '\0'; + return buf; +} +EXPORT_SYMBOL(sptlrpc_flavor2name_bulk); + +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize) +{ + strlcpy(buf, sptlrpc_flavor2name_base(sf->sf_rpc), bufsize); + + /* + * currently we don't support customized bulk specification for + * flavors other than plain + */ + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) { + char bspec[16]; + + bspec[0] = '-'; + sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1); + strlcat(buf, bspec, bufsize); + } + + return buf; +} +EXPORT_SYMBOL(sptlrpc_flavor2name); + +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_SEC_FL_REVERSE) + strlcat(buf, "reverse,", bufsize); + if (flags & PTLRPC_SEC_FL_ROOTONLY) + strlcat(buf, "rootonly,", bufsize); + if (flags & PTLRPC_SEC_FL_UDESC) + strlcat(buf, "udesc,", bufsize); + if (flags & PTLRPC_SEC_FL_BULK) + strlcat(buf, "bulk,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); + + return buf; +} +EXPORT_SYMBOL(sptlrpc_secflags2str); + +/************************************************** + * client context APIs * + **************************************************/ + +static +struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec) +{ + struct vfs_cred vcred; + int create = 1, remove_dead = 1; + + LASSERT(sec); + LASSERT(sec->ps_policy->sp_cops->lookup_ctx); + + if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE | + PTLRPC_SEC_FL_ROOTONLY)) { + vcred.vc_uid = 0; + vcred.vc_gid = 0; + if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) { + create = 0; + remove_dead = 0; + } + } else { + vcred.vc_uid = from_kuid(&init_user_ns, current_uid()); + vcred.vc_gid = from_kgid(&init_user_ns, current_gid()); + } + + return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred, + create, remove_dead); +} + +struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx) +{ + atomic_inc(&ctx->cc_refcount); + return ctx; +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_get); + +void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + + LASSERT(sec); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (!atomic_dec_and_test(&ctx->cc_refcount)) + return; + + sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_put); + +/** + * Expire the client context immediately. + * + * \pre Caller must hold at least 1 reference on the \a ctx. + */ +void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(ctx->cc_ops->force_die); + ctx->cc_ops->force_die(ctx, 0); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_expire); + +/** + * To wake up the threads who are waiting for this client context. Called + * after some status change happened on \a ctx. + */ +void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_request *req, *next; + + spin_lock(&ctx->cc_lock); + list_for_each_entry_safe(req, next, &ctx->cc_req_list, + rq_ctx_chain) { + list_del_init(&req->rq_ctx_chain); + ptlrpc_client_wake_req(req); + } + spin_unlock(&ctx->cc_lock); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup); + +int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize) +{ + LASSERT(ctx->cc_ops); + + if (ctx->cc_ops->display == NULL) + return 0; + + return ctx->cc_ops->display(ctx, buf, bufsize); +} + +static int import_sec_check_expire(struct obd_import *imp) +{ + int adapt = 0; + + spin_lock(&imp->imp_lock); + if (imp->imp_sec_expire && + imp->imp_sec_expire < get_seconds()) { + adapt = 1; + imp->imp_sec_expire = 0; + } + spin_unlock(&imp->imp_lock); + + if (!adapt) + return 0; + + CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n"); + return sptlrpc_import_sec_adapt(imp, NULL, NULL); +} + +static int import_sec_validate_get(struct obd_import *imp, + struct ptlrpc_sec **sec) +{ + int rc; + + if (unlikely(imp->imp_sec_expire)) { + rc = import_sec_check_expire(imp); + if (rc) + return rc; + } + + *sec = sptlrpc_import_sec_ref(imp); + if (*sec == NULL) { + CERROR("import %p (%s) with no sec\n", + imp, ptlrpc_import_state_name(imp->imp_state)); + return -EACCES; + } + + if (unlikely((*sec)->ps_dying)) { + CERROR("attempt to use dying sec %p\n", sec); + sptlrpc_sec_put(*sec); + return -EACCES; + } + + return 0; +} + +/** + * Given a \a req, find or allocate a appropriate context for it. + * \pre req->rq_cli_ctx == NULL. + * + * \retval 0 succeed, and req->rq_cli_ctx is set. + * \retval -ev error number, and req->rq_cli_ctx == NULL. + */ +int sptlrpc_req_get_ctx(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_sec *sec; + int rc; + + LASSERT(!req->rq_cli_ctx); + LASSERT(imp); + + rc = import_sec_validate_get(imp, &sec); + if (rc) + return rc; + + req->rq_cli_ctx = get_my_ctx(sec); + + sptlrpc_sec_put(sec); + + if (!req->rq_cli_ctx) { + CERROR("req %p: fail to get context\n", req); + return -ENOMEM; + } + + return 0; +} + +/** + * Drop the context for \a req. + * \pre req->rq_cli_ctx != NULL. + * \post req->rq_cli_ctx == NULL. + * + * If \a sync == 0, this function should return quickly without sleep; + * otherwise it might trigger and wait for the whole process of sending + * an context-destroying rpc to server. + */ +void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync) +{ + LASSERT(req); + LASSERT(req->rq_cli_ctx); + + /* request might be asked to release earlier while still + * in the context waiting list. + */ + if (!list_empty(&req->rq_ctx_chain)) { + spin_lock(&req->rq_cli_ctx->cc_lock); + list_del_init(&req->rq_ctx_chain); + spin_unlock(&req->rq_cli_ctx->cc_lock); + } + + sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync); + req->rq_cli_ctx = NULL; +} + +static +int sptlrpc_req_ctx_switch(struct ptlrpc_request *req, + struct ptlrpc_cli_ctx *oldctx, + struct ptlrpc_cli_ctx *newctx) +{ + struct sptlrpc_flavor old_flvr; + char *reqmsg = NULL; /* to workaround old gcc */ + int reqmsg_size; + int rc = 0; + + LASSERT(req->rq_reqmsg); + LASSERT(req->rq_reqlen); + LASSERT(req->rq_replen); + + CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), switch sec %p(%s) -> %p(%s)\n", + req, + oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec), + newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec), + oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name, + newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name); + + /* save flavor */ + old_flvr = req->rq_flvr; + + /* save request message */ + reqmsg_size = req->rq_reqlen; + if (reqmsg_size != 0) { + OBD_ALLOC_LARGE(reqmsg, reqmsg_size); + if (reqmsg == NULL) + return -ENOMEM; + memcpy(reqmsg, req->rq_reqmsg, reqmsg_size); + } + + /* release old req/rep buf */ + req->rq_cli_ctx = oldctx; + sptlrpc_cli_free_reqbuf(req); + sptlrpc_cli_free_repbuf(req); + req->rq_cli_ctx = newctx; + + /* recalculate the flavor */ + sptlrpc_req_set_flavor(req, 0); + + /* alloc new request buffer + * we don't need to alloc reply buffer here, leave it to the + * rest procedure of ptlrpc */ + if (reqmsg_size != 0) { + rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size); + if (!rc) { + LASSERT(req->rq_reqmsg); + memcpy(req->rq_reqmsg, reqmsg, reqmsg_size); + } else { + CWARN("failed to alloc reqbuf: %d\n", rc); + req->rq_flvr = old_flvr; + } + + OBD_FREE_LARGE(reqmsg, reqmsg_size); + } + return rc; +} + +/** + * If current context of \a req is dead somehow, e.g. we just switched flavor + * thus marked original contexts dead, we'll find a new context for it. if + * no switch is needed, \a req will end up with the same context. + * + * \note a request must have a context, to keep other parts of code happy. + * In any case of failure during the switching, we must restore the old one. + */ +int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx; + struct ptlrpc_cli_ctx *newctx; + int rc; + + LASSERT(oldctx); + + sptlrpc_cli_ctx_get(oldctx); + sptlrpc_req_put_ctx(req, 0); + + rc = sptlrpc_req_get_ctx(req); + if (unlikely(rc)) { + LASSERT(!req->rq_cli_ctx); + + /* restore old ctx */ + req->rq_cli_ctx = oldctx; + return rc; + } + + newctx = req->rq_cli_ctx; + LASSERT(newctx); + + if (unlikely(newctx == oldctx && + test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) { + /* + * still get the old dead ctx, usually means system too busy + */ + CDEBUG(D_SEC, + "ctx (%p, fl %lx) doesn't switch, relax a little bit\n", + newctx, newctx->cc_flags); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } else { + /* + * it's possible newctx == oldctx if we're switching + * subflavor with the same sec. + */ + rc = sptlrpc_req_ctx_switch(req, oldctx, newctx); + if (rc) { + /* restore old ctx */ + sptlrpc_req_put_ctx(req, 0); + req->rq_cli_ctx = oldctx; + return rc; + } + + LASSERT(req->rq_cli_ctx == newctx); + } + + sptlrpc_cli_ctx_put(oldctx, 1); + return 0; +} +EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx); + +static +int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx) +{ + if (cli_ctx_is_refreshed(ctx)) + return 1; + return 0; +} + +static +int ctx_refresh_timeout(void *data) +{ + struct ptlrpc_request *req = data; + int rc; + + /* conn_cnt is needed in expire_one_request */ + lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt); + + rc = ptlrpc_expire_one_request(req, 1); + /* if we started recovery, we should mark this ctx dead; otherwise + * in case of lgssd died nobody would retire this ctx, following + * connecting will still find the same ctx thus cause deadlock. + * there's an assumption that expire time of the request should be + * later than the context refresh expire time. + */ + if (rc == 0) + req->rq_cli_ctx->cc_ops->force_die(req->rq_cli_ctx, 0); + return rc; +} + +static +void ctx_refresh_interrupt(void *data) +{ + struct ptlrpc_request *req = data; + + spin_lock(&req->rq_lock); + req->rq_intr = 1; + spin_unlock(&req->rq_lock); +} + +static +void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx) +{ + spin_lock(&ctx->cc_lock); + if (!list_empty(&req->rq_ctx_chain)) + list_del_init(&req->rq_ctx_chain); + spin_unlock(&ctx->cc_lock); +} + +/** + * To refresh the context of \req, if it's not up-to-date. + * \param timeout + * - < 0: don't wait + * - = 0: wait until success or fatal error occur + * - > 0: timeout value (in seconds) + * + * The status of the context could be subject to be changed by other threads + * at any time. We allow this race, but once we return with 0, the caller will + * suppose it's uptodated and keep using it until the owning rpc is done. + * + * \retval 0 only if the context is uptodated. + * \retval -ev error number. + */ +int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec *sec; + struct l_wait_info lwi; + int rc; + + LASSERT(ctx); + + if (req->rq_ctx_init || req->rq_ctx_fini) + return 0; + + /* + * during the process a request's context might change type even + * (e.g. from gss ctx to null ctx), so each loop we need to re-check + * everything + */ +again: + rc = import_sec_validate_get(req->rq_import, &sec); + if (rc) + return rc; + + if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) { + CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n", + req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc); + req_off_ctx_list(req, ctx); + sptlrpc_req_replace_dead_ctx(req); + ctx = req->rq_cli_ctx; + } + sptlrpc_sec_put(sec); + + if (cli_ctx_is_eternal(ctx)) + return 0; + + if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) { + LASSERT(ctx->cc_ops->refresh); + ctx->cc_ops->refresh(ctx); + } + LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0); + + LASSERT(ctx->cc_ops->validate); + if (ctx->cc_ops->validate(ctx) == 0) { + req_off_ctx_list(req, ctx); + return 0; + } + + if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) { + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + req_off_ctx_list(req, ctx); + return -EPERM; + } + + /* + * There's a subtle issue for resending RPCs, suppose following + * situation: + * 1. the request was sent to server. + * 2. recovery was kicked start, after finished the request was + * marked as resent. + * 3. resend the request. + * 4. old reply from server received, we accept and verify the reply. + * this has to be success, otherwise the error will be aware + * by application. + * 5. new reply from server received, dropped by LNet. + * + * Note the xid of old & new request is the same. We can't simply + * change xid for the resent request because the server replies on + * it for reply reconstruction. + * + * Commonly the original context should be uptodate because we + * have a expiry nice time; server will keep its context because + * we at least hold a ref of old context which prevent context + * destroying RPC being sent. So server still can accept the request + * and finish the RPC. But if that's not the case: + * 1. If server side context has been trimmed, a NO_CONTEXT will + * be returned, gss_cli_ctx_verify/unseal will switch to new + * context by force. + * 2. Current context never be refreshed, then we are fine: we + * never really send request with old context before. + */ + if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) && + unlikely(req->rq_reqmsg) && + lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { + req_off_ctx_list(req, ctx); + return 0; + } + + if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) { + req_off_ctx_list(req, ctx); + /* + * don't switch ctx if import was deactivated + */ + if (req->rq_import->imp_deactive) { + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + return -EINTR; + } + + rc = sptlrpc_req_replace_dead_ctx(req); + if (rc) { + LASSERT(ctx == req->rq_cli_ctx); + CERROR("req %p: failed to replace dead ctx %p: %d\n", + req, ctx, rc); + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + return rc; + } + + ctx = req->rq_cli_ctx; + goto again; + } + + /* + * Now we're sure this context is during upcall, add myself into + * waiting list + */ + spin_lock(&ctx->cc_lock); + if (list_empty(&req->rq_ctx_chain)) + list_add(&req->rq_ctx_chain, &ctx->cc_req_list); + spin_unlock(&ctx->cc_lock); + + if (timeout < 0) + return -EWOULDBLOCK; + + /* Clear any flags that may be present from previous sends */ + LASSERT(req->rq_receiving_reply == 0); + spin_lock(&req->rq_lock); + req->rq_err = 0; + req->rq_timedout = 0; + req->rq_resend = 0; + req->rq_restart = 0; + spin_unlock(&req->rq_lock); + + lwi = LWI_TIMEOUT_INTR(timeout * HZ, ctx_refresh_timeout, + ctx_refresh_interrupt, req); + rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi); + + /* + * following cases could lead us here: + * - successfully refreshed; + * - interrupted; + * - timedout, and we don't want recover from the failure; + * - timedout, and waked up upon recovery finished; + * - someone else mark this ctx dead by force; + * - someone invalidate the req and call ptlrpc_client_wake_req(), + * e.g. ptlrpc_abort_inflight(); + */ + if (!cli_ctx_is_refreshed(ctx)) { + /* timed out or interrupted */ + req_off_ctx_list(req, ctx); + + LASSERT(rc != 0); + return rc; + } + + goto again; +} + +/** + * Initialize flavor settings for \a req, according to \a opcode. + * + * \note this could be called in two situations: + * - new request from ptlrpc_pre_req(), with proper @opcode + * - old request which changed ctx in the middle, with @opcode == 0 + */ +void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode) +{ + struct ptlrpc_sec *sec; + + LASSERT(req->rq_import); + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_cli_ctx->cc_sec); + LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0); + + /* special security flags according to opcode */ + switch (opcode) { + case OST_READ: + case MDS_READPAGE: + case MGS_CONFIG_READ: + case OBD_IDX_READ: + req->rq_bulk_read = 1; + break; + case OST_WRITE: + case MDS_WRITEPAGE: + req->rq_bulk_write = 1; + break; + case SEC_CTX_INIT: + req->rq_ctx_init = 1; + break; + case SEC_CTX_FINI: + req->rq_ctx_fini = 1; + break; + case 0: + /* init/fini rpc won't be resend, so can't be here */ + LASSERT(req->rq_ctx_init == 0); + LASSERT(req->rq_ctx_fini == 0); + + /* cleanup flags, which should be recalculated */ + req->rq_pack_udesc = 0; + req->rq_pack_bulk = 0; + break; + } + + sec = req->rq_cli_ctx->cc_sec; + + spin_lock(&sec->ps_lock); + req->rq_flvr = sec->ps_flvr; + spin_unlock(&sec->ps_lock); + + /* force SVC_NULL for context initiation rpc, SVC_INTG for context + * destruction rpc */ + if (unlikely(req->rq_ctx_init)) + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL); + else if (unlikely(req->rq_ctx_fini)) + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG); + + /* user descriptor flag, null security can't do it anyway */ + if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) && + (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL)) + req->rq_pack_udesc = 1; + + /* bulk security flag */ + if ((req->rq_bulk_read || req->rq_bulk_write) && + sptlrpc_flavor_has_bulk(&req->rq_flvr)) + req->rq_pack_bulk = 1; +} + +void sptlrpc_request_out_callback(struct ptlrpc_request *req) +{ + if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV) + return; + + LASSERT(req->rq_clrbuf); + if (req->rq_pool || !req->rq_reqbuf) + return; + + OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; +} + +/** + * Given an import \a imp, check whether current user has a valid context + * or not. We may create a new context and try to refresh it, and try + * repeatedly try in case of non-fatal errors. Return 0 means success. + */ +int sptlrpc_import_check_ctx(struct obd_import *imp) +{ + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + struct ptlrpc_request *req = NULL; + int rc; + + might_sleep(); + + sec = sptlrpc_import_sec_ref(imp); + ctx = get_my_ctx(sec); + sptlrpc_sec_put(sec); + + if (!ctx) + return -ENOMEM; + + if (cli_ctx_is_eternal(ctx) || + ctx->cc_ops->validate(ctx) == 0) { + sptlrpc_cli_ctx_put(ctx, 1); + return 0; + } + + if (cli_ctx_is_error(ctx)) { + sptlrpc_cli_ctx_put(ctx, 1); + return -EACCES; + } + + req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (!req) + return -ENOMEM; + + spin_lock_init(&req->rq_lock); + atomic_set(&req->rq_refcount, 10000); + INIT_LIST_HEAD(&req->rq_ctx_chain); + init_waitqueue_head(&req->rq_reply_waitq); + init_waitqueue_head(&req->rq_set_waitq); + req->rq_import = imp; + req->rq_flvr = sec->ps_flvr; + req->rq_cli_ctx = ctx; + + rc = sptlrpc_req_refresh_ctx(req, 0); + LASSERT(list_empty(&req->rq_ctx_chain)); + sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1); + ptlrpc_request_cache_free(req); + + return rc; +} + +/** + * Used by ptlrpc client, to perform the pre-defined security transformation + * upon the request message of \a req. After this function called, + * req->rq_reqmsg is still accessible as clear text. + */ +int sptlrpc_cli_wrap_request(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + int rc = 0; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(req->rq_reqbuf || req->rq_clrbuf); + + /* we wrap bulk request here because now we can be sure + * the context is uptodate. + */ + if (req->rq_bulk) { + rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk); + if (rc) + return rc; + } + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(ctx->cc_ops->sign); + rc = ctx->cc_ops->sign(ctx, req); + break; + case SPTLRPC_SVC_PRIV: + LASSERT(ctx->cc_ops->seal); + rc = ctx->cc_ops->seal(ctx, req); + break; + default: + LBUG(); + } + + if (rc == 0) { + LASSERT(req->rq_reqdata_len); + LASSERT(req->rq_reqdata_len % 8 == 0); + LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len); + } + + return rc; +} + +static int do_cli_unwrap_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + int rc; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata); + LASSERT(req->rq_repmsg == NULL); + + req->rq_rep_swab_mask = 0; + + rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len); + switch (rc) { + case 1: + lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); + case 0: + break; + default: + CERROR("failed unpack reply: x%llu\n", req->rq_xid); + return -EPROTO; + } + + if (req->rq_repdata_len < sizeof(struct lustre_msg)) { + CERROR("replied data length %d too small\n", + req->rq_repdata_len); + return -EPROTO; + } + + if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) != + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) { + CERROR("reply policy %u doesn't match request policy %u\n", + SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr), + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)); + return -EPROTO; + } + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(ctx->cc_ops->verify); + rc = ctx->cc_ops->verify(ctx, req); + break; + case SPTLRPC_SVC_PRIV: + LASSERT(ctx->cc_ops->unseal); + rc = ctx->cc_ops->unseal(ctx, req); + break; + default: + LBUG(); + } + LASSERT(rc || req->rq_repmsg || req->rq_resend); + + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL && + !req->rq_ctx_init) + req->rq_rep_swab_mask = 0; + return rc; +} + +/** + * Used by ptlrpc client, to perform security transformation upon the reply + * message of \a req. After return successfully, req->rq_repmsg points to + * the reply message in clear text. + * + * \pre the reply buffer should have been un-posted from LNet, so nothing is + * going to change. + */ +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req) +{ + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len); + + if (req->rq_reply_off == 0 && + (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + CERROR("real reply with offset 0\n"); + return -EPROTO; + } + + if (req->rq_reply_off % 8 != 0) { + CERROR("reply at odd offset %u\n", req->rq_reply_off); + return -EPROTO; + } + + req->rq_repdata = (struct lustre_msg *) + (req->rq_repbuf + req->rq_reply_off); + req->rq_repdata_len = req->rq_nob_received; + + return do_cli_unwrap_reply(req); +} + +/** + * Used by ptlrpc client, to perform security transformation upon the early + * reply message of \a req. We expect the rq_reply_off is 0, and + * rq_nob_received is the early reply size. + * + * Because the receive buffer might be still posted, the reply data might be + * changed at any time, no matter we're holding rq_lock or not. For this reason + * we allocate a separate ptlrpc_request and reply buffer for early reply + * processing. + * + * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request. + * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned + * \a *req_ret to release it. + * \retval -ev error number, and \a req_ret will not be set. + */ +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, + struct ptlrpc_request **req_ret) +{ + struct ptlrpc_request *early_req; + char *early_buf; + int early_bufsz, early_size; + int rc; + + early_req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (early_req == NULL) + return -ENOMEM; + + early_size = req->rq_nob_received; + early_bufsz = size_roundup_power2(early_size); + OBD_ALLOC_LARGE(early_buf, early_bufsz); + if (early_buf == NULL) { + rc = -ENOMEM; + goto err_req; + } + + /* sanity checkings and copy data out, do it inside spinlock */ + spin_lock(&req->rq_lock); + + if (req->rq_replied) { + spin_unlock(&req->rq_lock); + rc = -EALREADY; + goto err_buf; + } + + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + + if (req->rq_reply_off != 0) { + CERROR("early reply with offset %u\n", req->rq_reply_off); + spin_unlock(&req->rq_lock); + rc = -EPROTO; + goto err_buf; + } + + if (req->rq_nob_received != early_size) { + /* even another early arrived the size should be the same */ + CERROR("data size has changed from %u to %u\n", + early_size, req->rq_nob_received); + spin_unlock(&req->rq_lock); + rc = -EINVAL; + goto err_buf; + } + + if (req->rq_nob_received < sizeof(struct lustre_msg)) { + CERROR("early reply length %d too small\n", + req->rq_nob_received); + spin_unlock(&req->rq_lock); + rc = -EALREADY; + goto err_buf; + } + + memcpy(early_buf, req->rq_repbuf, early_size); + spin_unlock(&req->rq_lock); + + spin_lock_init(&early_req->rq_lock); + early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx); + early_req->rq_flvr = req->rq_flvr; + early_req->rq_repbuf = early_buf; + early_req->rq_repbuf_len = early_bufsz; + early_req->rq_repdata = (struct lustre_msg *) early_buf; + early_req->rq_repdata_len = early_size; + early_req->rq_early = 1; + early_req->rq_reqmsg = req->rq_reqmsg; + + rc = do_cli_unwrap_reply(early_req); + if (rc) { + DEBUG_REQ(D_ADAPTTO, early_req, + "error %d unwrap early reply", rc); + goto err_ctx; + } + + LASSERT(early_req->rq_repmsg); + *req_ret = early_req; + return 0; + +err_ctx: + sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); +err_buf: + OBD_FREE_LARGE(early_buf, early_bufsz); +err_req: + ptlrpc_request_cache_free(early_req); + return rc; +} + +/** + * Used by ptlrpc client, to release a processed early reply \a early_req. + * + * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply(). + */ +void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req) +{ + LASSERT(early_req->rq_repbuf); + LASSERT(early_req->rq_repdata); + LASSERT(early_req->rq_repmsg); + + sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); + OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len); + ptlrpc_request_cache_free(early_req); +} + +/************************************************** + * sec ID * + **************************************************/ + +/* + * "fixed" sec (e.g. null) use sec_id < 0 + */ +static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1); + +int sptlrpc_get_next_secid(void) +{ + return atomic_inc_return(&sptlrpc_sec_id); +} +EXPORT_SYMBOL(sptlrpc_get_next_secid); + +/************************************************** + * client side high-level security APIs * + **************************************************/ + +static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid, + int grace, int force) +{ + struct ptlrpc_sec_policy *policy = sec->ps_policy; + + LASSERT(policy->sp_cops); + LASSERT(policy->sp_cops->flush_ctx_cache); + + return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force); +} + +static void sec_cop_destroy_sec(struct ptlrpc_sec *sec) +{ + struct ptlrpc_sec_policy *policy = sec->ps_policy; + + LASSERT_ATOMIC_ZERO(&sec->ps_refcount); + LASSERT_ATOMIC_ZERO(&sec->ps_nctx); + LASSERT(policy->sp_cops->destroy_sec); + + CDEBUG(D_SEC, "%s@%p: being destroyed\n", sec->ps_policy->sp_name, sec); + + policy->sp_cops->destroy_sec(sec); + sptlrpc_policy_put(policy); +} + +void sptlrpc_sec_destroy(struct ptlrpc_sec *sec) +{ + sec_cop_destroy_sec(sec); +} +EXPORT_SYMBOL(sptlrpc_sec_destroy); + +static void sptlrpc_sec_kill(struct ptlrpc_sec *sec) +{ + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + if (sec->ps_policy->sp_cops->kill_sec) { + sec->ps_policy->sp_cops->kill_sec(sec); + + sec_cop_flush_ctx_cache(sec, -1, 1, 1); + } +} + +struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec) +{ + if (sec) + atomic_inc(&sec->ps_refcount); + + return sec; +} +EXPORT_SYMBOL(sptlrpc_sec_get); + +void sptlrpc_sec_put(struct ptlrpc_sec *sec) +{ + if (sec) { + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + if (atomic_dec_and_test(&sec->ps_refcount)) { + sptlrpc_gc_del_sec(sec); + sec_cop_destroy_sec(sec); + } + } +} +EXPORT_SYMBOL(sptlrpc_sec_put); + +/* + * policy module is responsible for taking reference of import + */ +static +struct ptlrpc_sec *sptlrpc_sec_create(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf, + enum lustre_sec_part sp) +{ + struct ptlrpc_sec_policy *policy; + struct ptlrpc_sec *sec; + char str[32]; + + if (svc_ctx) { + LASSERT(imp->imp_dlm_fake == 1); + + CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name, + sptlrpc_flavor2name(sf, str, sizeof(str))); + + policy = sptlrpc_policy_get(svc_ctx->sc_policy); + sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY; + } else { + LASSERT(imp->imp_dlm_fake == 0); + + CDEBUG(D_SEC, "%s %s: select security flavor %s\n", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name, + sptlrpc_flavor2name(sf, str, sizeof(str))); + + policy = sptlrpc_wireflavor2policy(sf->sf_rpc); + if (!policy) { + CERROR("invalid flavor 0x%x\n", sf->sf_rpc); + return NULL; + } + } + + sec = policy->sp_cops->create_sec(imp, svc_ctx, sf); + if (sec) { + atomic_inc(&sec->ps_refcount); + + sec->ps_part = sp; + + if (sec->ps_gc_interval && policy->sp_cops->gc_ctx) + sptlrpc_gc_add_sec(sec); + } else { + sptlrpc_policy_put(policy); + } + + return sec; +} + +struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp) +{ + struct ptlrpc_sec *sec; + + spin_lock(&imp->imp_lock); + sec = sptlrpc_sec_get(imp->imp_sec); + spin_unlock(&imp->imp_lock); + + return sec; +} +EXPORT_SYMBOL(sptlrpc_import_sec_ref); + +static void sptlrpc_import_sec_install(struct obd_import *imp, + struct ptlrpc_sec *sec) +{ + struct ptlrpc_sec *old_sec; + + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + spin_lock(&imp->imp_lock); + old_sec = imp->imp_sec; + imp->imp_sec = sec; + spin_unlock(&imp->imp_lock); + + if (old_sec) { + sptlrpc_sec_kill(old_sec); + + /* balance the ref taken by this import */ + sptlrpc_sec_put(old_sec); + } +} + +static inline +int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2) +{ + return (memcmp(sf1, sf2, sizeof(*sf1)) == 0); +} + +static inline +void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src) +{ + *dst = *src; +} + +static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct sptlrpc_flavor *sf) +{ + char str1[32], str2[32]; + + if (sec->ps_flvr.sf_flags != sf->sf_flags) + CDEBUG(D_SEC, "changing sec flags: %s -> %s\n", + sptlrpc_secflags2str(sec->ps_flvr.sf_flags, + str1, sizeof(str1)), + sptlrpc_secflags2str(sf->sf_flags, + str2, sizeof(str2))); + + spin_lock(&sec->ps_lock); + flavor_copy(&sec->ps_flvr, sf); + spin_unlock(&sec->ps_lock); +} + +/** + * To get an appropriate ptlrpc_sec for the \a imp, according to the current + * configuration. Upon called, imp->imp_sec may or may not be NULL. + * + * - regular import: \a svc_ctx should be NULL and \a flvr is ignored; + * - reverse import: \a svc_ctx and \a flvr are obtained from incoming request. + */ +int sptlrpc_import_sec_adapt(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *flvr) +{ + struct ptlrpc_connection *conn; + struct sptlrpc_flavor sf; + struct ptlrpc_sec *sec, *newsec; + enum lustre_sec_part sp; + char str[24]; + int rc = 0; + + might_sleep(); + + if (imp == NULL) + return 0; + + conn = imp->imp_connection; + + if (svc_ctx == NULL) { + struct client_obd *cliobd = &imp->imp_obd->u.cli; + /* + * normal import, determine flavor from rule set, except + * for mgc the flavor is predetermined. + */ + if (cliobd->cl_sp_me == LUSTRE_SP_MGC) + sf = cliobd->cl_flvr_mgc; + else + sptlrpc_conf_choose_flavor(cliobd->cl_sp_me, + cliobd->cl_sp_to, + &cliobd->cl_target_uuid, + conn->c_self, &sf); + + sp = imp->imp_obd->u.cli.cl_sp_me; + } else { + /* reverse import, determine flavor from incoming request */ + sf = *flvr; + + if (sf.sf_rpc != SPTLRPC_FLVR_NULL) + sf.sf_flags = PTLRPC_SEC_FL_REVERSE | + PTLRPC_SEC_FL_ROOTONLY; + + sp = sptlrpc_target_sec_part(imp->imp_obd); + } + + sec = sptlrpc_import_sec_ref(imp); + if (sec) { + char str2[24]; + + if (flavor_equal(&sf, &sec->ps_flvr)) + goto out; + + CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)), + sptlrpc_flavor2name(&sf, str2, sizeof(str2))); + + if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) == + SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) && + SPTLRPC_FLVR_MECH(sf.sf_rpc) == + SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) { + sptlrpc_import_sec_adapt_inplace(imp, sec, &sf); + goto out; + } + } else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) != + SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) { + CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + LNET_NIDNET(conn->c_self), + sptlrpc_flavor2name(&sf, str, sizeof(str))); + } + + mutex_lock(&imp->imp_sec_mutex); + + newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp); + if (newsec) { + sptlrpc_import_sec_install(imp, newsec); + } else { + CERROR("import %s->%s: failed to create new sec\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid)); + rc = -EPERM; + } + + mutex_unlock(&imp->imp_sec_mutex); +out: + sptlrpc_sec_put(sec); + return rc; +} + +void sptlrpc_import_sec_put(struct obd_import *imp) +{ + if (imp->imp_sec) { + sptlrpc_sec_kill(imp->imp_sec); + + sptlrpc_sec_put(imp->imp_sec); + imp->imp_sec = NULL; + } +} + +static void import_flush_ctx_common(struct obd_import *imp, + uid_t uid, int grace, int force) +{ + struct ptlrpc_sec *sec; + + if (imp == NULL) + return; + + sec = sptlrpc_import_sec_ref(imp); + if (sec == NULL) + return; + + sec_cop_flush_ctx_cache(sec, uid, grace, force); + sptlrpc_sec_put(sec); +} + +void sptlrpc_import_flush_root_ctx(struct obd_import *imp) +{ + /* it's important to use grace mode, see explain in + * sptlrpc_req_refresh_ctx() */ + import_flush_ctx_common(imp, 0, 1, 1); +} + +void sptlrpc_import_flush_my_ctx(struct obd_import *imp) +{ + import_flush_ctx_common(imp, from_kuid(&init_user_ns, current_uid()), + 1, 1); +} +EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx); + +void sptlrpc_import_flush_all_ctx(struct obd_import *imp) +{ + import_flush_ctx_common(imp, -1, 1, 1); +} +EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx); + +/** + * Used by ptlrpc client to allocate request buffer of \a req. Upon return + * successfully, req->rq_reqmsg points to a buffer with size \a msgsize. + */ +int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + int rc; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT(req->rq_reqmsg == NULL); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + policy = ctx->cc_sec->ps_policy; + rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize); + if (!rc) { + LASSERT(req->rq_reqmsg); + LASSERT(req->rq_reqbuf || req->rq_clrbuf); + + /* zeroing preallocated buffer */ + if (req->rq_pool) + memset(req->rq_reqmsg, 0, msgsize); + } + + return rc; +} + +/** + * Used by ptlrpc client to free request buffer of \a req. After this + * req->rq_reqmsg is set to NULL and should not be accessed anymore. + */ +void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL) + return; + + policy = ctx->cc_sec->ps_policy; + policy->sp_cops->free_reqbuf(ctx->cc_sec, req); + req->rq_reqmsg = NULL; +} + +/* + * NOTE caller must guarantee the buffer size is enough for the enlargement + */ +void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, + int segment, int newsize) +{ + void *src, *dst; + int oldsize, oldmsg_size, movesize; + + LASSERT(segment < msg->lm_bufcount); + LASSERT(msg->lm_buflens[segment] <= newsize); + + if (msg->lm_buflens[segment] == newsize) + return; + + /* nothing to do if we are enlarging the last segment */ + if (segment == msg->lm_bufcount - 1) { + msg->lm_buflens[segment] = newsize; + return; + } + + oldsize = msg->lm_buflens[segment]; + + src = lustre_msg_buf(msg, segment + 1, 0); + msg->lm_buflens[segment] = newsize; + dst = lustre_msg_buf(msg, segment + 1, 0); + msg->lm_buflens[segment] = oldsize; + + /* move from segment + 1 to end segment */ + LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2); + oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg); + LASSERT(movesize >= 0); + + if (movesize) + memmove(dst, src, movesize); + + /* note we don't clear the ares where old data live, not secret */ + + /* finally set new segment size */ + msg->lm_buflens[segment] = newsize; +} +EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace); + +/** + * Used by ptlrpc client to enlarge the \a segment of request message pointed + * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be + * preserved after the enlargement. this must be called after original request + * buffer being allocated. + * + * \note after this be called, rq_reqmsg and rq_reqlen might have been changed, + * so caller should refresh its local pointers if needed. + */ +int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, + int segment, int newsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_cops *cops; + struct lustre_msg *msg = req->rq_reqmsg; + + LASSERT(ctx); + LASSERT(msg); + LASSERT(msg->lm_bufcount > segment); + LASSERT(msg->lm_buflens[segment] <= newsize); + + if (msg->lm_buflens[segment] == newsize) + return 0; + + cops = ctx->cc_sec->ps_policy->sp_cops; + LASSERT(cops->enlarge_reqbuf); + return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize); +} +EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf); + +/** + * Used by ptlrpc client to allocate reply buffer of \a req. + * + * \note After this, req->rq_repmsg is still not accessible. + */ +int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + + if (req->rq_repbuf) + return 0; + + policy = ctx->cc_sec->ps_policy; + return policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize); +} + +/** + * Used by ptlrpc client to free reply buffer of \a req. After this + * req->rq_repmsg is set to NULL and should not be accessed anymore. + */ +void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (req->rq_repbuf == NULL) + return; + LASSERT(req->rq_repbuf_len); + + policy = ctx->cc_sec->ps_policy; + policy->sp_cops->free_repbuf(ctx->cc_sec, req); + req->rq_repmsg = NULL; +} + +int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy; + + if (!policy->sp_cops->install_rctx) + return 0; + return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx); +} + +int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx) +{ + struct ptlrpc_sec_policy *policy = ctx->sc_policy; + + if (!policy->sp_sops->install_rctx) + return 0; + return policy->sp_sops->install_rctx(imp, ctx); +} + +/**************************************** + * server side security * + ****************************************/ + +static int flavor_allowed(struct sptlrpc_flavor *exp, + struct ptlrpc_request *req) +{ + struct sptlrpc_flavor *flvr = &req->rq_flvr; + + if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc) + return 1; + + if ((req->rq_ctx_init || req->rq_ctx_fini) && + SPTLRPC_FLVR_POLICY(exp->sf_rpc) == + SPTLRPC_FLVR_POLICY(flvr->sf_rpc) && + SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc)) + return 1; + + return 0; +} + +#define EXP_FLVR_UPDATE_EXPIRE (OBD_TIMEOUT_DEFAULT + 10) + +/** + * Given an export \a exp, check whether the flavor of incoming \a req + * is allowed by the export \a exp. Main logic is about taking care of + * changing configurations. Return 0 means success. + */ +int sptlrpc_target_export_check(struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct sptlrpc_flavor flavor; + + if (exp == NULL) + return 0; + + /* client side export has no imp_reverse, skip + * FIXME maybe we should check flavor this as well??? */ + if (exp->exp_imp_reverse == NULL) + return 0; + + /* don't care about ctx fini rpc */ + if (req->rq_ctx_fini) + return 0; + + spin_lock(&exp->exp_lock); + + /* if flavor just changed (exp->exp_flvr_changed != 0), we wait for + * the first req with the new flavor, then treat it as current flavor, + * adapt reverse sec according to it. + * note the first rpc with new flavor might not be with root ctx, in + * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */ + if (unlikely(exp->exp_flvr_changed) && + flavor_allowed(&exp->exp_flvr_old[1], req)) { + /* make the new flavor as "current", and old ones as + * about-to-expire */ + CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp, + exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc); + flavor = exp->exp_flvr_old[1]; + exp->exp_flvr_old[1] = exp->exp_flvr_old[0]; + exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0]; + exp->exp_flvr_old[0] = exp->exp_flvr; + exp->exp_flvr_expire[0] = get_seconds() + + EXP_FLVR_UPDATE_EXPIRE; + exp->exp_flvr = flavor; + + /* flavor change finished */ + exp->exp_flvr_changed = 0; + LASSERT(exp->exp_flvr_adapt == 1); + + /* if it's gss, we only interested in root ctx init */ + if (req->rq_auth_gss && + !(req->rq_ctx_init && + (req->rq_auth_usr_root || req->rq_auth_usr_mdt || + req->rq_auth_usr_ost))) { + spin_unlock(&exp->exp_lock); + CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n", + req->rq_auth_gss, req->rq_ctx_init, + req->rq_auth_usr_root, req->rq_auth_usr_mdt, + req->rq_auth_usr_ost); + return 0; + } + + exp->exp_flvr_adapt = 0; + spin_unlock(&exp->exp_lock); + + return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, + req->rq_svc_ctx, &flavor); + } + + /* if it equals to the current flavor, we accept it, but need to + * dealing with reverse sec/ctx */ + if (likely(flavor_allowed(&exp->exp_flvr, req))) { + /* most cases should return here, we only interested in + * gss root ctx init */ + if (!req->rq_auth_gss || !req->rq_ctx_init || + (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && + !req->rq_auth_usr_ost)) { + spin_unlock(&exp->exp_lock); + return 0; + } + + /* if flavor just changed, we should not proceed, just leave + * it and current flavor will be discovered and replaced + * shortly, and let _this_ rpc pass through */ + if (exp->exp_flvr_changed) { + LASSERT(exp->exp_flvr_adapt); + spin_unlock(&exp->exp_lock); + return 0; + } + + if (exp->exp_flvr_adapt) { + exp->exp_flvr_adapt = 0; + CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + flavor = exp->exp_flvr; + spin_unlock(&exp->exp_lock); + + return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, + req->rq_svc_ctx, + &flavor); + } else { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, install rvs ctx\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + spin_unlock(&exp->exp_lock); + + return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse, + req->rq_svc_ctx); + } + } + + if (exp->exp_flvr_expire[0]) { + if (exp->exp_flvr_expire[0] >= get_seconds()) { + if (flavor_allowed(&exp->exp_flvr_old[0], req)) { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the middle one (" CFS_DURATION_T ")\n", exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc, + exp->exp_flvr_expire[0] - + get_seconds()); + spin_unlock(&exp->exp_lock); + return 0; + } + } else { + CDEBUG(D_SEC, "mark middle expired\n"); + exp->exp_flvr_expire[0] = 0; + } + CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, + req->rq_flvr.sf_rpc); + } + + /* now it doesn't match the current flavor, the only chance we can + * accept it is match the old flavors which is not expired. */ + if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) { + if (exp->exp_flvr_expire[1] >= get_seconds()) { + if (flavor_allowed(&exp->exp_flvr_old[1], req)) { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the oldest one (" CFS_DURATION_T ")\n", + exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc, + exp->exp_flvr_expire[1] - + get_seconds()); + spin_unlock(&exp->exp_lock); + return 0; + } + } else { + CDEBUG(D_SEC, "mark oldest expired\n"); + exp->exp_flvr_expire[1] = 0; + } + CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, + req->rq_flvr.sf_rpc); + } else { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n", + exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + } + + spin_unlock(&exp->exp_lock); + + CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with unauthorized flavor %x, expect %x|%x(%+ld)|%x(%+ld)\n", + exp, exp->exp_obd->obd_name, + req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini, + req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost, + req->rq_flvr.sf_rpc, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_expire[0] ? + (unsigned long) (exp->exp_flvr_expire[0] - + get_seconds()) : 0, + exp->exp_flvr_old[1].sf_rpc, + exp->exp_flvr_expire[1] ? + (unsigned long) (exp->exp_flvr_expire[1] - + get_seconds()) : 0); + return -EACCES; +} +EXPORT_SYMBOL(sptlrpc_target_export_check); + +void sptlrpc_target_update_exp_flavor(struct obd_device *obd, + struct sptlrpc_rule_set *rset) +{ + struct obd_export *exp; + struct sptlrpc_flavor new_flvr; + + LASSERT(obd); + + spin_lock(&obd->obd_dev_lock); + + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) { + if (exp->exp_connection == NULL) + continue; + + /* note if this export had just been updated flavor + * (exp_flvr_changed == 1), this will override the + * previous one. */ + spin_lock(&exp->exp_lock); + sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer, + exp->exp_connection->c_peer.nid, + &new_flvr); + if (exp->exp_flvr_changed || + !flavor_equal(&new_flvr, &exp->exp_flvr)) { + exp->exp_flvr_old[1] = new_flvr; + exp->exp_flvr_expire[1] = 0; + exp->exp_flvr_changed = 1; + exp->exp_flvr_adapt = 1; + + CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n", + exp, sptlrpc_part2name(exp->exp_sp_peer), + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + } + spin_unlock(&exp->exp_lock); + } + + spin_unlock(&obd->obd_dev_lock); +} +EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor); + +static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc) +{ + /* peer's claim is unreliable unless gss is being used */ + if (!req->rq_auth_gss || svc_rc == SECSVC_DROP) + return svc_rc; + + switch (req->rq_sp_from) { + case LUSTRE_SP_CLI: + if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) { + DEBUG_REQ(D_ERROR, req, "faked source CLI"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_MDT: + if (!req->rq_auth_usr_mdt) { + DEBUG_REQ(D_ERROR, req, "faked source MDT"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_OST: + if (!req->rq_auth_usr_ost) { + DEBUG_REQ(D_ERROR, req, "faked source OST"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_MGS: + case LUSTRE_SP_MGC: + if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && + !req->rq_auth_usr_ost) { + DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_ANY: + default: + DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from); + svc_rc = SECSVC_DROP; + } + + return svc_rc; +} + +/** + * Used by ptlrpc server, to perform transformation upon request message of + * incoming \a req. This must be the first thing to do with a incoming + * request in ptlrpc layer. + * + * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in + * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set. + * \retval SECSVC_COMPLETE success, the request has been fully processed, and + * reply message has been prepared. + * \retval SECSVC_DROP failed, this request should be dropped. + */ +int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req) +{ + struct ptlrpc_sec_policy *policy; + struct lustre_msg *msg = req->rq_reqbuf; + int rc; + + LASSERT(msg); + LASSERT(req->rq_reqmsg == NULL); + LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_svc_ctx == NULL); + + req->rq_req_swab_mask = 0; + + rc = __lustre_unpack_msg(msg, req->rq_reqdata_len); + switch (rc) { + case 1: + lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); + case 0: + break; + default: + CERROR("error unpacking request from %s x%llu\n", + libcfs_id2str(req->rq_peer), req->rq_xid); + return SECSVC_DROP; + } + + req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr); + req->rq_sp_from = LUSTRE_SP_ANY; + req->rq_auth_uid = -1; + req->rq_auth_mapped_uid = -1; + + policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc); + if (!policy) { + CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc); + return SECSVC_DROP; + } + + LASSERT(policy->sp_sops->accept); + rc = policy->sp_sops->accept(req); + sptlrpc_policy_put(policy); + LASSERT(req->rq_reqmsg || rc != SECSVC_OK); + LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP); + + /* + * if it's not null flavor (which means embedded packing msg), + * reset the swab mask for the coming inner msg unpacking. + */ + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) + req->rq_req_swab_mask = 0; + + /* sanity check for the request source */ + rc = sptlrpc_svc_check_from(req, rc); + return rc; +} + +/** + * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed, + * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to + * a buffer of \a msglen size. + */ +int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen) +{ + struct ptlrpc_sec_policy *policy; + struct ptlrpc_reply_state *rs; + int rc; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_svc_ctx->sc_policy); + + policy = req->rq_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->alloc_rs); + + rc = policy->sp_sops->alloc_rs(req, msglen); + if (unlikely(rc == -ENOMEM)) { + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + if (svcpt->scp_service->srv_max_reply_size < + msglen + sizeof(struct ptlrpc_reply_state)) { + /* Just return failure if the size is too big */ + CERROR("size of message is too big (%zd), %d allowed", + msglen + sizeof(struct ptlrpc_reply_state), + svcpt->scp_service->srv_max_reply_size); + return -ENOMEM; + } + + /* failed alloc, try emergency pool */ + rs = lustre_get_emerg_rs(svcpt); + if (rs == NULL) + return -ENOMEM; + + req->rq_reply_state = rs; + rc = policy->sp_sops->alloc_rs(req, msglen); + if (rc) { + lustre_put_emerg_rs(rs); + req->rq_reply_state = NULL; + } + } + + LASSERT(rc != 0 || + (req->rq_reply_state && req->rq_reply_state->rs_msg)); + + return rc; +} + +/** + * Used by ptlrpc server, to perform transformation upon reply message. + * + * \post req->rq_reply_off is set to appropriate server-controlled reply offset. + * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible. + */ +int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_sec_policy *policy; + int rc; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_svc_ctx->sc_policy); + + policy = req->rq_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->authorize); + + rc = policy->sp_sops->authorize(req); + LASSERT(rc || req->rq_reply_state->rs_repdata_len); + + return rc; +} + +/** + * Used by ptlrpc server, to free reply_state. + */ +void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_sec_policy *policy; + unsigned int prealloc; + + LASSERT(rs->rs_svc_ctx); + LASSERT(rs->rs_svc_ctx->sc_policy); + + policy = rs->rs_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->free_rs); + + prealloc = rs->rs_prealloc; + policy->sp_sops->free_rs(rs); + + if (prealloc) + lustre_put_emerg_rs(rs); +} + +void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx != NULL) + atomic_inc(&ctx->sc_refcount); +} + +void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx == NULL) + return; + + LASSERT_ATOMIC_POS(&ctx->sc_refcount); + if (atomic_dec_and_test(&ctx->sc_refcount)) { + if (ctx->sc_policy->sp_sops->free_ctx) + ctx->sc_policy->sp_sops->free_ctx(ctx); + } + req->rq_svc_ctx = NULL; +} + +void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx == NULL) + return; + + LASSERT_ATOMIC_POS(&ctx->sc_refcount); + if (ctx->sc_policy->sp_sops->invalidate_ctx) + ctx->sc_policy->sp_sops->invalidate_ctx(ctx); +} +EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate); + +/**************************************** + * bulk security * + ****************************************/ + +/** + * Perform transformation upon bulk data pointed by \a desc. This is called + * before transforming the request message. + */ +int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_cli_ctx *ctx; + + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->wrap_bulk) + return ctx->cc_ops->wrap_bulk(ctx, req, desc); + return 0; +} +EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk); + +/** + * This is called after unwrap the reply message. + * return nob of actual plain text size received, or error code. + */ +int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + int nob) +{ + struct ptlrpc_cli_ctx *ctx; + int rc; + + LASSERT(req->rq_bulk_read && !req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return desc->bd_nob_transferred; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->unwrap_bulk) { + rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (rc < 0) + return rc; + } + return desc->bd_nob_transferred; +} +EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read); + +/** + * This is called after unwrap the reply message. + * return 0 for success or error code. + */ +int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_cli_ctx *ctx; + int rc; + + LASSERT(!req->rq_bulk_read && req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->unwrap_bulk) { + rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (rc < 0) + return rc; + } + + /* + * if everything is going right, nob should equals to nob_transferred. + * in case of privacy mode, nob_transferred needs to be adjusted. + */ + if (desc->bd_nob != desc->bd_nob_transferred) { + CERROR("nob %d doesn't match transferred nob %d", + desc->bd_nob, desc->bd_nob_transferred); + return -EPROTO; + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write); + + +/**************************************** + * user descriptor helpers * + ****************************************/ + +int sptlrpc_current_user_desc_size(void) +{ + int ngroups; + + ngroups = current_ngroups; + + if (ngroups > LUSTRE_MAX_GROUPS) + ngroups = LUSTRE_MAX_GROUPS; + return sptlrpc_user_desc_size(ngroups); +} +EXPORT_SYMBOL(sptlrpc_current_user_desc_size); + +int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset) +{ + struct ptlrpc_user_desc *pud; + + pud = lustre_msg_buf(msg, offset, 0); + + pud->pud_uid = from_kuid(&init_user_ns, current_uid()); + pud->pud_gid = from_kgid(&init_user_ns, current_gid()); + pud->pud_fsuid = from_kuid(&init_user_ns, current_fsuid()); + pud->pud_fsgid = from_kgid(&init_user_ns, current_fsgid()); + pud->pud_cap = cfs_curproc_cap_pack(); + pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4; + + task_lock(current); + if (pud->pud_ngroups > current_ngroups) + pud->pud_ngroups = current_ngroups; + memcpy(pud->pud_groups, current_cred()->group_info->blocks[0], + pud->pud_ngroups * sizeof(__u32)); + task_unlock(current); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_pack_user_desc); + +int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed) +{ + struct ptlrpc_user_desc *pud; + int i; + + pud = lustre_msg_buf(msg, offset, sizeof(*pud)); + if (!pud) + return -EINVAL; + + if (swabbed) { + __swab32s(&pud->pud_uid); + __swab32s(&pud->pud_gid); + __swab32s(&pud->pud_fsuid); + __swab32s(&pud->pud_fsgid); + __swab32s(&pud->pud_cap); + __swab32s(&pud->pud_ngroups); + } + + if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) { + CERROR("%u groups is too large\n", pud->pud_ngroups); + return -EINVAL; + } + + if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) > + msg->lm_buflens[offset]) { + CERROR("%u groups are claimed but bufsize only %u\n", + pud->pud_ngroups, msg->lm_buflens[offset]); + return -EINVAL; + } + + if (swabbed) { + for (i = 0; i < pud->pud_ngroups; i++) + __swab32s(&pud->pud_groups[i]); + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_unpack_user_desc); + +/**************************************** + * misc helpers * + ****************************************/ + +const char *sec2target_str(struct ptlrpc_sec *sec) +{ + if (!sec || !sec->ps_import || !sec->ps_import->imp_obd) + return "*"; + if (sec_is_reverse(sec)) + return "c"; + return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid); +} +EXPORT_SYMBOL(sec2target_str); + +/* + * return true if the bulk data is protected + */ +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr) +{ + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_INTG: + case SPTLRPC_BULK_SVC_PRIV: + return 1; + default: + return 0; + } +} +EXPORT_SYMBOL(sptlrpc_flavor_has_bulk); + +/**************************************** + * crypto API helper/alloc blkciper * + ****************************************/ + +/**************************************** + * initialize/finalize * + ****************************************/ + +int sptlrpc_init(void) +{ + int rc; + + rwlock_init(&policy_lock); + + rc = sptlrpc_gc_init(); + if (rc) + goto out; + + rc = sptlrpc_conf_init(); + if (rc) + goto out_gc; + + rc = sptlrpc_enc_pool_init(); + if (rc) + goto out_conf; + + rc = sptlrpc_null_init(); + if (rc) + goto out_pool; + + rc = sptlrpc_plain_init(); + if (rc) + goto out_null; + + rc = sptlrpc_lproc_init(); + if (rc) + goto out_plain; + + return 0; + +out_plain: + sptlrpc_plain_fini(); +out_null: + sptlrpc_null_fini(); +out_pool: + sptlrpc_enc_pool_fini(); +out_conf: + sptlrpc_conf_fini(); +out_gc: + sptlrpc_gc_fini(); +out: + return rc; +} + +void sptlrpc_fini(void) +{ + sptlrpc_lproc_fini(); + sptlrpc_plain_fini(); + sptlrpc_null_fini(); + sptlrpc_enc_pool_fini(); + sptlrpc_conf_fini(); + sptlrpc_gc_fini(); +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c new file mode 100644 index 000000000..c05a8554d --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c @@ -0,0 +1,884 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_bulk.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include "../../include/linux/libcfs/libcfs.h" +#include + +#include "../include/obd.h" +#include "../include/obd_cksum.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lustre_net.h" +#include "../include/lustre_import.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_sec.h" + +#include "ptlrpc_internal.h" + +/**************************************** + * bulk encryption page pools * + ****************************************/ + + +#define POINTERS_PER_PAGE (PAGE_CACHE_SIZE / sizeof(void *)) +#define PAGES_PER_POOL (POINTERS_PER_PAGE) + +#define IDLE_IDX_MAX (100) +#define IDLE_IDX_WEIGHT (3) + +#define CACHE_QUIESCENT_PERIOD (20) + +static struct ptlrpc_enc_page_pool { + /* + * constants + */ + unsigned long epp_max_pages; /* maximum pages can hold, const */ + unsigned int epp_max_pools; /* number of pools, const */ + + /* + * wait queue in case of not enough free pages. + */ + wait_queue_head_t epp_waitq; /* waiting threads */ + unsigned int epp_waitqlen; /* wait queue length */ + unsigned long epp_pages_short; /* # of pages wanted of in-q users */ + unsigned int epp_growing:1; /* during adding pages */ + + /* + * indicating how idle the pools are, from 0 to MAX_IDLE_IDX + * this is counted based on each time when getting pages from + * the pools, not based on time. which means in case that system + * is idled for a while but the idle_idx might still be low if no + * activities happened in the pools. + */ + unsigned long epp_idle_idx; + + /* last shrink time due to mem tight */ + long epp_last_shrink; + long epp_last_access; + + /* + * in-pool pages bookkeeping + */ + spinlock_t epp_lock; /* protect following fields */ + unsigned long epp_total_pages; /* total pages in pools */ + unsigned long epp_free_pages; /* current pages available */ + + /* + * statistics + */ + unsigned long epp_st_max_pages; /* # of pages ever reached */ + unsigned int epp_st_grows; /* # of grows */ + unsigned int epp_st_grow_fails; /* # of add pages failures */ + unsigned int epp_st_shrinks; /* # of shrinks */ + unsigned long epp_st_access; /* # of access */ + unsigned long epp_st_missings; /* # of cache missing */ + unsigned long epp_st_lowfree; /* lowest free pages reached */ + unsigned int epp_st_max_wqlen; /* highest waitqueue length */ + unsigned long epp_st_max_wait; /* in jiffies */ + /* + * pointers to pools + */ + struct page ***epp_pools; +} page_pools; + +/* + * /proc/fs/lustre/sptlrpc/encrypt_page_pools + */ +int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v) +{ + spin_lock(&page_pools.epp_lock); + + seq_printf(m, + "physical pages: %lu\n" + "pages per pool: %lu\n" + "max pages: %lu\n" + "max pools: %u\n" + "total pages: %lu\n" + "total free: %lu\n" + "idle index: %lu/100\n" + "last shrink: %lds\n" + "last access: %lds\n" + "max pages reached: %lu\n" + "grows: %u\n" + "grows failure: %u\n" + "shrinks: %u\n" + "cache access: %lu\n" + "cache missing: %lu\n" + "low free mark: %lu\n" + "max waitqueue depth: %u\n" + "max wait time: " CFS_TIME_T "/%u\n", + totalram_pages, + PAGES_PER_POOL, + page_pools.epp_max_pages, + page_pools.epp_max_pools, + page_pools.epp_total_pages, + page_pools.epp_free_pages, + page_pools.epp_idle_idx, + get_seconds() - page_pools.epp_last_shrink, + get_seconds() - page_pools.epp_last_access, + page_pools.epp_st_max_pages, + page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, + page_pools.epp_st_access, + page_pools.epp_st_missings, + page_pools.epp_st_lowfree, + page_pools.epp_st_max_wqlen, + page_pools.epp_st_max_wait, + HZ); + + spin_unlock(&page_pools.epp_lock); + + return 0; +} + +static void enc_pools_release_free_pages(long npages) +{ + int p_idx, g_idx; + int p_idx_max1, p_idx_max2; + + LASSERT(npages > 0); + LASSERT(npages <= page_pools.epp_free_pages); + LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages); + + /* max pool index before the release */ + p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL; + + page_pools.epp_free_pages -= npages; + page_pools.epp_total_pages -= npages; + + /* max pool index after the release */ + p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 : + ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + LASSERT(page_pools.epp_pools[p_idx]); + + while (npages--) { + LASSERT(page_pools.epp_pools[p_idx]); + LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); + + __free_page(page_pools.epp_pools[p_idx][g_idx]); + page_pools.epp_pools[p_idx][g_idx] = NULL; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + /* free unused pools */ + while (p_idx_max1 < p_idx_max2) { + LASSERT(page_pools.epp_pools[p_idx_max2]); + OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_CACHE_SIZE); + page_pools.epp_pools[p_idx_max2] = NULL; + p_idx_max2--; + } +} + +/* + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static unsigned long enc_pools_shrink_count(struct shrinker *s, + struct shrink_control *sc) +{ + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(get_seconds() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return max((int)page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) * + (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX; +} + +/* + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static unsigned long enc_pools_shrink_scan(struct shrinker *s, + struct shrink_control *sc) +{ + spin_lock(&page_pools.epp_lock); + sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan, + page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES); + if (sc->nr_to_scan > 0) { + enc_pools_release_free_pages(sc->nr_to_scan); + CDEBUG(D_SEC, "released %ld pages, %ld left\n", + (long)sc->nr_to_scan, page_pools.epp_free_pages); + + page_pools.epp_st_shrinks++; + page_pools.epp_last_shrink = get_seconds(); + } + spin_unlock(&page_pools.epp_lock); + + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(get_seconds() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return sc->nr_to_scan; +} + +static inline +int npages_to_npools(unsigned long npages) +{ + return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL); +} + +/* + * return how many pages cleaned up. + */ +static unsigned long enc_pools_cleanup(struct page ***pools, int npools) +{ + unsigned long cleaned = 0; + int i, j; + + for (i = 0; i < npools; i++) { + if (pools[i]) { + for (j = 0; j < PAGES_PER_POOL; j++) { + if (pools[i][j]) { + __free_page(pools[i][j]); + cleaned++; + } + } + OBD_FREE(pools[i], PAGE_CACHE_SIZE); + pools[i] = NULL; + } + } + + return cleaned; +} + +/* + * merge @npools pointed by @pools which contains @npages new pages + * into current pools. + * + * we have options to avoid most memory copy with some tricks. but we choose + * the simplest way to avoid complexity. It's not frequently called. + */ +static void enc_pools_insert(struct page ***pools, int npools, int npages) +{ + int freeslot; + int op_idx, np_idx, og_idx, ng_idx; + int cur_npools, end_npools; + + LASSERT(npages > 0); + LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages); + LASSERT(npages_to_npools(npages) == npools); + LASSERT(page_pools.epp_growing); + + spin_lock(&page_pools.epp_lock); + + /* + * (1) fill all the free slots of current pools. + */ + /* free slots are those left by rent pages, and the extra ones with + * index >= total_pages, locate at the tail of last pool. */ + freeslot = page_pools.epp_total_pages % PAGES_PER_POOL; + if (freeslot != 0) + freeslot = PAGES_PER_POOL - freeslot; + freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages; + + op_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + og_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + np_idx = npools - 1; + ng_idx = (npages - 1) % PAGES_PER_POOL; + + while (freeslot) { + LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL); + LASSERT(pools[np_idx][ng_idx] != NULL); + + page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx]; + pools[np_idx][ng_idx] = NULL; + + freeslot--; + + if (++og_idx == PAGES_PER_POOL) { + op_idx++; + og_idx = 0; + } + if (--ng_idx < 0) { + if (np_idx == 0) + break; + np_idx--; + ng_idx = PAGES_PER_POOL - 1; + } + } + + /* + * (2) add pools if needed. + */ + cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) / + PAGES_PER_POOL; + end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL - 1) + / PAGES_PER_POOL; + LASSERT(end_npools <= page_pools.epp_max_pools); + + np_idx = 0; + while (cur_npools < end_npools) { + LASSERT(page_pools.epp_pools[cur_npools] == NULL); + LASSERT(np_idx < npools); + LASSERT(pools[np_idx] != NULL); + + page_pools.epp_pools[cur_npools++] = pools[np_idx]; + pools[np_idx++] = NULL; + } + + page_pools.epp_total_pages += npages; + page_pools.epp_free_pages += npages; + page_pools.epp_st_lowfree = page_pools.epp_free_pages; + + if (page_pools.epp_total_pages > page_pools.epp_st_max_pages) + page_pools.epp_st_max_pages = page_pools.epp_total_pages; + + CDEBUG(D_SEC, "add %d pages to total %lu\n", npages, + page_pools.epp_total_pages); + + spin_unlock(&page_pools.epp_lock); +} + +static int enc_pools_add_pages(int npages) +{ + static DEFINE_MUTEX(add_pages_mutex); + struct page ***pools; + int npools, alloced = 0; + int i, j, rc = -ENOMEM; + + if (npages < PTLRPC_MAX_BRW_PAGES) + npages = PTLRPC_MAX_BRW_PAGES; + + mutex_lock(&add_pages_mutex); + + if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages) + npages = page_pools.epp_max_pages - page_pools.epp_total_pages; + LASSERT(npages > 0); + + page_pools.epp_st_grows++; + + npools = npages_to_npools(npages); + OBD_ALLOC(pools, npools * sizeof(*pools)); + if (pools == NULL) + goto out; + + for (i = 0; i < npools; i++) { + OBD_ALLOC(pools[i], PAGE_CACHE_SIZE); + if (pools[i] == NULL) + goto out_pools; + + for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) { + pools[i][j] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + if (pools[i][j] == NULL) + goto out_pools; + + alloced++; + } + } + LASSERT(alloced == npages); + + enc_pools_insert(pools, npools, npages); + CDEBUG(D_SEC, "added %d pages into pools\n", npages); + rc = 0; + +out_pools: + enc_pools_cleanup(pools, npools); + OBD_FREE(pools, npools * sizeof(*pools)); +out: + if (rc) { + page_pools.epp_st_grow_fails++; + CERROR("Failed to allocate %d enc pages\n", npages); + } + + mutex_unlock(&add_pages_mutex); + return rc; +} + +static inline void enc_pools_wakeup(void) +{ + assert_spin_locked(&page_pools.epp_lock); + LASSERT(page_pools.epp_waitqlen >= 0); + + if (unlikely(page_pools.epp_waitqlen)) { + LASSERT(waitqueue_active(&page_pools.epp_waitq)); + wake_up_all(&page_pools.epp_waitq); + } +} + +static int enc_pools_should_grow(int page_needed, long now) +{ + /* don't grow if someone else is growing the pools right now, + * or the pools has reached its full capacity + */ + if (page_pools.epp_growing || + page_pools.epp_total_pages == page_pools.epp_max_pages) + return 0; + + /* if total pages is not enough, we need to grow */ + if (page_pools.epp_total_pages < page_needed) + return 1; + + /* + * we wanted to return 0 here if there was a shrink just happened + * moment ago, but this may cause deadlock if both client and ost + * live on single node. + */ +#if 0 + if (now - page_pools.epp_last_shrink < 2) + return 0; +#endif + + /* + * here we perhaps need consider other factors like wait queue + * length, idle index, etc. ? + */ + + /* grow the pools in any other cases */ + return 1; +} + +/* + * we allocate the requested pages atomically. + */ +int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc) +{ + wait_queue_t waitlink; + unsigned long this_idle = -1; + unsigned long tick = 0; + long now; + int p_idx, g_idx; + int i; + + LASSERT(desc->bd_iov_count > 0); + LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages); + + /* resent bulk, enc iov might have been allocated previously */ + if (desc->bd_enc_iov != NULL) + return 0; + + OBD_ALLOC(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + if (desc->bd_enc_iov == NULL) + return -ENOMEM; + + spin_lock(&page_pools.epp_lock); + + page_pools.epp_st_access++; +again: + if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) { + if (tick == 0) + tick = cfs_time_current(); + + now = get_seconds(); + + page_pools.epp_st_missings++; + page_pools.epp_pages_short += desc->bd_iov_count; + + if (enc_pools_should_grow(desc->bd_iov_count, now)) { + page_pools.epp_growing = 1; + + spin_unlock(&page_pools.epp_lock); + enc_pools_add_pages(page_pools.epp_pages_short / 2); + spin_lock(&page_pools.epp_lock); + + page_pools.epp_growing = 0; + + enc_pools_wakeup(); + } else { + if (++page_pools.epp_waitqlen > + page_pools.epp_st_max_wqlen) + page_pools.epp_st_max_wqlen = + page_pools.epp_waitqlen; + + set_current_state(TASK_UNINTERRUPTIBLE); + init_waitqueue_entry(&waitlink, current); + add_wait_queue(&page_pools.epp_waitq, &waitlink); + + spin_unlock(&page_pools.epp_lock); + schedule(); + remove_wait_queue(&page_pools.epp_waitq, &waitlink); + LASSERT(page_pools.epp_waitqlen > 0); + spin_lock(&page_pools.epp_lock); + page_pools.epp_waitqlen--; + } + + LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count); + page_pools.epp_pages_short -= desc->bd_iov_count; + + this_idle = 0; + goto again; + } + + /* record max wait time */ + if (unlikely(tick != 0)) { + tick = cfs_time_current() - tick; + if (tick > page_pools.epp_st_max_wait) + page_pools.epp_st_max_wait = tick; + } + + /* proceed with rest of allocation */ + page_pools.epp_free_pages -= desc->bd_iov_count; + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); + desc->bd_enc_iov[i].kiov_page = + page_pools.epp_pools[p_idx][g_idx]; + page_pools.epp_pools[p_idx][g_idx] = NULL; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + if (page_pools.epp_free_pages < page_pools.epp_st_lowfree) + page_pools.epp_st_lowfree = page_pools.epp_free_pages; + + /* + * new idle index = (old * weight + new) / (weight + 1) + */ + if (this_idle == -1) { + this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX / + page_pools.epp_total_pages; + } + page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT + + this_idle) / + (IDLE_IDX_WEIGHT + 1); + + page_pools.epp_last_access = get_seconds(); + + spin_unlock(&page_pools.epp_lock); + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages); + +void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) +{ + int p_idx, g_idx; + int i; + + if (desc->bd_enc_iov == NULL) + return; + + LASSERT(desc->bd_iov_count > 0); + + spin_lock(&page_pools.epp_lock); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + + LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <= + page_pools.epp_total_pages); + LASSERT(page_pools.epp_pools[p_idx]); + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(desc->bd_enc_iov[i].kiov_page != NULL); + LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); + LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL); + + page_pools.epp_pools[p_idx][g_idx] = + desc->bd_enc_iov[i].kiov_page; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + page_pools.epp_free_pages += desc->bd_iov_count; + + enc_pools_wakeup(); + + spin_unlock(&page_pools.epp_lock); + + OBD_FREE(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + desc->bd_enc_iov = NULL; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages); + +/* + * we don't do much stuff for add_user/del_user anymore, except adding some + * initial pages in add_user() if current pools are empty, rest would be + * handled by the pools's self-adaption. + */ +int sptlrpc_enc_pool_add_user(void) +{ + int need_grow = 0; + + spin_lock(&page_pools.epp_lock); + if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) { + page_pools.epp_growing = 1; + need_grow = 1; + } + spin_unlock(&page_pools.epp_lock); + + if (need_grow) { + enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES + + PTLRPC_MAX_BRW_PAGES); + + spin_lock(&page_pools.epp_lock); + page_pools.epp_growing = 0; + enc_pools_wakeup(); + spin_unlock(&page_pools.epp_lock); + } + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_add_user); + +int sptlrpc_enc_pool_del_user(void) +{ + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_del_user); + +static inline void enc_pools_alloc(void) +{ + LASSERT(page_pools.epp_max_pools); + OBD_ALLOC_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); +} + +static inline void enc_pools_free(void) +{ + LASSERT(page_pools.epp_max_pools); + LASSERT(page_pools.epp_pools); + + OBD_FREE_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); +} + +static struct shrinker pools_shrinker = { + .count_objects = enc_pools_shrink_count, + .scan_objects = enc_pools_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; + +int sptlrpc_enc_pool_init(void) +{ + /* + * maximum capacity is 1/8 of total physical memory. + * is the 1/8 a good number? + */ + page_pools.epp_max_pages = totalram_pages / 8; + page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); + + init_waitqueue_head(&page_pools.epp_waitq); + page_pools.epp_waitqlen = 0; + page_pools.epp_pages_short = 0; + + page_pools.epp_growing = 0; + + page_pools.epp_idle_idx = 0; + page_pools.epp_last_shrink = get_seconds(); + page_pools.epp_last_access = get_seconds(); + + spin_lock_init(&page_pools.epp_lock); + page_pools.epp_total_pages = 0; + page_pools.epp_free_pages = 0; + + page_pools.epp_st_max_pages = 0; + page_pools.epp_st_grows = 0; + page_pools.epp_st_grow_fails = 0; + page_pools.epp_st_shrinks = 0; + page_pools.epp_st_access = 0; + page_pools.epp_st_missings = 0; + page_pools.epp_st_lowfree = 0; + page_pools.epp_st_max_wqlen = 0; + page_pools.epp_st_max_wait = 0; + + enc_pools_alloc(); + if (page_pools.epp_pools == NULL) + return -ENOMEM; + + register_shrinker(&pools_shrinker); + + return 0; +} + +void sptlrpc_enc_pool_fini(void) +{ + unsigned long cleaned, npools; + + LASSERT(page_pools.epp_pools); + LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages); + + unregister_shrinker(&pools_shrinker); + + npools = npages_to_npools(page_pools.epp_total_pages); + cleaned = enc_pools_cleanup(page_pools.epp_pools, npools); + LASSERT(cleaned == page_pools.epp_total_pages); + + enc_pools_free(); + + if (page_pools.epp_st_access > 0) { + CDEBUG(D_SEC, + "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait " + CFS_TIME_T"/%d\n", + page_pools.epp_st_max_pages, page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, page_pools.epp_st_access, + page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, + page_pools.epp_st_max_wait, HZ); + } +} + + +static int cfs_hash_alg_id[] = { + [BULK_HASH_ALG_NULL] = CFS_HASH_ALG_NULL, + [BULK_HASH_ALG_ADLER32] = CFS_HASH_ALG_ADLER32, + [BULK_HASH_ALG_CRC32] = CFS_HASH_ALG_CRC32, + [BULK_HASH_ALG_MD5] = CFS_HASH_ALG_MD5, + [BULK_HASH_ALG_SHA1] = CFS_HASH_ALG_SHA1, + [BULK_HASH_ALG_SHA256] = CFS_HASH_ALG_SHA256, + [BULK_HASH_ALG_SHA384] = CFS_HASH_ALG_SHA384, + [BULK_HASH_ALG_SHA512] = CFS_HASH_ALG_SHA512, +}; +const char *sptlrpc_get_hash_name(__u8 hash_alg) +{ + return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]); +} +EXPORT_SYMBOL(sptlrpc_get_hash_name); + +__u8 sptlrpc_get_hash_alg(const char *algname) +{ + return cfs_crypto_hash_alg(algname); +} +EXPORT_SYMBOL(sptlrpc_get_hash_alg); + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed) +{ + struct ptlrpc_bulk_sec_desc *bsd; + int size = msg->lm_buflens[offset]; + + bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); + if (bsd == NULL) { + CERROR("Invalid bulk sec desc: size %d\n", size); + return -EINVAL; + } + + if (swabbed) + __swab32s(&bsd->bsd_nob); + + if (unlikely(bsd->bsd_version != 0)) { + CERROR("Unexpected version %u\n", bsd->bsd_version); + return -EPROTO; + } + + if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) { + CERROR("Invalid type %u\n", bsd->bsd_type); + return -EPROTO; + } + + /* FIXME more sanity check here */ + + if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG && + bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) { + CERROR("Invalid svc %u\n", bsd->bsd_svc); + return -EPROTO; + } + + return 0; +} +EXPORT_SYMBOL(bulk_sec_desc_unpack); + +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) +{ + struct cfs_crypto_hash_desc *hdesc; + int hashsize; + char hashbuf[64]; + unsigned int bufsize; + int i, err; + + LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); + LASSERT(buflen >= 4); + + hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0); + if (IS_ERR(hdesc)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_hash_alg_id[alg])); + return PTR_ERR(hdesc); + } + + hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]); + + for (i = 0; i < desc->bd_iov_count; i++) { + cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page, + desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK, + desc->bd_iov[i].kiov_len); + } + if (hashsize > buflen) { + bufsize = sizeof(hashbuf); + err = cfs_crypto_hash_final(hdesc, (unsigned char *)hashbuf, + &bufsize); + memcpy(buf, hashbuf, buflen); + } else { + bufsize = buflen; + err = cfs_crypto_hash_final(hdesc, (unsigned char *)buf, + &bufsize); + } + + if (err) + cfs_crypto_hash_final(hdesc, NULL, NULL); + return err; +} +EXPORT_SYMBOL(sptlrpc_get_bulk_checksum); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_config.c new file mode 100644 index 000000000..56ba9e4e5 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_config.c @@ -0,0 +1,901 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include "../../include/linux/libcfs/libcfs.h" +#include +#include + +#include "../include/obd.h" +#include "../include/obd_support.h" +#include "../include/lustre_import.h" +#include "../include/lustre_param.h" +#include "../include/lustre_sec.h" + +#include "ptlrpc_internal.h" + +const char *sptlrpc_part2name(enum lustre_sec_part part) +{ + switch (part) { + case LUSTRE_SP_CLI: + return "cli"; + case LUSTRE_SP_MDT: + return "mdt"; + case LUSTRE_SP_OST: + return "ost"; + case LUSTRE_SP_MGC: + return "mgc"; + case LUSTRE_SP_MGS: + return "mgs"; + case LUSTRE_SP_ANY: + return "any"; + default: + return "err"; + } +} +EXPORT_SYMBOL(sptlrpc_part2name); + +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd) +{ + const char *type = obd->obd_type->typ_name; + + if (!strcmp(type, LUSTRE_MDT_NAME)) + return LUSTRE_SP_MDT; + if (!strcmp(type, LUSTRE_OST_NAME)) + return LUSTRE_SP_OST; + if (!strcmp(type, LUSTRE_MGS_NAME)) + return LUSTRE_SP_MGS; + + CERROR("unknown target %p(%s)\n", obd, type); + return LUSTRE_SP_ANY; +} +EXPORT_SYMBOL(sptlrpc_target_sec_part); + +/**************************************** + * user supplied flavor string parsing * + ****************************************/ + +/* + * format: [-] + */ +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr) +{ + char buf[32]; + char *bulk, *alg; + + memset(flvr, 0, sizeof(*flvr)); + + if (str == NULL || str[0] == '\0') { + flvr->sf_rpc = SPTLRPC_FLVR_INVALID; + return 0; + } + + strncpy(buf, str, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + + bulk = strchr(buf, '-'); + if (bulk) + *bulk++ = '\0'; + + flvr->sf_rpc = sptlrpc_name2flavor_base(buf); + if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID) + goto err_out; + + /* + * currently only base flavor "plain" can have bulk specification. + */ + if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) { + flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32; + if (bulk) { + /* + * format: plain-hash: + */ + alg = strchr(bulk, ':'); + if (alg == NULL) + goto err_out; + *alg++ = '\0'; + + if (strcmp(bulk, "hash")) + goto err_out; + + flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg); + if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX) + goto err_out; + } + + if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL) + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL); + else + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG); + } else { + if (bulk) + goto err_out; + } + + flvr->sf_flags = 0; + return 0; + +err_out: + CERROR("invalid flavor string: %s\n", str); + return -EINVAL; +} +EXPORT_SYMBOL(sptlrpc_parse_flavor); + +/**************************************** + * configure rules * + ****************************************/ + +static void get_default_flavor(struct sptlrpc_flavor *sf) +{ + memset(sf, 0, sizeof(*sf)); + + sf->sf_rpc = SPTLRPC_FLVR_NULL; + sf->sf_flags = 0; +} + +static void sptlrpc_rule_init(struct sptlrpc_rule *rule) +{ + rule->sr_netid = LNET_NIDNET(LNET_NID_ANY); + rule->sr_from = LUSTRE_SP_ANY; + rule->sr_to = LUSTRE_SP_ANY; + rule->sr_padding = 0; + + get_default_flavor(&rule->sr_flvr); +} + +/* + * format: network[.direction]=flavor + */ +int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule) +{ + char *flavor, *dir; + int rc; + + sptlrpc_rule_init(rule); + + flavor = strchr(param, '='); + if (flavor == NULL) { + CERROR("invalid param, no '='\n"); + return -EINVAL; + } + *flavor++ = '\0'; + + dir = strchr(param, '.'); + if (dir) + *dir++ = '\0'; + + /* 1.1 network */ + if (strcmp(param, "default")) { + rule->sr_netid = libcfs_str2net(param); + if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) { + CERROR("invalid network name: %s\n", param); + return -EINVAL; + } + } + + /* 1.2 direction */ + if (dir) { + if (!strcmp(dir, "mdt2ost")) { + rule->sr_from = LUSTRE_SP_MDT; + rule->sr_to = LUSTRE_SP_OST; + } else if (!strcmp(dir, "mdt2mdt")) { + rule->sr_from = LUSTRE_SP_MDT; + rule->sr_to = LUSTRE_SP_MDT; + } else if (!strcmp(dir, "cli2ost")) { + rule->sr_from = LUSTRE_SP_CLI; + rule->sr_to = LUSTRE_SP_OST; + } else if (!strcmp(dir, "cli2mdt")) { + rule->sr_from = LUSTRE_SP_CLI; + rule->sr_to = LUSTRE_SP_MDT; + } else { + CERROR("invalid rule dir segment: %s\n", dir); + return -EINVAL; + } + } + + /* 2.1 flavor */ + rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr); + if (rc) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(sptlrpc_parse_rule); + +void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset) +{ + LASSERT(rset->srs_nslot || + (rset->srs_nrule == 0 && rset->srs_rules == NULL)); + + if (rset->srs_nslot) { + OBD_FREE(rset->srs_rules, + rset->srs_nslot * sizeof(*rset->srs_rules)); + sptlrpc_rule_set_init(rset); + } +} +EXPORT_SYMBOL(sptlrpc_rule_set_free); + +/* + * return 0 if the rule set could accommodate one more rule. + */ +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule *rules; + int nslot; + + might_sleep(); + + if (rset->srs_nrule < rset->srs_nslot) + return 0; + + nslot = rset->srs_nslot + 8; + + /* better use realloc() if available */ + OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules)); + if (rules == NULL) + return -ENOMEM; + + if (rset->srs_nrule) { + LASSERT(rset->srs_nslot && rset->srs_rules); + memcpy(rules, rset->srs_rules, + rset->srs_nrule * sizeof(*rset->srs_rules)); + + OBD_FREE(rset->srs_rules, + rset->srs_nslot * sizeof(*rset->srs_rules)); + } + + rset->srs_rules = rules; + rset->srs_nslot = nslot; + return 0; +} +EXPORT_SYMBOL(sptlrpc_rule_set_expand); + +static inline int rule_spec_dir(struct sptlrpc_rule *rule) +{ + return (rule->sr_from != LUSTRE_SP_ANY || + rule->sr_to != LUSTRE_SP_ANY); +} +static inline int rule_spec_net(struct sptlrpc_rule *rule) +{ + return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY)); +} +static inline int rule_match_dir(struct sptlrpc_rule *r1, + struct sptlrpc_rule *r2) +{ + return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to); +} +static inline int rule_match_net(struct sptlrpc_rule *r1, + struct sptlrpc_rule *r2) +{ + return (r1->sr_netid == r2->sr_netid); +} + +/* + * merge @rule into @rset. + * the @rset slots might be expanded. + */ +int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, + struct sptlrpc_rule *rule) +{ + struct sptlrpc_rule *p = rset->srs_rules; + int spec_dir, spec_net; + int rc, n, match = 0; + + might_sleep(); + + spec_net = rule_spec_net(rule); + spec_dir = rule_spec_dir(rule); + + for (n = 0; n < rset->srs_nrule; n++) { + p = &rset->srs_rules[n]; + + /* test network match, if failed: + * - spec rule: skip rules which is also spec rule match, until + * we hit a wild rule, which means no more chance + * - wild rule: skip until reach the one which is also wild + * and matches + */ + if (!rule_match_net(p, rule)) { + if (spec_net) { + if (rule_spec_net(p)) + continue; + else + break; + } else { + continue; + } + } + + /* test dir match, same logic as net matching */ + if (!rule_match_dir(p, rule)) { + if (spec_dir) { + if (rule_spec_dir(p)) + continue; + else + break; + } else { + continue; + } + } + + /* find a match */ + match = 1; + break; + } + + if (match) { + LASSERT(n >= 0 && n < rset->srs_nrule); + + if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { + /* remove this rule */ + if (n < rset->srs_nrule - 1) + memmove(&rset->srs_rules[n], + &rset->srs_rules[n + 1], + (rset->srs_nrule - n - 1) * + sizeof(*rule)); + rset->srs_nrule--; + } else { + /* override the rule */ + memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); + } + } else { + LASSERT(n >= 0 && n <= rset->srs_nrule); + + if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) { + rc = sptlrpc_rule_set_expand(rset); + if (rc) + return rc; + + if (n < rset->srs_nrule) + memmove(&rset->srs_rules[n + 1], + &rset->srs_rules[n], + (rset->srs_nrule - n) * sizeof(*rule)); + memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); + rset->srs_nrule++; + } else { + CDEBUG(D_CONFIG, "ignore the unmatched deletion\n"); + } + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_rule_set_merge); + +/** + * given from/to/nid, determine a matching flavor in ruleset. + * return 1 if a match found, otherwise return 0. + */ +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + struct sptlrpc_rule *r; + int n; + + for (n = 0; n < rset->srs_nrule; n++) { + r = &rset->srs_rules[n]; + + if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) && + r->sr_netid != LNET_NIDNET(LNET_NID_ANY) && + LNET_NIDNET(nid) != r->sr_netid) + continue; + + if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY && + from != r->sr_from) + continue; + + if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY && + to != r->sr_to) + continue; + + *sf = r->sr_flvr; + return 1; + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_rule_set_choose); + +void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule *r; + int n; + + for (n = 0; n < rset->srs_nrule; n++) { + r = &rset->srs_rules[n]; + CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n, + r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc); + } +} +EXPORT_SYMBOL(sptlrpc_rule_set_dump); + +/********************************** + * sptlrpc configuration support * + **********************************/ + +struct sptlrpc_conf_tgt { + struct list_head sct_list; + char sct_name[MAX_OBD_NAME]; + struct sptlrpc_rule_set sct_rset; +}; + +struct sptlrpc_conf { + struct list_head sc_list; + char sc_fsname[MTI_NAME_MAXLEN]; + unsigned int sc_modified; /* modified during updating */ + unsigned int sc_updated:1, /* updated copy from MGS */ + sc_local:1; /* local copy from target */ + struct sptlrpc_rule_set sc_rset; /* fs general rules */ + struct list_head sc_tgts; /* target-specific rules */ +}; + +static struct mutex sptlrpc_conf_lock; +static LIST_HEAD(sptlrpc_confs); + +static inline int is_hex(char c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f')); +} + +static void target2fsname(const char *tgt, char *fsname, int buflen) +{ + const char *ptr; + int len; + + ptr = strrchr(tgt, '-'); + if (ptr) { + if ((strncmp(ptr, "-MDT", 4) != 0 && + strncmp(ptr, "-OST", 4) != 0) || + !is_hex(ptr[4]) || !is_hex(ptr[5]) || + !is_hex(ptr[6]) || !is_hex(ptr[7])) + ptr = NULL; + } + + /* if we didn't find the pattern, treat the whole string as fsname */ + if (ptr == NULL) + len = strlen(tgt); + else + len = ptr - tgt; + + len = min(len, buflen - 1); + memcpy(fsname, tgt, len); + fsname[len] = '\0'; +} + +static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf) +{ + struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next; + + sptlrpc_rule_set_free(&conf->sc_rset); + + list_for_each_entry_safe(conf_tgt, conf_tgt_next, + &conf->sc_tgts, sct_list) { + sptlrpc_rule_set_free(&conf_tgt->sct_rset); + list_del(&conf_tgt->sct_list); + OBD_FREE_PTR(conf_tgt); + } + LASSERT(list_empty(&conf->sc_tgts)); + + conf->sc_updated = 0; + conf->sc_local = 0; +} + +static void sptlrpc_conf_free(struct sptlrpc_conf *conf) +{ + CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname); + + sptlrpc_conf_free_rsets(conf); + list_del(&conf->sc_list); + OBD_FREE_PTR(conf); +} + +static +struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf, + const char *name, + int create) +{ + struct sptlrpc_conf_tgt *conf_tgt; + + list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) { + if (strcmp(conf_tgt->sct_name, name) == 0) + return conf_tgt; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf_tgt); + if (conf_tgt) { + strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name)); + sptlrpc_rule_set_init(&conf_tgt->sct_rset); + list_add(&conf_tgt->sct_list, &conf->sc_tgts); + } + + return conf_tgt; +} + +static +struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname, + int create) +{ + struct sptlrpc_conf *conf; + + list_for_each_entry(conf, &sptlrpc_confs, sc_list) { + if (strcmp(conf->sc_fsname, fsname) == 0) + return conf; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf); + if (conf == NULL) + return NULL; + + strcpy(conf->sc_fsname, fsname); + sptlrpc_rule_set_init(&conf->sc_rset); + INIT_LIST_HEAD(&conf->sc_tgts); + list_add(&conf->sc_list, &sptlrpc_confs); + + CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname); + return conf; +} + +/** + * caller must hold conf_lock already. + */ +static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf, + const char *target, + struct sptlrpc_rule *rule) +{ + struct sptlrpc_conf_tgt *conf_tgt; + struct sptlrpc_rule_set *rule_set; + + /* fsname == target means general rules for the whole fs */ + if (strcmp(conf->sc_fsname, target) == 0) { + rule_set = &conf->sc_rset; + } else { + conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1); + if (conf_tgt) { + rule_set = &conf_tgt->sct_rset; + } else { + CERROR("out of memory, can't merge rule!\n"); + return -ENOMEM; + } + } + + return sptlrpc_rule_set_merge(rule_set, rule); +} + +/** + * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we + * find one through the target name in the record inside conf_lock; + * otherwise means caller already hold conf_lock. + */ +static int __sptlrpc_process_config(struct lustre_cfg *lcfg, + struct sptlrpc_conf *conf) +{ + char *target, *param; + char fsname[MTI_NAME_MAXLEN]; + struct sptlrpc_rule rule; + int rc; + + target = lustre_cfg_string(lcfg, 1); + if (target == NULL) { + CERROR("missing target name\n"); + return -EINVAL; + } + + param = lustre_cfg_string(lcfg, 2); + if (param == NULL) { + CERROR("missing parameter\n"); + return -EINVAL; + } + + CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param); + + /* parse rule to make sure the format is correct */ + if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) { + CERROR("Invalid sptlrpc parameter: %s\n", param); + return -EINVAL; + } + param += sizeof(PARAM_SRPC_FLVR) - 1; + + rc = sptlrpc_parse_rule(param, &rule); + if (rc) + return -EINVAL; + + if (conf == NULL) { + target2fsname(target, fsname, sizeof(fsname)); + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf == NULL) { + CERROR("can't find conf\n"); + rc = -ENOMEM; + } else { + rc = sptlrpc_conf_merge_rule(conf, target, &rule); + } + mutex_unlock(&sptlrpc_conf_lock); + } else { + LASSERT(mutex_is_locked(&sptlrpc_conf_lock)); + rc = sptlrpc_conf_merge_rule(conf, target, &rule); + } + + if (rc == 0) + conf->sc_modified++; + + return rc; +} + +int sptlrpc_process_config(struct lustre_cfg *lcfg) +{ + return __sptlrpc_process_config(lcfg, NULL); +} +EXPORT_SYMBOL(sptlrpc_process_config); + +static int logname2fsname(const char *logname, char *buf, int buflen) +{ + char *ptr; + int len; + + ptr = strrchr(logname, '-'); + if (ptr == NULL || strcmp(ptr, "-sptlrpc")) { + CERROR("%s is not a sptlrpc config log\n", logname); + return -EINVAL; + } + + len = min((int) (ptr - logname), buflen - 1); + + memcpy(buf, logname, len); + buf[len] = '\0'; + return 0; +} + +void sptlrpc_conf_log_update_begin(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf) { + if (conf->sc_local) { + LASSERT(conf->sc_updated == 0); + sptlrpc_conf_free_rsets(conf); + } + conf->sc_modified = 0; + } + + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_update_begin); + +/** + * mark a config log has been updated + */ +void sptlrpc_conf_log_update_end(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf) { + /* + * if original state is not updated, make sure the + * modified counter > 0 to enforce updating local copy. + */ + if (conf->sc_updated == 0) + conf->sc_modified++; + + conf->sc_updated = 1; + } + + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_update_end); + +void sptlrpc_conf_log_start(const char *logname) +{ + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + sptlrpc_conf_get(fsname, 1); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_start); + +void sptlrpc_conf_log_stop(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf) + sptlrpc_conf_free(conf); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_stop); + +static inline void flavor_set_flags(struct sptlrpc_flavor *sf, + enum lustre_sec_part from, + enum lustre_sec_part to, + unsigned int fl_udesc) +{ + /* + * null flavor doesn't need to set any flavor, and in fact + * we'd better not do that because everybody share a single sec. + */ + if (sf->sf_rpc == SPTLRPC_FLVR_NULL) + return; + + if (from == LUSTRE_SP_MDT) { + /* MDT->MDT; MDT->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) { + /* CLI->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) { + /* CLI->MDT */ + if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL) + sf->sf_flags |= PTLRPC_SEC_FL_UDESC; + } +} + +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + struct sptlrpc_conf *conf; + struct sptlrpc_conf_tgt *conf_tgt; + char name[MTI_NAME_MAXLEN]; + int len, rc = 0; + + target2fsname(target->uuid, name, sizeof(name)); + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(name, 0); + if (conf == NULL) + goto out; + + /* convert uuid name (supposed end with _UUID) to target name */ + len = strlen(target->uuid); + LASSERT(len > 5); + memcpy(name, target->uuid, len - 5); + name[len - 5] = '\0'; + + conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0); + if (conf_tgt) { + rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset, + from, to, nid, sf); + if (rc) + goto out; + } + + rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf); +out: + mutex_unlock(&sptlrpc_conf_lock); + + if (rc == 0) + get_default_flavor(sf); + + flavor_set_flags(sf, from, to, 1); +} + +/** + * called by target devices, determine the expected flavor from + * certain peer (from, nid). + */ +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0) + get_default_flavor(sf); +} +EXPORT_SYMBOL(sptlrpc_target_choose_flavor); + +#define SEC_ADAPT_DELAY (10) + +/** + * called by client devices, notify the sptlrpc config has changed and + * do import_sec_adapt later. + */ +void sptlrpc_conf_client_adapt(struct obd_device *obd) +{ + struct obd_import *imp; + + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0); + CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid); + + /* serialize with connect/disconnect import */ + down_read(&obd->u.cli.cl_sem); + + imp = obd->u.cli.cl_import; + if (imp) { + spin_lock(&imp->imp_lock); + if (imp->imp_sec) + imp->imp_sec_expire = get_seconds() + + SEC_ADAPT_DELAY; + spin_unlock(&imp->imp_lock); + } + + up_read(&obd->u.cli.cl_sem); +} +EXPORT_SYMBOL(sptlrpc_conf_client_adapt); + +int sptlrpc_conf_init(void) +{ + mutex_init(&sptlrpc_conf_lock); + return 0; +} + +void sptlrpc_conf_fini(void) +{ + struct sptlrpc_conf *conf, *conf_next; + + mutex_lock(&sptlrpc_conf_lock); + list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) { + sptlrpc_conf_free(conf); + } + LASSERT(list_empty(&sptlrpc_confs)); + mutex_unlock(&sptlrpc_conf_lock); +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c new file mode 100644 index 000000000..81de68edb --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c @@ -0,0 +1,252 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_gc.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include "../../include/linux/libcfs/libcfs.h" + +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_net.h" +#include "../include/lustre_sec.h" + +#include "ptlrpc_internal.h" + +#define SEC_GC_INTERVAL (30 * 60) + + +static struct mutex sec_gc_mutex; +static LIST_HEAD(sec_gc_list); +static spinlock_t sec_gc_list_lock; + +static LIST_HEAD(sec_gc_ctx_list); +static spinlock_t sec_gc_ctx_list_lock; + +static struct ptlrpc_thread sec_gc_thread; +static atomic_t sec_gc_wait_del = ATOMIC_INIT(0); + + +void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec) +{ + LASSERT(sec->ps_policy->sp_cops->gc_ctx); + LASSERT(sec->ps_gc_interval > 0); + LASSERT(list_empty(&sec->ps_gc_list)); + + sec->ps_gc_next = get_seconds() + sec->ps_gc_interval; + + spin_lock(&sec_gc_list_lock); + list_add_tail(&sec_gc_list, &sec->ps_gc_list); + spin_unlock(&sec_gc_list_lock); + + CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name); +} +EXPORT_SYMBOL(sptlrpc_gc_add_sec); + +void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec) +{ + if (list_empty(&sec->ps_gc_list)) + return; + + might_sleep(); + + /* signal before list_del to make iteration in gc thread safe */ + atomic_inc(&sec_gc_wait_del); + + spin_lock(&sec_gc_list_lock); + list_del_init(&sec->ps_gc_list); + spin_unlock(&sec_gc_list_lock); + + /* barrier */ + mutex_lock(&sec_gc_mutex); + mutex_unlock(&sec_gc_mutex); + + atomic_dec(&sec_gc_wait_del); + + CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name); +} +EXPORT_SYMBOL(sptlrpc_gc_del_sec); + +void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(list_empty(&ctx->cc_gc_chain)); + + CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + spin_lock(&sec_gc_ctx_list_lock); + list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list); + spin_unlock(&sec_gc_ctx_list_lock); + + thread_add_flags(&sec_gc_thread, SVC_SIGNAL); + wake_up(&sec_gc_thread.t_ctl_waitq); +} +EXPORT_SYMBOL(sptlrpc_gc_add_ctx); + +static void sec_process_ctx_list(void) +{ + struct ptlrpc_cli_ctx *ctx; + + spin_lock(&sec_gc_ctx_list_lock); + + while (!list_empty(&sec_gc_ctx_list)) { + ctx = list_entry(sec_gc_ctx_list.next, + struct ptlrpc_cli_ctx, cc_gc_chain); + list_del_init(&ctx->cc_gc_chain); + spin_unlock(&sec_gc_ctx_list_lock); + + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) == 1); + CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + sptlrpc_cli_ctx_put(ctx, 1); + + spin_lock(&sec_gc_ctx_list_lock); + } + + spin_unlock(&sec_gc_ctx_list_lock); +} + +static void sec_do_gc(struct ptlrpc_sec *sec) +{ + LASSERT(sec->ps_policy->sp_cops->gc_ctx); + + if (unlikely(sec->ps_gc_next == 0)) { + CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n", + sec, sec->ps_policy->sp_name); + return; + } + + CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name); + + if (cfs_time_after(sec->ps_gc_next, get_seconds())) + return; + + sec->ps_policy->sp_cops->gc_ctx(sec); + sec->ps_gc_next = get_seconds() + sec->ps_gc_interval; +} + +static int sec_gc_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg; + struct l_wait_info lwi; + + unshare_fs_struct(); + + /* Record that the thread is running */ + thread_set_flags(thread, SVC_RUNNING); + wake_up(&thread->t_ctl_waitq); + + while (1) { + struct ptlrpc_sec *sec; + + thread_clear_flags(thread, SVC_SIGNAL); + sec_process_ctx_list(); +again: + /* go through sec list do gc. + * FIXME here we iterate through the whole list each time which + * is not optimal. we perhaps want to use balanced binary tree + * to trace each sec as order of expiry time. + * another issue here is we wakeup as fixed interval instead of + * according to each sec's expiry time */ + mutex_lock(&sec_gc_mutex); + list_for_each_entry(sec, &sec_gc_list, ps_gc_list) { + /* if someone is waiting to be deleted, let it + * proceed as soon as possible. */ + if (atomic_read(&sec_gc_wait_del)) { + CDEBUG(D_SEC, "deletion pending, start over\n"); + mutex_unlock(&sec_gc_mutex); + goto again; + } + + sec_do_gc(sec); + } + mutex_unlock(&sec_gc_mutex); + + /* check ctx list again before sleep */ + sec_process_ctx_list(); + + lwi = LWI_TIMEOUT(SEC_GC_INTERVAL * HZ, NULL, NULL); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopping(thread) || + thread_is_signal(thread), + &lwi); + + if (thread_test_and_clear_flags(thread, SVC_STOPPING)) + break; + } + + thread_set_flags(thread, SVC_STOPPED); + wake_up(&thread->t_ctl_waitq); + return 0; +} + +int sptlrpc_gc_init(void) +{ + struct l_wait_info lwi = { 0 }; + struct task_struct *task; + + mutex_init(&sec_gc_mutex); + spin_lock_init(&sec_gc_list_lock); + spin_lock_init(&sec_gc_ctx_list_lock); + + /* initialize thread control */ + memset(&sec_gc_thread, 0, sizeof(sec_gc_thread)); + init_waitqueue_head(&sec_gc_thread.t_ctl_waitq); + + task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc"); + if (IS_ERR(task)) { + CERROR("can't start gc thread: %ld\n", PTR_ERR(task)); + return PTR_ERR(task); + } + + l_wait_event(sec_gc_thread.t_ctl_waitq, + thread_is_running(&sec_gc_thread), &lwi); + return 0; +} + +void sptlrpc_gc_fini(void) +{ + struct l_wait_info lwi = { 0 }; + + thread_set_flags(&sec_gc_thread, SVC_STOPPING); + wake_up(&sec_gc_thread.t_ctl_waitq); + + l_wait_event(sec_gc_thread.t_ctl_waitq, + thread_is_stopped(&sec_gc_thread), &lwi); +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c new file mode 100644 index 000000000..0d08145a6 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c @@ -0,0 +1,199 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_lproc.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include "../../include/linux/libcfs/libcfs.h" +#include + +#include "../include/obd.h" +#include "../include/obd_class.h" +#include "../include/obd_support.h" +#include "../include/lustre_net.h" +#include "../include/lustre_import.h" +#include "../include/lustre_dlm.h" +#include "../include/lustre_sec.h" + +#include "ptlrpc_internal.h" + + +struct proc_dir_entry *sptlrpc_proc_root = NULL; +EXPORT_SYMBOL(sptlrpc_proc_root); + +static char *sec_flags2str(unsigned long flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_SEC_FL_REVERSE) + strlcat(buf, "reverse,", bufsize); + if (flags & PTLRPC_SEC_FL_ROOTONLY) + strlcat(buf, "rootonly,", bufsize); + if (flags & PTLRPC_SEC_FL_UDESC) + strlcat(buf, "udesc,", bufsize); + if (flags & PTLRPC_SEC_FL_BULK) + strlcat(buf, "bulk,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); + + return buf; +} + +static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_sec *sec = NULL; + char str[32]; + + LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0); + + if (cli->cl_import) + sec = sptlrpc_import_sec_ref(cli->cl_import); + if (sec == NULL) + goto out; + + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)); + + seq_printf(seq, "rpc flavor: %s\n", + sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc)); + seq_printf(seq, "bulk flavor: %s\n", + sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str))); + seq_printf(seq, "flags: %s\n", + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str))); + seq_printf(seq, "id: %d\n", sec->ps_id); + seq_printf(seq, "refcount: %d\n", + atomic_read(&sec->ps_refcount)); + seq_printf(seq, "nctx: %d\n", atomic_read(&sec->ps_nctx)); + seq_printf(seq, "gc internal %ld\n", sec->ps_gc_interval); + seq_printf(seq, "gc next %ld\n", + sec->ps_gc_interval ? + sec->ps_gc_next - get_seconds() : 0); + + sptlrpc_sec_put(sec); +out: + return 0; +} +LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs); + +static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_sec *sec = NULL; + + LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0); + + if (cli->cl_import) + sec = sptlrpc_import_sec_ref(cli->cl_import); + if (sec == NULL) + goto out; + + if (sec->ps_policy->sp_cops->display) + sec->ps_policy->sp_cops->display(sec, seq); + + sptlrpc_sec_put(sec); +out: + return 0; +} +LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs); + +int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev) +{ + int rc; + + if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 && + strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 && + strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) { + CERROR("can't register lproc for obd type %s\n", + dev->obd_type->typ_name); + return -EINVAL; + } + + rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444, + &sptlrpc_info_lprocfs_fops, dev); + if (rc) { + CERROR("create proc entry srpc_info for %s: %d\n", + dev->obd_name, rc); + return rc; + } + + rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444, + &sptlrpc_ctxs_lprocfs_fops, dev); + if (rc) { + CERROR("create proc entry srpc_contexts for %s: %d\n", + dev->obd_name, rc); + return rc; + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach); + +LPROC_SEQ_FOPS_RO(sptlrpc_proc_enc_pool); +static struct lprocfs_vars sptlrpc_lprocfs_vars[] = { + { "encrypt_page_pools", &sptlrpc_proc_enc_pool_fops }, + { NULL } +}; + +int sptlrpc_lproc_init(void) +{ + int rc; + + LASSERT(sptlrpc_proc_root == NULL); + + sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root, + sptlrpc_lprocfs_vars, NULL); + if (IS_ERR(sptlrpc_proc_root)) { + rc = PTR_ERR(sptlrpc_proc_root); + sptlrpc_proc_root = NULL; + return rc; + } + return 0; +} + +void sptlrpc_lproc_fini(void) +{ + if (sptlrpc_proc_root) { + lprocfs_remove(&sptlrpc_proc_root); + sptlrpc_proc_root = NULL; + } +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_null.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_null.c new file mode 100644 index 000000000..4e132435b --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_null.c @@ -0,0 +1,458 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_null.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + + +#include "../include/obd_support.h" +#include "../include/obd_cksum.h" +#include "../include/obd_class.h" +#include "../include/lustre_net.h" +#include "../include/lustre_sec.h" + +static struct ptlrpc_sec_policy null_policy; +static struct ptlrpc_sec null_sec; +static struct ptlrpc_cli_ctx null_cli_ctx; +static struct ptlrpc_svc_ctx null_svc_ctx; + +/* + * we can temporarily use the topmost 8-bits of lm_secflvr to identify + * the source sec part. + */ +static inline +void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp) +{ + msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24; +} + +static inline +enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg) +{ + return (msg->lm_secflvr >> 24) & 0xFF; +} + +static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx) +{ + /* should never reach here */ + LBUG(); + return 0; +} + +static +int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL; + + if (!req->rq_import->imp_dlm_fake) { + struct obd_device *obd = req->rq_import->imp_obd; + null_encode_sec_part(req->rq_reqbuf, + obd->u.cli.cl_sp_me); + } + req->rq_reqdata_len = req->rq_reqlen; + return 0; +} + +static +int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + __u32 cksums, cksumc; + + LASSERT(req->rq_repdata); + + req->rq_repmsg = req->rq_repdata; + req->rq_replen = req->rq_repdata_len; + + if (req->rq_early) { + cksums = lustre_msg_get_cksum(req->rq_repdata); + cksumc = lustre_msg_calc_cksum(req->rq_repmsg); + if (cksumc != cksums) { + CDEBUG(D_SEC, + "early reply checksum mismatch: %08x != %08x\n", + cksumc, cksums); + return -EINVAL; + } + } + + return 0; +} + +static +struct ptlrpc_sec *null_create_sec(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf) +{ + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL); + + /* general layer has take a module reference for us, because we never + * really destroy the sec, simply release the reference here. + */ + sptlrpc_policy_put(&null_policy); + return &null_sec; +} + +static +void null_destroy_sec(struct ptlrpc_sec *sec) +{ + LASSERT(sec == &null_sec); +} + +static +struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + atomic_inc(&null_cli_ctx.cc_refcount); + return &null_cli_ctx; +} + +static +int null_flush_ctx_cache(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + return 0; +} + +static +int null_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + if (!req->rq_reqbuf) { + int alloc_size = size_roundup_power2(msgsize); + + LASSERT(!req->rq_pool); + OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size); + if (!req->rq_reqbuf) + return -ENOMEM; + + req->rq_reqbuf_len = alloc_size; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= msgsize); + memset(req->rq_reqbuf, 0, msgsize); + } + + req->rq_reqmsg = req->rq_reqbuf; + return 0; +} + +static +void null_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + if (!req->rq_pool) { + LASSERTF(req->rq_reqmsg == req->rq_reqbuf, + "req %p: reqmsg %p is not reqbuf %p in null sec\n", + req, req->rq_reqmsg, req->rq_reqbuf); + LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen, + "req %p: reqlen %d should smaller than buflen %d\n", + req, req->rq_reqlen, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } +} + +static +int null_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + /* add space for early replied */ + msgsize += lustre_msg_early_size(); + + msgsize = size_roundup_power2(msgsize); + + OBD_ALLOC_LARGE(req->rq_repbuf, msgsize); + if (!req->rq_repbuf) + return -ENOMEM; + + req->rq_repbuf_len = msgsize; + return 0; +} + +static +void null_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + LASSERT(req->rq_repbuf); + + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; +} + +static +int null_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + struct lustre_msg *oldbuf = req->rq_reqmsg; + int oldsize, newmsg_size, alloc_size; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf == req->rq_reqmsg); + LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); + LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf)); + + /* compute new message size */ + oldsize = req->rq_reqbuf->lm_buflens[segment]; + req->rq_reqbuf->lm_buflens[segment] = newsize; + newmsg_size = lustre_packed_msg_size(oldbuf); + req->rq_reqbuf->lm_buflens[segment] = oldsize; + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size); + + if (req->rq_reqbuf_len < newmsg_size) { + alloc_size = size_roundup_power2(newmsg_size); + + OBD_ALLOC_LARGE(newbuf, alloc_size); + if (newbuf == NULL) + return -ENOMEM; + + /* Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = req->rq_reqmsg = newbuf; + req->rq_reqbuf_len = alloc_size; + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + req->rq_reqlen = newmsg_size; + + return 0; +} + +static struct ptlrpc_svc_ctx null_svc_ctx = { + .sc_refcount = ATOMIC_INIT(1), + .sc_policy = &null_policy, +}; + +static +int null_accept(struct ptlrpc_request *req) +{ + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_NULL); + + if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) { + CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc); + return SECSVC_DROP; + } + + req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf); + + req->rq_reqmsg = req->rq_reqbuf; + req->rq_reqlen = req->rq_reqdata_len; + + req->rq_svc_ctx = &null_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + return SECSVC_OK; +} + +static +int null_alloc_rs(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_reply_state *rs; + int rs_size = sizeof(*rs) + msgsize; + + LASSERT(msgsize % 8 == 0); + + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + return -ENOMEM; + + rs->rs_size = rs_size; + } + + rs->rs_svc_ctx = req->rq_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = rs_size - sizeof(*rs); + rs->rs_msg = rs->rs_repbuf; + + req->rq_reply_state = rs; + return 0; +} + +static +void null_free_rs(struct ptlrpc_reply_state *rs) +{ + LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1); + atomic_dec(&rs->rs_svc_ctx->sc_refcount); + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); +} + +static +int null_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + + LASSERT(rs); + + rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL; + rs->rs_repdata_len = req->rq_replen; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = lustre_msg_early_size(); + else + req->rq_reply_off = 0; + } else { + __u32 cksum; + + cksum = lustre_msg_calc_cksum(rs->rs_repbuf); + lustre_msg_set_cksum(rs->rs_repbuf, cksum); + req->rq_reply_off = 0; + } + + return 0; +} + +static struct ptlrpc_ctx_ops null_ctx_ops = { + .refresh = null_ctx_refresh, + .sign = null_ctx_sign, + .verify = null_ctx_verify, +}; + +static struct ptlrpc_sec_cops null_sec_cops = { + .create_sec = null_create_sec, + .destroy_sec = null_destroy_sec, + .lookup_ctx = null_lookup_ctx, + .flush_ctx_cache = null_flush_ctx_cache, + .alloc_reqbuf = null_alloc_reqbuf, + .alloc_repbuf = null_alloc_repbuf, + .free_reqbuf = null_free_reqbuf, + .free_repbuf = null_free_repbuf, + .enlarge_reqbuf = null_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops null_sec_sops = { + .accept = null_accept, + .alloc_rs = null_alloc_rs, + .authorize = null_authorize, + .free_rs = null_free_rs, +}; + +static struct ptlrpc_sec_policy null_policy = { + .sp_owner = THIS_MODULE, + .sp_name = "sec.null", + .sp_policy = SPTLRPC_POLICY_NULL, + .sp_cops = &null_sec_cops, + .sp_sops = &null_sec_sops, +}; + +static void null_init_internal(void) +{ + static HLIST_HEAD(__list); + + null_sec.ps_policy = &null_policy; + atomic_set(&null_sec.ps_refcount, 1); /* always busy */ + null_sec.ps_id = -1; + null_sec.ps_import = NULL; + null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL; + null_sec.ps_flvr.sf_flags = 0; + null_sec.ps_part = LUSTRE_SP_ANY; + null_sec.ps_dying = 0; + spin_lock_init(&null_sec.ps_lock); + atomic_set(&null_sec.ps_nctx, 1); /* for "null_cli_ctx" */ + INIT_LIST_HEAD(&null_sec.ps_gc_list); + null_sec.ps_gc_interval = 0; + null_sec.ps_gc_next = 0; + + hlist_add_head(&null_cli_ctx.cc_cache, &__list); + atomic_set(&null_cli_ctx.cc_refcount, 1); /* for hash */ + null_cli_ctx.cc_sec = &null_sec; + null_cli_ctx.cc_ops = &null_ctx_ops; + null_cli_ctx.cc_expire = 0; + null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL | + PTLRPC_CTX_UPTODATE; + null_cli_ctx.cc_vcred.vc_uid = 0; + spin_lock_init(&null_cli_ctx.cc_lock); + INIT_LIST_HEAD(&null_cli_ctx.cc_req_list); + INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain); +} + +int sptlrpc_null_init(void) +{ + int rc; + + null_init_internal(); + + rc = sptlrpc_register_policy(&null_policy); + if (rc) + CERROR("failed to register %s: %d\n", null_policy.sp_name, rc); + + return rc; +} + +void sptlrpc_null_fini(void) +{ + int rc; + + rc = sptlrpc_unregister_policy(&null_policy); + if (rc) + CERROR("failed to unregister %s: %d\n", + null_policy.sp_name, rc); +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c new file mode 100644 index 000000000..a79cd5301 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c @@ -0,0 +1,1013 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_plain.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + + +#include "../include/obd_support.h" +#include "../include/obd_cksum.h" +#include "../include/obd_class.h" +#include "../include/lustre_net.h" +#include "../include/lustre_sec.h" + +struct plain_sec { + struct ptlrpc_sec pls_base; + rwlock_t pls_lock; + struct ptlrpc_cli_ctx *pls_ctx; +}; + +static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec) +{ + return container_of(sec, struct plain_sec, pls_base); +} + +static struct ptlrpc_sec_policy plain_policy; +static struct ptlrpc_ctx_ops plain_ctx_ops; +static struct ptlrpc_svc_ctx plain_svc_ctx; + +static unsigned int plain_at_offset; + +/* + * for simplicity, plain policy rpc use fixed layout. + */ +#define PLAIN_PACK_SEGMENTS (4) + +#define PLAIN_PACK_HDR_OFF (0) +#define PLAIN_PACK_MSG_OFF (1) +#define PLAIN_PACK_USER_OFF (2) +#define PLAIN_PACK_BULK_OFF (3) + +#define PLAIN_FL_USER (0x01) +#define PLAIN_FL_BULK (0x02) + +struct plain_header { + __u8 ph_ver; /* 0 */ + __u8 ph_flags; + __u8 ph_sp; /* source */ + __u8 ph_bulk_hash_alg; /* complete flavor desc */ + __u8 ph_pad[4]; +}; + +struct plain_bulk_token { + __u8 pbt_hash[8]; +}; + +#define PLAIN_BSD_SIZE \ + (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token)) + +/**************************************** + * bulk checksum helpers * + ****************************************/ + +static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed) +{ + struct ptlrpc_bulk_sec_desc *bsd; + + if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed)) + return -EPROTO; + + bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE); + if (bsd == NULL) { + CERROR("bulk sec desc has short size %d\n", + lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF)); + return -EPROTO; + } + + if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) { + CERROR("invalid bulk svc %u\n", bsd->bsd_svc); + return -EPROTO; + } + + return 0; +} + +static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *token) +{ + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; + + memset(token->pbt_hash, 0, sizeof(token->pbt_hash)); + return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash, + sizeof(token->pbt_hash)); +} + +static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *tokenr) +{ + struct plain_bulk_token tokenv; + int rc; + + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; + + memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash)); + rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash, + sizeof(tokenv.pbt_hash)); + if (rc) + return rc; + + if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash))) + return -EACCES; + return 0; +} + +static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) +{ + char *ptr; + unsigned int off, i; + + for (i = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_iov[i].kiov_len == 0) + continue; + + ptr = kmap(desc->bd_iov[i].kiov_page); + off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; + ptr[off] ^= 0x1; + kunmap(desc->bd_iov[i].kiov_page); + return; + } +} + +/**************************************** + * cli_ctx apis * + ****************************************/ + +static +int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx) +{ + /* should never reach here */ + LBUG(); + return 0; +} + +static +int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx) +{ + return 0; +} + +static +int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; + + msg->lm_secflvr = req->rq_flvr.sf_rpc; + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_sp = ctx->cc_sec->ps_part; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + + if (req->rq_pack_udesc) + phdr->ph_flags |= PLAIN_FL_USER; + if (req->rq_pack_bulk) + phdr->ph_flags |= PLAIN_FL_BULK; + + req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount, + msg->lm_buflens); + return 0; +} + +static +int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_repdata; + struct plain_header *phdr; + __u32 cksum; + int swabbed; + + if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) { + CERROR("unexpected reply buf count %u\n", msg->lm_bufcount); + return -EPROTO; + } + + swabbed = ptlrpc_rep_need_swab(req); + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + return -EPROTO; + } + + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + return -EPROTO; + } + + /* expect no user desc in reply */ + if (phdr->ph_flags & PLAIN_FL_USER) { + CERROR("Unexpected udesc flag in reply\n"); + return -EPROTO; + } + + if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) { + CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg, + req->rq_flvr.u_bulk.hash.hash_alg); + return -EPROTO; + } + + if (unlikely(req->rq_early)) { + unsigned int hsize = 4; + + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), + lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF), + NULL, 0, (unsigned char *)&cksum, &hsize); + if (cksum != msg->lm_cksum) { + CDEBUG(D_SEC, + "early reply checksum mismatch: %08x != %08x\n", + cpu_to_le32(cksum), msg->lm_cksum); + return -EINVAL; + } + } else { + /* whether we sent with bulk or not, we expect the same + * in reply, except for early reply */ + if (!req->rq_early && + !equi(req->rq_pack_bulk == 1, + phdr->ph_flags & PLAIN_FL_BULK)) { + CERROR("%s bulk checksum in reply\n", + req->rq_pack_bulk ? "Missing" : "Unexpected"); + return -EPROTO; + } + + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg, swabbed)) + return -EPROTO; + } + } + + req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); + req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF); + return 0; +} + +static +int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_bulk_sec_desc *bsd; + struct plain_bulk_token *token; + int rc; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); + + bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + token = (struct plain_bulk_token *) bsd->bsd_data; + + bsd->bsd_version = 0; + bsd->bsd_flags = 0; + bsd->bsd_type = SPTLRPC_BULK_DEFAULT; + bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + if (req->rq_bulk_read) + return 0; + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + token); + if (rc) { + CERROR("bulk write: failed to compute checksum: %d\n", rc); + } else { + /* + * for sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo + */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) && + req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL) + token->pbt_hash[0] ^= 0x1; + } + + return rc; +} + +static +int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_bulk_sec_desc *bsdv; + struct plain_bulk_token *tokenv; + int rc; + int i, nob; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); + LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS); + + bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + if (req->rq_bulk_write) { + if (bsdv->bsd_flags & BSD_FL_ERR) + return -EIO; + return 0; + } + + /* fix the actual data size */ + for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) { + desc->bd_iov[i].kiov_len = + desc->bd_nob_transferred - nob; + } + nob += desc->bd_iov[i].kiov_len; + } + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) + CERROR("bulk read: client verify failed: %d\n", rc); + + return rc; +} + +/**************************************** + * sec apis * + ****************************************/ + +static +struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec) +{ + struct ptlrpc_cli_ctx *ctx, *ctx_new; + + OBD_ALLOC_PTR(ctx_new); + + write_lock(&plsec->pls_lock); + + ctx = plsec->pls_ctx; + if (ctx) { + atomic_inc(&ctx->cc_refcount); + + if (ctx_new) + OBD_FREE_PTR(ctx_new); + } else if (ctx_new) { + ctx = ctx_new; + + atomic_set(&ctx->cc_refcount, 1); /* for cache */ + ctx->cc_sec = &plsec->pls_base; + ctx->cc_ops = &plain_ctx_ops; + ctx->cc_expire = 0; + ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE; + ctx->cc_vcred.vc_uid = 0; + spin_lock_init(&ctx->cc_lock); + INIT_LIST_HEAD(&ctx->cc_req_list); + INIT_LIST_HEAD(&ctx->cc_gc_chain); + + plsec->pls_ctx = ctx; + atomic_inc(&plsec->pls_base.ps_nctx); + atomic_inc(&plsec->pls_base.ps_refcount); + + atomic_inc(&ctx->cc_refcount); /* for caller */ + } + + write_unlock(&plsec->pls_lock); + + return ctx; +} + +static +void plain_destroy_sec(struct ptlrpc_sec *sec) +{ + struct plain_sec *plsec = sec2plsec(sec); + + LASSERT(sec->ps_policy == &plain_policy); + LASSERT(sec->ps_import); + LASSERT(atomic_read(&sec->ps_refcount) == 0); + LASSERT(atomic_read(&sec->ps_nctx) == 0); + LASSERT(plsec->pls_ctx == NULL); + + class_import_put(sec->ps_import); + + OBD_FREE_PTR(plsec); +} + +static +void plain_kill_sec(struct ptlrpc_sec *sec) +{ + sec->ps_dying = 1; +} + +static +struct ptlrpc_sec *plain_create_sec(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf) +{ + struct plain_sec *plsec; + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN); + + OBD_ALLOC_PTR(plsec); + if (plsec == NULL) + return NULL; + + /* + * initialize plain_sec + */ + rwlock_init(&plsec->pls_lock); + plsec->pls_ctx = NULL; + + sec = &plsec->pls_base; + sec->ps_policy = &plain_policy; + atomic_set(&sec->ps_refcount, 0); + atomic_set(&sec->ps_nctx, 0); + sec->ps_id = sptlrpc_get_next_secid(); + sec->ps_import = class_import_get(imp); + sec->ps_flvr = *sf; + spin_lock_init(&sec->ps_lock); + INIT_LIST_HEAD(&sec->ps_gc_list); + sec->ps_gc_interval = 0; + sec->ps_gc_next = 0; + + /* install ctx immediately if this is a reverse sec */ + if (svc_ctx) { + ctx = plain_sec_install_ctx(plsec); + if (ctx == NULL) { + plain_destroy_sec(sec); + return NULL; + } + sptlrpc_cli_ctx_put(ctx, 1); + } + + return sec; +} + +static +struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct plain_sec *plsec = sec2plsec(sec); + struct ptlrpc_cli_ctx *ctx; + + read_lock(&plsec->pls_lock); + ctx = plsec->pls_ctx; + if (ctx) + atomic_inc(&ctx->cc_refcount); + read_unlock(&plsec->pls_lock); + + if (unlikely(ctx == NULL)) + ctx = plain_sec_install_ctx(plsec); + + return ctx; +} + +static +void plain_release_ctx(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, int sync) +{ + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(ctx->cc_sec == sec); + + OBD_FREE_PTR(ctx); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static +int plain_flush_ctx_cache(struct ptlrpc_sec *sec, + uid_t uid, int grace, int force) +{ + struct plain_sec *plsec = sec2plsec(sec); + struct ptlrpc_cli_ctx *ctx; + + /* do nothing unless caller want to flush for 'all' */ + if (uid != -1) + return 0; + + write_lock(&plsec->pls_lock); + ctx = plsec->pls_ctx; + plsec->pls_ctx = NULL; + write_unlock(&plsec->pls_lock); + + if (ctx) + sptlrpc_cli_ctx_put(ctx, 1); + return 0; +} + +static +int plain_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int alloc_len; + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_udesc) + buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size(); + + if (req->rq_pack_bulk) { + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + } + + alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + if (!req->rq_reqbuf) { + LASSERT(!req->rq_pool); + + alloc_len = size_roundup_power2(alloc_len); + OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len); + if (!req->rq_reqbuf) + return -ENOMEM; + + req->rq_reqbuf_len = alloc_len; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= alloc_len); + memset(req->rq_reqbuf, 0, alloc_len); + } + + lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0); + + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF); + + return 0; +} + +static +void plain_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + if (!req->rq_pool) { + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } +} + +static +int plain_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int alloc_len; + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_bulk) { + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + } + + alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + /* add space for early reply */ + alloc_len += plain_at_offset; + + alloc_len = size_roundup_power2(alloc_len); + + OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len); + if (!req->rq_repbuf) + return -ENOMEM; + + req->rq_repbuf_len = alloc_len; + return 0; +} + +static +void plain_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; +} + +static +int plain_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + int oldsize; + int newmsg_size, newbuf_size; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); + LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) == + req->rq_reqmsg); + + /* compute new embedded msg size. */ + oldsize = req->rq_reqmsg->lm_buflens[segment]; + req->rq_reqmsg->lm_buflens[segment] = newsize; + newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount, + req->rq_reqmsg->lm_buflens); + req->rq_reqmsg->lm_buflens[segment] = oldsize; + + /* compute new wrapper msg size. */ + oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF]; + req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size; + newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount, + req->rq_reqbuf->lm_buflens); + req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize; + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size); + + if (req->rq_reqbuf_len < newbuf_size) { + newbuf_size = size_roundup_power2(newbuf_size); + + OBD_ALLOC_LARGE(newbuf, newbuf_size); + if (newbuf == NULL) + return -ENOMEM; + + /* Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + + memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = newbuf; + req->rq_reqbuf_len = newbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, + PLAIN_PACK_MSG_OFF, 0); + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, + newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + + req->rq_reqlen = newmsg_size; + return 0; +} + +/**************************************** + * service apis * + ****************************************/ + +static struct ptlrpc_svc_ctx plain_svc_ctx = { + .sc_refcount = ATOMIC_INIT(1), + .sc_policy = &plain_policy, +}; + +static +int plain_accept(struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; + int swabbed; + + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_PLAIN); + + if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) || + SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) { + CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc); + return SECSVC_DROP; + } + + if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) { + CERROR("unexpected request buf count %u\n", msg->lm_bufcount); + return SECSVC_DROP; + } + + swabbed = ptlrpc_req_need_swab(req); + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + return -EPROTO; + } + + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + return -EPROTO; + } + + if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) { + CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg); + return -EPROTO; + } + + req->rq_sp_from = phdr->ph_sp; + req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg; + + if (phdr->ph_flags & PLAIN_FL_USER) { + if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF, + swabbed)) { + CERROR("Mal-formed user descriptor\n"); + return SECSVC_DROP; + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0); + } + + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg, swabbed)) + return SECSVC_DROP; + + req->rq_pack_bulk = 1; + } + + req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); + req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF]; + + req->rq_svc_ctx = &plain_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + return SECSVC_OK; +} + +static +int plain_alloc_rs(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_reply_state *rs; + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int rs_size = sizeof(*rs); + + LASSERT(msgsize % 8 == 0); + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write)) + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + + rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + return -ENOMEM; + + rs->rs_size = rs_size; + } + + rs->rs_svc_ctx = req->rq_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = rs_size - sizeof(*rs); + + lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); + rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0); + + req->rq_reply_state = rs; + return 0; +} + +static +void plain_free_rs(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1); + atomic_dec(&rs->rs_svc_ctx->sc_refcount); + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); +} + +static +int plain_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct lustre_msg_v2 *msg = rs->rs_repbuf; + struct plain_header *phdr; + int len; + + LASSERT(rs); + LASSERT(msg); + + if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF]) + len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF, + req->rq_replen, 1); + else + len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + + msg->lm_secflvr = req->rq_flvr.sf_rpc; + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + + if (req->rq_pack_bulk) + phdr->ph_flags |= PLAIN_FL_BULK; + + rs->rs_repdata_len = len; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = plain_at_offset; + else + req->rq_reply_off = 0; + } else { + unsigned int hsize = 4; + + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), + lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF), + NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize); + req->rq_reply_off = 0; + } + + return 0; +} + +static +int plain_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenr; + int rc; + + LASSERT(req->rq_bulk_write); + LASSERT(req->rq_pack_bulk); + + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + tokenr = (struct plain_bulk_token *) bsdr->bsd_data; + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenr); + if (rc) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("bulk write: server verify failed: %d\n", rc); + } + + return rc; +} + +static +int plain_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenv; + int rc; + + LASSERT(req->rq_bulk_read); + LASSERT(req->rq_pack_bulk); + + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) { + CERROR("bulk read: server failed to compute checksum: %d\n", + rc); + } else { + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) + corrupt_bulk_data(desc); + } + + return rc; +} + +static struct ptlrpc_ctx_ops plain_ctx_ops = { + .refresh = plain_ctx_refresh, + .validate = plain_ctx_validate, + .sign = plain_ctx_sign, + .verify = plain_ctx_verify, + .wrap_bulk = plain_cli_wrap_bulk, + .unwrap_bulk = plain_cli_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops plain_sec_cops = { + .create_sec = plain_create_sec, + .destroy_sec = plain_destroy_sec, + .kill_sec = plain_kill_sec, + .lookup_ctx = plain_lookup_ctx, + .release_ctx = plain_release_ctx, + .flush_ctx_cache = plain_flush_ctx_cache, + .alloc_reqbuf = plain_alloc_reqbuf, + .free_reqbuf = plain_free_reqbuf, + .alloc_repbuf = plain_alloc_repbuf, + .free_repbuf = plain_free_repbuf, + .enlarge_reqbuf = plain_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops plain_sec_sops = { + .accept = plain_accept, + .alloc_rs = plain_alloc_rs, + .authorize = plain_authorize, + .free_rs = plain_free_rs, + .unwrap_bulk = plain_svc_unwrap_bulk, + .wrap_bulk = plain_svc_wrap_bulk, +}; + +static struct ptlrpc_sec_policy plain_policy = { + .sp_owner = THIS_MODULE, + .sp_name = "plain", + .sp_policy = SPTLRPC_POLICY_PLAIN, + .sp_cops = &plain_sec_cops, + .sp_sops = &plain_sec_sops, +}; + +int sptlrpc_plain_init(void) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int rc; + + buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size(); + plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + rc = sptlrpc_register_policy(&plain_policy); + if (rc) + CERROR("failed to register: %d\n", rc); + + return rc; +} + +void sptlrpc_plain_fini(void) +{ + int rc; + + rc = sptlrpc_unregister_policy(&plain_policy); + if (rc) + CERROR("cannot unregister: %d\n", rc); +} diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/service.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/service.c new file mode 100644 index 000000000..8e6142151 --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/service.c @@ -0,0 +1,3105 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_net.h" +#include "../include/lu_object.h" +#include "../../include/linux/lnet/types.h" +#include "ptlrpc_internal.h" + +/* The following are visible and mutable through /sys/module/ptlrpc */ +int test_req_buffer_pressure = 0; +module_param(test_req_buffer_pressure, int, 0444); +MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools"); +module_param(at_min, int, 0644); +MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)"); +module_param(at_max, int, 0644); +MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)"); +module_param(at_history, int, 0644); +MODULE_PARM_DESC(at_history, + "Adaptive timeouts remember the slowest event that took place within this period (sec)"); +module_param(at_early_margin, int, 0644); +MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply"); +module_param(at_extra, int, 0644); +MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply"); + + +/* forward ref */ +static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt); +static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req); +static void ptlrpc_at_remove_timed(struct ptlrpc_request *req); + +/** Holds a list of all PTLRPC services */ +LIST_HEAD(ptlrpc_all_services); +/** Used to protect the \e ptlrpc_all_services list */ +struct mutex ptlrpc_all_services_mutex; + +struct ptlrpc_request_buffer_desc * +ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request_buffer_desc *rqbd; + + OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt); + if (rqbd == NULL) + return NULL; + + rqbd->rqbd_svcpt = svcpt; + rqbd->rqbd_refcount = 0; + rqbd->rqbd_cbid.cbid_fn = request_in_callback; + rqbd->rqbd_cbid.cbid_arg = rqbd; + INIT_LIST_HEAD(&rqbd->rqbd_reqs); + OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable, + svcpt->scp_cpt, svc->srv_buf_size); + if (rqbd->rqbd_buffer == NULL) { + OBD_FREE_PTR(rqbd); + return NULL; + } + + spin_lock(&svcpt->scp_lock); + list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); + svcpt->scp_nrqbds_total++; + spin_unlock(&svcpt->scp_lock); + + return rqbd; +} + +void +ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd) +{ + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + + LASSERT(rqbd->rqbd_refcount == 0); + LASSERT(list_empty(&rqbd->rqbd_reqs)); + + spin_lock(&svcpt->scp_lock); + list_del(&rqbd->rqbd_list); + svcpt->scp_nrqbds_total--; + spin_unlock(&svcpt->scp_lock); + + OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size); + OBD_FREE_PTR(rqbd); +} + +int +ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request_buffer_desc *rqbd; + int rc = 0; + int i; + + if (svcpt->scp_rqbd_allocating) + goto try_post; + + spin_lock(&svcpt->scp_lock); + /* check again with lock */ + if (svcpt->scp_rqbd_allocating) { + /* NB: we might allow more than one thread in the future */ + LASSERT(svcpt->scp_rqbd_allocating == 1); + spin_unlock(&svcpt->scp_lock); + goto try_post; + } + + svcpt->scp_rqbd_allocating++; + spin_unlock(&svcpt->scp_lock); + + + for (i = 0; i < svc->srv_nbuf_per_group; i++) { + /* NB: another thread might have recycled enough rqbds, we + * need to make sure it wouldn't over-allocate, see LU-1212. */ + if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group) + break; + + rqbd = ptlrpc_alloc_rqbd(svcpt); + + if (rqbd == NULL) { + CERROR("%s: Can't allocate request buffer\n", + svc->srv_name); + rc = -ENOMEM; + break; + } + } + + spin_lock(&svcpt->scp_lock); + + LASSERT(svcpt->scp_rqbd_allocating == 1); + svcpt->scp_rqbd_allocating--; + + spin_unlock(&svcpt->scp_lock); + + CDEBUG(D_RPCTRACE, + "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n", + svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted, + svcpt->scp_nrqbds_total, rc); + + try_post: + if (post && rc == 0) + rc = ptlrpc_server_post_idle_rqbds(svcpt); + + return rc; +} + +/** + * Part of Rep-Ack logic. + * Puts a lock and its mode into reply state associated to request reply. + */ +void +ptlrpc_save_lock(struct ptlrpc_request *req, + struct lustre_handle *lock, int mode, int no_ack) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + int idx; + + LASSERT(rs != NULL); + LASSERT(rs->rs_nlocks < RS_MAX_LOCKS); + + if (req->rq_export->exp_disconnected) { + ldlm_lock_decref(lock, mode); + } else { + idx = rs->rs_nlocks++; + rs->rs_locks[idx] = *lock; + rs->rs_modes[idx] = mode; + rs->rs_difficult = 1; + rs->rs_no_ack = !!no_ack; + } +} +EXPORT_SYMBOL(ptlrpc_save_lock); + + +struct ptlrpc_hr_partition; + +struct ptlrpc_hr_thread { + int hrt_id; /* thread ID */ + spinlock_t hrt_lock; + wait_queue_head_t hrt_waitq; + struct list_head hrt_queue; /* RS queue */ + struct ptlrpc_hr_partition *hrt_partition; +}; + +struct ptlrpc_hr_partition { + /* # of started threads */ + atomic_t hrp_nstarted; + /* # of stopped threads */ + atomic_t hrp_nstopped; + /* cpu partition id */ + int hrp_cpt; + /* round-robin rotor for choosing thread */ + int hrp_rotor; + /* total number of threads on this partition */ + int hrp_nthrs; + /* threads table */ + struct ptlrpc_hr_thread *hrp_thrs; +}; + +#define HRT_RUNNING 0 +#define HRT_STOPPING 1 + +struct ptlrpc_hr_service { + /* CPU partition table, it's just cfs_cpt_table for now */ + struct cfs_cpt_table *hr_cpt_table; + /** controller sleep waitq */ + wait_queue_head_t hr_waitq; + unsigned int hr_stopping; + /** roundrobin rotor for non-affinity service */ + unsigned int hr_rotor; + /* partition data */ + struct ptlrpc_hr_partition **hr_partitions; +}; + +struct rs_batch { + struct list_head rsb_replies; + unsigned int rsb_n_replies; + struct ptlrpc_service_part *rsb_svcpt; +}; + +/** reply handling service. */ +static struct ptlrpc_hr_service ptlrpc_hr; + +/** + * maximum number of replies scheduled in one batch + */ +#define MAX_SCHEDULED 256 + +/** + * Initialize a reply batch. + * + * \param b batch + */ +static void rs_batch_init(struct rs_batch *b) +{ + memset(b, 0, sizeof(*b)); + INIT_LIST_HEAD(&b->rsb_replies); +} + +/** + * Choose an hr thread to dispatch requests to. + */ +static struct ptlrpc_hr_thread * +ptlrpc_hr_select(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_hr_partition *hrp; + unsigned int rotor; + + if (svcpt->scp_cpt >= 0 && + svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) { + /* directly match partition */ + hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt]; + + } else { + rotor = ptlrpc_hr.hr_rotor++; + rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table); + + hrp = ptlrpc_hr.hr_partitions[rotor]; + } + + rotor = hrp->hrp_rotor++; + return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs]; +} + +/** + * Dispatch all replies accumulated in the batch to one from + * dedicated reply handling threads. + * + * \param b batch + */ +static void rs_batch_dispatch(struct rs_batch *b) +{ + if (b->rsb_n_replies != 0) { + struct ptlrpc_hr_thread *hrt; + + hrt = ptlrpc_hr_select(b->rsb_svcpt); + + spin_lock(&hrt->hrt_lock); + list_splice_init(&b->rsb_replies, &hrt->hrt_queue); + spin_unlock(&hrt->hrt_lock); + + wake_up(&hrt->hrt_waitq); + b->rsb_n_replies = 0; + } +} + +/** + * Add a reply to a batch. + * Add one reply object to a batch, schedule batched replies if overload. + * + * \param b batch + * \param rs reply + */ +static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) { + if (b->rsb_svcpt != NULL) { + rs_batch_dispatch(b); + spin_unlock(&b->rsb_svcpt->scp_rep_lock); + } + spin_lock(&svcpt->scp_rep_lock); + b->rsb_svcpt = svcpt; + } + spin_lock(&rs->rs_lock); + rs->rs_scheduled_ever = 1; + if (rs->rs_scheduled == 0) { + list_move(&rs->rs_list, &b->rsb_replies); + rs->rs_scheduled = 1; + b->rsb_n_replies++; + } + rs->rs_committed = 1; + spin_unlock(&rs->rs_lock); +} + +/** + * Reply batch finalization. + * Dispatch remaining replies from the batch + * and release remaining spinlock. + * + * \param b batch + */ +static void rs_batch_fini(struct rs_batch *b) +{ + if (b->rsb_svcpt != NULL) { + rs_batch_dispatch(b); + spin_unlock(&b->rsb_svcpt->scp_rep_lock); + } +} + +#define DECLARE_RS_BATCH(b) struct rs_batch b + + +/** + * Put reply state into a queue for processing because we received + * ACK from the client + */ +void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_hr_thread *hrt; + + LASSERT(list_empty(&rs->rs_list)); + + hrt = ptlrpc_hr_select(rs->rs_svcpt); + + spin_lock(&hrt->hrt_lock); + list_add_tail(&rs->rs_list, &hrt->hrt_queue); + spin_unlock(&hrt->hrt_lock); + + wake_up(&hrt->hrt_waitq); +} + +void +ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs) +{ + assert_spin_locked(&rs->rs_svcpt->scp_rep_lock); + assert_spin_locked(&rs->rs_lock); + LASSERT(rs->rs_difficult); + rs->rs_scheduled_ever = 1; /* flag any notification attempt */ + + if (rs->rs_scheduled) { /* being set up or already notified */ + return; + } + + rs->rs_scheduled = 1; + list_del_init(&rs->rs_list); + ptlrpc_dispatch_difficult_reply(rs); +} +EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); + +void ptlrpc_commit_replies(struct obd_export *exp) +{ + struct ptlrpc_reply_state *rs, *nxt; + DECLARE_RS_BATCH(batch); + + rs_batch_init(&batch); + /* Find any replies that have been committed and get their service + * to attend to complete them. */ + + /* CAVEAT EMPTOR: spinlock ordering!!! */ + spin_lock(&exp->exp_uncommitted_replies_lock); + list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies, + rs_obd_list) { + LASSERT(rs->rs_difficult); + /* VBR: per-export last_committed */ + LASSERT(rs->rs_export); + if (rs->rs_transno <= exp->exp_last_committed) { + list_del_init(&rs->rs_obd_list); + rs_batch_add(&batch, rs); + } + } + spin_unlock(&exp->exp_uncommitted_replies_lock); + rs_batch_fini(&batch); +} +EXPORT_SYMBOL(ptlrpc_commit_replies); + +static int +ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_request_buffer_desc *rqbd; + int rc; + int posted = 0; + + for (;;) { + spin_lock(&svcpt->scp_lock); + + if (list_empty(&svcpt->scp_rqbd_idle)) { + spin_unlock(&svcpt->scp_lock); + return posted; + } + + rqbd = list_entry(svcpt->scp_rqbd_idle.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + list_del(&rqbd->rqbd_list); + + /* assume we will post successfully */ + svcpt->scp_nrqbds_posted++; + list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); + + spin_unlock(&svcpt->scp_lock); + + rc = ptlrpc_register_rqbd(rqbd); + if (rc != 0) + break; + + posted = 1; + } + + spin_lock(&svcpt->scp_lock); + + svcpt->scp_nrqbds_posted--; + list_del(&rqbd->rqbd_list); + list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); + + /* Don't complain if no request buffers are posted right now; LNET + * won't drop requests because we set the portal lazy! */ + + spin_unlock(&svcpt->scp_lock); + + return -1; +} + +static void ptlrpc_at_timer(unsigned long castmeharder) +{ + struct ptlrpc_service_part *svcpt; + + svcpt = (struct ptlrpc_service_part *)castmeharder; + + svcpt->scp_at_check = 1; + svcpt->scp_at_checktime = cfs_time_current(); + wake_up(&svcpt->scp_waitq); +} + +static void +ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, + struct ptlrpc_service_conf *conf) +{ + struct ptlrpc_service_thr_conf *tc = &conf->psc_thr; + unsigned init; + unsigned total; + unsigned nthrs; + int weight; + + /* + * Common code for estimating & validating threads number. + * CPT affinity service could have percpt thread-pool instead + * of a global thread-pool, which means user might not always + * get the threads number they give it in conf::tc_nthrs_user + * even they did set. It's because we need to validate threads + * number for each CPT to guarantee each pool will have enough + * threads to keep the service healthy. + */ + init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL); + init = max_t(int, init, tc->tc_nthrs_init); + + /* NB: please see comments in lustre_lnet.h for definition + * details of these members */ + LASSERT(tc->tc_nthrs_max != 0); + + if (tc->tc_nthrs_user != 0) { + /* In case there is a reason to test a service with many + * threads, we give a less strict check here, it can + * be up to 8 * nthrs_max */ + total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user); + nthrs = total / svc->srv_ncpts; + init = max(init, nthrs); + goto out; + } + + total = tc->tc_nthrs_max; + if (tc->tc_nthrs_base == 0) { + /* don't care about base threads number per partition, + * this is most for non-affinity service */ + nthrs = total / svc->srv_ncpts; + goto out; + } + + nthrs = tc->tc_nthrs_base; + if (svc->srv_ncpts == 1) { + int i; + + /* NB: Increase the base number if it's single partition + * and total number of cores/HTs is larger or equal to 4. + * result will always < 2 * nthrs_base */ + weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY); + for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */ + (tc->tc_nthrs_base >> i) != 0; i++) + nthrs += tc->tc_nthrs_base >> i; + } + + if (tc->tc_thr_factor != 0) { + int factor = tc->tc_thr_factor; + const int fade = 4; + + /* + * User wants to increase number of threads with for + * each CPU core/HT, most likely the factor is larger then + * one thread/core because service threads are supposed to + * be blocked by lock or wait for IO. + */ + /* + * Amdahl's law says that adding processors wouldn't give + * a linear increasing of parallelism, so it's nonsense to + * have too many threads no matter how many cores/HTs + * there are. + */ + /* weight is # of HTs */ + if (cpumask_weight(topology_thread_cpumask(0)) > 1) { + /* depress thread factor for hyper-thread */ + factor = factor - (factor >> 1) + (factor >> 3); + } + + weight = cfs_cpt_weight(svc->srv_cptable, 0); + LASSERT(weight > 0); + + for (; factor > 0 && weight > 0; factor--, weight -= fade) + nthrs += min(weight, fade) * factor; + } + + if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { + nthrs = max(tc->tc_nthrs_base, + tc->tc_nthrs_max / svc->srv_ncpts); + } + out: + nthrs = max(nthrs, tc->tc_nthrs_init); + svc->srv_nthrs_cpt_limit = nthrs; + svc->srv_nthrs_cpt_init = init; + + if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { + CDEBUG(D_OTHER, "%s: This service may have more threads (%d) than the given soft limit (%d)\n", + svc->srv_name, nthrs * svc->srv_ncpts, + tc->tc_nthrs_max); + } +} + +/** + * Initialize percpt data for a service + */ +static int +ptlrpc_service_part_init(struct ptlrpc_service *svc, + struct ptlrpc_service_part *svcpt, int cpt) +{ + struct ptlrpc_at_array *array; + int size; + int index; + int rc; + + svcpt->scp_cpt = cpt; + INIT_LIST_HEAD(&svcpt->scp_threads); + + /* rqbd and incoming request queue */ + spin_lock_init(&svcpt->scp_lock); + INIT_LIST_HEAD(&svcpt->scp_rqbd_idle); + INIT_LIST_HEAD(&svcpt->scp_rqbd_posted); + INIT_LIST_HEAD(&svcpt->scp_req_incoming); + init_waitqueue_head(&svcpt->scp_waitq); + /* history request & rqbd list */ + INIT_LIST_HEAD(&svcpt->scp_hist_reqs); + INIT_LIST_HEAD(&svcpt->scp_hist_rqbds); + + /* active requests and hp requests */ + spin_lock_init(&svcpt->scp_req_lock); + + /* reply states */ + spin_lock_init(&svcpt->scp_rep_lock); + INIT_LIST_HEAD(&svcpt->scp_rep_active); + INIT_LIST_HEAD(&svcpt->scp_rep_idle); + init_waitqueue_head(&svcpt->scp_rep_waitq); + atomic_set(&svcpt->scp_nreps_difficult, 0); + + /* adaptive timeout */ + spin_lock_init(&svcpt->scp_at_lock); + array = &svcpt->scp_at_array; + + size = at_est2timeout(at_max); + array->paa_size = size; + array->paa_count = 0; + array->paa_deadline = -1; + + /* allocate memory for scp_at_array (ptlrpc_at_array) */ + OBD_CPT_ALLOC(array->paa_reqs_array, + svc->srv_cptable, cpt, sizeof(struct list_head) * size); + if (array->paa_reqs_array == NULL) + return -ENOMEM; + + for (index = 0; index < size; index++) + INIT_LIST_HEAD(&array->paa_reqs_array[index]); + + OBD_CPT_ALLOC(array->paa_reqs_count, + svc->srv_cptable, cpt, sizeof(__u32) * size); + if (array->paa_reqs_count == NULL) + goto failed; + + cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt); + /* At SOW, service time should be quick; 10s seems generous. If client + * timeout is less than this, we'll be sending an early reply. */ + at_init(&svcpt->scp_at_estimate, 10, 0); + + /* assign this before call ptlrpc_grow_req_bufs */ + svcpt->scp_service = svc; + /* Now allocate the request buffers, but don't post them now */ + rc = ptlrpc_grow_req_bufs(svcpt, 0); + /* We shouldn't be under memory pressure at startup, so + * fail if we can't allocate all our buffers at this time. */ + if (rc != 0) + goto failed; + + return 0; + + failed: + if (array->paa_reqs_count != NULL) { + OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size); + array->paa_reqs_count = NULL; + } + + if (array->paa_reqs_array != NULL) { + OBD_FREE(array->paa_reqs_array, + sizeof(struct list_head) * array->paa_size); + array->paa_reqs_array = NULL; + } + + return -ENOMEM; +} + +/** + * Initialize service on a given portal. + * This includes starting serving threads , allocating and posting rqbds and + * so on. + */ +struct ptlrpc_service * +ptlrpc_register_service(struct ptlrpc_service_conf *conf, + struct proc_dir_entry *proc_entry) +{ + struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt; + struct ptlrpc_service *service; + struct ptlrpc_service_part *svcpt; + struct cfs_cpt_table *cptable; + __u32 *cpts = NULL; + int ncpts; + int cpt; + int rc; + int i; + + LASSERT(conf->psc_buf.bc_nbufs > 0); + LASSERT(conf->psc_buf.bc_buf_size >= + conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD); + LASSERT(conf->psc_thr.tc_ctx_tags != 0); + + cptable = cconf->cc_cptable; + if (cptable == NULL) + cptable = cfs_cpt_table; + + if (!conf->psc_thr.tc_cpu_affinity) { + ncpts = 1; + } else { + ncpts = cfs_cpt_number(cptable); + if (cconf->cc_pattern != NULL) { + struct cfs_expr_list *el; + + rc = cfs_expr_list_parse(cconf->cc_pattern, + strlen(cconf->cc_pattern), + 0, ncpts - 1, &el); + if (rc != 0) { + CERROR("%s: invalid CPT pattern string: %s", + conf->psc_name, cconf->cc_pattern); + return ERR_PTR(-EINVAL); + } + + rc = cfs_expr_list_values(el, ncpts, &cpts); + cfs_expr_list_free(el); + if (rc <= 0) { + CERROR("%s: failed to parse CPT array %s: %d\n", + conf->psc_name, cconf->cc_pattern, rc); + if (cpts != NULL) + OBD_FREE(cpts, sizeof(*cpts) * ncpts); + return ERR_PTR(rc < 0 ? rc : -EINVAL); + } + ncpts = rc; + } + } + + OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts])); + if (service == NULL) { + if (cpts != NULL) + OBD_FREE(cpts, sizeof(*cpts) * ncpts); + return ERR_PTR(-ENOMEM); + } + + service->srv_cptable = cptable; + service->srv_cpts = cpts; + service->srv_ncpts = ncpts; + + service->srv_cpt_bits = 0; /* it's zero already, easy to read... */ + while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable)) + service->srv_cpt_bits++; + + /* public members */ + spin_lock_init(&service->srv_lock); + service->srv_name = conf->psc_name; + service->srv_watchdog_factor = conf->psc_watchdog_factor; + INIT_LIST_HEAD(&service->srv_list); /* for safety of cleanup */ + + /* buffer configuration */ + service->srv_nbuf_per_group = test_req_buffer_pressure ? + 1 : conf->psc_buf.bc_nbufs; + service->srv_max_req_size = conf->psc_buf.bc_req_max_size + + SPTLRPC_MAX_PAYLOAD; + service->srv_buf_size = conf->psc_buf.bc_buf_size; + service->srv_rep_portal = conf->psc_buf.bc_rep_portal; + service->srv_req_portal = conf->psc_buf.bc_req_portal; + + /* Increase max reply size to next power of two */ + service->srv_max_reply_size = 1; + while (service->srv_max_reply_size < + conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD) + service->srv_max_reply_size <<= 1; + + service->srv_thread_name = conf->psc_thr.tc_thr_name; + service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags; + service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; + service->srv_ops = conf->psc_ops; + + for (i = 0; i < ncpts; i++) { + if (!conf->psc_thr.tc_cpu_affinity) + cpt = CFS_CPT_ANY; + else + cpt = cpts != NULL ? cpts[i] : i; + + OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt)); + if (svcpt == NULL) { + rc = -ENOMEM; + goto failed; + } + + service->srv_parts[i] = svcpt; + rc = ptlrpc_service_part_init(service, svcpt, cpt); + if (rc != 0) + goto failed; + } + + ptlrpc_server_nthreads_check(service, conf); + + rc = LNetSetLazyPortal(service->srv_req_portal); + LASSERT(rc == 0); + + mutex_lock(&ptlrpc_all_services_mutex); + list_add(&service->srv_list, &ptlrpc_all_services); + mutex_unlock(&ptlrpc_all_services_mutex); + + if (proc_entry != NULL) + ptlrpc_lprocfs_register_service(proc_entry, service); + + rc = ptlrpc_service_nrs_setup(service); + if (rc != 0) + goto failed; + + CDEBUG(D_NET, "%s: Started, listening on portal %d\n", + service->srv_name, service->srv_req_portal); + + rc = ptlrpc_start_threads(service); + if (rc != 0) { + CERROR("Failed to start threads for service %s: %d\n", + service->srv_name, rc); + goto failed; + } + + return service; +failed: + ptlrpc_unregister_service(service); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(ptlrpc_register_service); + +/** + * to actually free the request, must be called without holding svc_lock. + * note it's caller's responsibility to unlink req->rq_list. + */ +static void ptlrpc_server_free_request(struct ptlrpc_request *req) +{ + LASSERT(atomic_read(&req->rq_refcount) == 0); + LASSERT(list_empty(&req->rq_timed_list)); + + /* DEBUG_REQ() assumes the reply state of a request with a valid + * ref will not be destroyed until that reference is dropped. */ + ptlrpc_req_drop_rs(req); + + sptlrpc_svc_ctx_decref(req); + + if (req != &req->rq_rqbd->rqbd_req) { + /* NB request buffers use an embedded + * req if the incoming req unlinked the + * MD; this isn't one of them! */ + ptlrpc_request_cache_free(req); + } +} + +/** + * drop a reference count of the request. if it reaches 0, we either + * put it into history list, or free it immediately. + */ +void ptlrpc_server_drop_request(struct ptlrpc_request *req) +{ + struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int refcount; + struct list_head *tmp; + struct list_head *nxt; + + if (!atomic_dec_and_test(&req->rq_refcount)) + return; + + if (req->rq_at_linked) { + spin_lock(&svcpt->scp_at_lock); + /* recheck with lock, in case it's unlinked by + * ptlrpc_at_check_timed() */ + if (likely(req->rq_at_linked)) + ptlrpc_at_remove_timed(req); + spin_unlock(&svcpt->scp_at_lock); + } + + LASSERT(list_empty(&req->rq_timed_list)); + + /* finalize request */ + if (req->rq_export) { + class_export_put(req->rq_export); + req->rq_export = NULL; + } + + spin_lock(&svcpt->scp_lock); + + list_add(&req->rq_list, &rqbd->rqbd_reqs); + + refcount = --(rqbd->rqbd_refcount); + if (refcount == 0) { + /* request buffer is now idle: add to history */ + list_del(&rqbd->rqbd_list); + + list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); + svcpt->scp_hist_nrqbds++; + + /* cull some history? + * I expect only about 1 or 2 rqbds need to be recycled here */ + while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) { + rqbd = list_entry(svcpt->scp_hist_rqbds.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + + list_del(&rqbd->rqbd_list); + svcpt->scp_hist_nrqbds--; + + /* remove rqbd's reqs from svc's req history while + * I've got the service lock */ + list_for_each(tmp, &rqbd->rqbd_reqs) { + req = list_entry(tmp, struct ptlrpc_request, + rq_list); + /* Track the highest culled req seq */ + if (req->rq_history_seq > + svcpt->scp_hist_seq_culled) { + svcpt->scp_hist_seq_culled = + req->rq_history_seq; + } + list_del(&req->rq_history_list); + } + + spin_unlock(&svcpt->scp_lock); + + list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) { + req = list_entry(rqbd->rqbd_reqs.next, + struct ptlrpc_request, + rq_list); + list_del(&req->rq_list); + ptlrpc_server_free_request(req); + } + + spin_lock(&svcpt->scp_lock); + /* + * now all reqs including the embedded req has been + * disposed, schedule request buffer for re-use. + */ + LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == + 0); + list_add_tail(&rqbd->rqbd_list, + &svcpt->scp_rqbd_idle); + } + + spin_unlock(&svcpt->scp_lock); + } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { + /* If we are low on memory, we are not interested in history */ + list_del(&req->rq_list); + list_del_init(&req->rq_history_list); + + /* Track the highest culled req seq */ + if (req->rq_history_seq > svcpt->scp_hist_seq_culled) + svcpt->scp_hist_seq_culled = req->rq_history_seq; + + spin_unlock(&svcpt->scp_lock); + + ptlrpc_server_free_request(req); + } else { + spin_unlock(&svcpt->scp_lock); + } +} + +/** Change request export and move hp request from old export to new */ +void ptlrpc_request_change_export(struct ptlrpc_request *req, + struct obd_export *export) +{ + if (req->rq_export != NULL) { + if (!list_empty(&req->rq_exp_list)) { + /* remove rq_exp_list from last export */ + spin_lock_bh(&req->rq_export->exp_rpc_lock); + list_del_init(&req->rq_exp_list); + spin_unlock_bh(&req->rq_export->exp_rpc_lock); + + /* export has one reference already, so it`s safe to + * add req to export queue here and get another + * reference for request later */ + spin_lock_bh(&export->exp_rpc_lock); + list_add(&req->rq_exp_list, &export->exp_hp_rpcs); + spin_unlock_bh(&export->exp_rpc_lock); + } + class_export_rpc_dec(req->rq_export); + class_export_put(req->rq_export); + } + + /* request takes one export refcount */ + req->rq_export = class_export_get(export); + class_export_rpc_inc(export); + + return; +} + +/** + * to finish a request: stop sending more early replies, and release + * the request. + */ +static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + ptlrpc_server_hpreq_fini(req); + + ptlrpc_server_drop_request(req); +} + +/** + * to finish a active request: stop sending more early replies, and release + * the request. should be called after we finished handling the request. + */ +static void ptlrpc_server_finish_active_request( + struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + spin_lock(&svcpt->scp_req_lock); + ptlrpc_nrs_req_stop_nolock(req); + svcpt->scp_nreqs_active--; + if (req->rq_hp) + svcpt->scp_nhreqs_active--; + spin_unlock(&svcpt->scp_req_lock); + + ptlrpc_nrs_req_finalize(req); + + if (req->rq_export != NULL) + class_export_rpc_dec(req->rq_export); + + ptlrpc_server_finish_request(svcpt, req); +} + +/** + * This function makes sure dead exports are evicted in a timely manner. + * This function is only called when some export receives a message (i.e., + * the network is up.) + */ +static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) +{ + struct obd_export *oldest_exp; + time_t oldest_time, new_time; + + LASSERT(exp); + + /* Compensate for slow machines, etc, by faking our request time + into the future. Although this can break the strict time-ordering + of the list, we can be really lazy here - we don't have to evict + at the exact right moment. Eventually, all silent exports + will make it to the top of the list. */ + + /* Do not pay attention on 1sec or smaller renewals. */ + new_time = get_seconds() + extra_delay; + if (exp->exp_last_request_time + 1 /*second */ >= new_time) + return; + + exp->exp_last_request_time = new_time; + + /* exports may get disconnected from the chain even though the + export has references, so we must keep the spin lock while + manipulating the lists */ + spin_lock(&exp->exp_obd->obd_dev_lock); + + if (list_empty(&exp->exp_obd_chain_timed)) { + /* this one is not timed */ + spin_unlock(&exp->exp_obd->obd_dev_lock); + return; + } + + list_move_tail(&exp->exp_obd_chain_timed, + &exp->exp_obd->obd_exports_timed); + + oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next, + struct obd_export, exp_obd_chain_timed); + oldest_time = oldest_exp->exp_last_request_time; + spin_unlock(&exp->exp_obd->obd_dev_lock); + + if (exp->exp_obd->obd_recovering) { + /* be nice to everyone during recovery */ + return; + } + + /* Note - racing to start/reset the obd_eviction timer is safe */ + if (exp->exp_obd->obd_eviction_timer == 0) { + /* Check if the oldest entry is expired. */ + if (get_seconds() > (oldest_time + PING_EVICT_TIMEOUT + + extra_delay)) { + /* We need a second timer, in case the net was down and + * it just came back. Since the pinger may skip every + * other PING_INTERVAL (see note in ptlrpc_pinger_main), + * we better wait for 3. */ + exp->exp_obd->obd_eviction_timer = + get_seconds() + 3 * PING_INTERVAL; + CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n", + exp->exp_obd->obd_name, + obd_export_nid2str(oldest_exp), oldest_time); + } + } else { + if (get_seconds() > + (exp->exp_obd->obd_eviction_timer + extra_delay)) { + /* The evictor won't evict anyone who we've heard from + * recently, so we don't have to check before we start + * it. */ + if (!ping_evictor_wake(exp)) + exp->exp_obd->obd_eviction_timer = 0; + } + } +} + +/** + * Sanity check request \a req. + * Return 0 if all is ok, error code otherwise. + */ +static int ptlrpc_check_req(struct ptlrpc_request *req) +{ + struct obd_device *obd = req->rq_export->exp_obd; + int rc = 0; + + if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < + req->rq_export->exp_conn_cnt)) { + DEBUG_REQ(D_RPCTRACE, req, + "DROPPING req from old connection %d < %d", + lustre_msg_get_conn_cnt(req->rq_reqmsg), + req->rq_export->exp_conn_cnt); + return -EEXIST; + } + if (unlikely(obd == NULL || obd->obd_fail)) { + /* + * Failing over, don't handle any more reqs, send + * error response instead. + */ + CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", + req, (obd != NULL) ? obd->obd_name : "unknown"); + rc = -ENODEV; + } else if (lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE) && + !obd->obd_recovering) { + DEBUG_REQ(D_ERROR, req, + "Invalid replay without recovery"); + class_fail_export(req->rq_export); + rc = -ENODEV; + } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 && + !obd->obd_recovering) { + DEBUG_REQ(D_ERROR, req, "Invalid req with transno %llu without recovery", + lustre_msg_get_transno(req->rq_reqmsg)); + class_fail_export(req->rq_export); + rc = -ENODEV; + } + + if (unlikely(rc < 0)) { + req->rq_status = rc; + ptlrpc_error(req); + } + return rc; +} + +static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + __s32 next; + + if (array->paa_count == 0) { + cfs_timer_disarm(&svcpt->scp_at_timer); + return; + } + + /* Set timer for closest deadline */ + next = (__s32)(array->paa_deadline - get_seconds() - + at_early_margin); + if (next <= 0) { + ptlrpc_at_timer((unsigned long)svcpt); + } else { + cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next)); + CDEBUG(D_INFO, "armed %s at %+ds\n", + svcpt->scp_service->srv_name, next); + } +} + +/* Add rpc to early reply check list */ +static int ptlrpc_at_add_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + struct ptlrpc_request *rq = NULL; + __u32 index; + + if (AT_OFF) + return 0; + + if (req->rq_no_reply) + return 0; + + if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) + return -ENOSYS; + + spin_lock(&svcpt->scp_at_lock); + LASSERT(list_empty(&req->rq_timed_list)); + + index = (unsigned long)req->rq_deadline % array->paa_size; + if (array->paa_reqs_count[index] > 0) { + /* latest rpcs will have the latest deadlines in the list, + * so search backward. */ + list_for_each_entry_reverse(rq, + &array->paa_reqs_array[index], + rq_timed_list) { + if (req->rq_deadline >= rq->rq_deadline) { + list_add(&req->rq_timed_list, + &rq->rq_timed_list); + break; + } + } + } + + /* Add the request at the head of the list */ + if (list_empty(&req->rq_timed_list)) + list_add(&req->rq_timed_list, + &array->paa_reqs_array[index]); + + spin_lock(&req->rq_lock); + req->rq_at_linked = 1; + spin_unlock(&req->rq_lock); + req->rq_at_index = index; + array->paa_reqs_count[index]++; + array->paa_count++; + if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) { + array->paa_deadline = req->rq_deadline; + ptlrpc_at_set_timer(svcpt); + } + spin_unlock(&svcpt->scp_at_lock); + + return 0; +} + +static void +ptlrpc_at_remove_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_at_array *array; + + array = &req->rq_rqbd->rqbd_svcpt->scp_at_array; + + /* NB: must call with hold svcpt::scp_at_lock */ + LASSERT(!list_empty(&req->rq_timed_list)); + list_del_init(&req->rq_timed_list); + + spin_lock(&req->rq_lock); + req->rq_at_linked = 0; + spin_unlock(&req->rq_lock); + + array->paa_reqs_count[req->rq_at_index]--; + array->paa_count--; +} + +static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_request *reqcopy; + struct lustre_msg *reqmsg; + long olddl = req->rq_deadline - get_seconds(); + time_t newdl; + int rc; + + /* deadline is when the client expects us to reply, margin is the + difference between clients' and servers' expectations */ + DEBUG_REQ(D_ADAPTTO, req, + "%ssending early reply (deadline %+lds, margin %+lds) for %d+%d", + AT_OFF ? "AT off - not " : "", + olddl, olddl - at_get(&svcpt->scp_at_estimate), + at_get(&svcpt->scp_at_estimate), at_extra); + + if (AT_OFF) + return 0; + + if (olddl < 0) { + DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), not sending early reply. Consider increasing at_early_margin (%d)?", + olddl, at_early_margin); + + /* Return an error so we're not re-added to the timed list. */ + return -ETIMEDOUT; + } + + if (!(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, but no AT support"); + return -ENOSYS; + } + + if (req->rq_export && + lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) { + /* During recovery, we don't want to send too many early + * replies, but on the other hand we want to make sure the + * client has enough time to resend if the rpc is lost. So + * during the recovery period send at least 4 early replies, + * spacing them every at_extra if we can. at_estimate should + * always equal this fixed value during recovery. */ + at_measured(&svcpt->scp_at_estimate, min(at_extra, + req->rq_export->exp_obd->obd_recovery_timeout / 4)); + } else { + /* Fake our processing time into the future to ask the clients + * for some extra amount of time */ + at_measured(&svcpt->scp_at_estimate, at_extra + + get_seconds() - + req->rq_arrival_time.tv_sec); + + /* Check to see if we've actually increased the deadline - + * we may be past adaptive_max */ + if (req->rq_deadline >= req->rq_arrival_time.tv_sec + + at_get(&svcpt->scp_at_estimate)) { + DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%ld/%ld), not sending early reply\n", + olddl, req->rq_arrival_time.tv_sec + + at_get(&svcpt->scp_at_estimate) - + get_seconds()); + return -ETIMEDOUT; + } + } + newdl = get_seconds() + at_get(&svcpt->scp_at_estimate); + + reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS); + if (reqcopy == NULL) + return -ENOMEM; + OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen); + if (!reqmsg) { + rc = -ENOMEM; + goto out_free; + } + + *reqcopy = *req; + reqcopy->rq_reply_state = NULL; + reqcopy->rq_rep_swab_mask = 0; + reqcopy->rq_pack_bulk = 0; + reqcopy->rq_pack_udesc = 0; + reqcopy->rq_packed_final = 0; + sptlrpc_svc_ctx_addref(reqcopy); + /* We only need the reqmsg for the magic */ + reqcopy->rq_reqmsg = reqmsg; + memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); + + LASSERT(atomic_read(&req->rq_refcount)); + /** if it is last refcount then early reply isn't needed */ + if (atomic_read(&req->rq_refcount) == 1) { + DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, abort sending early reply\n"); + rc = -EINVAL; + goto out; + } + + /* Connection ref */ + reqcopy->rq_export = class_conn2export( + lustre_msg_get_handle(reqcopy->rq_reqmsg)); + if (reqcopy->rq_export == NULL) { + rc = -ENODEV; + goto out; + } + + /* RPC ref */ + class_export_rpc_inc(reqcopy->rq_export); + if (reqcopy->rq_export->exp_obd && + reqcopy->rq_export->exp_obd->obd_fail) { + rc = -ENODEV; + goto out_put; + } + + rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); + if (rc) + goto out_put; + + rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); + + if (!rc) { + /* Adjust our own deadline to what we told the client */ + req->rq_deadline = newdl; + req->rq_early_count++; /* number sent, server side */ + } else { + DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); + } + + /* Free the (early) reply state from lustre_pack_reply. + (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */ + ptlrpc_req_drop_rs(reqcopy); + +out_put: + class_export_rpc_dec(reqcopy->rq_export); + class_export_put(reqcopy->rq_export); +out: + sptlrpc_svc_ctx_decref(reqcopy); + OBD_FREE_LARGE(reqmsg, req->rq_reqlen); +out_free: + ptlrpc_request_cache_free(reqcopy); + return rc; +} + +/* Send early replies to everybody expiring within at_early_margin + asking for at_extra time */ +static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + struct ptlrpc_request *rq, *n; + struct list_head work_list; + __u32 index, count; + time_t deadline; + time_t now = get_seconds(); + long delay; + int first, counter = 0; + + spin_lock(&svcpt->scp_at_lock); + if (svcpt->scp_at_check == 0) { + spin_unlock(&svcpt->scp_at_lock); + return 0; + } + delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime); + svcpt->scp_at_check = 0; + + if (array->paa_count == 0) { + spin_unlock(&svcpt->scp_at_lock); + return 0; + } + + /* The timer went off, but maybe the nearest rpc already completed. */ + first = array->paa_deadline - now; + if (first > at_early_margin) { + /* We've still got plenty of time. Reset the timer. */ + ptlrpc_at_set_timer(svcpt); + spin_unlock(&svcpt->scp_at_lock); + return 0; + } + + /* We're close to a timeout, and we don't know how much longer the + server will take. Send early replies to everyone expiring soon. */ + INIT_LIST_HEAD(&work_list); + deadline = -1; + index = (unsigned long)array->paa_deadline % array->paa_size; + count = array->paa_count; + while (count > 0) { + count -= array->paa_reqs_count[index]; + list_for_each_entry_safe(rq, n, + &array->paa_reqs_array[index], + rq_timed_list) { + if (rq->rq_deadline > now + at_early_margin) { + /* update the earliest deadline */ + if (deadline == -1 || + rq->rq_deadline < deadline) + deadline = rq->rq_deadline; + break; + } + + ptlrpc_at_remove_timed(rq); + /** + * ptlrpc_server_drop_request() may drop + * refcount to 0 already. Let's check this and + * don't add entry to work_list + */ + if (likely(atomic_inc_not_zero(&rq->rq_refcount))) + list_add(&rq->rq_timed_list, &work_list); + counter++; + } + + if (++index >= array->paa_size) + index = 0; + } + array->paa_deadline = deadline; + /* we have a new earliest deadline, restart the timer */ + ptlrpc_at_set_timer(svcpt); + + spin_unlock(&svcpt->scp_at_lock); + + CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early replies\n", + first, at_extra, counter); + if (first < 0) { + /* We're already past request deadlines before we even get a + chance to send early replies */ + LCONSOLE_WARN("%s: This server is not able to keep up with request traffic (cpu-bound).\n", + svcpt->scp_service->srv_name); + CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=" CFS_DURATION_T "(jiff)\n", + counter, svcpt->scp_nreqs_incoming, + svcpt->scp_nreqs_active, + at_get(&svcpt->scp_at_estimate), delay); + } + + /* we took additional refcount so entries can't be deleted from list, no + * locking is needed */ + while (!list_empty(&work_list)) { + rq = list_entry(work_list.next, struct ptlrpc_request, + rq_timed_list); + list_del_init(&rq->rq_timed_list); + + if (ptlrpc_at_send_early_reply(rq) == 0) + ptlrpc_at_add_timed(rq); + + ptlrpc_server_drop_request(rq); + } + + return 1; /* return "did_something" for liblustre */ +} + +/** + * Put the request to the export list if the request may become + * a high priority one. + */ +static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + int rc = 0; + + if (svcpt->scp_service->srv_ops.so_hpreq_handler) { + rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req); + if (rc < 0) + return rc; + LASSERT(rc == 0); + } + if (req->rq_export && req->rq_ops) { + /* Perform request specific check. We should do this check + * before the request is added into exp_hp_rpcs list otherwise + * it may hit swab race at LU-1044. */ + if (req->rq_ops->hpreq_check) { + rc = req->rq_ops->hpreq_check(req); + /** + * XXX: Out of all current + * ptlrpc_hpreq_ops::hpreq_check(), only + * ldlm_cancel_hpreq_check() can return an error code; + * other functions assert in similar places, which seems + * odd. What also does not seem right is that handlers + * for those RPCs do not assert on the same checks, but + * rather handle the error cases. e.g. see + * ost_rw_hpreq_check(), and ost_brw_read(), + * ost_brw_write(). + */ + if (rc < 0) + return rc; + LASSERT(rc == 0 || rc == 1); + } + + spin_lock_bh(&req->rq_export->exp_rpc_lock); + list_add(&req->rq_exp_list, + &req->rq_export->exp_hp_rpcs); + spin_unlock_bh(&req->rq_export->exp_rpc_lock); + } + + ptlrpc_nrs_req_initialize(svcpt, req, rc); + + return rc; +} + +/** Remove the request from the export list. */ +static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) +{ + if (req->rq_export && req->rq_ops) { + /* refresh lock timeout again so that client has more + * room to send lock cancel RPC. */ + if (req->rq_ops->hpreq_fini) + req->rq_ops->hpreq_fini(req); + + spin_lock_bh(&req->rq_export->exp_rpc_lock); + list_del_init(&req->rq_exp_list); + spin_unlock_bh(&req->rq_export->exp_rpc_lock); + } +} + +static int ptlrpc_hpreq_check(struct ptlrpc_request *req) +{ + return 1; +} + +static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = { + .hpreq_check = ptlrpc_hpreq_check, +}; + +/* Hi-Priority RPC check by RPC operation code. */ +int ptlrpc_hpreq_handler(struct ptlrpc_request *req) +{ + int opc = lustre_msg_get_opc(req->rq_reqmsg); + + /* Check for export to let only reconnects for not yet evicted + * export to become a HP rpc. */ + if ((req->rq_export != NULL) && + (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT)) + req->rq_ops = &ptlrpc_hpreq_common; + + return 0; +} +EXPORT_SYMBOL(ptlrpc_hpreq_handler); + +static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + int rc; + + rc = ptlrpc_server_hpreq_init(svcpt, req); + if (rc < 0) + return rc; + + ptlrpc_nrs_req_add(svcpt, req, !!rc); + + return 0; +} + +/** + * Allow to handle high priority request + * User can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_req_lock to get reliable result + */ +static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt, + bool force) +{ + int running = svcpt->scp_nthrs_running; + + if (!nrs_svcpt_has_hp(svcpt)) + return false; + + if (force) + return true; + + if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && + CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { + /* leave just 1 thread for normal RPCs */ + running = PTLRPC_NTHRS_INIT; + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) + running += 1; + } + + if (svcpt->scp_nreqs_active >= running - 1) + return false; + + if (svcpt->scp_nhreqs_active == 0) + return true; + + return !ptlrpc_nrs_req_pending_nolock(svcpt, false) || + svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio; +} + +static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt, + bool force) +{ + return ptlrpc_server_allow_high(svcpt, force) && + ptlrpc_nrs_req_pending_nolock(svcpt, true); +} + +/** + * Only allow normal priority requests on a service that has a high-priority + * queue if forced (i.e. cleanup), if there are other high priority requests + * already being processed (i.e. those threads can service more high-priority + * requests), or if there are enough idle threads that a later thread can do + * a high priority request. + * User can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_req_lock to get reliable result + */ +static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt, + bool force) +{ + int running = svcpt->scp_nthrs_running; + if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && + CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { + /* leave just 1 thread for normal RPCs */ + running = PTLRPC_NTHRS_INIT; + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) + running += 1; + } + + if (force || + svcpt->scp_nreqs_active < running - 2) + return true; + + if (svcpt->scp_nreqs_active >= running - 1) + return false; + + return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt); +} + +static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt, + bool force) +{ + return ptlrpc_server_allow_normal(svcpt, force) && + ptlrpc_nrs_req_pending_nolock(svcpt, false); +} + +/** + * Returns true if there are requests available in incoming + * request queue for processing and it is allowed to fetch them. + * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock + * to get reliable result + * \see ptlrpc_server_allow_normal + * \see ptlrpc_server_allow high + */ +static inline bool +ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force) +{ + return ptlrpc_server_high_pending(svcpt, force) || + ptlrpc_server_normal_pending(svcpt, force); +} + +/** + * Fetch a request for processing from queue of unprocessed requests. + * Favors high-priority requests. + * Returns a pointer to fetched request. + */ +static struct ptlrpc_request * +ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) +{ + struct ptlrpc_request *req = NULL; + + spin_lock(&svcpt->scp_req_lock); + + if (ptlrpc_server_high_pending(svcpt, force)) { + req = ptlrpc_nrs_req_get_nolock(svcpt, true, force); + if (req != NULL) { + svcpt->scp_hreq_count++; + goto got_request; + } + } + + if (ptlrpc_server_normal_pending(svcpt, force)) { + req = ptlrpc_nrs_req_get_nolock(svcpt, false, force); + if (req != NULL) { + svcpt->scp_hreq_count = 0; + goto got_request; + } + } + + spin_unlock(&svcpt->scp_req_lock); + return NULL; + +got_request: + svcpt->scp_nreqs_active++; + if (req->rq_hp) + svcpt->scp_nhreqs_active++; + + spin_unlock(&svcpt->scp_req_lock); + + if (likely(req->rq_export)) + class_export_rpc_inc(req->rq_export); + + return req; +} + +/** + * Handle freshly incoming reqs, add to timed early reply list, + * pass on to regular request queue. + * All incoming requests pass through here before getting into + * ptlrpc_server_handle_req later on. + */ +static int +ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request *req; + __u32 deadline; + int rc; + + spin_lock(&svcpt->scp_lock); + if (list_empty(&svcpt->scp_req_incoming)) { + spin_unlock(&svcpt->scp_lock); + return 0; + } + + req = list_entry(svcpt->scp_req_incoming.next, + struct ptlrpc_request, rq_list); + list_del_init(&req->rq_list); + svcpt->scp_nreqs_incoming--; + /* Consider this still a "queued" request as far as stats are + * concerned */ + spin_unlock(&svcpt->scp_lock); + + /* go through security check/transform */ + rc = sptlrpc_svc_unwrap_request(req); + switch (rc) { + case SECSVC_OK: + break; + case SECSVC_COMPLETE: + target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); + goto err_req; + case SECSVC_DROP: + goto err_req; + default: + LBUG(); + } + + /* + * for null-flavored rpc, msg has been unpacked by sptlrpc, although + * redo it wouldn't be harmful. + */ + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { + rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen); + if (rc != 0) { + CERROR("error unpacking request: ptl %d from %s x%llu\n", + svc->srv_req_portal, libcfs_id2str(req->rq_peer), + req->rq_xid); + goto err_req; + } + } + + rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + CERROR("error unpacking ptlrpc body: ptl %d from %s x%llu\n", + svc->srv_req_portal, libcfs_id2str(req->rq_peer), + req->rq_xid); + goto err_req; + } + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) && + lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) { + CERROR("drop incoming rpc opc %u, x%llu\n", + cfs_fail_val, req->rq_xid); + goto err_req; + } + + rc = -EINVAL; + if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { + CERROR("wrong packet type received (type=%u) from %s\n", + lustre_msg_get_type(req->rq_reqmsg), + libcfs_id2str(req->rq_peer)); + goto err_req; + } + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case MDS_WRITEPAGE: + case OST_WRITE: + req->rq_bulk_write = 1; + break; + case MDS_READPAGE: + case OST_READ: + case MGS_CONFIG_READ: + req->rq_bulk_read = 1; + break; + } + + CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid); + + req->rq_export = class_conn2export( + lustre_msg_get_handle(req->rq_reqmsg)); + if (req->rq_export) { + rc = ptlrpc_check_req(req); + if (rc == 0) { + rc = sptlrpc_target_export_check(req->rq_export, req); + if (rc) + DEBUG_REQ(D_ERROR, req, "DROPPING req with illegal security flavor,"); + } + + if (rc) + goto err_req; + ptlrpc_update_export_timer(req->rq_export, 0); + } + + /* req_in handling should/must be fast */ + if (get_seconds() - req->rq_arrival_time.tv_sec > 5) + DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s", + cfs_time_sub(get_seconds(), + req->rq_arrival_time.tv_sec)); + + /* Set rpc server deadline and add it to the timed list */ + deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & + MSGHDR_AT_SUPPORT) ? + /* The max time the client expects us to take */ + lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; + req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; + if (unlikely(deadline == 0)) { + DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); + goto err_req; + } + + req->rq_svc_thread = thread; + + ptlrpc_at_add_timed(req); + + /* Move it over to the request processing queue */ + rc = ptlrpc_server_request_add(svcpt, req); + if (rc) + goto err_req; + + wake_up(&svcpt->scp_waitq); + return 1; + +err_req: + ptlrpc_server_finish_request(svcpt, req); + + return 1; +} + +/** + * Main incoming request handling logic. + * Calls handler function from service to do actual processing. + */ +static int +ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request *request; + struct timeval work_start; + struct timeval work_end; + long timediff; + int rc; + int fail_opc = 0; + + request = ptlrpc_server_request_get(svcpt, false); + if (request == NULL) + return 0; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; + else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; + + if (unlikely(fail_opc)) { + if (request->rq_export && request->rq_ops) + OBD_FAIL_TIMEOUT(fail_opc, 4); + } + + ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) + libcfs_debug_dumplog(); + + do_gettimeofday(&work_start); + timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time, + NULL); + if (likely(svc->srv_stats != NULL)) { + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, + timediff); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, + svcpt->scp_nreqs_incoming); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, + svcpt->scp_nreqs_active); + lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, + at_get(&svcpt->scp_at_estimate)); + } + + rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF); + if (rc) { + CERROR("Failure to initialize session: %d\n", rc); + goto out_req; + } + request->rq_session.lc_thread = thread; + request->rq_session.lc_cookie = 0x5; + lu_context_enter(&request->rq_session); + + CDEBUG(D_NET, "got req %llu\n", request->rq_xid); + + request->rq_svc_thread = thread; + if (thread) + request->rq_svc_thread->t_env->le_ses = &request->rq_session; + + if (likely(request->rq_export)) { + if (unlikely(ptlrpc_check_req(request))) + goto put_conn; + ptlrpc_update_export_timer(request->rq_export, timediff >> 19); + } + + /* Discard requests queued for longer than the deadline. + The deadline is increased if we send an early reply. */ + if (get_seconds() > request->rq_deadline) { + DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline " CFS_DURATION_T ":" CFS_DURATION_T "s ago\n", + libcfs_id2str(request->rq_peer), + cfs_time_sub(request->rq_deadline, + request->rq_arrival_time.tv_sec), + cfs_time_sub(get_seconds(), + request->rq_deadline)); + goto put_conn; + } + + CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d\n", + current_comm(), + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + atomic_read(&request->rq_export->exp_refcount) : -99), + lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg)); + + if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) + CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); + + rc = svc->srv_ops.so_req_handler(request); + + ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); + +put_conn: + lu_context_exit(&request->rq_session); + lu_context_fini(&request->rq_session); + + if (unlikely(get_seconds() > request->rq_deadline)) { + DEBUG_REQ(D_WARNING, request, + "Request took longer than estimated (" + CFS_DURATION_T":"CFS_DURATION_T + "s); client may timeout.", + cfs_time_sub(request->rq_deadline, + request->rq_arrival_time.tv_sec), + cfs_time_sub(get_seconds(), + request->rq_deadline)); + } + + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request processed in %ldus (%ldus total) trans %llu rc %d/%d\n", + current_comm(), + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + atomic_read(&request->rq_export->exp_refcount) : -99), + lustre_msg_get_status(request->rq_reqmsg), + request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg), + timediff, + cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL), + (request->rq_repmsg ? + lustre_msg_get_transno(request->rq_repmsg) : + request->rq_transno), + request->rq_status, + (request->rq_repmsg ? + lustre_msg_get_status(request->rq_repmsg) : -999)); + if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { + __u32 op = lustre_msg_get_opc(request->rq_reqmsg); + int opc = opcode_offset(op); + if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { + LASSERT(opc < LUSTRE_MAX_OPCODES); + lprocfs_counter_add(svc->srv_stats, + opc + EXTRA_MAX_OPCODES, + timediff); + } + } + if (unlikely(request->rq_early_count)) { + DEBUG_REQ(D_ADAPTTO, request, + "sent %d early replies before finishing in " + CFS_DURATION_T"s", + request->rq_early_count, + cfs_time_sub(work_end.tv_sec, + request->rq_arrival_time.tv_sec)); + } + +out_req: + ptlrpc_server_finish_active_request(svcpt, request); + + return 1; +} + +/** + * An internal function to process a single reply state object. + */ +static int +ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + struct obd_export *exp; + int nlocks; + int been_handled; + + exp = rs->rs_export; + + LASSERT(rs->rs_difficult); + LASSERT(rs->rs_scheduled); + LASSERT(list_empty(&rs->rs_list)); + + spin_lock(&exp->exp_lock); + /* Noop if removed already */ + list_del_init(&rs->rs_exp_list); + spin_unlock(&exp->exp_lock); + + /* The disk commit callback holds exp_uncommitted_replies_lock while it + * iterates over newly committed replies, removing them from + * exp_uncommitted_replies. It then drops this lock and schedules the + * replies it found for handling here. + * + * We can avoid contention for exp_uncommitted_replies_lock between the + * HRT threads and further commit callbacks by checking rs_committed + * which is set in the commit callback while it holds both + * rs_lock and exp_uncommitted_reples. + * + * If we see rs_committed clear, the commit callback _may_ not have + * handled this reply yet and we race with it to grab + * exp_uncommitted_replies_lock before removing the reply from + * exp_uncommitted_replies. Note that if we lose the race and the + * reply has already been removed, list_del_init() is a noop. + * + * If we see rs_committed set, we know the commit callback is handling, + * or has handled this reply since store reordering might allow us to + * see rs_committed set out of sequence. But since this is done + * holding rs_lock, we can be sure it has all completed once we hold + * rs_lock, which we do right next. + */ + if (!rs->rs_committed) { + spin_lock(&exp->exp_uncommitted_replies_lock); + list_del_init(&rs->rs_obd_list); + spin_unlock(&exp->exp_uncommitted_replies_lock); + } + + spin_lock(&rs->rs_lock); + + been_handled = rs->rs_handled; + rs->rs_handled = 1; + + nlocks = rs->rs_nlocks; /* atomic "steal", but */ + rs->rs_nlocks = 0; /* locks still on rs_locks! */ + + if (nlocks == 0 && !been_handled) { + /* If we see this, we should already have seen the warning + * in mds_steal_ack_locks() */ + CDEBUG(D_HA, "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n", + rs, + rs->rs_xid, rs->rs_transno, rs->rs_opc, + libcfs_nid2str(exp->exp_connection->c_peer.nid)); + } + + if ((!been_handled && rs->rs_on_net) || nlocks > 0) { + spin_unlock(&rs->rs_lock); + + if (!been_handled && rs->rs_on_net) { + LNetMDUnlink(rs->rs_md_h); + /* Ignore return code; we're racing with completion */ + } + + while (nlocks-- > 0) + ldlm_lock_decref(&rs->rs_locks[nlocks], + rs->rs_modes[nlocks]); + + spin_lock(&rs->rs_lock); + } + + rs->rs_scheduled = 0; + + if (!rs->rs_on_net) { + /* Off the net */ + spin_unlock(&rs->rs_lock); + + class_export_put(exp); + rs->rs_export = NULL; + ptlrpc_rs_decref(rs); + if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) && + svc->srv_is_stopping) + wake_up_all(&svcpt->scp_waitq); + return 1; + } + + /* still on the net; callback will schedule */ + spin_unlock(&rs->rs_lock); + return 1; +} + + +static void +ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt) +{ + int avail = svcpt->scp_nrqbds_posted; + int low_water = test_req_buffer_pressure ? 0 : + svcpt->scp_service->srv_nbuf_per_group / 2; + + /* NB I'm not locking; just looking. */ + + /* CAVEAT EMPTOR: We might be allocating buffers here because we've + * allowed the request history to grow out of control. We could put a + * sanity check on that here and cull some history if we need the + * space. */ + + if (avail <= low_water) + ptlrpc_grow_req_bufs(svcpt, 1); + + if (svcpt->scp_service->srv_stats) { + lprocfs_counter_add(svcpt->scp_service->srv_stats, + PTLRPC_REQBUF_AVAIL_CNTR, avail); + } +} + +static int +ptlrpc_retry_rqbds(void *arg) +{ + struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg; + + svcpt->scp_rqbd_timeout = 0; + return -ETIMEDOUT; +} + +static inline int +ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nreqs_active < + svcpt->scp_nthrs_running - 1 - + (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL); +} + +/** + * allowed to create more threads + * user can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_lock to get reliable result + */ +static inline int +ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nthrs_running + + svcpt->scp_nthrs_starting < + svcpt->scp_service->srv_nthrs_cpt_limit; +} + +/** + * too many requests and allowed to create more threads + */ +static inline int +ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt) +{ + return !ptlrpc_threads_enough(svcpt) && + ptlrpc_threads_increasable(svcpt); +} + +static inline int +ptlrpc_thread_stopping(struct ptlrpc_thread *thread) +{ + return thread_is_stopping(thread) || + thread->t_svcpt->scp_service->srv_is_stopping; +} + +static inline int +ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt) +{ + return !list_empty(&svcpt->scp_rqbd_idle) && + svcpt->scp_rqbd_timeout == 0; +} + +static inline int +ptlrpc_at_check(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_at_check; +} + +/** + * requests wait on preprocessing + * user can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_lock to get reliable result + */ +static inline int +ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt) +{ + return !list_empty(&svcpt->scp_req_incoming); +} + +static __attribute__((__noinline__)) int +ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + /* Don't exit while there are replies to be handled */ + struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout, + ptlrpc_retry_rqbds, svcpt); + + /* XXX: Add this back when libcfs watchdog is merged upstream + lc_watchdog_disable(thread->t_watchdog); + */ + + cond_resched(); + + l_wait_event_exclusive_head(svcpt->scp_waitq, + ptlrpc_thread_stopping(thread) || + ptlrpc_server_request_incoming(svcpt) || + ptlrpc_server_request_pending(svcpt, false) || + ptlrpc_rqbd_pending(svcpt) || + ptlrpc_at_check(svcpt), &lwi); + + if (ptlrpc_thread_stopping(thread)) + return -EINTR; + + /* + lc_watchdog_touch(thread->t_watchdog, + ptlrpc_server_get_timeout(svcpt)); + */ + return 0; +} + +/** + * Main thread body for service threads. + * Waits in a loop waiting for new requests to process to appear. + * Every time an incoming requests is added to its queue, a waitq + * is woken up and one of the threads will handle it. + */ +static int ptlrpc_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; + struct ptlrpc_service_part *svcpt = thread->t_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_reply_state *rs; + struct group_info *ginfo = NULL; + struct lu_env *env; + int counter = 0, rc = 0; + + thread->t_pid = current_pid(); + unshare_fs_struct(); + + /* NB: we will call cfs_cpt_bind() for all threads, because we + * might want to run lustre server only on a subset of system CPUs, + * in that case ->scp_cpt is CFS_CPT_ANY */ + rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt); + if (rc != 0) { + CWARN("%s: failed to bind %s on CPT %d\n", + svc->srv_name, thread->t_name, svcpt->scp_cpt); + } + + ginfo = groups_alloc(0); + if (!ginfo) { + rc = -ENOMEM; + goto out; + } + + set_current_groups(ginfo); + put_group_info(ginfo); + + if (svc->srv_ops.so_thr_init != NULL) { + rc = svc->srv_ops.so_thr_init(thread); + if (rc) + goto out; + } + + OBD_ALLOC_PTR(env); + if (env == NULL) { + rc = -ENOMEM; + goto out_srv_fini; + } + + rc = lu_context_init(&env->le_ctx, + svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF); + if (rc) + goto out_srv_fini; + + thread->t_env = env; + env->le_ctx.lc_thread = thread; + env->le_ctx.lc_cookie = 0x6; + + while (!list_empty(&svcpt->scp_rqbd_idle)) { + rc = ptlrpc_server_post_idle_rqbds(svcpt); + if (rc >= 0) + continue; + + CERROR("Failed to post rqbd for %s on CPT %d: %d\n", + svc->srv_name, svcpt->scp_cpt, rc); + goto out_srv_fini; + } + + /* Alloc reply state structure for this one */ + OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size); + if (!rs) { + rc = -ENOMEM; + goto out_srv_fini; + } + + spin_lock(&svcpt->scp_lock); + + LASSERT(thread_is_starting(thread)); + thread_clear_flags(thread, SVC_STARTING); + + LASSERT(svcpt->scp_nthrs_starting == 1); + svcpt->scp_nthrs_starting--; + + /* SVC_STOPPING may already be set here if someone else is trying + * to stop the service while this new thread has been dynamically + * forked. We still set SVC_RUNNING to let our creator know that + * we are now running, however we will exit as soon as possible */ + thread_add_flags(thread, SVC_RUNNING); + svcpt->scp_nthrs_running++; + spin_unlock(&svcpt->scp_lock); + + /* wake up our creator in case he's still waiting. */ + wake_up(&thread->t_ctl_waitq); + + /* + thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt), + NULL, NULL); + */ + + spin_lock(&svcpt->scp_rep_lock); + list_add(&rs->rs_list, &svcpt->scp_rep_idle); + wake_up(&svcpt->scp_rep_waitq); + spin_unlock(&svcpt->scp_rep_lock); + + CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, + svcpt->scp_nthrs_running); + + /* XXX maintain a list of all managed devices: insert here */ + while (!ptlrpc_thread_stopping(thread)) { + if (ptlrpc_wait_event(svcpt, thread)) + break; + + ptlrpc_check_rqbd_pool(svcpt); + + if (ptlrpc_threads_need_create(svcpt)) { + /* Ignore return code - we tried... */ + ptlrpc_start_thread(svcpt, 0); + } + + /* Process all incoming reqs before handling any */ + if (ptlrpc_server_request_incoming(svcpt)) { + lu_context_enter(&env->le_ctx); + env->le_ses = NULL; + ptlrpc_server_handle_req_in(svcpt, thread); + lu_context_exit(&env->le_ctx); + + /* but limit ourselves in case of flood */ + if (counter++ < 100) + continue; + counter = 0; + } + + if (ptlrpc_at_check(svcpt)) + ptlrpc_at_check_timed(svcpt); + + if (ptlrpc_server_request_pending(svcpt, false)) { + lu_context_enter(&env->le_ctx); + ptlrpc_server_handle_request(svcpt, thread); + lu_context_exit(&env->le_ctx); + } + + if (ptlrpc_rqbd_pending(svcpt) && + ptlrpc_server_post_idle_rqbds(svcpt) < 0) { + /* I just failed to repost request buffers. + * Wait for a timeout (unless something else + * happens) before I try again */ + svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; + CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", + svcpt->scp_nrqbds_posted); + } + } + + /* + lc_watchdog_delete(thread->t_watchdog); + thread->t_watchdog = NULL; + */ + +out_srv_fini: + /* + * deconstruct service specific state created by ptlrpc_start_thread() + */ + if (svc->srv_ops.so_thr_done != NULL) + svc->srv_ops.so_thr_done(thread); + + if (env != NULL) { + lu_context_fini(&env->le_ctx); + OBD_FREE_PTR(env); + } +out: + CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n", + thread, thread->t_pid, thread->t_id, rc); + + spin_lock(&svcpt->scp_lock); + if (thread_test_and_clear_flags(thread, SVC_STARTING)) + svcpt->scp_nthrs_starting--; + + if (thread_test_and_clear_flags(thread, SVC_RUNNING)) { + /* must know immediately */ + svcpt->scp_nthrs_running--; + } + + thread->t_id = rc; + thread_add_flags(thread, SVC_STOPPED); + + wake_up(&thread->t_ctl_waitq); + spin_unlock(&svcpt->scp_lock); + + return rc; +} + +static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt, + struct list_head *replies) +{ + int result; + + spin_lock(&hrt->hrt_lock); + + list_splice_init(&hrt->hrt_queue, replies); + result = ptlrpc_hr.hr_stopping || !list_empty(replies); + + spin_unlock(&hrt->hrt_lock); + return result; +} + +/** + * Main body of "handle reply" function. + * It processes acked reply states + */ +static int ptlrpc_hr_main(void *arg) +{ + struct ptlrpc_hr_thread *hrt = (struct ptlrpc_hr_thread *)arg; + struct ptlrpc_hr_partition *hrp = hrt->hrt_partition; + LIST_HEAD (replies); + char threadname[20]; + int rc; + + snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d", + hrp->hrp_cpt, hrt->hrt_id); + unshare_fs_struct(); + + rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt); + if (rc != 0) { + CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n", + threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc); + } + + atomic_inc(&hrp->hrp_nstarted); + wake_up(&ptlrpc_hr.hr_waitq); + + while (!ptlrpc_hr.hr_stopping) { + l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies)); + + while (!list_empty(&replies)) { + struct ptlrpc_reply_state *rs; + + rs = list_entry(replies.prev, + struct ptlrpc_reply_state, + rs_list); + list_del_init(&rs->rs_list); + ptlrpc_handle_rs(rs); + } + } + + atomic_inc(&hrp->hrp_nstopped); + wake_up(&ptlrpc_hr.hr_waitq); + + return 0; +} + +static void ptlrpc_stop_hr_threads(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + int j; + + ptlrpc_hr.hr_stopping = 1; + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs == NULL) + continue; /* uninitialized */ + for (j = 0; j < hrp->hrp_nthrs; j++) + wake_up_all(&hrp->hrp_thrs[j].hrt_waitq); + } + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs == NULL) + continue; /* uninitialized */ + wait_event(ptlrpc_hr.hr_waitq, + atomic_read(&hrp->hrp_nstopped) == + atomic_read(&hrp->hrp_nstarted)); + } +} + +static int ptlrpc_start_hr_threads(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + int j; + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + int rc = 0; + + for (j = 0; j < hrp->hrp_nthrs; j++) { + struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j]; + rc = PTR_ERR(kthread_run(ptlrpc_hr_main, + &hrp->hrp_thrs[j], + "ptlrpc_hr%02d_%03d", + hrp->hrp_cpt, + hrt->hrt_id)); + if (IS_ERR_VALUE(rc)) + break; + } + wait_event(ptlrpc_hr.hr_waitq, + atomic_read(&hrp->hrp_nstarted) == j); + if (!IS_ERR_VALUE(rc)) + continue; + + CERROR("Reply handling thread %d:%d Failed on starting: rc = %d\n", + i, j, rc); + ptlrpc_stop_hr_threads(); + return rc; + } + return 0; +} + +static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) +{ + struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *thread; + LIST_HEAD (zombie); + + CDEBUG(D_INFO, "Stopping threads for service %s\n", + svcpt->scp_service->srv_name); + + spin_lock(&svcpt->scp_lock); + /* let the thread know that we would like it to stop asap */ + list_for_each_entry(thread, &svcpt->scp_threads, t_link) { + CDEBUG(D_INFO, "Stopping thread %s #%u\n", + svcpt->scp_service->srv_thread_name, thread->t_id); + thread_add_flags(thread, SVC_STOPPING); + } + + wake_up_all(&svcpt->scp_waitq); + + while (!list_empty(&svcpt->scp_threads)) { + thread = list_entry(svcpt->scp_threads.next, + struct ptlrpc_thread, t_link); + if (thread_is_stopped(thread)) { + list_del(&thread->t_link); + list_add(&thread->t_link, &zombie); + continue; + } + spin_unlock(&svcpt->scp_lock); + + CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", + svcpt->scp_service->srv_thread_name, thread->t_id); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopped(thread), &lwi); + + spin_lock(&svcpt->scp_lock); + } + + spin_unlock(&svcpt->scp_lock); + + while (!list_empty(&zombie)) { + thread = list_entry(zombie.next, + struct ptlrpc_thread, t_link); + list_del(&thread->t_link); + OBD_FREE_PTR(thread); + } +} + +/** + * Stops all threads of a particular service \a svc + */ +void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service != NULL) + ptlrpc_svcpt_stop_threads(svcpt); + } +} +EXPORT_SYMBOL(ptlrpc_stop_all_threads); + +int ptlrpc_start_threads(struct ptlrpc_service *svc) +{ + int rc = 0; + int i; + int j; + + /* We require 2 threads min, see note in ptlrpc_server_handle_request */ + LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT); + + for (i = 0; i < svc->srv_ncpts; i++) { + for (j = 0; j < svc->srv_nthrs_cpt_init; j++) { + rc = ptlrpc_start_thread(svc->srv_parts[i], 1); + if (rc == 0) + continue; + + if (rc != -EMFILE) + goto failed; + /* We have enough threads, don't start more. b=15759 */ + break; + } + } + + return 0; + failed: + CERROR("cannot start %s thread #%d_%d: rc %d\n", + svc->srv_thread_name, i, j, rc); + ptlrpc_stop_all_threads(svc); + return rc; +} +EXPORT_SYMBOL(ptlrpc_start_threads); + +int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) +{ + struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *thread; + struct ptlrpc_service *svc; + int rc; + + LASSERT(svcpt != NULL); + + svc = svcpt->scp_service; + + CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n", + svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running, + svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit); + + again: + if (unlikely(svc->srv_is_stopping)) + return -ESRCH; + + if (!ptlrpc_threads_increasable(svcpt) || + (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && + svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1)) + return -EMFILE; + + OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt); + if (thread == NULL) + return -ENOMEM; + init_waitqueue_head(&thread->t_ctl_waitq); + + spin_lock(&svcpt->scp_lock); + if (!ptlrpc_threads_increasable(svcpt)) { + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + return -EMFILE; + } + + if (svcpt->scp_nthrs_starting != 0) { + /* serialize starting because some modules (obdfilter) + * might require unique and contiguous t_id */ + LASSERT(svcpt->scp_nthrs_starting == 1); + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + if (wait) { + CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n", + svc->srv_thread_name, svcpt->scp_thr_nextid); + schedule(); + goto again; + } + + CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n", + svc->srv_thread_name, svcpt->scp_thr_nextid); + return -EAGAIN; + } + + svcpt->scp_nthrs_starting++; + thread->t_id = svcpt->scp_thr_nextid++; + thread_add_flags(thread, SVC_STARTING); + thread->t_svcpt = svcpt; + + list_add(&thread->t_link, &svcpt->scp_threads); + spin_unlock(&svcpt->scp_lock); + + if (svcpt->scp_cpt >= 0) { + snprintf(thread->t_name, sizeof(thread->t_name), "%s%02d_%03d", + svc->srv_thread_name, svcpt->scp_cpt, thread->t_id); + } else { + snprintf(thread->t_name, sizeof(thread->t_name), "%s_%04d", + svc->srv_thread_name, thread->t_id); + } + + CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name); + rc = PTR_ERR(kthread_run(ptlrpc_main, thread, "%s", thread->t_name)); + if (IS_ERR_VALUE(rc)) { + CERROR("cannot start thread '%s': rc %d\n", + thread->t_name, rc); + spin_lock(&svcpt->scp_lock); + --svcpt->scp_nthrs_starting; + if (thread_is_stopping(thread)) { + /* this ptlrpc_thread is being handled + * by ptlrpc_svcpt_stop_threads now + */ + thread_add_flags(thread, SVC_STOPPED); + wake_up(&thread->t_ctl_waitq); + spin_unlock(&svcpt->scp_lock); + } else { + list_del(&thread->t_link); + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + } + return rc; + } + + if (!wait) + return 0; + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + + rc = thread_is_stopped(thread) ? thread->t_id : 0; + return rc; +} + +int ptlrpc_hr_init(void) +{ + struct ptlrpc_hr_partition *hrp; + struct ptlrpc_hr_thread *hrt; + int rc; + int i; + int j; + int weight; + + memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr)); + ptlrpc_hr.hr_cpt_table = cfs_cpt_table; + + ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table, + sizeof(*hrp)); + if (ptlrpc_hr.hr_partitions == NULL) + return -ENOMEM; + + init_waitqueue_head(&ptlrpc_hr.hr_waitq); + + weight = cpumask_weight(topology_thread_cpumask(0)); + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + hrp->hrp_cpt = i; + + atomic_set(&hrp->hrp_nstarted, 0); + atomic_set(&hrp->hrp_nstopped, 0); + + hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i); + hrp->hrp_nthrs /= weight; + + LASSERT(hrp->hrp_nthrs > 0); + OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i, + hrp->hrp_nthrs * sizeof(*hrt)); + if (hrp->hrp_thrs == NULL) { + rc = -ENOMEM; + goto out; + } + + for (j = 0; j < hrp->hrp_nthrs; j++) { + hrt = &hrp->hrp_thrs[j]; + + hrt->hrt_id = j; + hrt->hrt_partition = hrp; + init_waitqueue_head(&hrt->hrt_waitq); + spin_lock_init(&hrt->hrt_lock); + INIT_LIST_HEAD(&hrt->hrt_queue); + } + } + + rc = ptlrpc_start_hr_threads(); +out: + if (rc != 0) + ptlrpc_hr_fini(); + return rc; +} + +void ptlrpc_hr_fini(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + + if (ptlrpc_hr.hr_partitions == NULL) + return; + + ptlrpc_stop_hr_threads(); + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs != NULL) { + OBD_FREE(hrp->hrp_thrs, + hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0])); + } + } + + cfs_percpt_free(ptlrpc_hr.hr_partitions); + ptlrpc_hr.hr_partitions = NULL; +} + + +/** + * Wait until all already scheduled replies are processed. + */ +static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) +{ + while (1) { + int rc; + struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10), + NULL, NULL); + + rc = l_wait_event(svcpt->scp_waitq, + atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi); + if (rc == 0) + break; + CWARN("Unexpectedly long timeout %s %p\n", + svcpt->scp_service->srv_name, svcpt->scp_service); + } +} + +static void +ptlrpc_service_del_atimer(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + /* early disarm AT timer... */ + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service != NULL) + cfs_timer_disarm(&svcpt->scp_at_timer); + } +} + +static void +ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request_buffer_desc *rqbd; + struct l_wait_info lwi; + int rc; + int i; + + /* All history will be culled when the next request buffer is + * freed in ptlrpc_service_purge_all() */ + svc->srv_hist_nrqbds_cpt_max = 0; + + rc = LNetClearLazyPortal(svc->srv_req_portal); + LASSERT(rc == 0); + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* Unlink all the request buffers. This forces a 'final' + * event with its 'unlink' flag set for each posted rqbd */ + list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, + rqbd_list) { + rc = LNetMDUnlink(rqbd->rqbd_md_h); + LASSERT(rc == 0 || rc == -ENOENT); + } + } + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* Wait for the network to release any buffers + * it's currently filling */ + spin_lock(&svcpt->scp_lock); + while (svcpt->scp_nrqbds_posted != 0) { + spin_unlock(&svcpt->scp_lock); + /* Network access will complete in finite time but + * the HUGE timeout lets us CWARN for visibility + * of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL( + cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(svcpt->scp_waitq, + svcpt->scp_nrqbds_posted == 0, &lwi); + if (rc == -ETIMEDOUT) { + CWARN("Service %s waiting for request buffers\n", + svcpt->scp_service->srv_name); + } + spin_lock(&svcpt->scp_lock); + } + spin_unlock(&svcpt->scp_lock); + } +} + +static void +ptlrpc_service_purge_all(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request_buffer_desc *rqbd; + struct ptlrpc_request *req; + struct ptlrpc_reply_state *rs; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + spin_lock(&svcpt->scp_rep_lock); + while (!list_empty(&svcpt->scp_rep_active)) { + rs = list_entry(svcpt->scp_rep_active.next, + struct ptlrpc_reply_state, rs_list); + spin_lock(&rs->rs_lock); + ptlrpc_schedule_difficult_reply(rs); + spin_unlock(&rs->rs_lock); + } + spin_unlock(&svcpt->scp_rep_lock); + + /* purge the request queue. NB No new replies (rqbds + * all unlinked) and no service threads, so I'm the only + * thread noodling the request queue now */ + while (!list_empty(&svcpt->scp_req_incoming)) { + req = list_entry(svcpt->scp_req_incoming.next, + struct ptlrpc_request, rq_list); + + list_del(&req->rq_list); + svcpt->scp_nreqs_incoming--; + ptlrpc_server_finish_request(svcpt, req); + } + + while (ptlrpc_server_request_pending(svcpt, true)) { + req = ptlrpc_server_request_get(svcpt, true); + ptlrpc_server_finish_active_request(svcpt, req); + } + + LASSERT(list_empty(&svcpt->scp_rqbd_posted)); + LASSERT(svcpt->scp_nreqs_incoming == 0); + LASSERT(svcpt->scp_nreqs_active == 0); + /* history should have been culled by + * ptlrpc_server_finish_request */ + LASSERT(svcpt->scp_hist_nrqbds == 0); + + /* Now free all the request buffers since nothing + * references them any more... */ + + while (!list_empty(&svcpt->scp_rqbd_idle)) { + rqbd = list_entry(svcpt->scp_rqbd_idle.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + ptlrpc_free_rqbd(rqbd); + } + ptlrpc_wait_replies(svcpt); + + while (!list_empty(&svcpt->scp_rep_idle)) { + rs = list_entry(svcpt->scp_rep_idle.next, + struct ptlrpc_reply_state, + rs_list); + list_del(&rs->rs_list); + OBD_FREE_LARGE(rs, svc->srv_max_reply_size); + } + } +} + +static void +ptlrpc_service_free(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_at_array *array; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* In case somebody rearmed this in the meantime */ + cfs_timer_disarm(&svcpt->scp_at_timer); + array = &svcpt->scp_at_array; + + if (array->paa_reqs_array != NULL) { + OBD_FREE(array->paa_reqs_array, + sizeof(struct list_head) * array->paa_size); + array->paa_reqs_array = NULL; + } + + if (array->paa_reqs_count != NULL) { + OBD_FREE(array->paa_reqs_count, + sizeof(__u32) * array->paa_size); + array->paa_reqs_count = NULL; + } + } + + ptlrpc_service_for_each_part(svcpt, i, svc) + OBD_FREE_PTR(svcpt); + + if (svc->srv_cpts != NULL) + cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts); + + OBD_FREE(svc, offsetof(struct ptlrpc_service, + srv_parts[svc->srv_ncpts])); +} + +int ptlrpc_unregister_service(struct ptlrpc_service *service) +{ + CDEBUG(D_NET, "%s: tearing down\n", service->srv_name); + + service->srv_is_stopping = 1; + + mutex_lock(&ptlrpc_all_services_mutex); + list_del_init(&service->srv_list); + mutex_unlock(&ptlrpc_all_services_mutex); + + ptlrpc_service_del_atimer(service); + ptlrpc_stop_all_threads(service); + + ptlrpc_service_unlink_rqbd(service); + ptlrpc_service_purge_all(service); + ptlrpc_service_nrs_cleanup(service); + + ptlrpc_lprocfs_unregister_service(service); + + ptlrpc_service_free(service); + + return 0; +} +EXPORT_SYMBOL(ptlrpc_unregister_service); + +/** + * Returns 0 if the service is healthy. + * + * Right now, it just checks to make sure that requests aren't languishing + * in the queue. We'll use this health check to govern whether a node needs + * to be shot, so it's intentionally non-aggressive. */ +int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_request *request = NULL; + struct timeval right_now; + long timediff; + + do_gettimeofday(&right_now); + + spin_lock(&svcpt->scp_req_lock); + /* How long has the next entry been waiting? */ + if (ptlrpc_server_high_pending(svcpt, true)) + request = ptlrpc_nrs_req_peek_nolock(svcpt, true); + else if (ptlrpc_server_normal_pending(svcpt, true)) + request = ptlrpc_nrs_req_peek_nolock(svcpt, false); + + if (request == NULL) { + spin_unlock(&svcpt->scp_req_lock); + return 0; + } + + timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL); + spin_unlock(&svcpt->scp_req_lock); + + if ((timediff / ONE_MILLION) > + (AT_OFF ? obd_timeout * 3 / 2 : at_max)) { + CERROR("%s: unhealthy - request has been waiting %lds\n", + svcpt->scp_service->srv_name, timediff / ONE_MILLION); + return -1; + } + + return 0; +} + +int +ptlrpc_service_health_check(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + if (svc == NULL) + return 0; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + int rc = ptlrpc_svcpt_health_check(svcpt); + + if (rc != 0) + return rc; + } + return 0; +} +EXPORT_SYMBOL(ptlrpc_service_health_check); diff --git a/kernel/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/kernel/drivers/staging/lustre/lustre/ptlrpc/wiretest.c new file mode 100644 index 000000000..d6d92046c --- /dev/null +++ b/kernel/drivers/staging/lustre/lustre/ptlrpc/wiretest.c @@ -0,0 +1,4492 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include + +#include "../include/obd_support.h" +#include "../include/obd_class.h" +#include "../include/lustre_net.h" +#include "../include/lustre_disk.h" +void lustre_assert_wire_constants(void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * (make -C lustre/utils newwiretest) + * running on Linux centos6-bis 2.6.32-358.0.1.el6-head + * #3 SMP Wed Apr 17 17:37:43 CEST 2013 + * with gcc version 4.4.6 20110731 (Red Hat 4.4.6-3) (GCC) + */ + + /* Constants... */ + LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n", + (long long)PTL_RPC_MSG_REQUEST); + LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n", + (long long)PTL_RPC_MSG_ERR); + LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n", + (long long)PTL_RPC_MSG_REPLY); + LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n", + MDS_DIR_END_OFF); + LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n", + DEAD_HANDLE_MAGIC); + CLASSERT(MTI_NAME_MAXLEN == 64); + LASSERTF(OST_REPLY == 0, "found %lld\n", + (long long)OST_REPLY); + LASSERTF(OST_GETATTR == 1, "found %lld\n", + (long long)OST_GETATTR); + LASSERTF(OST_SETATTR == 2, "found %lld\n", + (long long)OST_SETATTR); + LASSERTF(OST_READ == 3, "found %lld\n", + (long long)OST_READ); + LASSERTF(OST_WRITE == 4, "found %lld\n", + (long long)OST_WRITE); + LASSERTF(OST_CREATE == 5, "found %lld\n", + (long long)OST_CREATE); + LASSERTF(OST_DESTROY == 6, "found %lld\n", + (long long)OST_DESTROY); + LASSERTF(OST_GET_INFO == 7, "found %lld\n", + (long long)OST_GET_INFO); + LASSERTF(OST_CONNECT == 8, "found %lld\n", + (long long)OST_CONNECT); + LASSERTF(OST_DISCONNECT == 9, "found %lld\n", + (long long)OST_DISCONNECT); + LASSERTF(OST_PUNCH == 10, "found %lld\n", + (long long)OST_PUNCH); + LASSERTF(OST_OPEN == 11, "found %lld\n", + (long long)OST_OPEN); + LASSERTF(OST_CLOSE == 12, "found %lld\n", + (long long)OST_CLOSE); + LASSERTF(OST_STATFS == 13, "found %lld\n", + (long long)OST_STATFS); + LASSERTF(OST_SYNC == 16, "found %lld\n", + (long long)OST_SYNC); + LASSERTF(OST_SET_INFO == 17, "found %lld\n", + (long long)OST_SET_INFO); + LASSERTF(OST_QUOTACHECK == 18, "found %lld\n", + (long long)OST_QUOTACHECK); + LASSERTF(OST_QUOTACTL == 19, "found %lld\n", + (long long)OST_QUOTACTL); + LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n", + (long long)OST_QUOTA_ADJUST_QUNIT); + LASSERTF(OST_LAST_OPC == 21, "found %lld\n", + (long long)OST_LAST_OPC); + LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", + OBD_OBJECT_EOF); + LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n", + (long long)OST_MIN_PRECREATE); + LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n", + (long long)OST_MAX_PRECREATE); + LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n", + OST_LVB_ERR_INIT); + LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n", + OST_LVB_ERR_MASK); + LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n", + (long long)MDS_FIRST_OPC); + LASSERTF(MDS_GETATTR == 33, "found %lld\n", + (long long)MDS_GETATTR); + LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n", + (long long)MDS_GETATTR_NAME); + LASSERTF(MDS_CLOSE == 35, "found %lld\n", + (long long)MDS_CLOSE); + LASSERTF(MDS_REINT == 36, "found %lld\n", + (long long)MDS_REINT); + LASSERTF(MDS_READPAGE == 37, "found %lld\n", + (long long)MDS_READPAGE); + LASSERTF(MDS_CONNECT == 38, "found %lld\n", + (long long)MDS_CONNECT); + LASSERTF(MDS_DISCONNECT == 39, "found %lld\n", + (long long)MDS_DISCONNECT); + LASSERTF(MDS_GETSTATUS == 40, "found %lld\n", + (long long)MDS_GETSTATUS); + LASSERTF(MDS_STATFS == 41, "found %lld\n", + (long long)MDS_STATFS); + LASSERTF(MDS_PIN == 42, "found %lld\n", + (long long)MDS_PIN); + LASSERTF(MDS_UNPIN == 43, "found %lld\n", + (long long)MDS_UNPIN); + LASSERTF(MDS_SYNC == 44, "found %lld\n", + (long long)MDS_SYNC); + LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n", + (long long)MDS_DONE_WRITING); + LASSERTF(MDS_SET_INFO == 46, "found %lld\n", + (long long)MDS_SET_INFO); + LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n", + (long long)MDS_QUOTACHECK); + LASSERTF(MDS_QUOTACTL == 48, "found %lld\n", + (long long)MDS_QUOTACTL); + LASSERTF(MDS_GETXATTR == 49, "found %lld\n", + (long long)MDS_GETXATTR); + LASSERTF(MDS_SETXATTR == 50, "found %lld\n", + (long long)MDS_SETXATTR); + LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n", + (long long)MDS_WRITEPAGE); + LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n", + (long long)MDS_IS_SUBDIR); + LASSERTF(MDS_GET_INFO == 53, "found %lld\n", + (long long)MDS_GET_INFO); + LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n", + (long long)MDS_HSM_STATE_GET); + LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n", + (long long)MDS_HSM_STATE_SET); + LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n", + (long long)MDS_HSM_ACTION); + LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n", + (long long)MDS_HSM_PROGRESS); + LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n", + (long long)MDS_HSM_REQUEST); + LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n", + (long long)MDS_HSM_CT_REGISTER); + LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n", + (long long)MDS_HSM_CT_UNREGISTER); + LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n", + (long long)MDS_SWAP_LAYOUTS); + LASSERTF(MDS_LAST_OPC == 62, "found %lld\n", + (long long)MDS_LAST_OPC); + LASSERTF(REINT_SETATTR == 1, "found %lld\n", + (long long)REINT_SETATTR); + LASSERTF(REINT_CREATE == 2, "found %lld\n", + (long long)REINT_CREATE); + LASSERTF(REINT_LINK == 3, "found %lld\n", + (long long)REINT_LINK); + LASSERTF(REINT_UNLINK == 4, "found %lld\n", + (long long)REINT_UNLINK); + LASSERTF(REINT_RENAME == 5, "found %lld\n", + (long long)REINT_RENAME); + LASSERTF(REINT_OPEN == 6, "found %lld\n", + (long long)REINT_OPEN); + LASSERTF(REINT_SETXATTR == 7, "found %lld\n", + (long long)REINT_SETXATTR); + LASSERTF(REINT_RMENTRY == 8, "found %lld\n", + (long long)REINT_RMENTRY); + LASSERTF(REINT_MAX == 9, "found %lld\n", + (long long)REINT_MAX); + LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)DISP_IT_EXECD); + LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_EXECD); + LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_NEG); + LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_POS); + LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_CREATE); + LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_OPEN); + LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_COMPLETE); + LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_OPEN_REF); + LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_CREATE_REF); + LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_LOCK); + LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n", + (long long)MDS_STATUS_CONN); + LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n", + (long long)MDS_STATUS_LOV); + LASSERTF(LUSTRE_BFLAG_UNCOMMITTED_WRITES == 1, "found %lld\n", + (long long)LUSTRE_BFLAG_UNCOMMITTED_WRITES); + LASSERTF(MF_SOM_CHANGE == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)MF_SOM_CHANGE); + LASSERTF(MF_EPOCH_OPEN == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MF_EPOCH_OPEN); + LASSERTF(MF_EPOCH_CLOSE == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MF_EPOCH_CLOSE); + LASSERTF(MF_MDC_CANCEL_FID1 == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)MF_MDC_CANCEL_FID1); + LASSERTF(MF_MDC_CANCEL_FID2 == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)MF_MDC_CANCEL_FID2); + LASSERTF(MF_MDC_CANCEL_FID3 == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MF_MDC_CANCEL_FID3); + LASSERTF(MF_MDC_CANCEL_FID4 == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)MF_MDC_CANCEL_FID4); + LASSERTF(MF_SOM_AU == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MF_SOM_AU); + LASSERTF(MF_GETATTR_LOCK == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)MF_GETATTR_LOCK); + LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MODE); + LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_UID); + LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_GID); + LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_SIZE); + LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATIME); + LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MTIME); + LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_CTIME); + LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATIME_SET); + LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MTIME_SET); + LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_FORCE); + LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATTR_FLAG); + LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_KILL_SUID); + LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_KILL_SGID); + LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_CTIME_SET); + LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_FROM_OPEN); + LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_BLOCKS); + LASSERTF(FLD_QUERY == 900, "found %lld\n", + (long long)FLD_QUERY); + LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n", + (long long)FLD_FIRST_OPC); + LASSERTF(FLD_LAST_OPC == 901, "found %lld\n", + (long long)FLD_LAST_OPC); + LASSERTF(SEQ_QUERY == 700, "found %lld\n", + (long long)SEQ_QUERY); + LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n", + (long long)SEQ_FIRST_OPC); + LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n", + (long long)SEQ_LAST_OPC); + LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n", + (long long)SEQ_ALLOC_SUPER); + LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n", + (long long)SEQ_ALLOC_META); + LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n", + (long long)LDLM_ENQUEUE); + LASSERTF(LDLM_CONVERT == 102, "found %lld\n", + (long long)LDLM_CONVERT); + LASSERTF(LDLM_CANCEL == 103, "found %lld\n", + (long long)LDLM_CANCEL); + LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n", + (long long)LDLM_BL_CALLBACK); + LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n", + (long long)LDLM_CP_CALLBACK); + LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n", + (long long)LDLM_GL_CALLBACK); + LASSERTF(LDLM_SET_INFO == 107, "found %lld\n", + (long long)LDLM_SET_INFO); + LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n", + (long long)LDLM_LAST_OPC); + LASSERTF(LCK_MINMODE == 0, "found %lld\n", + (long long)LCK_MINMODE); + LASSERTF(LCK_EX == 1, "found %lld\n", + (long long)LCK_EX); + LASSERTF(LCK_PW == 2, "found %lld\n", + (long long)LCK_PW); + LASSERTF(LCK_PR == 4, "found %lld\n", + (long long)LCK_PR); + LASSERTF(LCK_CW == 8, "found %lld\n", + (long long)LCK_CW); + LASSERTF(LCK_CR == 16, "found %lld\n", + (long long)LCK_CR); + LASSERTF(LCK_NL == 32, "found %lld\n", + (long long)LCK_NL); + LASSERTF(LCK_GROUP == 64, "found %lld\n", + (long long)LCK_GROUP); + LASSERTF(LCK_COS == 128, "found %lld\n", + (long long)LCK_COS); + LASSERTF(LCK_MAXMODE == 129, "found %lld\n", + (long long)LCK_MAXMODE); + LASSERTF(LCK_MODE_NUM == 8, "found %lld\n", + (long long)LCK_MODE_NUM); + CLASSERT(LDLM_PLAIN == 10); + CLASSERT(LDLM_EXTENT == 11); + CLASSERT(LDLM_FLOCK == 12); + CLASSERT(LDLM_IBITS == 13); + CLASSERT(LDLM_MAX_TYPE == 14); + CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0); + CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1); + LASSERTF(UPDATE_OBJ == 1000, "found %lld\n", + (long long)UPDATE_OBJ); + LASSERTF(UPDATE_LAST_OPC == 1001, "found %lld\n", + (long long)UPDATE_LAST_OPC); + CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2); + CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3); + CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3); + CLASSERT(LQUOTA_TYPE_USR == 0); + CLASSERT(LQUOTA_TYPE_GRP == 1); + CLASSERT(LQUOTA_RES_MD == 1); + CLASSERT(LQUOTA_RES_DT == 2); + LASSERTF(OBD_PING == 400, "found %lld\n", + (long long)OBD_PING); + LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n", + (long long)OBD_LOG_CANCEL); + LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n", + (long long)OBD_QC_CALLBACK); + LASSERTF(OBD_IDX_READ == 403, "found %lld\n", + (long long)OBD_IDX_READ); + LASSERTF(OBD_LAST_OPC == 404, "found %lld\n", + (long long)OBD_LAST_OPC); + LASSERTF(QUOTA_DQACQ == 601, "found %lld\n", + (long long)QUOTA_DQACQ); + LASSERTF(QUOTA_DQREL == 602, "found %lld\n", + (long long)QUOTA_DQREL); + LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n", + (long long)QUOTA_LAST_OPC); + LASSERTF(MGS_CONNECT == 250, "found %lld\n", + (long long)MGS_CONNECT); + LASSERTF(MGS_DISCONNECT == 251, "found %lld\n", + (long long)MGS_DISCONNECT); + LASSERTF(MGS_EXCEPTION == 252, "found %lld\n", + (long long)MGS_EXCEPTION); + LASSERTF(MGS_TARGET_REG == 253, "found %lld\n", + (long long)MGS_TARGET_REG); + LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n", + (long long)MGS_TARGET_DEL); + LASSERTF(MGS_SET_INFO == 255, "found %lld\n", + (long long)MGS_SET_INFO); + LASSERTF(MGS_LAST_OPC == 257, "found %lld\n", + (long long)MGS_LAST_OPC); + LASSERTF(SEC_CTX_INIT == 801, "found %lld\n", + (long long)SEC_CTX_INIT); + LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n", + (long long)SEC_CTX_INIT_CONT); + LASSERTF(SEC_CTX_FINI == 803, "found %lld\n", + (long long)SEC_CTX_FINI); + LASSERTF(SEC_LAST_OPC == 804, "found %lld\n", + (long long)SEC_LAST_OPC); + /* Sizes and Offsets */ + + /* Checks for struct obd_uuid */ + LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(struct obd_uuid)); + + /* Checks for struct lu_seq_range */ + LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n", + (long long)(int)sizeof(struct lu_seq_range)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_start)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_end)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_index)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_flags)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags)); + LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n", + (long long)LU_SEQ_RANGE_MDT); + LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n", + (long long)LU_SEQ_RANGE_OST); + + /* Checks for struct lustre_mdt_attrs */ + LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n", + (long long)(int)sizeof(struct lustre_mdt_attrs)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid)); + LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LMAI_RELEASED); + LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LMAC_HSM); + LASSERTF(LMAC_SOM == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LMAC_SOM); + LASSERTF(LMAC_NOT_IN_OI == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LMAC_NOT_IN_OI); + LASSERTF(LMAC_FID_ON_OST == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)LMAC_FID_ON_OST); + LASSERTF(OBJ_CREATE == 1, "found %lld\n", + (long long)OBJ_CREATE); + LASSERTF(OBJ_DESTROY == 2, "found %lld\n", + (long long)OBJ_DESTROY); + LASSERTF(OBJ_REF_ADD == 3, "found %lld\n", + (long long)OBJ_REF_ADD); + LASSERTF(OBJ_REF_DEL == 4, "found %lld\n", + (long long)OBJ_REF_DEL); + LASSERTF(OBJ_ATTR_SET == 5, "found %lld\n", + (long long)OBJ_ATTR_SET); + LASSERTF(OBJ_ATTR_GET == 6, "found %lld\n", + (long long)OBJ_ATTR_GET); + LASSERTF(OBJ_XATTR_SET == 7, "found %lld\n", + (long long)OBJ_XATTR_SET); + LASSERTF(OBJ_XATTR_GET == 8, "found %lld\n", + (long long)OBJ_XATTR_GET); + LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n", + (long long)OBJ_INDEX_LOOKUP); + LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n", + (long long)OBJ_INDEX_LOOKUP); + LASSERTF(OBJ_INDEX_INSERT == 10, "found %lld\n", + (long long)OBJ_INDEX_INSERT); + LASSERTF(OBJ_INDEX_DELETE == 11, "found %lld\n", + (long long)OBJ_INDEX_DELETE); + + /* Checks for struct ost_id */ + LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n", + (long long)(int)sizeof(struct ost_id)); + LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_id, oi)); + LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ost_id *)0)->oi)); + LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n", + (long long)LUSTRE_FID_INIT_OID); + LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n", + (long long)FID_SEQ_OST_MDT0); + LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n", + (long long)FID_SEQ_LLOG); + LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n", + (long long)FID_SEQ_ECHO); + LASSERTF(FID_SEQ_OST_MDT1 == 3, "found %lld\n", + (long long)FID_SEQ_OST_MDT1); + LASSERTF(FID_SEQ_OST_MAX == 9, "found %lld\n", + (long long)FID_SEQ_OST_MAX); + LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n", + (long long)FID_SEQ_RSVD); + LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n", + (long long)FID_SEQ_IGIF); + LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IGIF_MAX); + LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IDIF); + LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IDIF_MAX); + LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_START); + LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LOCAL_FILE); + LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_DOT_LUSTRE); + LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_SPECIAL); + LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_QUOTA); + LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_QUOTA_GLB); + LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_ROOT); + LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_NORMAL); + LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LOV_DEFAULT); + LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_SPECIAL_BFL); + LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_DOT_LUSTRE); + LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_DOT_LUSTRE_OBF); + + /* Checks for struct lu_dirent */ + LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n", + (long long)(int)sizeof(struct lu_dirent)); + LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_fid)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid)); + LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_hash)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash)); + LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_reclen)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen)); + LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_namelen)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen)); + LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_attrs)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs)); + LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_name[0])); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0])); + LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LUDA_FID); + LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LUDA_TYPE); + LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LUDA_64BITHASH); + + /* Checks for struct luda_type */ + LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n", + (long long)(int)sizeof(struct luda_type)); + LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct luda_type, lt_type)); + LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct luda_type *)0)->lt_type)); + + /* Checks for struct lu_dirpage */ + LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n", + (long long)(int)sizeof(struct lu_dirpage)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_flags)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_pad0)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0])); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0])); + LASSERTF(LDF_EMPTY == 1, "found %lld\n", + (long long)LDF_EMPTY); + LASSERTF(LDF_COLLIDE == 2, "found %lld\n", + (long long)LDF_COLLIDE); + LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n", + (long long)LU_PAGE_SIZE); + /* Checks for union lu_page */ + LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n", + (long long)(int)sizeof(union lu_page)); + + /* Checks for struct lustre_handle */ + LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n", + (long long)(int)sizeof(struct lustre_handle)); + LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_handle, cookie)); + LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_handle *)0)->cookie)); + + /* Checks for struct lustre_msg_v2 */ + LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n", + (long long)(int)sizeof(struct lustre_msg_v2)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_magic)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_flags)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0])); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0])); + LASSERTF(LUSTRE_MSG_MAGIC_V1 == 0x0BD00BD0, "found 0x%.8x\n", + LUSTRE_MSG_MAGIC_V1); + LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n", + LUSTRE_MSG_MAGIC_V2); + LASSERTF(LUSTRE_MSG_MAGIC_V1_SWABBED == 0xD00BD00B, "found 0x%.8x\n", + LUSTRE_MSG_MAGIC_V1_SWABBED); + LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n", + LUSTRE_MSG_MAGIC_V2_SWABBED); + + /* Checks for struct ptlrpc_body */ + LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n", + (long long)(int)sizeof(struct ptlrpc_body_v3)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == 32, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_seen)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv)); + CLASSERT(PTLRPC_NUM_VERSIONS == 4); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == 120, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding)); + CLASSERT(JOBSTATS_JOBID_SIZE == 32); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == (int)offsetof(struct ptlrpc_body_v2, pb_last_seen), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_seen), (int)offsetof(struct ptlrpc_body_v2, pb_last_seen)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == (int)offsetof(struct ptlrpc_body_v2, pb_padding), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding), (int)offsetof(struct ptlrpc_body_v2, pb_padding)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding)); + LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n", + (long long)MSG_PTLRPC_BODY_OFF); + LASSERTF(REQ_REC_OFF == 1, "found %lld\n", + (long long)REQ_REC_OFF); + LASSERTF(REPLY_REC_OFF == 1, "found %lld\n", + (long long)REPLY_REC_OFF); + LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n", + (long long)DLM_LOCKREQ_OFF); + LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n", + (long long)DLM_REQ_REC_OFF); + LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n", + (long long)DLM_INTENT_IT_OFF); + LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n", + (long long)DLM_INTENT_REC_OFF); + LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n", + (long long)DLM_LOCKREPLY_OFF); + LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n", + (long long)DLM_REPLY_REC_OFF); + LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n", + (long long)MSG_PTLRPC_HEADER_OFF); + LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n", + PTLRPC_MSG_VERSION); + LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n", + LUSTRE_VERSION_MASK); + LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n", + LUSTRE_OBD_VERSION); + LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n", + LUSTRE_MDS_VERSION); + LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n", + LUSTRE_OST_VERSION); + LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n", + LUSTRE_DLM_VERSION); + LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n", + LUSTRE_LOG_VERSION); + LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n", + LUSTRE_MGS_VERSION); + LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n", + (long long)MSGHDR_AT_SUPPORT); + LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n", + (long long)MSGHDR_CKSUM_INCOMPAT18); + LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n", + (unsigned)MSG_OP_FLAG_MASK); + LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n", + (long long)MSG_OP_FLAG_SHIFT); + LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n", + (unsigned)MSG_GEN_FLAG_MASK); + LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)MSG_LAST_REPLAY); + LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MSG_RESENT); + LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MSG_REPLAY); + LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)MSG_DELAY_REPLAY); + LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MSG_VERSION_REPLAY); + LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)MSG_REQ_REPLAY_DONE); + LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MSG_LOCK_REPLAY_DONE); + LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_RECOVERING); + LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_RECONNECT); + LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_REPLAYABLE); + LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_LIBCLIENT); + LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_INITIAL); + LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_ASYNC); + LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_NEXT_VER); + LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_TRANSNO); + + /* Checks for struct obd_connect_data */ + LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n", + (long long)(int)sizeof(struct obd_connect_data)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_version)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_index)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_unused)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_transno)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_group)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_instance)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes)); + LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 72, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding1)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1)); + LASSERTF((int)offsetof(struct obd_connect_data, padding2) == 80, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding2)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2)); + LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding3)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3)); + LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding4)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4)); + LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding5)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5)); + LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding6)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6)); + LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding7)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7)); + LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding8)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8)); + LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding9)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingA)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingB)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingC)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingD)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingE)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingF)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF)); + LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RDONLY); + LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_INDEX); + LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS); + LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT); + LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SRVLOCK); + LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_VERSION); + LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_REQPORTAL); + LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_ACL); + LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_XATTR); + LASSERTF(OBD_CONNECT_CROW == 0x200ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CROW); + LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_TRUNCLOCK); + LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_TRANSNO); + LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_IBITS); + LASSERTF(OBD_CONNECT_JOIN == 0x2000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_JOIN); + LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_ATTRFID); + LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_NODEVOH); + LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RMT_CLIENT); + LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RMT_CLIENT_FORCE); + LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_BRW_SIZE); + LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_QUOTA64); + LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS_CAPA); + LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_OSS_CAPA); + LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CANCELSET); + LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SOM); + LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_AT); + LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LRU_RESIZE); + LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS_MDS); + LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_REAL); + LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CHANGE_QS); + LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CKSUM); + LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FID); + LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_VBR); + LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LOV_V3); + LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT_SHRINK); + LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SKIP_ORPHAN); + LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MAX_EASIZE); + LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FULL20); + LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LAYOUTLOCK); + LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_64BITHASH); + LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MAXBYTES); + LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_IMP_RECOV); + LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_JOBSTATS); + LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_UMASK); + LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_EINPROGRESS); + LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT_PARAM); + LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FLOCK_OWNER); + LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LVB_TYPE); + LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_NANOSEC_TIME); + LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LIGHTWEIGHT); + LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SHORTIO); + LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_PINGLESS); + LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL, + "found 0x%.16llxULL\n", OBD_CONNECT_FLOCK_DEAD); + LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_CRC32); + LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_ADLER); + LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_CRC32C); + + /* Checks for struct obdo */ + LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n", + (long long)(int)sizeof(struct obdo)); + LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_valid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_valid)); + LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_oi)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_oi)); + LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_seq)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq)); + LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_size)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_size)); + LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_mtime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_mtime)); + LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_atime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_atime)); + LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_ctime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_ctime)); + LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_blocks)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_blocks)); + LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_grant)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_grant)); + LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_blksize)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_blksize)); + LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_mode)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_mode)); + LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_uid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_uid)); + LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_gid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_gid)); + LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_flags)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_flags)); + LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_nlink)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_nlink)); + LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_oid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid)); + LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_misc)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_misc)); + LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_ioepoch)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch)); + LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_stripe_idx)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx)); + LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_ver)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver)); + LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_handle)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_handle)); + LASSERTF((int)offsetof(struct obdo, o_lcookie) == 136, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_lcookie)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_lcookie) == 32, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_lcookie)); + LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_uid_h)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_uid_h)); + LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_gid_h)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_gid_h)); + LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_data_version)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_data_version)); + LASSERTF((int)offsetof(struct obdo, o_padding_4) == 184, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_4)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_4)); + LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_5)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_5)); + LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_6)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_6)); + LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n", + OBD_MD_FLID); + LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n", + OBD_MD_FLATIME); + LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMTIME); + LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCTIME); + LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n", + OBD_MD_FLSIZE); + LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n", + OBD_MD_FLBLOCKS); + LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n", + OBD_MD_FLBLKSZ); + LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMODE); + LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n", + OBD_MD_FLTYPE); + LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n", + OBD_MD_FLUID); + LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGID); + LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n", + OBD_MD_FLFLAGS); + LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLNLINK); + LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGENER); + LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRDEV); + LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLEASIZE); + LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n", + OBD_MD_LINKNAME); + LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLHANDLE); + LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCKSUM); + LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLQOS); + LASSERTF(OBD_MD_FLCOOKIE == (0x00800000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCOOKIE); + LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGROUP); + LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLFID); + LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLEPOCH); + LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGRANT); + LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLDIREA); + LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLUSRQUOTA); + LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGRPQUOTA); + LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMODEASIZE); + LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n", + OBD_MD_MDS); + LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n", + OBD_MD_REINT); + LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n", + OBD_MD_MEA); + LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL), + "found 0x%.16llxULL\n", OBD_MD_TSTATE); + LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTR); + LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTRLS); + LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTRRM); + LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLACL); + LASSERTF(OBD_MD_FLRMTPERM == (0x0000010000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTPERM); + LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMDSCAPA); + LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLOSSCAPA); + LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCKSPLIT); + LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCROSSREF); + LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGETATTRLOCK); + LASSERTF(OBD_MD_FLRMTLSETFACL == (0x0001000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTLSETFACL); + LASSERTF(OBD_MD_FLRMTLGETFACL == (0x0002000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTLGETFACL); + LASSERTF(OBD_MD_FLRMTRSETFACL == (0x0004000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTRSETFACL); + LASSERTF(OBD_MD_FLRMTRGETFACL == (0x0008000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTRGETFACL); + LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLDATAVERSION); + CLASSERT(OBD_FL_INLINEDATA == 0x00000001); + CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002); + CLASSERT(OBD_FL_DELORPHAN == 0x00000004); + CLASSERT(OBD_FL_NORPC == 0x00000008); + CLASSERT(OBD_FL_IDONLY == 0x00000010); + CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020); + CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040); + CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100); + CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200); + CLASSERT(OBD_FL_CREATE_CROW == 0x00000400); + CLASSERT(OBD_FL_SRVLOCK == 0x00000800); + CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000); + CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000); + CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000); + CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000); + CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000); + CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000); + CLASSERT(OBD_FL_MMAP == 0x00040000); + CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000); + CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000); + CLASSERT(OBD_FL_LOCAL_MASK == 0xf0000000); + + /* Checks for struct lov_ost_data_v1 */ + LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n", + (long long)(int)sizeof(struct lov_ost_data_v1)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx)); + + /* Checks for struct lov_mds_md_v1 */ + LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v1)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0])); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0])); + CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0); + + /* Checks for struct lov_mds_md_v3 */ + LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v3)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen)); + CLASSERT(LOV_MAXPOOLNAME == 16); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16])); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16])); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0])); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0])); + CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0); + LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_RAID0); + LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_RAID1); + LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_FIRST); + LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_CMOBD); + + /* Checks for struct obd_statfs */ + LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n", + (long long)(int)sizeof(struct obd_statfs)); + LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_type)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_type)); + LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_blocks)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks)); + LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bfree)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree)); + LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bavail)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail)); + LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_ffree)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree)); + LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_fsid)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid)); + LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bsize)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize)); + LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_namelen)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen)); + LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_state)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_state)); + LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_fprecreated)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare2)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare3)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare4)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare5)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare6)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare7)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare8)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare9)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9)); + + /* Checks for struct obd_ioobj */ + LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n", + (long long)(int)sizeof(struct obd_ioobj)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_oid)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt)); + + /* Checks for union lquota_id */ + LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n", + (long long)(int)sizeof(union lquota_id)); + + LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n", + (long long)QUOTABLOCK_BITS); + LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n", + (long long)QUOTABLOCK_SIZE); + + /* Checks for struct obd_quotactl */ + LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n", + (long long)(int)sizeof(struct obd_quotactl)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_cmd)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_type)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_id)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_stat)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_dqblk)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk)); + + /* Checks for struct obd_dqinfo */ + LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n", + (long long)(int)sizeof(struct obd_dqinfo)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_flags)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_valid)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid)); + + /* Checks for struct obd_dqblk */ + LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n", + (long long)(int)sizeof(struct obd_dqblk)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_curspace)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_btime)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_itime)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_valid)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_padding)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding)); + LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n", + Q_QUOTACHECK); + LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n", + Q_INITQUOTA); + LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n", + Q_GETOINFO); + LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n", + Q_GETOQUOTA); + LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n", + Q_FINVALIDATE); + + /* Checks for struct lquota_acct_rec */ + LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n", + (long long)(int)sizeof(struct lquota_acct_rec)); + LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_acct_rec, bspace)); + LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace)); + LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_acct_rec, ispace)); + LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace)); + + /* Checks for struct lquota_glb_rec */ + LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct lquota_glb_rec)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_time)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted)); + + /* Checks for struct lquota_slv_rec */ + LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n", + (long long)(int)sizeof(struct lquota_slv_rec)); + LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted)); + LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted)); + + /* Checks for struct idx_info */ + LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n", + (long long)(int)sizeof(struct idx_info)); + LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_magic)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_magic)); + LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_flags)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_flags)); + LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_count)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_count)); + LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad0)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0)); + LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_attrs)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs)); + LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_fid)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_fid)); + LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_version)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_version)); + LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_hash_start)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start)); + LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_hash_end)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end)); + LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_keysize)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize)); + LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_recsize)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize)); + LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad1)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1)); + LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad2)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2)); + LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad3)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3)); + CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37); + + /* Checks for struct lu_idxpage */ + LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n", + (long long)(int)sizeof(struct lu_idxpage)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_magic)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_flags)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_nr)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_pad0)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0)); + CLASSERT(LIP_MAGIC == 0x8A6D6B6C); + LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n", + (long long)LIP_HDR_SIZE); + LASSERTF(II_FL_NOHASH == 1, "found %lld\n", + (long long)II_FL_NOHASH); + LASSERTF(II_FL_VARKEY == 2, "found %lld\n", + (long long)II_FL_VARKEY); + LASSERTF(II_FL_VARREC == 4, "found %lld\n", + (long long)II_FL_VARREC); + LASSERTF(II_FL_NONUNQ == 8, "found %lld\n", + (long long)II_FL_NONUNQ); + + /* Checks for struct niobuf_remote */ + LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n", + (long long)(int)sizeof(struct niobuf_remote)); + LASSERTF((int)offsetof(struct niobuf_remote, offset) == 0, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, offset)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->offset)); + LASSERTF((int)offsetof(struct niobuf_remote, len) == 8, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, len)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->len)); + LASSERTF((int)offsetof(struct niobuf_remote, flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, flags)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->flags)); + LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n", + OBD_BRW_READ); + LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n", + OBD_BRW_WRITE); + LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n", + OBD_BRW_SYNC); + LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n", + OBD_BRW_CHECK); + LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n", + OBD_BRW_FROM_GRANT); + LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n", + OBD_BRW_GRANTED); + LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n", + OBD_BRW_NOCACHE); + LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n", + OBD_BRW_NOQUOTA); + LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n", + OBD_BRW_SRVLOCK); + LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n", + OBD_BRW_ASYNC); + LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n", + OBD_BRW_MEMALLOC); + + /* Checks for struct ost_body */ + LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n", + (long long)(int)sizeof(struct ost_body)); + LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_body, oa)); + LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n", + (long long)(int)sizeof(((struct ost_body *)0)->oa)); + + /* Checks for struct ll_fid */ + LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n", + (long long)(int)sizeof(struct ll_fid)); + LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, id)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->id)); + LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, generation)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->generation)); + LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, f_type)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->f_type)); + + /* Checks for struct mdt_body */ + LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n", + (long long)(int)sizeof(struct mdt_body)); + LASSERTF((int)offsetof(struct mdt_body, fid1) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, fid1)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->fid1)); + LASSERTF((int)offsetof(struct mdt_body, fid2) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, fid2)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->fid2)); + LASSERTF((int)offsetof(struct mdt_body, handle) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, handle)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->handle)); + LASSERTF((int)offsetof(struct mdt_body, valid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, valid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->valid)); + LASSERTF((int)offsetof(struct mdt_body, size) == 48, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, size)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->size)); + LASSERTF((int)offsetof(struct mdt_body, mtime) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mtime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mtime)); + LASSERTF((int)offsetof(struct mdt_body, atime) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, atime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->atime)); + LASSERTF((int)offsetof(struct mdt_body, ctime) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, ctime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->ctime)); + LASSERTF((int)offsetof(struct mdt_body, blocks) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, blocks)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->blocks)); + LASSERTF((int)offsetof(struct mdt_body, t_state) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, t_state)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->t_state) == 8, + "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->t_state)); + LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, fsuid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->fsuid)); + LASSERTF((int)offsetof(struct mdt_body, fsgid) == 108, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, fsgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->fsgid)); + LASSERTF((int)offsetof(struct mdt_body, capability) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, capability)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->capability) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->capability)); + LASSERTF((int)offsetof(struct mdt_body, mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mode)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mode)); + LASSERTF((int)offsetof(struct mdt_body, uid) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, uid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->uid)); + LASSERTF((int)offsetof(struct mdt_body, gid) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, gid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->gid)); + LASSERTF((int)offsetof(struct mdt_body, flags) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, flags)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->flags)); + LASSERTF((int)offsetof(struct mdt_body, rdev) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, rdev)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->rdev) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->rdev)); + LASSERTF((int)offsetof(struct mdt_body, nlink) == 136, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, nlink)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->nlink) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->nlink)); + LASSERTF((int)offsetof(struct mdt_body, unused2) == 140, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, unused2)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->unused2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->unused2)); + LASSERTF((int)offsetof(struct mdt_body, suppgid) == 144, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, suppgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->suppgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->suppgid)); + LASSERTF((int)offsetof(struct mdt_body, eadatasize) == 148, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, eadatasize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->eadatasize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->eadatasize)); + LASSERTF((int)offsetof(struct mdt_body, aclsize) == 152, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, aclsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->aclsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->aclsize)); + LASSERTF((int)offsetof(struct mdt_body, max_mdsize) == 156, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, max_mdsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->max_mdsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->max_mdsize)); + LASSERTF((int)offsetof(struct mdt_body, max_cookiesize) == 160, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, max_cookiesize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->max_cookiesize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->max_cookiesize)); + LASSERTF((int)offsetof(struct mdt_body, uid_h) == 164, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, uid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->uid_h)); + LASSERTF((int)offsetof(struct mdt_body, gid_h) == 168, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, gid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->gid_h)); + LASSERTF((int)offsetof(struct mdt_body, padding_5) == 172, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_5)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_5)); + LASSERTF((int)offsetof(struct mdt_body, padding_6) == 176, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_6)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_6)); + LASSERTF((int)offsetof(struct mdt_body, padding_7) == 184, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_7)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_7)); + LASSERTF((int)offsetof(struct mdt_body, padding_8) == 192, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_8)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_8) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_8)); + LASSERTF((int)offsetof(struct mdt_body, padding_9) == 200, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_9)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_9) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_9)); + LASSERTF((int)offsetof(struct mdt_body, padding_10) == 208, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_10)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_10) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_10)); + LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n", + MDS_FMODE_CLOSED); + LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n", + MDS_FMODE_EXEC); + LASSERTF(MDS_FMODE_EPOCH == 000001000000UL, "found 0%.11oUL\n", + MDS_FMODE_EPOCH); + LASSERTF(MDS_FMODE_TRUNC == 000002000000UL, "found 0%.11oUL\n", + MDS_FMODE_TRUNC); + LASSERTF(MDS_FMODE_SOM == 000004000000UL, "found 0%.11oUL\n", + MDS_FMODE_SOM); + LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n", + MDS_OPEN_CREATED); + LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n", + MDS_OPEN_CROSS); + LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n", + MDS_OPEN_CREAT); + LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n", + MDS_OPEN_EXCL); + LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n", + MDS_OPEN_TRUNC); + LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n", + MDS_OPEN_APPEND); + LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n", + MDS_OPEN_SYNC); + LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n", + MDS_OPEN_DIRECTORY); + LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n", + MDS_OPEN_BY_FID); + LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n", + MDS_OPEN_DELAY_CREATE); + LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n", + MDS_OPEN_OWNEROVERRIDE); + LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n", + MDS_OPEN_JOIN_FILE); + LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n", + MDS_OPEN_LOCK); + LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n", + MDS_OPEN_HAS_EA); + LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n", + MDS_OPEN_HAS_OBJS); + LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_NORESTORE); + LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_NEWSTRIPE); + LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_VOLATILE); + LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n", + LUSTRE_SYNC_FL); + LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n", + LUSTRE_IMMUTABLE_FL); + LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n", + LUSTRE_APPEND_FL); + LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n", + LUSTRE_NOATIME_FL); + LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n", + LUSTRE_DIRSYNC_FL); + LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n", + MDS_INODELOCK_LOOKUP); + LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n", + MDS_INODELOCK_UPDATE); + LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n", + MDS_INODELOCK_OPEN); + LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n", + MDS_INODELOCK_LAYOUT); + + /* Checks for struct mdt_ioepoch */ + LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n", + (long long)(int)sizeof(struct mdt_ioepoch)); + LASSERTF((int)offsetof(struct mdt_ioepoch, handle) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, handle)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->handle)); + LASSERTF((int)offsetof(struct mdt_ioepoch, ioepoch) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, ioepoch)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->ioepoch)); + LASSERTF((int)offsetof(struct mdt_ioepoch, flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, flags)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->flags)); + LASSERTF((int)offsetof(struct mdt_ioepoch, padding) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, padding)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->padding)); + + /* Checks for struct mdt_remote_perm */ + LASSERTF((int)sizeof(struct mdt_remote_perm) == 32, "found %lld\n", + (long long)(int)sizeof(struct mdt_remote_perm)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_uid) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_uid)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_uid)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_gid) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_gid)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_gid)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_fsuid)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_fsgid)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_access_perm) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_access_perm)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_padding) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_padding)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_padding)); + LASSERTF(CFS_SETUID_PERM == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)CFS_SETUID_PERM); + LASSERTF(CFS_SETGID_PERM == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)CFS_SETGID_PERM); + LASSERTF(CFS_SETGRP_PERM == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)CFS_SETGRP_PERM); + LASSERTF(CFS_RMTACL_PERM == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)CFS_RMTACL_PERM); + LASSERTF(CFS_RMTOWN_PERM == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)CFS_RMTOWN_PERM); + + /* Checks for struct mdt_rec_setattr */ + LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_setattr)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_size)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_3) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5)); + + /* Checks for struct mdt_rec_create */ + LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_create)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_time)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_rdev)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_umask)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4)); + + /* Checks for struct mdt_rec_link */ + LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_link)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_time)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9)); + + /* Checks for struct mdt_rec_unlink */ + LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_unlink)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_time)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9)); + + /* Checks for struct mdt_rec_rename */ + LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_rename)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_time)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8)); + + /* Checks for struct mdt_rec_setxattr */ + LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_setxattr)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11)); + + /* Checks for struct mdt_rec_reint */ + LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_reint)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_atime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_size)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_umask)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4)); + + /* Checks for struct lmv_desc */ + LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n", + (long long)(int)sizeof(struct lmv_desc)); + LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_tgt_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_pattern)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern)); + LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_3)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_4)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4)); + LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_uuid)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid)); + + /* Checks for struct lmv_stripe_md */ + LASSERTF((int)sizeof(struct lmv_stripe_md) == 32, "found %lld\n", + (long long)(int)sizeof(struct lmv_stripe_md)); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_magic)); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_magic)); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_count)); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_count)); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_master) == 8, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_master)); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_master) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_master)); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_padding)); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_padding)); + CLASSERT(LOV_MAXPOOLNAME == 16); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_pool_name[16]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_pool_name[16])); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16])); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_ids[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_ids[0])); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0])); + + /* Checks for struct lov_desc */ + LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n", + (long long)(int)sizeof(struct lov_desc)); + LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_tgt_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_pattern)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_0)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0)); + LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_uuid)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid)); + CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C); + + /* Checks for struct ldlm_res_id */ + LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n", + (long long)(int)sizeof(struct ldlm_res_id)); + CLASSERT(RES_NAME_SIZE == 4); + LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n", + (long long)(int)offsetof(struct ldlm_res_id, name[4])); + LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4])); + + /* Checks for struct ldlm_extent */ + LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n", + (long long)(int)sizeof(struct ldlm_extent)); + LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, start)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->start)); + LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, end)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->end)); + LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, gid)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->gid)); + + /* Checks for struct ldlm_inodebits */ + LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n", + (long long)(int)sizeof(struct ldlm_inodebits)); + LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_inodebits, bits)); + LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits)); + + /* Checks for struct ldlm_flock_wire */ + LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n", + (long long)(int)sizeof(struct ldlm_flock_wire)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid)); + + /* Checks for struct ldlm_intent */ + LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n", + (long long)(int)sizeof(struct ldlm_intent)); + LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_intent, opc)); + LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_intent *)0)->opc)); + + /* Checks for struct ldlm_resource_desc */ + LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n", + (long long)(int)sizeof(struct ldlm_resource_desc)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_type)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_name)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name)); + + /* Checks for struct ldlm_lock_desc */ + LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(struct ldlm_lock_desc)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_resource)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data)); + + /* Checks for struct ldlm_request */ + LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n", + (long long)(int)sizeof(struct ldlm_request)); + LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_flags)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_count)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count)); + LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_desc)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc)); + LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_handle)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle)); + + /* Checks for struct ldlm_reply */ + LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n", + (long long)(int)sizeof(struct ldlm_reply)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_flags)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_padding)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_desc)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_handle)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2)); + + /* Checks for struct ost_lvb_v1 */ + LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n", + (long long)(int)sizeof(struct ost_lvb_v1)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_size)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks)); + + /* Checks for struct ost_lvb */ + LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n", + (long long)(int)sizeof(struct ost_lvb)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_size)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_mtime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_atime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_ctime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_blocks)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_padding)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding)); + + /* Checks for struct lquota_lvb */ + LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n", + (long long)(int)sizeof(struct lquota_lvb)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_flags)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_pad1)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1)); + LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n", + (long long)LQUOTA_FL_EDQUOT); + + /* Checks for struct ldlm_gl_lquota_desc */ + LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n", + (long long)(int)sizeof(struct ldlm_gl_lquota_desc)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2)); + + /* Checks for struct mgs_send_param */ + LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n", + (long long)(int)sizeof(struct mgs_send_param)); + CLASSERT(MGS_PARAM_MAXLEN == 1024); + LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n", + (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024])); + LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024])); + + /* Checks for struct cfg_marker */ + LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n", + (long long)(int)sizeof(struct cfg_marker)); + LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_step)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step)); + LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_flags)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags)); + LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_vers)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers)); + LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_padding)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding)); + LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_createtime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_canceltime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_tgtname)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname)); + LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_comment)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment)); + + /* Checks for struct llog_logid */ + LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n", + (long long)(int)sizeof(struct llog_logid)); + LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_logid, lgl_oi)); + LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi)); + LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_logid, lgl_ogen)); + LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen)); + CLASSERT(OST_SZ_REC == 274730752); + CLASSERT(MDS_UNLINK_REC == 274801668); + CLASSERT(MDS_UNLINK64_REC == 275325956); + CLASSERT(MDS_SETATTR64_REC == 275325953); + CLASSERT(OBD_CFG_REC == 274857984); + CLASSERT(LLOG_GEN_REC == 274989056); + CLASSERT(CHANGELOG_REC == 275120128); + CLASSERT(CHANGELOG_USER_REC == 275185664); + CLASSERT(LLOG_HDR_MAGIC == 275010873); + CLASSERT(LLOG_LOGID_MAGIC == 275010875); + + /* Checks for struct llog_catid */ + LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n", + (long long)(int)sizeof(struct llog_catid)); + LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_logid)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding1)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding2)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding3)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3)); + + /* Checks for struct llog_rec_hdr */ + LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(struct llog_rec_hdr)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_len)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_index)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_type)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_id)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id)); + + /* Checks for struct llog_rec_tail */ + LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n", + (long long)(int)sizeof(struct llog_rec_tail)); + LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_tail, lrt_len)); + LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len)); + LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_tail, lrt_index)); + LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index)); + + /* Checks for struct llog_logid_rec */ + LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_logid_rec)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_hdr)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_id)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding1)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding2)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding3)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_tail)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail)); + + /* Checks for struct llog_unlink_rec */ + LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n", + (long long)(int)sizeof(struct llog_unlink_rec)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_oid)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_count)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_tail)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail)); + /* Checks for struct llog_unlink64_rec */ + LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_unlink64_rec)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_count)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3)); + + /* Checks for struct llog_setattr64_rec */ + LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_setattr64_rec)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_padding) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_padding)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail)); + + /* Checks for struct llog_size_change_rec */ + LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_size_change_rec)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail)); + + /* Checks for struct changelog_rec */ + LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct changelog_rec)); + LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_namelen)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen)); + LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_flags)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags)); + LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_type)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type)); + LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_index)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index)); + LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_prev)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev)); + LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_time)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time)); + LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_tfid)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid)); + LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_pfid)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid)); + + /* Checks for struct changelog_ext_rec */ + LASSERTF((int)sizeof(struct changelog_ext_rec) == 96, "found %lld\n", + (long long)(int)sizeof(struct changelog_ext_rec)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_namelen) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_namelen)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_flags) == 2, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_flags)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_flags)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_type)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_type)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_index) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_index)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_index) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_index)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_prev) == 16, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_prev)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_prev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_prev)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_time) == 24, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_time)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_time)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_tfid) == 32, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_tfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_pfid) == 48, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_pfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_sfid) == 64, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_sfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_spfid) == 80, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_spfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid)); + + /* Checks for struct changelog_setinfo */ + LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n", + (long long)(int)sizeof(struct changelog_setinfo)); + LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_setinfo, cs_recno)); + LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno)); + LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_setinfo, cs_id)); + LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id)); + + /* Checks for struct llog_changelog_rec */ + LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n", + (long long)(int)sizeof(struct llog_changelog_rec)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr_tail) == 80, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr_tail)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_tail)); + + /* Checks for struct llog_changelog_user_rec */ + LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n", + (long long)(int)sizeof(struct llog_changelog_user_rec)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail)); + + /* Checks for struct llog_gen */ + LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n", + (long long)(int)sizeof(struct llog_gen)); + LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_gen, mnt_cnt)); + LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt)); + LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n", + (long long)(int)offsetof(struct llog_gen, conn_cnt)); + LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt)); + + /* Checks for struct llog_gen_rec */ + LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_gen_rec)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_gen)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_tail)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail)); + + /* Checks for struct llog_log_hdr */ + LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n", + (long long)(int)sizeof(struct llog_log_hdr)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_hdr)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_count)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_size)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_flags)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_reserved) == 84, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_reserved)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_reserved) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_reserved)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap) == 88, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap) == 8096, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_tail) == 8184, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_tail)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tail)); + + /* Checks for struct llog_cookie */ + LASSERTF((int)sizeof(struct llog_cookie) == 32, "found %lld\n", + (long long)(int)sizeof(struct llog_cookie)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_lgl) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_lgl)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_lgl) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_lgl)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_subsys) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_subsys)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_subsys) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_subsys)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_index) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_index)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_padding)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding)); + + /* Checks for struct llogd_body */ + LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n", + (long long)(int)sizeof(struct llogd_body)); + LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_logid)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid)); + LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx)); + LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_llh_flags)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags)); + LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_index)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index)); + LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_saved_index)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index)); + LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_len)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len)); + LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_cur_offset)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset)); + CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501); + CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502); + CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503); + CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504); + CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505); + CLASSERT(LLOG_ORIGIN_CONNECT == 506); + CLASSERT(LLOG_CATINFO == 507); + CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508); + CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509); + CLASSERT(LLOG_FIRST_OPC == 501); + CLASSERT(LLOG_LAST_OPC == 510); + + /* Checks for struct llogd_conn_body */ + LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n", + (long long)(int)sizeof(struct llogd_conn_body)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx)); + + /* Checks for struct ll_fiemap_info_key */ + LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n", + (long long)(int)sizeof(struct ll_fiemap_info_key)); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, name[8]) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, name[8])); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->name[8])); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, oa) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, oa)); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->oa) == 208, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->oa)); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, fiemap) == 216, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, fiemap)); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap)); + + /* Checks for struct quota_body */ + LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n", + (long long)(int)sizeof(struct quota_body)); + LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_fid)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_fid)); + LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_id)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_id)); + LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_flags)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_flags)); + LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_padding)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_padding)); + LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_count)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_count)); + LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_usage)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_usage)); + LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_slv_ver)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver)); + LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_lockh)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh)); + LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_glb_lockh)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh)); + LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_padding1[4])); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4])); + + /* Checks for struct mgs_target_info */ + LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n", + (long long)(int)sizeof(struct mgs_target_info)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_config_ver)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_flags)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_nid_count)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_instance)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_fsname)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_svname)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_uuid)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_nids)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_params)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params)); + + /* Checks for struct lustre_capa */ + LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n", + (long long)(int)sizeof(struct lustre_capa)); + LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_fid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_opc)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc)); + LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_uid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_gid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_flags)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags)); + LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_keyid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_timeout)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout)); + LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_expiry)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry)); + CLASSERT(CAPA_HMAC_MAX_LEN == 64); + LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_hmac[64])); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64])); + + /* Checks for struct lustre_capa_key */ + LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n", + (long long)(int)sizeof(struct lustre_capa_key)); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_seq)); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq)); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_keyid)); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid)); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_padding)); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding)); + CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_key[56])); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56])); + + /* Checks for struct getinfo_fid2path */ + LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n", + (long long)(int)sizeof(struct getinfo_fid2path)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_fid)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_recno)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0])); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0])); + + /* Checks for struct ll_user_fiemap */ + LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, "found %lld\n", + (long long)(int)sizeof(struct ll_user_fiemap)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_start)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_length)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_flags)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_extents)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents)); + CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001); + CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002); + CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000); + + /* Checks for struct ll_fiemap_extent */ + LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, "found %lld\n", + (long long)(int)sizeof(struct ll_fiemap_extent)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_length)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_device)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device)); + CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001); + CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002); + CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004); + CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008); + CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080); + CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100); + CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200); + CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400); + CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800); + CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000); + CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000); + CLASSERT(FIEMAP_EXTENT_NET == 0x80000000); + + /* Checks for type posix_acl_xattr_entry */ + LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n", + (long long)(int)sizeof(posix_acl_xattr_entry)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_tag)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_perm)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_id)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id)); + + /* Checks for type posix_acl_xattr_header */ + LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n", + (long long)(int)sizeof(posix_acl_xattr_header)); + LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_header, a_version)); + LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version)); + LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_header, a_entries)); + LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries)); + + /* Checks for struct link_ea_header */ + LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n", + (long long)(int)sizeof(struct link_ea_header)); + LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_magic)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic)); + LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_reccount)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount)); + LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_len)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len)); + LASSERTF((int)offsetof(struct link_ea_header, padding1) == 16, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, padding1)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->padding1)); + LASSERTF((int)offsetof(struct link_ea_header, padding2) == 20, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, padding2)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->padding2)); + CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL); + + /* Checks for struct link_ea_entry */ + LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n", + (long long)(int)sizeof(struct link_ea_entry)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_reclen)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_name)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name)); + + /* Checks for struct layout_intent */ + LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n", + (long long)(int)sizeof(struct layout_intent)); + LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_opc)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_opc)); + LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_flags)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_flags)); + LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_start)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_start)); + LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_end)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_end)); + LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n", + (long long)LAYOUT_INTENT_ACCESS); + LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n", + (long long)LAYOUT_INTENT_READ); + LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n", + (long long)LAYOUT_INTENT_WRITE); + LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n", + (long long)LAYOUT_INTENT_GLIMPSE); + LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n", + (long long)LAYOUT_INTENT_TRUNC); + LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n", + (long long)LAYOUT_INTENT_RELEASE); + LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n", + (long long)LAYOUT_INTENT_RESTORE); + + /* Checks for struct hsm_action_item */ + LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n", + (long long)(int)sizeof(struct hsm_action_item)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_len)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_action)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_fid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_dfid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_extent)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_cookie)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_gid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_data)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data)); + + /* Checks for struct hsm_action_list */ + LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_action_list)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_version)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_count)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_compound_id)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_flags)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_archive_id)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id)); + LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, padding1)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_fsname)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname)); + + /* Checks for struct hsm_progress */ + LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n", + (long long)(int)sizeof(struct hsm_progress)); + LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_fid)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid)); + LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_cookie)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie)); + LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_extent)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent)); + LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_flags)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags)); + LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_errval)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval)); + LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, padding)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->padding)); + LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n", + HP_FLAG_COMPLETED); + LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n", + HP_FLAG_RETRY); + + LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_data_version)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version)); + LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_flags)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags)); + LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_errval)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval)); + LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, padding)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->padding)); + LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_hai)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai)); + + /* Checks for struct hsm_progress_kernel */ + LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n", + (long long)(int)sizeof(struct hsm_progress_kernel)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2)); + + /* Checks for struct hsm_user_item */ + LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_item)); + LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_item, hui_fid)); + LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid)); + LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_item, hui_extent)); + LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent)); + + /* Checks for struct hsm_user_state */ + LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_state)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_states)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_archive_id)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location)); + + /* Checks for struct hsm_state_set */ + LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_state_set)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_valid)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_archive_id)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_setmask)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_clearmask)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask)); + + /* Checks for struct hsm_current_action */ + LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_current_action)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_state)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_action)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_location)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location)); + + /* Checks for struct hsm_request */ + LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_request)); + LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_action)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_action)); + LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_archive_id)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id)); + LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_flags)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags)); + LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_itemcount)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount)); + LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_data_len)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len)); + LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)HSM_FORCE_ACTION); + LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)HSM_GHOST_COPY); + + /* Checks for struct hsm_user_request */ + LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_request)); + LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_request, hur_request)); + LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request)); + LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_request, hur_user_item)); + LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item)); + + /* Checks for struct hsm_user_import */ + LASSERTF(sizeof(struct hsm_user_import) == 48, "found %lld\n", + (long long)sizeof(struct hsm_user_import)); + LASSERTF(offsetof(struct hsm_user_import, hui_size) == 0, + "found %lld\n", + (long long)offsetof(struct hsm_user_import, hui_size)); + LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_size) == 8, + "found %lld\n", + (long long)sizeof(((struct hsm_user_import *)0)->hui_size)); + LASSERTF(offsetof(struct hsm_user_import, hui_uid) == 32, + "found %lld\n", + (long long)offsetof(struct hsm_user_import, hui_uid)); + LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_uid) == 4, + "found %lld\n", + (long long)sizeof(((struct hsm_user_import *)0)->hui_uid)); + LASSERTF(offsetof(struct hsm_user_import, hui_gid) == 36, + "found %lld\n", + (long long)offsetof(struct hsm_user_import, hui_gid)); + LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_gid) == 4, + "found %lld\n", + (long long)sizeof(((struct hsm_user_import *)0)->hui_gid)); + LASSERTF(offsetof(struct hsm_user_import, hui_mode) == 40, + "found %lld\n", + (long long)offsetof(struct hsm_user_import, hui_mode)); + LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mode) == 4, + "found %lld\n", + (long long)sizeof(((struct hsm_user_import *)0)->hui_mode)); + LASSERTF(offsetof(struct hsm_user_import, hui_atime) == 8, + "found %lld\n", + (long long)offsetof(struct hsm_user_import, hui_atime)); + LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_atime) == 8, + "found %lld\n", + (long long)sizeof(((struct hsm_user_import *)0)->hui_atime)); + LASSERTF(offsetof(struct hsm_user_import, hui_atime_ns) == 24, + "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_atime_ns)); + LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_atime_ns) == 4, + "found %lld\n", + (long long)sizeof(((struct hsm_user_import *)0)->hui_atime_ns)); + LASSERTF(offsetof(struct hsm_user_import, hui_mtime) == 16, + "found %lld\n", + (long long)offsetof(struct hsm_user_import, hui_mtime)); + LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mtime) == 8, + "found %lld\n", + (long long)sizeof(((struct hsm_user_import *)0)->hui_mtime)); + LASSERTF(offsetof(struct hsm_user_import, hui_mtime_ns) == 28, + "found %lld\n", + (long long)offsetof(struct hsm_user_import, hui_mtime_ns)); + LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mtime_ns) == 4, + "found %lld\n", + (long long)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns)); + LASSERTF(offsetof(struct hsm_user_import, hui_archive_id) == 44, + "found %lld\n", + (long long)offsetof(struct hsm_user_import, hui_archive_id)); + LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_archive_id) == 4, + "found %lld\n", + (long long)sizeof(((struct hsm_user_import *)0)->hui_archive_id)); + + /* Checks for struct update_buf */ + LASSERTF((int)sizeof(struct update_buf) == 8, "found %lld\n", + (long long)(int)sizeof(struct update_buf)); + LASSERTF((int)offsetof(struct update_buf, ub_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_buf, ub_magic)); + LASSERTF((int)sizeof(((struct update_buf *)0)->ub_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_buf *)0)->ub_magic)); + LASSERTF((int)offsetof(struct update_buf, ub_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct update_buf, ub_count)); + LASSERTF((int)sizeof(((struct update_buf *)0)->ub_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_buf *)0)->ub_count)); + LASSERTF((int)offsetof(struct update_buf, ub_bufs) == 8, "found %lld\n", + (long long)(int)offsetof(struct update_buf, ub_bufs)); + LASSERTF((int)sizeof(((struct update_buf *)0)->ub_bufs) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update_buf *)0)->ub_bufs)); + + /* Checks for struct update_reply */ + LASSERTF((int)sizeof(struct update_reply) == 8, "found %lld\n", + (long long)(int)sizeof(struct update_reply)); + LASSERTF((int)offsetof(struct update_reply, ur_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_reply, ur_version)); + LASSERTF((int)sizeof(((struct update_reply *)0)->ur_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_reply *)0)->ur_version)); + LASSERTF((int)offsetof(struct update_reply, ur_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct update_reply, ur_count)); + LASSERTF((int)sizeof(((struct update_reply *)0)->ur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_reply *)0)->ur_count)); + LASSERTF((int)offsetof(struct update_reply, ur_lens) == 8, "found %lld\n", + (long long)(int)offsetof(struct update_reply, ur_lens)); + LASSERTF((int)sizeof(((struct update_reply *)0)->ur_lens) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update_reply *)0)->ur_lens)); + + /* Checks for struct update */ + LASSERTF((int)sizeof(struct update) == 56, "found %lld\n", + (long long)(int)sizeof(struct update)); + LASSERTF((int)offsetof(struct update, u_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct update, u_type)); + LASSERTF((int)sizeof(((struct update *)0)->u_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_type)); + LASSERTF((int)offsetof(struct update, u_batchid) == 4, "found %lld\n", + (long long)(int)offsetof(struct update, u_batchid)); + LASSERTF((int)sizeof(((struct update *)0)->u_batchid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_batchid)); + LASSERTF((int)offsetof(struct update, u_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct update, u_fid)); + LASSERTF((int)sizeof(((struct update *)0)->u_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_fid)); + LASSERTF((int)offsetof(struct update, u_lens) == 24, "found %lld\n", + (long long)(int)offsetof(struct update, u_lens)); + LASSERTF((int)sizeof(((struct update *)0)->u_lens) == 32, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_lens)); + LASSERTF((int)offsetof(struct update, u_bufs) == 56, "found %lld\n", + (long long)(int)offsetof(struct update, u_bufs)); + LASSERTF((int)sizeof(((struct update *)0)->u_bufs) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_bufs)); +} -- cgit 1.2.3-korg