26 files changed, 11277 insertions, 0 deletions
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/Makefile b/kernel/drivers/staging/lustre/lustre/libcfs/Makefile
new file mode 100644
index 000000000..2996a48a3
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/Makefile
@@ -0,0 +1,18 @@
+obj-$(CONFIG_LUSTRE_FS) += libcfs.o
+
+libcfs-linux-objs := linux-tracefile.o linux-debug.o
+libcfs-linux-objs += linux-prim.o linux-cpu.o
+libcfs-linux-objs += linux-tcpip.o
+libcfs-linux-objs += linux-curproc.o
+libcfs-linux-objs += linux-module.o
+libcfs-linux-objs += linux-crypto.o
+libcfs-linux-objs += linux-crypto-adler.o
+
+libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs))
+
+libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \
+		   libcfs_string.o hash.o kernel_user_comm.o \
+		   prng.o workitem.o libcfs_cpu.o \
+		   libcfs_mem.o libcfs_lock.o
+
+libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs)
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/debug.c b/kernel/drivers/staging/lustre/lustre/libcfs/debug.c
new file mode 100644
index 000000000..021c92fa0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/debug.c
@@ -0,0 +1,460 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "tracefile.h"
+
+static char debug_file_name[1024];
+
+unsigned int libcfs_subsystem_debug = ~0;
+module_param(libcfs_subsystem_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_subsystem_debug, "Lustre kernel debug subsystem mask");
+EXPORT_SYMBOL(libcfs_subsystem_debug);
+
+unsigned int libcfs_debug = (D_CANTMASK |
+			     D_NETERROR | D_HA | D_CONFIG | D_IOCTL);
+module_param(libcfs_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask");
+EXPORT_SYMBOL(libcfs_debug);
+
+static unsigned int libcfs_debug_mb;
+module_param(libcfs_debug_mb, uint, 0644);
+MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size.");
+EXPORT_SYMBOL(libcfs_debug_mb);
+
+unsigned int libcfs_printk = D_CANTMASK;
+module_param(libcfs_printk, uint, 0644);
+MODULE_PARM_DESC(libcfs_printk, "Lustre kernel debug console mask");
+EXPORT_SYMBOL(libcfs_printk);
+
+unsigned int libcfs_console_ratelimit = 1;
+module_param(libcfs_console_ratelimit, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)");
+EXPORT_SYMBOL(libcfs_console_ratelimit);
+
+unsigned int libcfs_console_max_delay;
+module_param(libcfs_console_max_delay, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_max_delay);
+
+unsigned int libcfs_console_min_delay;
+module_param(libcfs_console_min_delay, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_min_delay);
+
+unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
+module_param(libcfs_console_backoff, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor");
+EXPORT_SYMBOL(libcfs_console_backoff);
+
+unsigned int libcfs_debug_binary = 1;
+EXPORT_SYMBOL(libcfs_debug_binary);
+
+unsigned int libcfs_stack = 3 * THREAD_SIZE / 4;
+EXPORT_SYMBOL(libcfs_stack);
+
+static unsigned int portal_enter_debugger;
+EXPORT_SYMBOL(portal_enter_debugger);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+unsigned int libcfs_watchdog_ratelimit = 300;
+EXPORT_SYMBOL(libcfs_watchdog_ratelimit);
+
+unsigned int libcfs_panic_on_lbug = 1;
+module_param(libcfs_panic_on_lbug, uint, 0644);
+MODULE_PARM_DESC(libcfs_panic_on_lbug, "Lustre kernel panic on LBUG");
+EXPORT_SYMBOL(libcfs_panic_on_lbug);
+
+atomic_t libcfs_kmemory = ATOMIC_INIT(0);
+EXPORT_SYMBOL(libcfs_kmemory);
+
+static wait_queue_head_t debug_ctlwq;
+
+char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+
+/* We need to pass a pointer here, but elsewhere this must be a const */
+static char *libcfs_debug_file_path;
+module_param(libcfs_debug_file_path, charp, 0644);
+MODULE_PARM_DESC(libcfs_debug_file_path,
+		 "Path for dumping debug logs, set 'NONE' to prevent log dumping");
+
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+static const char *
+libcfs_debug_subsys2str(int subsys)
+{
+	switch (1 << subsys) {
+	default:
+		return NULL;
+	case S_UNDEFINED:
+		return "undefined";
+	case S_MDC:
+		return "mdc";
+	case S_MDS:
+		return "mds";
+	case S_OSC:
+		return "osc";
+	case S_OST:
+		return "ost";
+	case S_CLASS:
+		return "class";
+	case S_LOG:
+		return "log";
+	case S_LLITE:
+		return "llite";
+	case S_RPC:
+		return "rpc";
+	case S_LNET:
+		return "lnet";
+	case S_LND:
+		return "lnd";
+	case S_PINGER:
+		return "pinger";
+	case S_FILTER:
+		return "filter";
+	case S_ECHO:
+		return "echo";
+	case S_LDLM:
+		return "ldlm";
+	case S_LOV:
+		return "lov";
+	case S_LQUOTA:
+		return "lquota";
+	case S_OSD:
+		return "osd";
+	case S_LMV:
+		return "lmv";
+	case S_SEC:
+		return "sec";
+	case S_GSS:
+		return "gss";
+	case S_MGC:
+		return "mgc";
+	case S_MGS:
+		return "mgs";
+	case S_FID:
+		return "fid";
+	case S_FLD:
+		return "fld";
+	}
+}
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+static const char *
+libcfs_debug_dbg2str(int debug)
+{
+	switch (1 << debug) {
+	default:
+		return NULL;
+	case D_TRACE:
+		return "trace";
+	case D_INODE:
+		return "inode";
+	case D_SUPER:
+		return "super";
+	case D_EXT2:
+		return "ext2";
+	case D_MALLOC:
+		return "malloc";
+	case D_CACHE:
+		return "cache";
+	case D_INFO:
+		return "info";
+	case D_IOCTL:
+		return "ioctl";
+	case D_NETERROR:
+		return "neterror";
+	case D_NET:
+		return "net";
+	case D_WARNING:
+		return "warning";
+	case D_BUFFS:
+		return "buffs";
+	case D_OTHER:
+		return "other";
+	case D_DENTRY:
+		return "dentry";
+	case D_NETTRACE:
+		return "nettrace";
+	case D_PAGE:
+		return "page";
+	case D_DLMTRACE:
+		return "dlmtrace";
+	case D_ERROR:
+		return "error";
+	case D_EMERG:
+		return "emerg";
+	case D_HA:
+		return "ha";
+	case D_RPCTRACE:
+		return "rpctrace";
+	case D_VFSTRACE:
+		return "vfstrace";
+	case D_READA:
+		return "reada";
+	case D_MMAP:
+		return "mmap";
+	case D_CONFIG:
+		return "config";
+	case D_CONSOLE:
+		return "console";
+	case D_QUOTA:
+		return "quota";
+	case D_SEC:
+		return "sec";
+	case D_LFSCK:
+		return "lfsck";
+	}
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int	   len = 0;
+	const char   *token;
+	int	   i;
+
+	if (mask == 0) {			/* "0" */
+		if (size > 0)
+			str[0] = '0';
+		len = 1;
+	} else {				/* space-separated tokens */
+		for (i = 0; i < 32; i++) {
+			if ((mask & (1 << i)) == 0)
+				continue;
+
+			token = fn(i);
+			if (token == NULL)	      /* unused bit */
+				continue;
+
+			if (len > 0) {		  /* separator? */
+				if (len < size)
+					str[len] = ' ';
+				len++;
+			}
+
+			while (*token != 0) {
+				if (len < size)
+					str[len] = *token;
+				token++;
+				len++;
+			}
+		}
+	}
+
+	/* terminate 'str' */
+	if (len < size)
+		str[len] = 0;
+	else
+		str[size - 1] = 0;
+
+	return len;
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int	 m = 0;
+	int	 matched;
+	int	 n;
+	int	 t;
+
+	/* Allow a number for backwards compatibility */
+
+	for (n = strlen(str); n > 0; n--)
+		if (!isspace(str[n-1]))
+			break;
+	matched = n;
+	t = sscanf(str, "%i%n", &m, &matched);
+	if (t >= 1 && matched == n) {
+		/* don't print warning for lctl set_param debug=0 or -1 */
+		if (m != 0 && m != -1)
+			CWARN("You are trying to use a numerical value for the mask - this will be deprecated in a future release.\n");
+		*mask = m;
+		return 0;
+	}
+
+	return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+			    0xffffffff);
+}
+
+/**
+ * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages()
+ */
+void libcfs_debug_dumplog_internal(void *arg)
+{
+	void *journal_info;
+
+	journal_info = current->journal_info;
+	current->journal_info = NULL;
+
+	if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) {
+		snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+			 "%s.%ld.%ld", libcfs_debug_file_path_arr,
+			 get_seconds(), (long_ptr_t)arg);
+		pr_alert("LustreError: dumping log to %s\n",
+		       debug_file_name);
+		cfs_tracefile_dump_all_pages(debug_file_name);
+		libcfs_run_debug_log_upcall(debug_file_name);
+	}
+
+	current->journal_info = journal_info;
+}
+
+static int libcfs_debug_dumplog_thread(void *arg)
+{
+	libcfs_debug_dumplog_internal(arg);
+	wake_up(&debug_ctlwq);
+	return 0;
+}
+
+void libcfs_debug_dumplog(void)
+{
+	wait_queue_t wait;
+	struct task_struct *dumper;
+
+	/* we're being careful to ensure that the kernel thread is
+	 * able to set our state to running as it exits before we
+	 * get to schedule() */
+	init_waitqueue_entry(&wait, current);
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&debug_ctlwq, &wait);
+
+	dumper = kthread_run(libcfs_debug_dumplog_thread,
+			     (void *)(long)current_pid(),
+			     "libcfs_debug_dumper");
+	if (IS_ERR(dumper))
+		pr_err("LustreError: cannot start log dump thread: %ld\n",
+		       PTR_ERR(dumper));
+	else
+		schedule();
+
+	/* be sure to teardown if cfs_create_thread() failed */
+	remove_wait_queue(&debug_ctlwq, &wait);
+	set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+	int    rc = 0;
+	unsigned int max = libcfs_debug_mb;
+
+	init_waitqueue_head(&debug_ctlwq);
+
+	if (libcfs_console_max_delay <= 0 || /* not set by user or */
+	    libcfs_console_min_delay <= 0 || /* set to invalid values */
+	    libcfs_console_min_delay >= libcfs_console_max_delay) {
+		libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY;
+		libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY;
+	}
+
+	if (libcfs_debug_file_path != NULL) {
+		strncpy(libcfs_debug_file_path_arr,
+			libcfs_debug_file_path, PATH_MAX-1);
+		libcfs_debug_file_path_arr[PATH_MAX - 1] = '\0';
+	}
+
+	/* If libcfs_debug_mb is set to an invalid value or uninitialized
+	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+	if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
+		max = TCD_MAX_PAGES;
+	} else {
+		max = max / num_possible_cpus();
+		max <<= (20 - PAGE_CACHE_SHIFT);
+	}
+	rc = cfs_tracefile_init(max);
+
+	if (rc == 0)
+		libcfs_register_panic_notifier();
+
+	return rc;
+}
+
+int libcfs_debug_cleanup(void)
+{
+	libcfs_unregister_panic_notifier();
+	cfs_tracefile_exit();
+	return 0;
+}
+
+int libcfs_debug_clear_buffer(void)
+{
+	cfs_trace_flush_pages();
+	return 0;
+}
+
+/* Debug markers, although printed by S_LNET
+ * should not be be marked as such. */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int libcfs_debug_mark_buffer(const char *text)
+{
+	CDEBUG(D_TRACE,
+	       "***************************************************\n");
+	LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+	CDEBUG(D_TRACE,
+	       "***************************************************\n");
+
+	return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_LNET
+
+void libcfs_debug_set_level(unsigned int debug_level)
+{
+	pr_warn("Lustre: Setting portals debug level to %08x\n",
+	       debug_level);
+	libcfs_debug = debug_level;
+}
+
+EXPORT_SYMBOL(libcfs_debug_set_level);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/fail.c b/kernel/drivers/staging/lustre/lustre/libcfs/fail.c
new file mode 100644
index 000000000..92444b0fe
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/fail.c
@@ -0,0 +1,138 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+unsigned long cfs_fail_loc = 0;
+EXPORT_SYMBOL(cfs_fail_loc);
+
+unsigned int cfs_fail_val = 0;
+EXPORT_SYMBOL(cfs_fail_val);
+
+wait_queue_head_t cfs_race_waitq;
+EXPORT_SYMBOL(cfs_race_waitq);
+
+int cfs_race_state;
+EXPORT_SYMBOL(cfs_race_state);
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set)
+{
+	static atomic_t cfs_fail_count = ATOMIC_INIT(0);
+
+	LASSERT(!(id & CFS_FAIL_ONCE));
+
+	if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) ==
+	    (CFS_FAILED | CFS_FAIL_ONCE)) {
+		atomic_set(&cfs_fail_count, 0); /* paranoia */
+		return 0;
+	}
+
+	/* Fail 1/cfs_fail_val times */
+	if (cfs_fail_loc & CFS_FAIL_RAND) {
+		if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0)
+			return 0;
+	}
+
+	/* Skip the first cfs_fail_val, then fail */
+	if (cfs_fail_loc & CFS_FAIL_SKIP) {
+		if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val)
+			return 0;
+	}
+
+	/* check cfs_fail_val... */
+	if (set == CFS_FAIL_LOC_VALUE) {
+		if (cfs_fail_val != -1 && cfs_fail_val != value)
+			return 0;
+	}
+
+	/* Fail cfs_fail_val times, overridden by FAIL_ONCE */
+	if (cfs_fail_loc & CFS_FAIL_SOME &&
+	    (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) {
+		int count = atomic_inc_return(&cfs_fail_count);
+
+		if (count >= cfs_fail_val) {
+			set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+			atomic_set(&cfs_fail_count, 0);
+			/* we are lost race to increase  */
+			if (count > cfs_fail_val)
+				return 0;
+		}
+	}
+
+	if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) &&
+	    (value & CFS_FAIL_ONCE))
+		set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+	/* Lost race to set CFS_FAILED_BIT. */
+	if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) {
+		/* If CFS_FAIL_ONCE is valid, only one process can fail,
+		 * otherwise multi-process can fail at the same time. */
+		if (cfs_fail_loc & CFS_FAIL_ONCE)
+			return 0;
+	}
+
+	switch (set) {
+	case CFS_FAIL_LOC_NOSET:
+	case CFS_FAIL_LOC_VALUE:
+		break;
+	case CFS_FAIL_LOC_ORSET:
+		cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE);
+		break;
+	case CFS_FAIL_LOC_RESET:
+		cfs_fail_loc = value;
+		break;
+	default:
+		LASSERTF(0, "called with bad set %u\n", set);
+		break;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(__cfs_fail_check_set);
+
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+	int ret = 0;
+
+	ret = __cfs_fail_check_set(id, value, set);
+	if (ret) {
+		CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
+		       id, ms);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(ms) / 1000);
+		CERROR("cfs_fail_timeout id %x awake\n", id);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cfs_fail_timeout_set);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/hash.c b/kernel/drivers/staging/lustre/lustre/libcfs/hash.c
new file mode 100644
index 000000000..a55567e0d
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/hash.c
@@ -0,0 +1,2098 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ *   * CFS_HASH_DEBUG additional validation
+ *   * CFS_HASH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
+ *
+ * 2009-07-31: Liang Zhen <zhen.liang@sun.com>
+ * - move all stuff to libcfs
+ * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH
+ * - ignore hs_rwlock if without CFS_HASH_REHASH setting
+ * - buckets are allocated one by one(instead of contiguous memory),
+ *   to avoid unnecessary cacheline conflict
+ *
+ * 2010-03-01: Liang Zhen <zhen.liang@sun.com>
+ * - "bucket" is a group of hlist_head now, user can specify bucket size
+ *   by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share
+ *   one lock for reducing memory overhead.
+ *
+ * - support lockless hash, caller will take care of locks:
+ *   avoid lock overhead for hash tables that are already protected
+ *   by locking in the caller for another reason
+ *
+ * - support both spin_lock/rwlock for bucket:
+ *   overhead of spinlock contention is lower than read/write
+ *   contention of rwlock, so using spinlock to serialize operations on
+ *   bucket is more reasonable for those frequently changed hash tables
+ *
+ * - support one-single lock mode:
+ *   one lock to protect all hash operations to avoid overhead of
+ *   multiple locks if hash table is always small
+ *
+ * - removed a lot of unnecessary addref & decref on hash element:
+ *   addref & decref are atomic operations in many use-cases which
+ *   are expensive.
+ *
+ * - support non-blocking cfs_hash_add() and cfs_hash_findadd():
+ *   some lustre use-cases require these functions to be strictly
+ *   non-blocking, we need to schedule required rehash on a different
+ *   thread on those cases.
+ *
+ * - safer rehash on large hash table
+ *   In old implementation, rehash function will exclusively lock the
+ *   hash table and finish rehash in one batch, it's dangerous on SMP
+ *   system because rehash millions of elements could take long time.
+ *   New implemented rehash can release lock and relax CPU in middle
+ *   of rehash, it's safe for another thread to search/change on the
+ *   hash table even it's in rehasing.
+ *
+ * - support two different refcount modes
+ *   . hash table has refcount on element
+ *   . hash table doesn't change refcount on adding/removing element
+ *
+ * - support long name hash table (for param-tree)
+ *
+ * - fix a bug for cfs_hash_rehash_key:
+ *   in old implementation, cfs_hash_rehash_key could screw up the
+ *   hash-table because @key is overwritten without any protection.
+ *   Now we need user to define hs_keycpy for those rehash enabled
+ *   hash tables, cfs_hash_rehash_key will overwrite hash-key
+ *   inside lock by calling hs_keycpy.
+ *
+ * - better hash iteration:
+ *   Now we support both locked iteration & lockless iteration of hash
+ *   table. Also, user can break the iteration by return 1 in callback.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/seq_file.h>
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static unsigned int warn_on_depth = 8;
+module_param(warn_on_depth, uint, 0644);
+MODULE_PARM_DESC(warn_on_depth, "warning when hash depth is high.");
+#endif
+
+struct cfs_wi_sched *cfs_sched_rehash;
+
+static inline void
+cfs_hash_nl_lock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_nl_unlock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_spin_lock(union cfs_hash_lock *lock, int exclusive)
+	__acquires(&lock->spin)
+{
+	spin_lock(&lock->spin);
+}
+
+static inline void
+cfs_hash_spin_unlock(union cfs_hash_lock *lock, int exclusive)
+	__releases(&lock->spin)
+{
+	spin_unlock(&lock->spin);
+}
+
+static inline void
+cfs_hash_rw_lock(union cfs_hash_lock *lock, int exclusive)
+	__acquires(&lock->rw)
+{
+	if (!exclusive)
+		read_lock(&lock->rw);
+	else
+		write_lock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_unlock(union cfs_hash_lock *lock, int exclusive)
+	__releases(&lock->rw)
+{
+	if (!exclusive)
+		read_unlock(&lock->rw);
+	else
+		write_unlock(&lock->rw);
+}
+
+/** No lock hash */
+static cfs_hash_lock_ops_t cfs_hash_nl_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_nl_lock,
+	.hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** no bucket lock, one spinlock to protect everything */
+static cfs_hash_lock_ops_t cfs_hash_nbl_lops = {
+	.hs_lock	= cfs_hash_spin_lock,
+	.hs_unlock      = cfs_hash_spin_unlock,
+	.hs_bkt_lock    = cfs_hash_nl_lock,
+	.hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** spin bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_spin_lops = {
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock      = cfs_hash_rw_unlock,
+	.hs_bkt_lock    = cfs_hash_spin_lock,
+	.hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_rw_lops = {
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock      = cfs_hash_rw_unlock,
+	.hs_bkt_lock    = cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+/** spin bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_spin_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_spin_lock,
+	.hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_rw_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+static void
+cfs_hash_lock_setup(struct cfs_hash *hs)
+{
+	if (cfs_hash_with_no_lock(hs)) {
+		hs->hs_lops = &cfs_hash_nl_lops;
+
+	} else if (cfs_hash_with_no_bktlock(hs)) {
+		hs->hs_lops = &cfs_hash_nbl_lops;
+		spin_lock_init(&hs->hs_lock.spin);
+
+	} else if (cfs_hash_with_rehash(hs)) {
+		rwlock_init(&hs->hs_lock.rw);
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			hs->hs_lops = &cfs_hash_bkt_rw_lops;
+		else if (cfs_hash_with_spin_bktlock(hs))
+			hs->hs_lops = &cfs_hash_bkt_spin_lops;
+		else
+			LBUG();
+	} else {
+		if (cfs_hash_with_rw_bktlock(hs))
+			hs->hs_lops = &cfs_hash_nr_bkt_rw_lops;
+		else if (cfs_hash_with_spin_bktlock(hs))
+			hs->hs_lops = &cfs_hash_nr_bkt_spin_lops;
+		else
+			LBUG();
+	}
+}
+
+/**
+ * Simple hash head without depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+	struct hlist_head	hh_head;	/**< entries list */
+} cfs_hash_head_t;
+
+static int
+cfs_hash_hh_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(cfs_hash_head_t);
+}
+
+static struct hlist_head *
+cfs_hash_hh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	cfs_hash_head_t *head = (cfs_hash_head_t *)&bd->bd_bucket->hsb_head[0];
+
+	return &head[bd->bd_offset].hh_head;
+}
+
+static int
+cfs_hash_hh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd));
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_hh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_del_init(hnode);
+	return -1; /* unknown depth */
+}
+
+/**
+ * Simple hash head with depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+	struct hlist_head	hd_head;	/**< entries list */
+	unsigned int	    hd_depth;       /**< list length */
+} cfs_hash_head_dep_t;
+
+static int
+cfs_hash_hd_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(cfs_hash_head_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_hd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	cfs_hash_head_dep_t   *head;
+
+	head = (cfs_hash_head_dep_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].hd_head;
+}
+
+static int
+cfs_hash_hd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+					       cfs_hash_head_dep_t, hd_head);
+	hlist_add_head(hnode, &hh->hd_head);
+	return ++hh->hd_depth;
+}
+
+static int
+cfs_hash_hd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+					       cfs_hash_head_dep_t, hd_head);
+	hlist_del_init(hnode);
+	return --hh->hd_depth;
+}
+
+/**
+ * double links hash head without depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+	struct hlist_head	dh_head;	/**< entries list */
+	struct hlist_node       *dh_tail;	/**< the last entry */
+} cfs_hash_dhead_t;
+
+static int
+cfs_hash_dh_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(cfs_hash_dhead_t);
+}
+
+static struct hlist_head *
+cfs_hash_dh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	cfs_hash_dhead_t *head;
+
+	head = (cfs_hash_dhead_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dh_head;
+}
+
+static int
+cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+					    cfs_hash_dhead_t, dh_head);
+
+	if (dh->dh_tail != NULL) /* not empty */
+		hlist_add_behind(hnode, dh->dh_tail);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dh_head);
+	dh->dh_tail = hnode;
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_dh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnd)
+{
+	cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+					    cfs_hash_dhead_t, dh_head);
+
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return -1; /* unknown depth */
+}
+
+/**
+ * double links hash head with depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+	struct hlist_head	dd_head;	/**< entries list */
+	struct hlist_node       *dd_tail;	/**< the last entry */
+	unsigned int	    dd_depth;       /**< list length */
+} cfs_hash_dhead_dep_t;
+
+static int
+cfs_hash_dd_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(cfs_hash_dhead_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_dd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	cfs_hash_dhead_dep_t *head;
+
+	head = (cfs_hash_dhead_dep_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dd_head;
+}
+
+static int
+cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+						cfs_hash_dhead_dep_t, dd_head);
+
+	if (dh->dd_tail != NULL) /* not empty */
+		hlist_add_behind(hnode, dh->dd_tail);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dd_head);
+	dh->dd_tail = hnode;
+	return ++dh->dd_depth;
+}
+
+static int
+cfs_hash_dd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnd)
+{
+	cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+						cfs_hash_dhead_dep_t, dd_head);
+
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return --dh->dd_depth;
+}
+
+static cfs_hash_hlist_ops_t cfs_hash_hh_hops = {
+       .hop_hhead      = cfs_hash_hh_hhead,
+       .hop_hhead_size = cfs_hash_hh_hhead_size,
+       .hop_hnode_add  = cfs_hash_hh_hnode_add,
+       .hop_hnode_del  = cfs_hash_hh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_hd_hops = {
+       .hop_hhead      = cfs_hash_hd_hhead,
+       .hop_hhead_size = cfs_hash_hd_hhead_size,
+       .hop_hnode_add  = cfs_hash_hd_hnode_add,
+       .hop_hnode_del  = cfs_hash_hd_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dh_hops = {
+       .hop_hhead      = cfs_hash_dh_hhead,
+       .hop_hhead_size = cfs_hash_dh_hhead_size,
+       .hop_hnode_add  = cfs_hash_dh_hnode_add,
+       .hop_hnode_del  = cfs_hash_dh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dd_hops = {
+       .hop_hhead      = cfs_hash_dd_hhead,
+       .hop_hhead_size = cfs_hash_dd_hhead_size,
+       .hop_hnode_add  = cfs_hash_dd_hnode_add,
+       .hop_hnode_del  = cfs_hash_dd_hnode_del,
+};
+
+static void
+cfs_hash_hlist_setup(struct cfs_hash *hs)
+{
+	if (cfs_hash_with_add_tail(hs)) {
+		hs->hs_hops = cfs_hash_with_depth(hs) ?
+			      &cfs_hash_dd_hops : &cfs_hash_dh_hops;
+	} else {
+		hs->hs_hops = cfs_hash_with_depth(hs) ?
+			      &cfs_hash_hd_hops : &cfs_hash_hh_hops;
+	}
+}
+
+static void
+cfs_hash_bd_from_key(struct cfs_hash *hs, struct cfs_hash_bucket **bkts,
+		     unsigned int bits, const void *key, struct cfs_hash_bd *bd)
+{
+	unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1);
+
+	LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits);
+
+	bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)];
+	bd->bd_offset = index >> (bits - hs->hs_bkt_bits);
+}
+
+void
+cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (likely(hs->hs_rehash_buckets == NULL)) {
+		cfs_hash_bd_from_key(hs, hs->hs_buckets,
+				     hs->hs_cur_bits, key, bd);
+	} else {
+		LASSERT(hs->hs_rehash_bits != 0);
+		cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+				     hs->hs_rehash_bits, key, bd);
+	}
+}
+EXPORT_SYMBOL(cfs_hash_bd_get);
+
+static inline void
+cfs_hash_bd_dep_record(struct cfs_hash *hs, struct cfs_hash_bd *bd, int dep_cur)
+{
+	if (likely(dep_cur <= bd->bd_bucket->hsb_depmax))
+		return;
+
+	bd->bd_bucket->hsb_depmax = dep_cur;
+# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+	if (likely(warn_on_depth == 0 ||
+		   max(warn_on_depth, hs->hs_dep_max) >= dep_cur))
+		return;
+
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_max  = dep_cur;
+	hs->hs_dep_bkt  = bd->bd_bucket->hsb_index;
+	hs->hs_dep_off  = bd->bd_offset;
+	hs->hs_dep_bits = hs->hs_cur_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi);
+# endif
+}
+
+void
+cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		       struct hlist_node *hnode)
+{
+	int		rc;
+
+	rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode);
+	cfs_hash_bd_dep_record(hs, bd, rc);
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+	bd->bd_bucket->hsb_count++;
+
+	if (cfs_hash_with_counter(hs))
+		atomic_inc(&hs->hs_count);
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_get(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_add_locked);
+
+void
+cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		       struct hlist_node *hnode)
+{
+	hs->hs_hops->hop_hnode_del(hs, bd, hnode);
+
+	LASSERT(bd->bd_bucket->hsb_count > 0);
+	bd->bd_bucket->hsb_count--;
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+
+	if (cfs_hash_with_counter(hs)) {
+		LASSERT(atomic_read(&hs->hs_count) > 0);
+		atomic_dec(&hs->hs_count);
+	}
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_put_locked(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_del_locked);
+
+void
+cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old,
+			struct cfs_hash_bd *bd_new, struct hlist_node *hnode)
+{
+	struct cfs_hash_bucket *obkt = bd_old->bd_bucket;
+	struct cfs_hash_bucket *nbkt = bd_new->bd_bucket;
+	int		rc;
+
+	if (cfs_hash_bd_compare(bd_old, bd_new) == 0)
+		return;
+
+	/* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops
+	 * in cfs_hash_bd_del/add_locked */
+	hs->hs_hops->hop_hnode_del(hs, bd_old, hnode);
+	rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode);
+	cfs_hash_bd_dep_record(hs, bd_new, rc);
+
+	LASSERT(obkt->hsb_count > 0);
+	obkt->hsb_count--;
+	obkt->hsb_version++;
+	if (unlikely(obkt->hsb_version == 0))
+		obkt->hsb_version++;
+	nbkt->hsb_count++;
+	nbkt->hsb_version++;
+	if (unlikely(nbkt->hsb_version == 0))
+		nbkt->hsb_version++;
+}
+EXPORT_SYMBOL(cfs_hash_bd_move_locked);
+
+enum {
+	/** always set, for sanity (avoid ZERO intent) */
+	CFS_HS_LOOKUP_MASK_FIND     = 1 << 0,
+	/** return entry with a ref */
+	CFS_HS_LOOKUP_MASK_REF      = 1 << 1,
+	/** add entry if not existing */
+	CFS_HS_LOOKUP_MASK_ADD      = 1 << 2,
+	/** delete entry, ignore other masks */
+	CFS_HS_LOOKUP_MASK_DEL      = 1 << 3,
+};
+
+typedef enum cfs_hash_lookup_intent {
+	/** return item w/o refcount */
+	CFS_HS_LOOKUP_IT_PEEK       = CFS_HS_LOOKUP_MASK_FIND,
+	/** return item with refcount */
+	CFS_HS_LOOKUP_IT_FIND       = (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_REF),
+	/** return item w/o refcount if existed, otherwise add */
+	CFS_HS_LOOKUP_IT_ADD	= (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_ADD),
+	/** return item with refcount if existed, otherwise add */
+	CFS_HS_LOOKUP_IT_FINDADD    = (CFS_HS_LOOKUP_IT_FIND |
+				       CFS_HS_LOOKUP_MASK_ADD),
+	/** delete if existed */
+	CFS_HS_LOOKUP_IT_FINDDEL    = (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_DEL)
+} cfs_hash_lookup_intent_t;
+
+static struct hlist_node *
+cfs_hash_bd_lookup_intent(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			  const void *key, struct hlist_node *hnode,
+			  cfs_hash_lookup_intent_t intent)
+
+{
+	struct hlist_head  *hhead = cfs_hash_bd_hhead(hs, bd);
+	struct hlist_node  *ehnode;
+	struct hlist_node  *match;
+	int  intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0;
+
+	/* with this function, we can avoid a lot of useless refcount ops,
+	 * which are expensive atomic operations most time. */
+	match = intent_add ? NULL : hnode;
+	hlist_for_each(ehnode, hhead) {
+		if (!cfs_hash_keycmp(hs, key, ehnode))
+			continue;
+
+		if (match != NULL && match != ehnode) /* can't match */
+			continue;
+
+		/* match and ... */
+		if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) {
+			cfs_hash_bd_del_locked(hs, bd, ehnode);
+			return ehnode;
+		}
+
+		/* caller wants refcount? */
+		if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0)
+			cfs_hash_get(hs, ehnode);
+		return ehnode;
+	}
+	/* no match item */
+	if (!intent_add)
+		return NULL;
+
+	LASSERT(hnode != NULL);
+	cfs_hash_bd_add_locked(hs, bd, hnode);
+	return hnode;
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					 CFS_HS_LOOKUP_IT_FIND);
+}
+EXPORT_SYMBOL(cfs_hash_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					 CFS_HS_LOOKUP_IT_PEEK);
+}
+EXPORT_SYMBOL(cfs_hash_bd_peek_locked);
+
+struct hlist_node *
+cfs_hash_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   const void *key, struct hlist_node *hnode,
+			   int noref)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+					 CFS_HS_LOOKUP_IT_ADD |
+					 (!noref * CFS_HS_LOOKUP_MASK_REF));
+}
+EXPORT_SYMBOL(cfs_hash_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   const void *key, struct hlist_node *hnode)
+{
+	/* hnode can be NULL, we find the first item with @key */
+	return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+					 CFS_HS_LOOKUP_IT_FINDDEL);
+}
+EXPORT_SYMBOL(cfs_hash_bd_finddel_locked);
+
+static void
+cfs_hash_multi_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+		       unsigned n, int excl)
+{
+	struct cfs_hash_bucket *prev = NULL;
+	int		i;
+
+	/**
+	 * bds must be ascendantly ordered by bd->bd_bucket->hsb_index.
+	 * NB: it's possible that several bds point to the same bucket but
+	 * have different bd::bd_offset, so need take care of deadlock.
+	 */
+	cfs_hash_for_each_bd(bds, n, i) {
+		if (prev == bds[i].bd_bucket)
+			continue;
+
+		LASSERT(prev == NULL ||
+			prev->hsb_index < bds[i].bd_bucket->hsb_index);
+		cfs_hash_bd_lock(hs, &bds[i], excl);
+		prev = bds[i].bd_bucket;
+	}
+}
+
+static void
+cfs_hash_multi_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+			 unsigned n, int excl)
+{
+	struct cfs_hash_bucket *prev = NULL;
+	int		i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		if (prev != bds[i].bd_bucket) {
+			cfs_hash_bd_unlock(hs, &bds[i], excl);
+			prev = bds[i].bd_bucket;
+		}
+	}
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				unsigned n, const void *key)
+{
+	struct hlist_node  *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL,
+						   CFS_HS_LOOKUP_IT_FIND);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_findadd_locked(struct cfs_hash *hs,
+				 struct cfs_hash_bd *bds, unsigned n, const void *key,
+				 struct hlist_node *hnode, int noref)
+{
+	struct hlist_node  *ehnode;
+	int		intent;
+	unsigned	   i;
+
+	LASSERT(hnode != NULL);
+	intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF);
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key,
+						   NULL, intent);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+
+	if (i == 1) { /* only one bucket */
+		cfs_hash_bd_add_locked(hs, &bds[0], hnode);
+	} else {
+		struct cfs_hash_bd      mybd;
+
+		cfs_hash_bd_get(hs, key, &mybd);
+		cfs_hash_bd_add_locked(hs, &mybd, hnode);
+	}
+
+	return hnode;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				 unsigned n, const void *key,
+				 struct hlist_node *hnode)
+{
+	struct hlist_node  *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode,
+						   CFS_HS_LOOKUP_IT_FINDDEL);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static void
+cfs_hash_bd_order(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2)
+{
+	int     rc;
+
+	if (bd2->bd_bucket == NULL)
+		return;
+
+	if (bd1->bd_bucket == NULL) {
+		*bd1 = *bd2;
+		bd2->bd_bucket = NULL;
+		return;
+	}
+
+	rc = cfs_hash_bd_compare(bd1, bd2);
+	if (rc == 0) {
+		bd2->bd_bucket = NULL;
+
+	} else if (rc > 0) { /* swab bd1 and bd2 */
+		struct cfs_hash_bd tmp;
+
+		tmp = *bd2;
+		*bd2 = *bd1;
+		*bd1 = tmp;
+	}
+}
+
+void
+cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bds)
+{
+	/* NB: caller should hold hs_lock.rw if REHASH is set */
+	cfs_hash_bd_from_key(hs, hs->hs_buckets,
+			     hs->hs_cur_bits, key, &bds[0]);
+	if (likely(hs->hs_rehash_buckets == NULL)) {
+		/* no rehash or not rehashing */
+		bds[1].bd_bucket = NULL;
+		return;
+	}
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+			     hs->hs_rehash_bits, key, &bds[1]);
+
+	cfs_hash_bd_order(&bds[0], &bds[1]);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_get);
+
+void
+cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+	cfs_hash_multi_bd_lock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lock);
+
+void
+cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+	cfs_hash_multi_bd_unlock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_unlock);
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+			       const void *key)
+{
+	return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode,
+				int noref)
+{
+	return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key,
+						hnode, noref);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_finddel_locked);
+
+static void
+cfs_hash_buckets_free(struct cfs_hash_bucket **buckets,
+		      int bkt_size, int prev_size, int size)
+{
+	int     i;
+
+	for (i = prev_size; i < size; i++) {
+		if (buckets[i] != NULL)
+			LIBCFS_FREE(buckets[i], bkt_size);
+	}
+
+	LIBCFS_FREE(buckets, sizeof(buckets[0]) * size);
+}
+
+/*
+ * Create or grow bucket memory. Return old_buckets if no allocation was
+ * needed, the newly allocated buckets if allocation was needed and
+ * successful, and NULL on error.
+ */
+static struct cfs_hash_bucket **
+cfs_hash_buckets_realloc(struct cfs_hash *hs, struct cfs_hash_bucket **old_bkts,
+			 unsigned int old_size, unsigned int new_size)
+{
+	struct cfs_hash_bucket **new_bkts;
+	int		 i;
+
+	LASSERT(old_size == 0 || old_bkts != NULL);
+
+	if (old_bkts != NULL && old_size == new_size)
+		return old_bkts;
+
+	LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size);
+	if (new_bkts == NULL)
+		return NULL;
+
+	if (old_bkts != NULL) {
+		memcpy(new_bkts, old_bkts,
+		       min(old_size, new_size) * sizeof(*old_bkts));
+	}
+
+	for (i = old_size; i < new_size; i++) {
+		struct hlist_head *hhead;
+		struct cfs_hash_bd     bd;
+
+		LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs));
+		if (new_bkts[i] == NULL) {
+			cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs),
+					      old_size, new_size);
+			return NULL;
+		}
+
+		new_bkts[i]->hsb_index   = i;
+		new_bkts[i]->hsb_version = 1;  /* shouldn't be zero */
+		new_bkts[i]->hsb_depmax  = -1; /* unknown */
+		bd.bd_bucket = new_bkts[i];
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead)
+			INIT_HLIST_HEAD(hhead);
+
+		if (cfs_hash_with_no_lock(hs) ||
+		    cfs_hash_with_no_bktlock(hs))
+			continue;
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			rwlock_init(&new_bkts[i]->hsb_lock.rw);
+		else if (cfs_hash_with_spin_bktlock(hs))
+			spin_lock_init(&new_bkts[i]->hsb_lock.spin);
+		else
+			LBUG(); /* invalid use-case */
+	}
+	return new_bkts;
+}
+
+/**
+ * Initialize new libcfs hash, where:
+ * @name     - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops      - Registered hash table operations
+ * @flags    - CFS_HASH_REHASH enable synamic hash resizing
+ *	   - CFS_HASH_SORT enable chained hash sort
+ */
+static int cfs_hash_rehash_worker(cfs_workitem_t *wi);
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static int cfs_hash_dep_print(cfs_workitem_t *wi)
+{
+	struct cfs_hash *hs = container_of(wi, struct cfs_hash, hs_dep_wi);
+	int	 dep;
+	int	 bkt;
+	int	 off;
+	int	 bits;
+
+	spin_lock(&hs->hs_dep_lock);
+	dep  = hs->hs_dep_max;
+	bkt  = hs->hs_dep_bkt;
+	off  = hs->hs_dep_off;
+	bits = hs->hs_dep_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n",
+		      hs->hs_name, bits, dep, bkt, off);
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_bits = 0; /* mark as workitem done */
+	spin_unlock(&hs->hs_dep_lock);
+	return 0;
+}
+
+static void cfs_hash_depth_wi_init(struct cfs_hash *hs)
+{
+	spin_lock_init(&hs->hs_dep_lock);
+	cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print);
+}
+
+static void cfs_hash_depth_wi_cancel(struct cfs_hash *hs)
+{
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi))
+		return;
+
+	spin_lock(&hs->hs_dep_lock);
+	while (hs->hs_dep_bits != 0) {
+		spin_unlock(&hs->hs_dep_lock);
+		cond_resched();
+		spin_lock(&hs->hs_dep_lock);
+	}
+	spin_unlock(&hs->hs_dep_lock);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */
+
+static inline void cfs_hash_depth_wi_init(struct cfs_hash *hs) {}
+static inline void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */
+
+struct cfs_hash *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+		unsigned bkt_bits, unsigned extra_bytes,
+		unsigned min_theta, unsigned max_theta,
+		cfs_hash_ops_t *ops, unsigned flags)
+{
+	struct cfs_hash *hs;
+	int	 len;
+
+	CLASSERT(CFS_HASH_THETA_BITS < 15);
+
+	LASSERT(name != NULL);
+	LASSERT(ops != NULL);
+	LASSERT(ops->hs_key);
+	LASSERT(ops->hs_hash);
+	LASSERT(ops->hs_object);
+	LASSERT(ops->hs_keycmp);
+	LASSERT(ops->hs_get != NULL);
+	LASSERT(ops->hs_put_locked != NULL);
+
+	if ((flags & CFS_HASH_REHASH) != 0)
+		flags |= CFS_HASH_COUNTER; /* must have counter */
+
+	LASSERT(cur_bits > 0);
+	LASSERT(cur_bits >= bkt_bits);
+	LASSERT(max_bits >= cur_bits && max_bits < 31);
+	LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits));
+	LASSERT(ergo((flags & CFS_HASH_REHASH) != 0,
+		     (flags & CFS_HASH_NO_LOCK) == 0));
+	LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0,
+		      ops->hs_keycpy != NULL));
+
+	len = (flags & CFS_HASH_BIGNAME) == 0 ?
+	      CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN;
+	LIBCFS_ALLOC(hs, offsetof(struct cfs_hash, hs_name[len]));
+	if (hs == NULL)
+		return NULL;
+
+	strncpy(hs->hs_name, name, len);
+	hs->hs_name[len - 1] = '\0';
+	hs->hs_flags = flags;
+
+	atomic_set(&hs->hs_refcount, 1);
+	atomic_set(&hs->hs_count, 0);
+
+	cfs_hash_lock_setup(hs);
+	cfs_hash_hlist_setup(hs);
+
+	hs->hs_cur_bits = (__u8)cur_bits;
+	hs->hs_min_bits = (__u8)cur_bits;
+	hs->hs_max_bits = (__u8)max_bits;
+	hs->hs_bkt_bits = (__u8)bkt_bits;
+
+	hs->hs_ops	 = ops;
+	hs->hs_extra_bytes = extra_bytes;
+	hs->hs_rehash_bits = 0;
+	cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker);
+	cfs_hash_depth_wi_init(hs);
+
+	if (cfs_hash_with_rehash(hs))
+		__cfs_hash_set_theta(hs, min_theta, max_theta);
+
+	hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0,
+						  CFS_HASH_NBKT(hs));
+	if (hs->hs_buckets != NULL)
+		return hs;
+
+	LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[len]));
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_create);
+
+/**
+ * Cleanup libcfs hash @hs.
+ */
+static void
+cfs_hash_destroy(struct cfs_hash *hs)
+{
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	struct cfs_hash_bd	 bd;
+	int		   i;
+
+	LASSERT(hs != NULL);
+	LASSERT(!cfs_hash_is_exiting(hs) &&
+		!cfs_hash_is_iterating(hs));
+
+	/**
+	 * prohibit further rehashes, don't need any lock because
+	 * I'm the only (last) one can change it.
+	 */
+	hs->hs_exiting = 1;
+	if (cfs_hash_with_rehash(hs))
+		cfs_hash_rehash_cancel(hs);
+
+	cfs_hash_depth_wi_cancel(hs);
+	/* rehash should be done/canceled */
+	LASSERT(hs->hs_buckets != NULL &&
+		hs->hs_rehash_buckets == NULL);
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		LASSERT(bd.bd_bucket != NULL);
+		/* no need to take this lock, just for consistent code */
+		cfs_hash_bd_lock(hs, &bd, 1);
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				LASSERTF(!cfs_hash_with_assert_empty(hs),
+					 "hash %s bucket %u(%u) is not empty: %u items left\n",
+					 hs->hs_name, bd.bd_bucket->hsb_index,
+					 bd.bd_offset, bd.bd_bucket->hsb_count);
+				/* can't assert key valicate, because we
+				 * can interrupt rehash */
+				cfs_hash_bd_del_locked(hs, &bd, hnode);
+				cfs_hash_exit(hs, hnode);
+			}
+		}
+		LASSERT(bd.bd_bucket->hsb_count == 0);
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		cond_resched();
+	}
+
+	LASSERT(atomic_read(&hs->hs_count) == 0);
+
+	cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs),
+			      0, CFS_HASH_NBKT(hs));
+	i = cfs_hash_with_bigname(hs) ?
+	    CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN;
+	LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[i]));
+}
+
+struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs)
+{
+	if (atomic_inc_not_zero(&hs->hs_refcount))
+		return hs;
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_getref);
+
+void cfs_hash_putref(struct cfs_hash *hs)
+{
+	if (atomic_dec_and_test(&hs->hs_refcount))
+		cfs_hash_destroy(hs);
+}
+EXPORT_SYMBOL(cfs_hash_putref);
+
+static inline int
+cfs_hash_rehash_bits(struct cfs_hash *hs)
+{
+	if (cfs_hash_with_no_lock(hs) ||
+	    !cfs_hash_with_rehash(hs))
+		return -EOPNOTSUPP;
+
+	if (unlikely(cfs_hash_is_exiting(hs)))
+		return -ESRCH;
+
+	if (unlikely(cfs_hash_is_rehashing(hs)))
+		return -EALREADY;
+
+	if (unlikely(cfs_hash_is_iterating(hs)))
+		return -EAGAIN;
+
+	/* XXX: need to handle case with max_theta != 2.0
+	 *      and the case with min_theta != 0.5 */
+	if ((hs->hs_cur_bits < hs->hs_max_bits) &&
+	    (__cfs_hash_theta(hs) > hs->hs_max_theta))
+		return hs->hs_cur_bits + 1;
+
+	if (!cfs_hash_with_shrink(hs))
+		return 0;
+
+	if ((hs->hs_cur_bits > hs->hs_min_bits) &&
+	    (__cfs_hash_theta(hs) < hs->hs_min_theta))
+		return hs->hs_cur_bits - 1;
+
+	return 0;
+}
+
+/**
+ * don't allow inline rehash if:
+ * - user wants non-blocking change (add/del) on hash table
+ * - too many elements
+ */
+static inline int
+cfs_hash_rehash_inline(struct cfs_hash *hs)
+{
+	return !cfs_hash_with_nblk_change(hs) &&
+	       atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called when the item is added.
+ */
+void
+cfs_hash_add(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	struct cfs_hash_bd   bd;
+	int	     bits;
+
+	LASSERT(hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_bd_get_and_lock(hs, key, &bd, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	cfs_hash_bd_add_locked(hs, &bd, hnode);
+
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+}
+EXPORT_SYMBOL(cfs_hash_add);
+
+static struct hlist_node *
+cfs_hash_find_or_add(struct cfs_hash *hs, const void *key,
+		     struct hlist_node *hnode, int noref)
+{
+	struct hlist_node *ehnode;
+	struct cfs_hash_bd     bds[2];
+	int	       bits = 0;
+
+	LASSERT(hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key,
+						 hnode, noref);
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+
+	if (ehnode == hnode) /* new item added */
+		bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return ehnode;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+cfs_hash_add_unique(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ?
+	       -EALREADY : 0;
+}
+EXPORT_SYMBOL(cfs_hash_add_unique);
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  If this @key
+ * already exists in the hash then ops->hs_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->hs_get is called on the item which was added.
+ */
+void *
+cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode)
+{
+	hnode = cfs_hash_find_or_add(hs, key, hnode, 0);
+
+	return cfs_hash_object(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_findadd_unique);
+
+/**
+ * Delete item @hnode from the libcfs hash @hs using @key.  The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket.  The object
+ * removed from the hash will be returned and obs->hs_put is called
+ * on the removed object.
+ */
+void *
+cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	void	   *obj  = NULL;
+	int	     bits = 0;
+	struct cfs_hash_bd   bds[2];
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	/* NB: do nothing if @hnode is not in hash table */
+	if (hnode == NULL || !hlist_unhashed(hnode)) {
+		if (bds[1].bd_bucket == NULL && hnode != NULL) {
+			cfs_hash_bd_del_locked(hs, &bds[0], hnode);
+		} else {
+			hnode = cfs_hash_dual_bd_finddel_locked(hs, bds,
+								key, hnode);
+		}
+	}
+
+	if (hnode != NULL) {
+		obj  = cfs_hash_object(hs, hnode);
+		bits = cfs_hash_rehash_bits(hs);
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return obj;
+}
+EXPORT_SYMBOL(cfs_hash_del);
+
+/**
+ * Delete item given @key in libcfs hash @hs.  The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @hs this function must be called once per key.  The removed object
+ * will be returned and ops->hs_put is called on the removed object.
+ */
+void *
+cfs_hash_del_key(struct cfs_hash *hs, const void *key)
+{
+	return cfs_hash_del(hs, key, NULL);
+}
+EXPORT_SYMBOL(cfs_hash_del_key);
+
+/**
+ * Lookup an item using @key in the libcfs hash @hs and return it.
+ * If the @key is found in the hash hs->hs_get() is called and the
+ * matching objects is returned.  It is the callers responsibility
+ * to call the counterpart ops->hs_put using the cfs_hash_put() macro
+ * when when finished with the object.  If the @key was not found
+ * in the hash @hs NULL is returned.
+ */
+void *
+cfs_hash_lookup(struct cfs_hash *hs, const void *key)
+{
+	void		 *obj = NULL;
+	struct hlist_node     *hnode;
+	struct cfs_hash_bd	 bds[2];
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key);
+	if (hnode != NULL)
+		obj = cfs_hash_object(hs, hnode);
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+
+	return obj;
+}
+EXPORT_SYMBOL(cfs_hash_lookup);
+
+static void
+cfs_hash_for_each_enter(struct cfs_hash *hs) {
+	LASSERT(!cfs_hash_is_exiting(hs));
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	/*
+	 * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter
+	 * because it's just an unreliable signal to rehash-thread,
+	 * rehash-thread will try to finish rehash ASAP when seeing this.
+	 */
+	hs->hs_iterating = 1;
+
+	cfs_hash_lock(hs, 1);
+	hs->hs_iterators++;
+
+	/* NB: iteration is mostly called by service thread,
+	 * we tend to cancel pending rehash-request, instead of
+	 * blocking service thread, we will relaunch rehash request
+	 * after iteration */
+	if (cfs_hash_is_rehashing(hs))
+		cfs_hash_rehash_cancel_locked(hs);
+	cfs_hash_unlock(hs, 1);
+}
+
+static void
+cfs_hash_for_each_exit(struct cfs_hash *hs) {
+	int remained;
+	int bits;
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	cfs_hash_lock(hs, 1);
+	remained = --hs->hs_iterators;
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 1);
+	/* NB: it's race on cfs_has_t::hs_iterating, see above */
+	if (remained == 0)
+		hs->hs_iterating = 0;
+	if (bits > 0) {
+		cfs_hash_rehash(hs, atomic_read(&hs->hs_count) <
+				    CFS_HASH_LOOP_HOG);
+	}
+}
+
+/**
+ * For each item in the libcfs hash @hs call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ *
+ * a) the function may sleep!
+ * b) during the callback:
+ *    . the bucket lock is held so the callback must never sleep.
+ *    . if @removal_safe is true, use can remove current item by
+ *      cfs_hash_bd_del_locked
+ */
+static __u64
+cfs_hash_for_each_tight(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+			void *data, int remove_safe) {
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	struct cfs_hash_bd	 bd;
+	__u64		 count = 0;
+	int		   excl  = !!remove_safe;
+	int		   loop  = 0;
+	int		   i;
+
+	cfs_hash_for_each_enter(hs);
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, excl);
+		if (func == NULL) { /* only glimpse size */
+			count += bd.bd_bucket->hsb_count;
+			cfs_hash_bd_unlock(hs, &bd, excl);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				count++;
+				loop++;
+				if (func(hs, &bd, hnode, data)) {
+					cfs_hash_bd_unlock(hs, &bd, excl);
+					goto out;
+				}
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, excl);
+		if (loop < CFS_HASH_LOOP_HOG)
+			continue;
+		loop = 0;
+		cfs_hash_unlock(hs, 0);
+		cond_resched();
+		cfs_hash_lock(hs, 0);
+	}
+ out:
+	cfs_hash_unlock(hs, 0);
+
+	cfs_hash_for_each_exit(hs);
+	return count;
+}
+
+typedef struct {
+	cfs_hash_cond_opt_cb_t  func;
+	void		   *arg;
+} cfs_hash_cond_arg_t;
+
+static int
+cfs_hash_cond_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			 struct hlist_node *hnode, void *data)
+{
+	cfs_hash_cond_arg_t *cond = data;
+
+	if (cond->func(cfs_hash_object(hs, hnode), cond->arg))
+		cfs_hash_bd_del_locked(hs, bd, hnode);
+	return 0;
+}
+
+/**
+ * Delete item from the libcfs hash @hs when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t func, void *data)
+{
+	cfs_hash_cond_arg_t arg = {
+		.func   = func,
+		.arg    = data,
+	};
+
+	cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1);
+}
+EXPORT_SYMBOL(cfs_hash_cond_del);
+
+void
+cfs_hash_for_each(struct cfs_hash *hs,
+		  cfs_hash_for_each_cb_t func, void *data)
+{
+	cfs_hash_for_each_tight(hs, func, data, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each);
+
+void
+cfs_hash_for_each_safe(struct cfs_hash *hs,
+		       cfs_hash_for_each_cb_t func, void *data) {
+	cfs_hash_for_each_tight(hs, func, data, 1);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_safe);
+
+static int
+cfs_hash_peek(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+	      struct hlist_node *hnode, void *data)
+{
+	*(int *)data = 0;
+	return 1; /* return 1 to break the loop */
+}
+
+int
+cfs_hash_is_empty(struct cfs_hash *hs)
+{
+	int empty = 1;
+
+	cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0);
+	return empty;
+}
+EXPORT_SYMBOL(cfs_hash_is_empty);
+
+__u64
+cfs_hash_size_get(struct cfs_hash *hs)
+{
+	return cfs_hash_with_counter(hs) ?
+	       atomic_read(&hs->hs_count) :
+	       cfs_hash_for_each_tight(hs, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(cfs_hash_size_get);
+
+/*
+ * cfs_hash_for_each_relax:
+ * Iterate the hash table and call @func on each item without
+ * any lock. This function can't guarantee to finish iteration
+ * if these features are enabled:
+ *
+ *  a. if rehash_key is enabled, an item can be moved from
+ *     one bucket to another bucket
+ *  b. user can remove non-zero-ref item from hash-table,
+ *     so the item can be removed from hash-table, even worse,
+ *     it's possible that user changed key and insert to another
+ *     hash bucket.
+ * there's no way for us to finish iteration correctly on previous
+ * two cases, so iteration has to be stopped on change.
+ */
+static int
+cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+			void *data) {
+	struct hlist_node *hnode;
+	struct hlist_node *tmp;
+	struct cfs_hash_bd     bd;
+	__u32	     version;
+	int	       count = 0;
+	int	       stop_on_change;
+	int	       rc;
+	int	       i;
+
+	stop_on_change = cfs_hash_with_rehash_key(hs) ||
+			 !cfs_hash_with_no_itemref(hs) ||
+			 hs->hs_ops->hs_put_locked == NULL;
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, 0);
+		version = cfs_hash_bd_version_get(&bd);
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			for (hnode = hhead->first; hnode != NULL;) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				cfs_hash_get(hs, hnode);
+				cfs_hash_bd_unlock(hs, &bd, 0);
+				cfs_hash_unlock(hs, 0);
+
+				rc = func(hs, &bd, hnode, data);
+				if (stop_on_change)
+					cfs_hash_put(hs, hnode);
+				cond_resched();
+				count++;
+
+				cfs_hash_lock(hs, 0);
+				cfs_hash_bd_lock(hs, &bd, 0);
+				if (!stop_on_change) {
+					tmp = hnode->next;
+					cfs_hash_put_locked(hs, hnode);
+					hnode = tmp;
+				} else { /* bucket changed? */
+					if (version !=
+					    cfs_hash_bd_version_get(&bd))
+						break;
+					/* safe to continue because no change */
+					hnode = hnode->next;
+				}
+				if (rc) /* callback wants to break iteration */
+					break;
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+	cfs_hash_unlock(hs, 0);
+
+	return count;
+}
+
+int
+cfs_hash_for_each_nolock(struct cfs_hash *hs,
+			 cfs_hash_for_each_cb_t func, void *data) {
+	if (cfs_hash_with_no_lock(hs) ||
+	    cfs_hash_with_rehash_key(hs) ||
+	    !cfs_hash_with_no_itemref(hs))
+		return -EOPNOTSUPP;
+
+	if (hs->hs_ops->hs_get == NULL ||
+	    (hs->hs_ops->hs_put == NULL &&
+	     hs->hs_ops->hs_put_locked == NULL))
+		return -EOPNOTSUPP;
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_for_each_relax(hs, func, data);
+	cfs_hash_for_each_exit(hs);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_hash_for_each_nolock);
+
+/**
+ * For each hash bucket in the libcfs hash @hs call the passed callback
+ * @func until all the hash buckets are empty.  The passed callback @func
+ * or the previously registered callback hs->hs_put must remove the item
+ * from the hash.  You may either use the cfs_hash_del() or hlist_del()
+ * functions.  No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed.  This function will not terminate until the
+ * hash is empty.  Note it is still possible to concurrently add new
+ * items in to the hash.  It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+int
+cfs_hash_for_each_empty(struct cfs_hash *hs,
+			cfs_hash_for_each_cb_t func, void *data) {
+	unsigned  i = 0;
+
+	if (cfs_hash_with_no_lock(hs))
+		return -EOPNOTSUPP;
+
+	if (hs->hs_ops->hs_get == NULL ||
+	    (hs->hs_ops->hs_put == NULL &&
+	     hs->hs_ops->hs_put_locked == NULL))
+		return -EOPNOTSUPP;
+
+	cfs_hash_for_each_enter(hs);
+	while (cfs_hash_for_each_relax(hs, func, data)) {
+		CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+		       hs->hs_name, i++);
+	}
+	cfs_hash_for_each_exit(hs);
+	return 0;
+}
+EXPORT_SYMBOL(cfs_hash_for_each_empty);
+
+void
+cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_head   *hhead;
+	struct hlist_node   *hnode;
+	struct cfs_hash_bd       bd;
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_lock(hs, 0);
+	if (hindex >= CFS_HASH_NHLIST(hs))
+		goto out;
+
+	cfs_hash_bd_index_set(hs, hindex, &bd);
+
+	cfs_hash_bd_lock(hs, &bd, 0);
+	hhead = cfs_hash_bd_hhead(hs, &bd);
+	hlist_for_each(hnode, hhead) {
+		if (func(hs, &bd, hnode, data))
+			break;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 0);
+ out:
+	cfs_hash_unlock(hs, 0);
+	cfs_hash_for_each_exit(hs);
+}
+
+EXPORT_SYMBOL(cfs_hash_hlist_for_each);
+
+/*
+ * For each item in the libcfs hash @hs which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data. During the callback the bucket lock
+ * is held so the callback must never sleep.
+   */
+void
+cfs_hash_for_each_key(struct cfs_hash *hs, const void *key,
+		      cfs_hash_for_each_cb_t func, void *data) {
+	struct hlist_node   *hnode;
+	struct cfs_hash_bd       bds[2];
+	unsigned	    i;
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	cfs_hash_for_each_bd(bds, 2, i) {
+		struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]);
+
+		hlist_for_each(hnode, hlist) {
+			cfs_hash_bucket_validate(hs, &bds[i], hnode);
+
+			if (cfs_hash_keycmp(hs, key, hnode)) {
+				if (func(hs, &bds[i], hnode, data))
+					break;
+			}
+		}
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_key);
+
+/**
+ * Rehash the libcfs hash @hs to the given @bits.  This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed.  When the CFS_HASH_REHASH
+ * flag is set in @hs the libcfs hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the hs->hs_min_theta or hs->max_theta values.  By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function.  The
+ * theta thresholds for @hs are tunable via cfs_hash_set_theta().
+ */
+void
+cfs_hash_rehash_cancel_locked(struct cfs_hash *hs)
+{
+	int     i;
+
+	/* need hold cfs_hash_lock(hs, 1) */
+	LASSERT(cfs_hash_with_rehash(hs) &&
+		!cfs_hash_with_no_lock(hs));
+
+	if (!cfs_hash_is_rehashing(hs))
+		return;
+
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) {
+		hs->hs_rehash_bits = 0;
+		return;
+	}
+
+	for (i = 2; cfs_hash_is_rehashing(hs); i++) {
+		cfs_hash_unlock(hs, 1);
+		/* raise console warning while waiting too long */
+		CDEBUG(IS_PO2(i >> 3) ? D_WARNING : D_INFO,
+		       "hash %s is still rehashing, rescheded %d\n",
+		       hs->hs_name, i - 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel_locked);
+
+void
+cfs_hash_rehash_cancel(struct cfs_hash *hs)
+{
+	cfs_hash_lock(hs, 1);
+	cfs_hash_rehash_cancel_locked(hs);
+	cfs_hash_unlock(hs, 1);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel);
+
+int
+cfs_hash_rehash(struct cfs_hash *hs, int do_rehash)
+{
+	int     rc;
+
+	LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs));
+
+	cfs_hash_lock(hs, 1);
+
+	rc = cfs_hash_rehash_bits(hs);
+	if (rc <= 0) {
+		cfs_hash_unlock(hs, 1);
+		return rc;
+	}
+
+	hs->hs_rehash_bits = rc;
+	if (!do_rehash) {
+		/* launch and return */
+		cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi);
+		cfs_hash_unlock(hs, 1);
+		return 0;
+	}
+
+	/* rehash right now */
+	cfs_hash_unlock(hs, 1);
+
+	return cfs_hash_rehash_worker(&hs->hs_rehash_wi);
+}
+EXPORT_SYMBOL(cfs_hash_rehash);
+
+static int
+cfs_hash_rehash_bd(struct cfs_hash *hs, struct cfs_hash_bd *old)
+{
+	struct cfs_hash_bd      new;
+	struct hlist_head  *hhead;
+	struct hlist_node  *hnode;
+	struct hlist_node  *pos;
+	void	      *key;
+	int		c = 0;
+
+	/* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */
+	cfs_hash_bd_for_each_hlist(hs, old, hhead) {
+		hlist_for_each_safe(hnode, pos, hhead) {
+			key = cfs_hash_key(hs, hnode);
+			LASSERT(key != NULL);
+			/* Validate hnode is in the correct bucket. */
+			cfs_hash_bucket_validate(hs, old, hnode);
+			/*
+			 * Delete from old hash bucket; move to new bucket.
+			 * ops->hs_key must be defined.
+			 */
+			cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+					     hs->hs_rehash_bits, key, &new);
+			cfs_hash_bd_move_locked(hs, old, &new, hnode);
+			c++;
+		}
+	}
+
+	return c;
+}
+
+static int
+cfs_hash_rehash_worker(cfs_workitem_t *wi)
+{
+	struct cfs_hash	 *hs = container_of(wi, struct cfs_hash, hs_rehash_wi);
+	struct cfs_hash_bucket **bkts;
+	struct cfs_hash_bd       bd;
+	unsigned int	old_size;
+	unsigned int	new_size;
+	int		 bsize;
+	int		 count = 0;
+	int		 rc = 0;
+	int		 i;
+
+	LASSERT (hs != NULL && cfs_hash_with_rehash(hs));
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(cfs_hash_is_rehashing(hs));
+
+	old_size = CFS_HASH_NBKT(hs);
+	new_size = CFS_HASH_RH_NBKT(hs);
+
+	cfs_hash_unlock(hs, 0);
+
+	/*
+	 * don't need hs::hs_rwlock for hs::hs_buckets,
+	 * because nobody can change bkt-table except me.
+	 */
+	bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets,
+					old_size, new_size);
+	cfs_hash_lock(hs, 1);
+	if (bkts == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (bkts == hs->hs_buckets) {
+		bkts = NULL; /* do nothing */
+		goto out;
+	}
+
+	rc = __cfs_hash_theta(hs);
+	if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) {
+		/* free the new allocated bkt-table */
+		old_size = new_size;
+		new_size = CFS_HASH_NBKT(hs);
+		rc = -EALREADY;
+		goto out;
+	}
+
+	LASSERT(hs->hs_rehash_buckets == NULL);
+	hs->hs_rehash_buckets = bkts;
+
+	rc = 0;
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		if (cfs_hash_is_exiting(hs)) {
+			rc = -ESRCH;
+			/* someone wants to destroy the hash, abort now */
+			if (old_size < new_size) /* OK to free old bkt-table */
+				break;
+			/* it's shrinking, need free new bkt-table */
+			hs->hs_rehash_buckets = NULL;
+			old_size = new_size;
+			new_size = CFS_HASH_NBKT(hs);
+			goto out;
+		}
+
+		count += cfs_hash_rehash_bd(hs, &bd);
+		if (count < CFS_HASH_LOOP_HOG ||
+		    cfs_hash_is_iterating(hs)) { /* need to finish ASAP */
+			continue;
+		}
+
+		count = 0;
+		cfs_hash_unlock(hs, 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+
+	hs->hs_rehash_count++;
+
+	bkts = hs->hs_buckets;
+	hs->hs_buckets = hs->hs_rehash_buckets;
+	hs->hs_rehash_buckets = NULL;
+
+	hs->hs_cur_bits = hs->hs_rehash_bits;
+ out:
+	hs->hs_rehash_bits = 0;
+	if (rc == -ESRCH) /* never be scheduled again */
+		cfs_wi_exit(cfs_sched_rehash, wi);
+	bsize = cfs_hash_bkt_size(hs);
+	cfs_hash_unlock(hs, 1);
+	/* can't refer to @hs anymore because it could be destroyed */
+	if (bkts != NULL)
+		cfs_hash_buckets_free(bkts, bsize, new_size, old_size);
+	if (rc != 0)
+		CDEBUG(D_INFO, "early quit of rehashing: %d\n", rc);
+	/* return 1 only if cfs_wi_exit is called */
+	return rc == -ESRCH;
+}
+
+/**
+ * Rehash the object referenced by @hnode in the libcfs hash @hs.  The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a cfs_hash_add() + cfs_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash.  When an object is being rehashed
+ * the registered cfs_hash_get() and cfs_hash_put() functions will
+ * not be called.
+ */
+void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key,
+			 void *new_key, struct hlist_node *hnode)
+{
+	struct cfs_hash_bd	bds[3];
+	struct cfs_hash_bd	old_bds[2];
+	struct cfs_hash_bd	new_bd;
+
+	LASSERT(!hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get(hs, old_key, old_bds);
+	cfs_hash_bd_get(hs, new_key, &new_bd);
+
+	bds[0] = old_bds[0];
+	bds[1] = old_bds[1];
+	bds[2] = new_bd;
+
+	/* NB: bds[0] and bds[1] are ordered already */
+	cfs_hash_bd_order(&bds[1], &bds[2]);
+	cfs_hash_bd_order(&bds[0], &bds[1]);
+
+	cfs_hash_multi_bd_lock(hs, bds, 3, 1);
+	if (likely(old_bds[1].bd_bucket == NULL)) {
+		cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode);
+	} else {
+		cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode);
+		cfs_hash_bd_add_locked(hs, &new_bd, hnode);
+	}
+	/* overwrite key inside locks, otherwise may screw up with
+	 * other operations, i.e: rehash */
+	cfs_hash_keycpy(hs, new_key, hnode);
+
+	cfs_hash_multi_bd_unlock(hs, bds, 3, 1);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_key);
+
+void cfs_hash_debug_header(struct seq_file *m)
+{
+	seq_printf(m, "%-*s   cur   min   max theta t-min t-max flags rehash   count  maxdep maxdepb distribution\n",
+		   CFS_HASH_BIGNAME_LEN, "name");
+}
+EXPORT_SYMBOL(cfs_hash_debug_header);
+
+static struct cfs_hash_bucket **
+cfs_hash_full_bkts(struct cfs_hash *hs)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (hs->hs_rehash_buckets == NULL)
+		return hs->hs_buckets;
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	return hs->hs_rehash_bits > hs->hs_cur_bits ?
+	       hs->hs_rehash_buckets : hs->hs_buckets;
+}
+
+static unsigned int
+cfs_hash_full_nbkt(struct cfs_hash *hs)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (hs->hs_rehash_buckets == NULL)
+		return CFS_HASH_NBKT(hs);
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	return hs->hs_rehash_bits > hs->hs_cur_bits ?
+	       CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs);
+}
+
+void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m)
+{
+	int		    dist[8] = { 0, };
+	int		    maxdep  = -1;
+	int		    maxdepb = -1;
+	int		    total   = 0;
+	int		    theta;
+	int		    i;
+
+	cfs_hash_lock(hs, 0);
+	theta = __cfs_hash_theta(hs);
+
+	seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d  0x%02x %6d ",
+		      CFS_HASH_BIGNAME_LEN, hs->hs_name,
+		      1 << hs->hs_cur_bits, 1 << hs->hs_min_bits,
+		      1 << hs->hs_max_bits,
+		      __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta),
+		      __cfs_hash_theta_int(hs->hs_min_theta),
+		      __cfs_hash_theta_frac(hs->hs_min_theta),
+		      __cfs_hash_theta_int(hs->hs_max_theta),
+		      __cfs_hash_theta_frac(hs->hs_max_theta),
+		      hs->hs_flags, hs->hs_rehash_count);
+
+	/*
+	 * The distribution is a summary of the chained hash depth in
+	 * each of the libcfs hash buckets.  Each buckets hsb_count is
+	 * divided by the hash theta value and used to generate a
+	 * histogram of the hash distribution.  A uniform hash will
+	 * result in all hash buckets being close to the average thus
+	 * only the first few entries in the histogram will be non-zero.
+	 * If you hash function results in a non-uniform hash the will
+	 * be observable by outlier bucks in the distribution histogram.
+	 *
+	 * Uniform hash distribution:      128/128/0/0/0/0/0/0
+	 * Non-Uniform hash distribution:  128/125/0/0/0/0/2/1
+	 */
+	for (i = 0; i < cfs_hash_full_nbkt(hs); i++) {
+		struct cfs_hash_bd  bd;
+
+		bd.bd_bucket = cfs_hash_full_bkts(hs)[i];
+		cfs_hash_bd_lock(hs, &bd, 0);
+		if (maxdep < bd.bd_bucket->hsb_depmax) {
+			maxdep  = bd.bd_bucket->hsb_depmax;
+			maxdepb = ffz(~maxdep);
+		}
+		total += bd.bd_bucket->hsb_count;
+		dist[min(fls(bd.bd_bucket->hsb_count / max(theta, 1)), 7)]++;
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+
+	seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb);
+	for (i = 0; i < 8; i++)
+		seq_printf(m, "%d%c",  dist[i], (i == 7) ? '\n' : '/');
+
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_debug_str);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c b/kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
new file mode 100644
index 000000000..d9b7c6b69
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
@@ -0,0 +1,240 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * Using pipes for all arches.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_KUC D_OTHER
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* This is the kernel side (liblustre as well). */
+
+/**
+ * libcfs_kkuc_msg_put - send an message from kernel to userspace
+ * @param fp to send the message to
+ * @param payload Payload data.  First field of payload is always
+ *   struct kuc_hdr
+ */
+int libcfs_kkuc_msg_put(struct file *filp, void *payload)
+{
+	struct kuc_hdr *kuch = (struct kuc_hdr *)payload;
+	ssize_t count = kuch->kuc_msglen;
+	loff_t offset = 0;
+	mm_segment_t fs;
+	int rc = -ENOSYS;
+
+	if (filp == NULL || IS_ERR(filp))
+		return -EBADF;
+
+	if (kuch->kuc_magic != KUC_MAGIC) {
+		CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic);
+		return -ENOSYS;
+	}
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	while (count > 0) {
+		rc = vfs_write(filp, (void __force __user *)payload,
+			       count, &offset);
+		if (rc < 0)
+			break;
+		count -= rc;
+		payload += rc;
+		rc = 0;
+	}
+	set_fs(fs);
+
+	if (rc < 0)
+		CWARN("message send failed (%d)\n", rc);
+	else
+		CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_msg_put);
+
+/* Broadcast groups are global across all mounted filesystems;
+ * i.e. registering for a group on 1 fs will get messages for that
+ * group from any fs */
+/** A single group registration has a uid and a file pointer */
+struct kkuc_reg {
+	struct list_head	kr_chain;
+	int		kr_uid;
+	struct file	*kr_fp;
+	__u32		kr_data;
+};
+static struct list_head kkuc_groups[KUC_GRP_MAX+1] = {};
+/* Protect message sending against remove and adds */
+static DECLARE_RWSEM(kg_sem);
+
+/** Add a receiver to a broadcast group
+ * @param filp pipe to write into
+ * @param uid identifier for this receiver
+ * @param group group number
+ */
+int libcfs_kkuc_group_add(struct file *filp, int uid, int group, __u32 data)
+{
+	struct kkuc_reg *reg;
+
+	if (group > KUC_GRP_MAX) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	/* fput in group_rem */
+	if (filp == NULL)
+		return -EBADF;
+
+	/* freed in group_rem */
+	reg = kmalloc(sizeof(*reg), 0);
+	if (reg == NULL)
+		return -ENOMEM;
+
+	reg->kr_fp = filp;
+	reg->kr_uid = uid;
+	reg->kr_data = data;
+
+	down_write(&kg_sem);
+	if (kkuc_groups[group].next == NULL)
+		INIT_LIST_HEAD(&kkuc_groups[group]);
+	list_add(&reg->kr_chain, &kkuc_groups[group]);
+	up_write(&kg_sem);
+
+	CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_add);
+
+int libcfs_kkuc_group_rem(int uid, int group)
+{
+	struct kkuc_reg *reg, *next;
+
+	if (kkuc_groups[group].next == NULL)
+		return 0;
+
+	if (uid == 0) {
+		/* Broadcast a shutdown message */
+		struct kuc_hdr lh;
+
+		lh.kuc_magic = KUC_MAGIC;
+		lh.kuc_transport = KUC_TRANSPORT_GENERIC;
+		lh.kuc_msgtype = KUC_MSG_SHUTDOWN;
+		lh.kuc_msglen = sizeof(lh);
+		libcfs_kkuc_group_put(group, &lh);
+	}
+
+	down_write(&kg_sem);
+	list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) {
+		if ((uid == 0) || (uid == reg->kr_uid)) {
+			list_del(&reg->kr_chain);
+			CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+			       reg->kr_uid, reg->kr_fp, group);
+			if (reg->kr_fp != NULL)
+				fput(reg->kr_fp);
+			kfree(reg);
+		}
+	}
+	up_write(&kg_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_rem);
+
+int libcfs_kkuc_group_put(int group, void *payload)
+{
+	struct kkuc_reg	*reg;
+	int		 rc = 0;
+	int one_success = 0;
+
+	down_read(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (reg->kr_fp != NULL) {
+			rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
+			if (rc == 0)
+				one_success = 1;
+			else if (rc == -EPIPE) {
+				fput(reg->kr_fp);
+				reg->kr_fp = NULL;
+			}
+		}
+	}
+	up_read(&kg_sem);
+
+	/* don't return an error if the message has been delivered
+	 * at least to one agent */
+	if (one_success)
+		rc = 0;
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_put);
+
+/**
+ * Calls a callback function for each link of the given kuc group.
+ * @param group the group to call the function on.
+ * @param cb_func the function to be called.
+ * @param cb_arg iextra argument to be passed to the callback function.
+ */
+int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+			      void *cb_arg)
+{
+	struct kkuc_reg *reg;
+	int rc = 0;
+
+	if (group > KUC_GRP_MAX) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	/* no link for this group */
+	if (kkuc_groups[group].next == NULL)
+		return 0;
+
+	down_write(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (reg->kr_fp != NULL)
+			rc = cb_func(reg->kr_data, cb_arg);
+	}
+	up_write(&kg_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_foreach);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
new file mode 100644
index 000000000..31a558115
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
@@ -0,0 +1,224 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/** Global CPU partition table */
+struct cfs_cpt_table   *cfs_cpt_table __read_mostly;
+EXPORT_SYMBOL(cfs_cpt_table);
+
+#ifndef HAVE_LIBCFS_CPT
+
+#define CFS_CPU_VERSION_MAGIC	   0xbabecafe
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+
+	if (ncpt != 1) {
+		CERROR("Can't support cpu partition number %d\n", ncpt);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab != NULL) {
+		cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+		cptab->ctb_nparts  = ncpt;
+	}
+
+	return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+#ifdef CONFIG_SMP
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	int	rc = 0;
+
+	rc = snprintf(buf, len, "%d\t: %d\n", 0, 0);
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+#endif /* CONFIG_SMP */
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpu_ht_nsiblings(int cpu)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpu_ht_nsiblings);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+void
+cfs_cpu_fini(void)
+{
+	if (cfs_cpt_table != NULL) {
+		cfs_cpt_table_free(cfs_cpt_table);
+		cfs_cpt_table = NULL;
+	}
+}
+
+int
+cfs_cpu_init(void)
+{
+	cfs_cpt_table = cfs_cpt_table_alloc(1);
+
+	return cfs_cpt_table != NULL ? 0 : -1;
+}
+
+#endif /* HAVE_LIBCFS_CPT */
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
new file mode 100644
index 000000000..2c199c725
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+
+/** destroy cpu-partition lock, see libcfs_private.h for more detail */
+void
+cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
+{
+	LASSERT(pcl->pcl_locks != NULL);
+	LASSERT(!pcl->pcl_locked);
+
+	cfs_percpt_free(pcl->pcl_locks);
+	LIBCFS_FREE(pcl, sizeof(*pcl));
+}
+EXPORT_SYMBOL(cfs_percpt_lock_free);
+
+/**
+ * create cpu-partition lock, see libcfs_private.h for more detail.
+ *
+ * cpu-partition lock is designed for large-scale SMP system, so we need to
+ * reduce cacheline conflict as possible as we can, that's the
+ * reason we always allocate cacheline-aligned memory block.
+ */
+struct cfs_percpt_lock *
+cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab)
+{
+	struct cfs_percpt_lock	*pcl;
+	spinlock_t		*lock;
+	int			i;
+
+	/* NB: cptab can be NULL, pcl will be for HW CPUs on that case */
+	LIBCFS_ALLOC(pcl, sizeof(*pcl));
+	if (pcl == NULL)
+		return NULL;
+
+	pcl->pcl_cptab = cptab;
+	pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock));
+	if (pcl->pcl_locks == NULL) {
+		LIBCFS_FREE(pcl, sizeof(*pcl));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(lock, i, pcl->pcl_locks)
+		spin_lock_init(lock);
+
+	return pcl;
+}
+EXPORT_SYMBOL(cfs_percpt_lock_alloc);
+
+/**
+ * lock a CPU partition
+ *
+ * \a index != CFS_PERCPT_LOCK_EX
+ *     hold private lock indexed by \a index
+ *
+ * \a index == CFS_PERCPT_LOCK_EX
+ *     exclusively lock @pcl and nobody can take private lock
+ */
+void
+cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt);
+
+	if (ncpt == 1) {
+		index = 0;
+	} else { /* serialize with exclusive lock */
+		while (pcl->pcl_locked)
+			cpu_relax();
+	}
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_lock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	/* exclusive lock request */
+	for (i = 0; i < ncpt; i++) {
+		spin_lock(pcl->pcl_locks[i]);
+		if (i == 0) {
+			LASSERT(!pcl->pcl_locked);
+			/* nobody should take private lock after this
+			 * so I wouldn't starve for too long time */
+			pcl->pcl_locked = 1;
+		}
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_lock);
+
+/** unlock a CPU partition */
+void
+cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	index = ncpt == 1 ? 0 : index;
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_unlock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	for (i = ncpt - 1; i >= 0; i--) {
+		if (i == 0) {
+			LASSERT(pcl->pcl_locked);
+			pcl->pcl_locked = 0;
+		}
+		spin_unlock(pcl->pcl_locks[i]);
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_unlock);
+
+
+/** free cpu-partition refcount */
+void
+cfs_percpt_atomic_free(atomic_t **refs)
+{
+	cfs_percpt_free(refs);
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_free);
+
+/** allocate cpu-partition refcount with initial value @init_val */
+atomic_t **
+cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int init_val)
+{
+	atomic_t	**refs;
+	atomic_t	*ref;
+	int		i;
+
+	refs = cfs_percpt_alloc(cptab, sizeof(*ref));
+	if (refs == NULL)
+		return NULL;
+
+	cfs_percpt_for_each(ref, i, refs)
+		atomic_set(ref, init_val);
+	return refs;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_alloc);
+
+/** return sum of cpu-partition refs */
+int
+cfs_percpt_atomic_summary(atomic_t **refs)
+{
+	atomic_t	*ref;
+	int		i;
+	int		val = 0;
+
+	cfs_percpt_for_each(ref, i, refs)
+		val += atomic_read(ref);
+
+	return val;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_summary);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
new file mode 100644
index 000000000..1debdda72
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+struct cfs_var_array {
+	unsigned int		va_count;	/* # of buffers */
+	unsigned int		va_size;	/* size of each var */
+	struct cfs_cpt_table	*va_cptab;	/* cpu partition table */
+	void			*va_ptrs[0];	/* buffer addresses */
+};
+
+/*
+ * free per-cpu data, see more detail in cfs_percpt_free
+ */
+void
+cfs_percpt_free(void *vars)
+{
+	struct	cfs_var_array *arr;
+	int	i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] != NULL)
+			LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_percpt_free);
+
+/*
+ * allocate per cpu-partition variables, returned value is an array of pointers,
+ * variable can be indexed by CPU partition ID, i.e:
+ *
+ *	arr = cfs_percpt_alloc(cfs_cpu_pt, size);
+ *	then caller can access memory block for CPU 0 by arr[0],
+ *	memory block for CPU 1 by arr[1]...
+ *	memory block for CPU N by arr[N]...
+ *
+ * cacheline aligned.
+ */
+void *
+cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			count;
+	int			i;
+
+	count = cfs_cpt_number(cptab);
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_size	= size = L1_CACHE_ALIGN(size);
+	arr->va_count	= count;
+	arr->va_cptab	= cptab;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_percpt_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_percpt_alloc);
+
+/*
+ * return number of CPUs (or number of elements in per-cpu data)
+ * according to cptab of @vars
+ */
+int
+cfs_percpt_number(void *vars)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	return arr->va_count;
+}
+EXPORT_SYMBOL(cfs_percpt_number);
+
+/*
+ * return memory block shadowed from current CPU
+ */
+void *
+cfs_percpt_current(void *vars)
+{
+	struct cfs_var_array *arr;
+	int    cpt;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+	cpt = cfs_cpt_current(arr->va_cptab, 0);
+	if (cpt < 0)
+		return NULL;
+
+	return arr->va_ptrs[cpt];
+}
+EXPORT_SYMBOL(cfs_percpt_current);
+
+void *
+cfs_percpt_index(void *vars, int idx)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	LASSERT(idx >= 0 && idx < arr->va_count);
+	return arr->va_ptrs[idx];
+}
+EXPORT_SYMBOL(cfs_percpt_index);
+
+/*
+ * free variable array, see more detail in cfs_array_alloc
+ */
+void
+cfs_array_free(void *vars)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] == NULL)
+			continue;
+
+		LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_array_free);
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by @count, @size is size of each
+ * memory block in array.
+ */
+void *
+cfs_array_alloc(int count, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_count	= count;
+	arr->va_size	= size;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_ALLOC(arr->va_ptrs[i], size);
+
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_array_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_array_alloc);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
new file mode 100644
index 000000000..76d4392bd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
@@ -0,0 +1,562 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/libcfs_string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+		 int *oldmask, int minmask, int allmask)
+{
+	const char *debugstr;
+	char op = '\0';
+	int newmask = minmask, i, len, found = 0;
+
+	/* <str> must be a list of tokens separated by whitespace
+	 * and optionally an operator ('+' or '-').  If an operator
+	 * appears first in <str>, '*oldmask' is used as the starting point
+	 * (relative), otherwise minmask is used (absolute).  An operator
+	 * applies to all following tokens up to the next operator. */
+	while (*str != '\0') {
+		while (isspace(*str))
+			str++;
+		if (*str == '\0')
+			break;
+		if (*str == '+' || *str == '-') {
+			op = *str++;
+			if (!found)
+				/* only if first token is relative */
+				newmask = *oldmask;
+			while (isspace(*str))
+				str++;
+			if (*str == '\0')  /* trailing op */
+				return -EINVAL;
+		}
+
+		/* find token length */
+		len = 0;
+		while (str[len] != '\0' && !isspace(str[len]) &&
+		       str[len] != '+' && str[len] != '-')
+			len++;
+
+		/* match token */
+		found = 0;
+		for (i = 0; i < 32; i++) {
+			debugstr = bit2str(i);
+			if (debugstr != NULL &&
+			    strlen(debugstr) == len &&
+			    strncasecmp(str, debugstr, len) == 0) {
+				if (op == '-')
+					newmask &= ~(1 << i);
+				else
+					newmask |= (1 << i);
+				found = 1;
+				break;
+			}
+		}
+		if (!found && len == 3 &&
+		    (strncasecmp(str, "ALL", len) == 0)) {
+			if (op == '-')
+				newmask = minmask;
+			else
+				newmask = allmask;
+			found = 1;
+		}
+		if (!found) {
+			CWARN("unknown mask '%.*s'.\n"
+			      "mask usage: [+|-]<all|type> ...\n", len, str);
+			return -EINVAL;
+		}
+		str += len;
+	}
+
+	*oldmask = newmask;
+	return 0;
+}
+
+/* get the first string out of @str */
+char *cfs_firststr(char *str, size_t size)
+{
+	size_t i = 0;
+	char  *end;
+
+	/* trim leading spaces */
+	while (i < size && *str && isspace(*str)) {
+		++i;
+		++str;
+	}
+
+	/* string with all spaces */
+	if (*str == '\0')
+		goto out;
+
+	end = str;
+	while (i < size && *end != '\0' && !isspace(*end)) {
+		++i;
+		++end;
+	}
+
+	*end = '\0';
+out:
+	return str;
+}
+EXPORT_SYMBOL(cfs_firststr);
+
+char *
+cfs_trimwhite(char *str)
+{
+	char *end;
+
+	while (isspace(*str))
+		str++;
+
+	end = str + strlen(str);
+	while (end > str) {
+		if (!isspace(end[-1]))
+			break;
+		end--;
+	}
+
+	*end = 0;
+	return str;
+}
+EXPORT_SYMBOL(cfs_trimwhite);
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+	char *end;
+
+	if (next->ls_str == NULL)
+		return 0;
+
+	/* skip leading white spaces */
+	while (next->ls_len) {
+		if (!isspace(*next->ls_str))
+			break;
+		next->ls_str++;
+		next->ls_len--;
+	}
+
+	if (next->ls_len == 0) /* whitespaces only */
+		return 0;
+
+	if (*next->ls_str == delim) {
+		/* first non-writespace is the delimiter */
+		return 0;
+	}
+
+	res->ls_str = next->ls_str;
+	end = memchr(next->ls_str, delim, next->ls_len);
+	if (end == NULL) {
+		/* there is no the delimeter in the string */
+		end = next->ls_str + next->ls_len;
+		next->ls_str = NULL;
+	} else {
+		next->ls_str = end + 1;
+		next->ls_len -= (end - res->ls_str + 1);
+	}
+
+	/* skip ending whitespaces */
+	while (--end != res->ls_str) {
+		if (!isspace(*end))
+			break;
+	}
+
+	res->ls_len = end - res->ls_str + 1;
+	return 1;
+}
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+		  unsigned min, unsigned max)
+{
+	char	*endp;
+
+	str = cfs_trimwhite(str);
+	*num = strtoul(str, &endp, 0);
+	if (endp == str)
+		return 0;
+
+	for (; endp < str + nob; endp++) {
+		if (!isspace(*endp))
+			return 0;
+	}
+
+	return (*num >= min && *num <= max);
+}
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ `* src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+static int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+		     int bracketed, struct cfs_range_expr **expr)
+{
+	struct cfs_range_expr	*re;
+	struct cfs_lstr		tok;
+
+	LIBCFS_ALLOC(re, sizeof(*re));
+	if (re == NULL)
+		return -ENOMEM;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		re->re_lo = min;
+		re->re_hi = max;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_lo, min, max)) {
+		/* <number> is parsed */
+		re->re_hi = re->re_lo;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (!bracketed || !cfs_gettok(src, '-', &tok))
+		goto failed;
+
+	if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+			       &re->re_lo, min, max))
+		goto failed;
+
+	/* <number> - */
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_hi, min, max)) {
+		/* <number> - <number> is parsed */
+		re->re_stride = 1;
+		goto out;
+	}
+
+	/* go to check <number> '-' <number> '/' <number> */
+	if (cfs_gettok(src, '/', &tok)) {
+		if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+				       &re->re_hi, min, max))
+			goto failed;
+
+		/* <number> - <number> / ... */
+		if (cfs_str2num_check(src->ls_str, src->ls_len,
+				      &re->re_stride, min, max)) {
+			/* <number> - <number> / <number> is parsed */
+			goto out;
+		}
+	}
+
+ out:
+	*expr = re;
+	return 0;
+
+ failed:
+	LIBCFS_FREE(re, sizeof(*re));
+	return -EINVAL;
+}
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr	*expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (value >= expr->re_lo && value <= expr->re_hi &&
+		    ((value - expr->re_lo) % expr->re_stride) == 0)
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+	struct cfs_range_expr	*expr;
+	__u32			*val;
+	int			count = 0;
+	int			i;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				count++;
+		}
+	}
+
+	if (count == 0) /* empty expression list */
+		return 0;
+
+	if (count > max) {
+		CERROR("Number of values %d exceeds max allowed %d\n",
+		       max, count);
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(val, sizeof(val[0]) * count);
+	if (val == NULL)
+		return -ENOMEM;
+
+	count = 0;
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				val[count++] = i;
+		}
+	}
+
+	*valpp = val;
+	return count;
+}
+EXPORT_SYMBOL(cfs_expr_list_values);
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+	while (!list_empty(&expr_list->el_exprs)) {
+		struct cfs_range_expr *expr;
+
+		expr = list_entry(expr_list->el_exprs.next,
+				      struct cfs_range_expr, re_link),
+		list_del(&expr->re_link);
+		LIBCFS_FREE(expr, sizeof(*expr));
+	}
+
+	LIBCFS_FREE(expr_list, sizeof(*expr_list));
+}
+EXPORT_SYMBOL(cfs_expr_list_free);
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 1 if \a str parses to \<number\> | \<expr_list\>
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+		    struct cfs_expr_list **elpp)
+{
+	struct cfs_expr_list	*expr_list;
+	struct cfs_range_expr	*expr;
+	struct cfs_lstr		src;
+	int			rc;
+
+	LIBCFS_ALLOC(expr_list, sizeof(*expr_list));
+	if (expr_list == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+
+	INIT_LIST_HEAD(&expr_list->el_exprs);
+
+	if (src.ls_str[0] == '[' &&
+	    src.ls_str[src.ls_len - 1] == ']') {
+		src.ls_str++;
+		src.ls_len -= 2;
+
+		rc = -EINVAL;
+		while (src.ls_str != NULL) {
+			struct cfs_lstr tok;
+
+			if (!cfs_gettok(&src, ',', &tok)) {
+				rc = -EINVAL;
+				break;
+			}
+
+			rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+			if (rc != 0)
+				break;
+
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	} else {
+		rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+		if (rc == 0) {
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	}
+
+	if (rc != 0)
+		cfs_expr_list_free(expr_list);
+	else
+		*elpp = expr_list;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_expr_list_parse);
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+	struct cfs_expr_list *el;
+
+	while (!list_empty(list)) {
+		el = list_entry(list->next,
+				    struct cfs_expr_list, el_link);
+		list_del(&el->el_link);
+		cfs_expr_list_free(el);
+	}
+}
+
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list	*el;
+	struct cfs_lstr		src;
+	int			rc;
+	int			i;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	i = 0;
+
+	while (src.ls_str != NULL) {
+		struct cfs_lstr res;
+
+		if (!cfs_gettok(&src, '.', &res)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+		if (rc != 0)
+			goto out;
+
+		list_add_tail(&el->el_link, list);
+		i++;
+	}
+
+	if (i == 4)
+		return 0;
+
+	rc = -EINVAL;
+ out:
+	cfs_expr_list_free_list(list);
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_ip_addr_parse);
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0;
+
+	list_for_each_entry_reverse(el, list, el_link) {
+		if (!cfs_expr_list_match(addr & 0xff, el))
+			return 0;
+		addr >>= 8;
+		i++;
+	}
+
+	return i == 4;
+}
+EXPORT_SYMBOL(cfs_ip_addr_match);
+
+void
+cfs_ip_addr_free(struct list_head *list)
+{
+	cfs_expr_list_free_list(list);
+}
+EXPORT_SYMBOL(cfs_ip_addr_free);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
new file mode 100644
index 000000000..cc3ab3519
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
@@ -0,0 +1,1056 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#ifdef CONFIG_SMP
+
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int	cpu_npartitions;
+module_param(cpu_npartitions, int, 0444);
+MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
+
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char	*cpu_pattern = "";
+module_param(cpu_pattern, charp, 0444);
+MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
+
+struct cfs_cpt_data {
+	/* serialize hotplug etc */
+	spinlock_t		cpt_lock;
+	/* reserved for hotplug */
+	unsigned long		cpt_version;
+	/* mutex to protect cpt_cpumask */
+	struct mutex		cpt_mutex;
+	/* scratch buffer for set/unset_node */
+	cpumask_t		*cpt_cpumask;
+};
+
+static struct cfs_cpt_data	cpt_data;
+
+static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
+{
+	/* return cpumask of cores in the same socket */
+	cpumask_copy(mask, topology_core_cpumask(cpu));
+}
+
+/* return cpumask of HTs in the same core */
+static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
+{
+	cpumask_copy(mask, topology_thread_cpumask(cpu));
+}
+
+static void cfs_node_to_cpumask(int node, cpumask_t *mask)
+{
+	cpumask_copy(mask, cpumask_of_node(node));
+}
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	int	i;
+
+	if (cptab->ctb_cpu2cpt != NULL) {
+		LIBCFS_FREE(cptab->ctb_cpu2cpt,
+			    num_possible_cpus() *
+			    sizeof(cptab->ctb_cpu2cpt[0]));
+	}
+
+	for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask != NULL) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		if (part->cpt_cpumask != NULL)
+			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+	}
+
+	if (cptab->ctb_parts != NULL) {
+		LIBCFS_FREE(cptab->ctb_parts,
+			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+	}
+
+	if (cptab->ctb_nodemask != NULL)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (cptab->ctb_cpumask != NULL)
+		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+	int	i;
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab == NULL)
+		return NULL;
+
+	cptab->ctb_nparts = ncpt;
+
+	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+
+	if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
+		goto failed;
+
+	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+		     num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+	if (cptab->ctb_cpu2cpt == NULL)
+		goto failed;
+
+	memset(cptab->ctb_cpu2cpt, -1,
+	       num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+	if (cptab->ctb_parts == NULL)
+		goto failed;
+
+	for (i = 0; i < ncpt; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+		if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+			goto failed;
+	}
+
+	spin_lock(&cpt_data.cpt_lock);
+	/* Reserved for hotplug */
+	cptab->ctb_version = cpt_data.cpt_version;
+	spin_unlock(&cpt_data.cpt_lock);
+
+	return cptab;
+
+ failed:
+	cfs_cpt_table_free(cptab);
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	char	*tmp = buf;
+	int	rc = 0;
+	int	i;
+	int	j;
+
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len > 0) {
+			rc = snprintf(tmp, len, "%d\t: ", i);
+			len -= rc;
+		}
+
+		if (len <= 0) {
+			rc = -EFBIG;
+			goto out;
+		}
+
+		tmp += rc;
+		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
+			rc = snprintf(tmp, len, "%d ", j);
+			len -= rc;
+			if (len <= 0) {
+				rc = -EFBIG;
+				goto out;
+			}
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+ out:
+	if (rc < 0)
+		return rc;
+
+	return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return cptab->ctb_nparts;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_weight(cptab->ctb_cpumask) :
+	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_any_and(cptab->ctb_cpumask,
+			       cpu_online_mask) < nr_cpu_ids :
+	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
+			       cpu_online_mask) < nr_cpu_ids;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *
+cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	int	node;
+
+	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+		return 0;
+	}
+
+	if (cptab->ctb_cpu2cpt[cpu] != -1) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	cptab->ctb_cpu2cpt[cpu] = cpt;
+
+	LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+	LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+
+	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+
+	node = cpu_to_node(cpu);
+
+	/* first CPU of @node in this CPT table */
+	if (!node_isset(node, *cptab->ctb_nodemask))
+		node_set(node, *cptab->ctb_nodemask);
+
+	/* first CPU of @node in this partition */
+	if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
+		node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	int	node;
+	int	i;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+		return;
+	}
+
+	if (cpt == CFS_CPT_ANY) {
+		/* caller doesn't know the partition ID */
+		cpt = cptab->ctb_cpu2cpt[cpu];
+		if (cpt < 0) { /* not set in this CPT-table */
+			CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n",
+			       cpt, cptab);
+			return;
+		}
+
+	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+		CDEBUG(D_INFO,
+		       "CPU %d is not in cpu-partition %d\n", cpu, cpt);
+		return;
+	}
+
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+
+	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+	cptab->ctb_cpu2cpt[cpu] = -1;
+
+	node = cpu_to_node(cpu);
+
+	LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
+	LASSERT(node_isset(node, *cptab->ctb_nodemask));
+
+	for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
+		/* this CPT has other CPU belonging to this node? */
+		if (cpu_to_node(i) == node)
+			break;
+	}
+
+	if (i >= nr_cpu_ids)
+		node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+	for_each_cpu(i, cptab->ctb_cpumask) {
+		/* this CPT-table has other CPU belonging to this node? */
+		if (cpu_to_node(i) == node)
+			break;
+	}
+
+	if (i >= nr_cpu_ids)
+		node_clear(node, *cptab->ctb_nodemask);
+
+	return;
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	int	i;
+
+	if (cpumask_weight(mask) == 0 ||
+	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
+		CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n",
+		       cpt);
+		return 0;
+	}
+
+	for_each_cpu(i, mask) {
+		if (!cfs_cpt_set_cpu(cptab, cpt, i))
+			return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	int	i;
+
+	for_each_cpu(i, mask)
+		cfs_cpt_unset_cpu(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	cpumask_t	*mask;
+	int		rc;
+
+	if (node < 0 || node >= MAX_NUMNODES) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return 0;
+	}
+
+	mutex_lock(&cpt_data.cpt_mutex);
+
+	mask = cpt_data.cpt_cpumask;
+	cfs_node_to_cpumask(node, mask);
+
+	rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
+
+	mutex_unlock(&cpt_data.cpt_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	cpumask_t *mask;
+
+	if (node < 0 || node >= MAX_NUMNODES) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return;
+	}
+
+	mutex_lock(&cpt_data.cpt_mutex);
+
+	mask = cpt_data.cpt_cpumask;
+	cfs_node_to_cpumask(node, mask);
+
+	cfs_cpt_unset_cpumask(cptab, cpt, mask);
+
+	mutex_unlock(&cpt_data.cpt_mutex);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	int	i;
+
+	for_each_node_mask(i, *mask) {
+		if (!cfs_cpt_set_node(cptab, cpt, i))
+			return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	int	i;
+
+	for_each_node_mask(i, *mask)
+		cfs_cpt_unset_node(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+	int	last;
+	int	i;
+
+	if (cpt == CFS_CPT_ANY) {
+		last = cptab->ctb_nparts - 1;
+		cpt = 0;
+	} else {
+		last = cpt;
+	}
+
+	for (; cpt <= last; cpt++) {
+		for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask)
+			cfs_cpt_unset_cpu(cptab, cpt, i);
+	}
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	nodemask_t	*mask;
+	int		weight;
+	int		rotor;
+	int		node;
+
+	/* convert CPU partition ID to HW node id */
+
+	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+		mask = cptab->ctb_nodemask;
+		rotor = cptab->ctb_spread_rotor++;
+	} else {
+		mask = cptab->ctb_parts[cpt].cpt_nodemask;
+		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+	}
+
+	weight = nodes_weight(*mask);
+	LASSERT(weight > 0);
+
+	rotor %= weight;
+
+	for_each_node_mask(node, *mask) {
+		if (rotor-- == 0)
+			return node;
+	}
+
+	LBUG();
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	int	cpu = smp_processor_id();
+	int	cpt = cptab->ctb_cpu2cpt[cpu];
+
+	if (cpt < 0) {
+		if (!remap)
+			return cpt;
+
+		/* don't return negative value for safety of upper layer,
+		 * instead we shadow the unknown cpu to a valid partition ID */
+		cpt = cpu % cptab->ctb_nparts;
+	}
+
+	return cpt;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
+
+	return cptab->ctb_cpu2cpt[cpu];
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	cpumask_t	*cpumask;
+	nodemask_t	*nodemask;
+	int		rc;
+	int		i;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpt == CFS_CPT_ANY) {
+		cpumask = cptab->ctb_cpumask;
+		nodemask = cptab->ctb_nodemask;
+	} else {
+		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+	}
+
+	if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) {
+		CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
+		       cpt);
+		return -EINVAL;
+	}
+
+	for_each_online_cpu(i) {
+		if (cpumask_test_cpu(i, cpumask))
+			continue;
+
+		rc = set_cpus_allowed_ptr(current, cpumask);
+		set_mems_allowed(*nodemask);
+		if (rc == 0)
+			schedule(); /* switch to allowed CPU */
+
+		return rc;
+	}
+
+	/* don't need to set affinity because all online CPUs are covered */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int
+cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+		     cpumask_t *node, int number)
+{
+	cpumask_t	*socket = NULL;
+	cpumask_t	*core = NULL;
+	int		rc = 0;
+	int		cpu;
+
+	LASSERT(number > 0);
+
+	if (number >= cpumask_weight(node)) {
+		while (!cpumask_empty(node)) {
+			cpu = cpumask_first(node);
+
+			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+			if (!rc)
+				return -EINVAL;
+			cpumask_clear_cpu(cpu, node);
+		}
+		return 0;
+	}
+
+	/* allocate scratch buffer */
+	LIBCFS_ALLOC(socket, cpumask_size());
+	LIBCFS_ALLOC(core, cpumask_size());
+	if (socket == NULL || core == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	while (!cpumask_empty(node)) {
+		cpu = cpumask_first(node);
+
+		/* get cpumask for cores in the same socket */
+		cfs_cpu_core_siblings(cpu, socket);
+		cpumask_and(socket, socket, node);
+
+		LASSERT(!cpumask_empty(socket));
+
+		while (!cpumask_empty(socket)) {
+			int     i;
+
+			/* get cpumask for hts in the same core */
+			cfs_cpu_ht_siblings(cpu, core);
+			cpumask_and(core, core, node);
+
+			LASSERT(!cpumask_empty(core));
+
+			for_each_cpu(i, core) {
+				cpumask_clear_cpu(i, socket);
+				cpumask_clear_cpu(i, node);
+
+				rc = cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					rc = -EINVAL;
+					goto out;
+				}
+
+				if (--number == 0)
+					goto out;
+			}
+			cpu = cpumask_first(socket);
+		}
+	}
+
+ out:
+	if (socket != NULL)
+		LIBCFS_FREE(socket, cpumask_size());
+	if (core != NULL)
+		LIBCFS_FREE(core, cpumask_size());
+	return rc;
+}
+
+#define CPT_WEIGHT_MIN  4u
+
+static unsigned int
+cfs_cpt_num_estimate(void)
+{
+	unsigned nnode = num_online_nodes();
+	unsigned ncpu  = num_online_cpus();
+	unsigned ncpt;
+
+	if (ncpu <= CPT_WEIGHT_MIN) {
+		ncpt = 1;
+		goto out;
+	}
+
+	/* generate reasonable number of CPU partitions based on total number
+	 * of CPUs, Preferred N should be power2 and match this condition:
+	 * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
+	for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
+
+	if (ncpt <= nnode) { /* fat numa system */
+		while (nnode > ncpt)
+			nnode >>= 1;
+
+	} else { /* ncpt > nnode */
+		while ((nnode << 1) <= ncpt)
+			nnode <<= 1;
+	}
+
+	ncpt = nnode;
+
+ out:
+#if (BITS_PER_LONG == 32)
+	/* config many CPU partitions on 32-bit system could consume
+	 * too much memory */
+	ncpt = min(2U, ncpt);
+#endif
+	while (ncpu % ncpt != 0)
+		ncpt--; /* worst case is 1 */
+
+	return ncpt;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create(int ncpt)
+{
+	struct cfs_cpt_table *cptab = NULL;
+	cpumask_t	*mask = NULL;
+	int		cpt = 0;
+	int		num;
+	int		rc;
+	int		i;
+
+	rc = cfs_cpt_num_estimate();
+	if (ncpt <= 0)
+		ncpt = rc;
+
+	if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
+		CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
+		      ncpt, rc);
+	}
+
+	if (num_online_cpus() % ncpt != 0) {
+		CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n",
+		       (int)num_online_cpus(), ncpt);
+		goto failed;
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+		goto failed;
+	}
+
+	num = num_online_cpus() / ncpt;
+	if (num == 0) {
+		CERROR("CPU changed while setting CPU partition\n");
+		goto failed;
+	}
+
+	LIBCFS_ALLOC(mask, cpumask_size());
+	if (mask == NULL) {
+		CERROR("Failed to allocate scratch cpumask\n");
+		goto failed;
+	}
+
+	for_each_online_node(i) {
+		cfs_node_to_cpumask(i, mask);
+
+		while (!cpumask_empty(mask)) {
+			struct cfs_cpu_partition *part;
+			int    n;
+
+			if (cpt >= ncpt)
+				goto failed;
+
+			part = &cptab->ctb_parts[cpt];
+
+			n = num - cpumask_weight(part->cpt_cpumask);
+			LASSERT(n > 0);
+
+			rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
+			if (rc < 0)
+				goto failed;
+
+			LASSERT(num >= cpumask_weight(part->cpt_cpumask));
+			if (num == cpumask_weight(part->cpt_cpumask))
+				cpt++;
+		}
+	}
+
+	if (cpt != ncpt ||
+	    num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
+		CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n",
+		       cptab->ctb_nparts, num, cpt,
+		       cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask));
+		goto failed;
+	}
+
+	LIBCFS_FREE(mask, cpumask_size());
+
+	return cptab;
+
+ failed:
+	CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
+	       ncpt, num_online_nodes(), num_online_cpus());
+
+	if (mask != NULL)
+		LIBCFS_FREE(mask, cpumask_size());
+
+	if (cptab != NULL)
+		cfs_cpt_table_free(cptab);
+
+	return NULL;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create_pattern(char *pattern)
+{
+	struct cfs_cpt_table	*cptab;
+	char			*str	= pattern;
+	int			node	= 0;
+	int			high;
+	int			ncpt;
+	int			c;
+
+	for (ncpt = 0;; ncpt++) { /* quick scan bracket */
+		str = strchr(str, '[');
+		if (str == NULL)
+			break;
+		str++;
+	}
+
+	str = cfs_trimwhite(pattern);
+	if (*str == 'n' || *str == 'N') {
+		pattern = str + 1;
+		node = 1;
+	}
+
+	if (ncpt == 0 ||
+	    (node && ncpt > num_online_nodes()) ||
+	    (!node && ncpt > num_online_cpus())) {
+		CERROR("Invalid pattern %s, or too many partitions %d\n",
+		       pattern, ncpt);
+		return NULL;
+	}
+
+	high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate cpu partition table\n");
+		return NULL;
+	}
+
+	for (str = cfs_trimwhite(pattern), c = 0;; c++) {
+		struct cfs_range_expr	*range;
+		struct cfs_expr_list	*el;
+		char			*bracket = strchr(str, '[');
+		int			cpt;
+		int			rc;
+		int			i;
+		int			n;
+
+		if (bracket == NULL) {
+			if (*str != 0) {
+				CERROR("Invalid pattern %s\n", str);
+				goto failed;
+			} else if (c != ncpt) {
+				CERROR("expect %d partitions but found %d\n",
+				       ncpt, c);
+				goto failed;
+			}
+			break;
+		}
+
+		if (sscanf(str, "%d%n", &cpt, &n) < 1) {
+			CERROR("Invalid cpu pattern %s\n", str);
+			goto failed;
+		}
+
+		if (cpt < 0 || cpt >= ncpt) {
+			CERROR("Invalid partition id %d, total partitions %d\n",
+			       cpt, ncpt);
+			goto failed;
+		}
+
+		if (cfs_cpt_weight(cptab, cpt) != 0) {
+			CERROR("Partition %d has already been set.\n", cpt);
+			goto failed;
+		}
+
+		str = cfs_trimwhite(str + n);
+		if (str != bracket) {
+			CERROR("Invalid pattern %s\n", str);
+			goto failed;
+		}
+
+		bracket = strchr(str, ']');
+		if (bracket == NULL) {
+			CERROR("missing right bracket for cpt %d, %s\n",
+			       cpt, str);
+			goto failed;
+		}
+
+		if (cfs_expr_list_parse(str, (bracket - str) + 1,
+					0, high, &el) != 0) {
+			CERROR("Can't parse number range: %s\n", str);
+			goto failed;
+		}
+
+		list_for_each_entry(range, &el->el_exprs, re_link) {
+			for (i = range->re_lo; i <= range->re_hi; i++) {
+				if ((i - range->re_lo) % range->re_stride != 0)
+					continue;
+
+				rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
+					    cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					cfs_expr_list_free(el);
+					goto failed;
+				}
+			}
+		}
+
+		cfs_expr_list_free(el);
+
+		if (!cfs_cpt_online(cptab, cpt)) {
+			CERROR("No online CPU is found on partition %d\n", cpt);
+			goto failed;
+		}
+
+		str = cfs_trimwhite(bracket + 1);
+	}
+
+	return cptab;
+
+ failed:
+	cfs_cpt_table_free(cptab);
+	return NULL;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int  cpu = (unsigned long)hcpu;
+	bool	     warn;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		spin_lock(&cpt_data.cpt_lock);
+		cpt_data.cpt_version++;
+		spin_unlock(&cpt_data.cpt_lock);
+	default:
+		if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
+			CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
+			       cpu, action);
+			break;
+		}
+
+		mutex_lock(&cpt_data.cpt_mutex);
+		/* if all HTs in a core are offline, it may break affinity */
+		cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
+		warn = cpumask_any_and(cpt_data.cpt_cpumask,
+				       cpu_online_mask) >= nr_cpu_ids;
+		mutex_unlock(&cpt_data.cpt_mutex);
+		CDEBUG(warn ? D_WARNING : D_INFO,
+		       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u action: %lx]\n",
+		       cpu, action);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+	.notifier_call	= cfs_cpu_notify,
+	.priority	= 0
+};
+
+#endif
+
+void
+cfs_cpu_fini(void)
+{
+	if (cfs_cpt_table != NULL)
+		cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+	if (cpt_data.cpt_cpumask != NULL)
+		LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
+}
+
+int
+cfs_cpu_init(void)
+{
+	LASSERT(cfs_cpt_table == NULL);
+
+	memset(&cpt_data, 0, sizeof(cpt_data));
+
+	LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
+	if (cpt_data.cpt_cpumask == NULL) {
+		CERROR("Failed to allocate scratch buffer\n");
+		return -1;
+	}
+
+	spin_lock_init(&cpt_data.cpt_lock);
+	mutex_init(&cpt_data.cpt_mutex);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+
+	if (*cpu_pattern != 0) {
+		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+		if (cfs_cpt_table == NULL) {
+			CERROR("Failed to create cptab from pattern %s\n",
+			       cpu_pattern);
+			goto failed;
+		}
+
+	} else {
+		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+		if (cfs_cpt_table == NULL) {
+			CERROR("Failed to create ptable with npartitions %d\n",
+			       cpu_npartitions);
+			goto failed;
+		}
+	}
+
+	spin_lock(&cpt_data.cpt_lock);
+	if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
+		spin_unlock(&cpt_data.cpt_lock);
+		CERROR("CPU hotplug/unplug during setup\n");
+		goto failed;
+	}
+	spin_unlock(&cpt_data.cpt_lock);
+
+	LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
+		 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
+	return 0;
+
+ failed:
+	cfs_cpu_fini();
+	return -1;
+}
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
new file mode 100644
index 000000000..5e185fa59
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
@@ -0,0 +1,141 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to zlib_adler32.
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include <crypto/internal/hash.h>
+#include "linux-crypto.h"
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+static u32 __adler32(u32 cksum, unsigned char const *p, size_t len)
+{
+	return zlib_adler32(cksum, p, len);
+}
+
+static int adler32_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 1;
+
+	return 0;
+}
+
+static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
+			  unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = *(u32 *)key;
+	return 0;
+}
+
+static int adler32_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = *mctx;
+
+	return 0;
+}
+
+static int adler32_update(struct shash_desc *desc, const u8 *data,
+			  unsigned int len)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = __adler32(*cksump, data, len);
+	return 0;
+}
+static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len,
+			   u8 *out)
+{
+	*(u32 *)out = __adler32(*cksump, data, len);
+	return 0;
+}
+
+static int adler32_finup(struct shash_desc *desc, const u8 *data,
+			 unsigned int len, u8 *out)
+{
+	return __adler32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int adler32_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*(u32 *)out = *cksump;
+	return 0;
+}
+
+static int adler32_digest(struct shash_desc *desc, const u8 *data,
+			  unsigned int len, u8 *out)
+{
+	return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+static struct shash_alg alg = {
+	.setkey		= adler32_setkey,
+	.init		= adler32_init,
+	.update		= adler32_update,
+	.final		= adler32_final,
+	.finup		= adler32_finup,
+	.digest		= adler32_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "adler32",
+		.cra_driver_name	= "adler32-zlib",
+		.cra_priority		= 100,
+		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(u32),
+		.cra_module		= THIS_MODULE,
+		.cra_init		= adler32_cra_init,
+	}
+};
+
+
+int cfs_crypto_adler32_register(void)
+{
+	return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_adler32_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
new file mode 100644
index 000000000..aa3fffed1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
@@ -0,0 +1,291 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "linux-crypto.h"
+/**
+ *  Array of  hash algorithm speed in MByte per second
+ */
+static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+
+
+
+static int cfs_crypto_hash_alloc(unsigned char alg_id,
+				 const struct cfs_crypto_hash_type **type,
+				 struct hash_desc *desc, unsigned char *key,
+				 unsigned int key_len)
+{
+	int     err = 0;
+
+	*type = cfs_crypto_hash_type(alg_id);
+
+	if (*type == NULL) {
+		CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
+		      alg_id, CFS_HASH_ALG_MAX);
+		return -EINVAL;
+	}
+	desc->tfm = crypto_alloc_hash((*type)->cht_name, 0, 0);
+
+	if (desc->tfm == NULL)
+		return -EINVAL;
+
+	if (IS_ERR(desc->tfm)) {
+		CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
+		       (*type)->cht_name);
+		return PTR_ERR(desc->tfm);
+	}
+
+	desc->flags = 0;
+
+	/** Shash have different logic for initialization then digest
+	 * shash: crypto_hash_setkey, crypto_hash_init
+	 * digest: crypto_digest_init, crypto_digest_setkey
+	 * Skip this function for digest, because we use shash logic at
+	 * cfs_crypto_hash_alloc.
+	 */
+	if (key != NULL) {
+		err = crypto_hash_setkey(desc->tfm, key, key_len);
+	} else if ((*type)->cht_key != 0) {
+		err = crypto_hash_setkey(desc->tfm,
+					 (unsigned char *)&((*type)->cht_key),
+					 (*type)->cht_size);
+	}
+
+	if (err != 0) {
+		crypto_free_hash(desc->tfm);
+		return err;
+	}
+
+	CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
+	       (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_name,
+	       (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_driver_name,
+	       cfs_crypto_hash_speeds[alg_id]);
+
+	return crypto_hash_init(desc);
+}
+
+int cfs_crypto_hash_digest(unsigned char alg_id,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len)
+{
+	struct scatterlist	sl;
+	struct hash_desc	hdesc;
+	int			err;
+	const struct cfs_crypto_hash_type	*type;
+
+	if (buf == NULL || buf_len == 0 || hash_len == NULL)
+		return -EINVAL;
+
+	err = cfs_crypto_hash_alloc(alg_id, &type, &hdesc, key, key_len);
+	if (err != 0)
+		return err;
+
+	if (hash == NULL || *hash_len < type->cht_size) {
+		*hash_len = type->cht_size;
+		crypto_free_hash(hdesc.tfm);
+		return -ENOSPC;
+	}
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	hdesc.flags = 0;
+	err = crypto_hash_digest(&hdesc, &sl, sl.length, hash);
+	crypto_free_hash(hdesc.tfm);
+
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_digest);
+
+struct cfs_crypto_hash_desc *
+	cfs_crypto_hash_init(unsigned char alg_id,
+			     unsigned char *key, unsigned int key_len)
+{
+
+	struct  hash_desc       *hdesc;
+	int		     err;
+	const struct cfs_crypto_hash_type       *type;
+
+	hdesc = kmalloc(sizeof(*hdesc), 0);
+	if (hdesc == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	err = cfs_crypto_hash_alloc(alg_id, &type, hdesc, key, key_len);
+
+	if (err) {
+		kfree(hdesc);
+		return ERR_PTR(err);
+	}
+	return (struct cfs_crypto_hash_desc *)hdesc;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_init);
+
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+				struct page *page, unsigned int offset,
+				unsigned int len)
+{
+	struct scatterlist sl;
+
+	sg_init_table(&sl, 1);
+	sg_set_page(&sl, page, len, offset & ~CFS_PAGE_MASK);
+
+	return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update_page);
+
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+			   const void *buf, unsigned int buf_len)
+{
+	struct scatterlist sl;
+
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update);
+
+/*      If hash_len pointer is NULL - destroy descriptor. */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+			  unsigned char *hash, unsigned int *hash_len)
+{
+	int     err;
+	int     size = crypto_hash_digestsize(((struct hash_desc *)hdesc)->tfm);
+
+	if (hash_len == NULL) {
+		crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+		kfree(hdesc);
+		return 0;
+	}
+	if (hash == NULL || *hash_len < size) {
+		*hash_len = size;
+		return -ENOSPC;
+	}
+	err = crypto_hash_final((struct hash_desc *) hdesc, hash);
+
+	if (err < 0) {
+		/* May be caller can fix error */
+		return err;
+	}
+	crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+	kfree(hdesc);
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_final);
+
+static void cfs_crypto_performance_test(unsigned char alg_id,
+					const unsigned char *buf,
+					unsigned int buf_len)
+{
+	unsigned long		   start, end;
+	int			     bcount, err = 0;
+	int			     sec = 1; /* do test only 1 sec */
+	unsigned char		   hash[64];
+	unsigned int		    hash_len = 64;
+
+	for (start = jiffies, end = start + sec * HZ, bcount = 0;
+	     time_before(jiffies, end); bcount++) {
+		err = cfs_crypto_hash_digest(alg_id, buf, buf_len, NULL, 0,
+					     hash, &hash_len);
+		if (err)
+			break;
+
+	}
+	end = jiffies;
+
+	if (err) {
+		cfs_crypto_hash_speeds[alg_id] =  -1;
+		CDEBUG(D_INFO, "Crypto hash algorithm %s, err = %d\n",
+		       cfs_crypto_hash_name(alg_id), err);
+	} else {
+		unsigned long   tmp;
+		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+		       1000) / (1024 * 1024);
+		cfs_crypto_hash_speeds[alg_id] = (int)tmp;
+	}
+	CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n",
+	       cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]);
+}
+
+int cfs_crypto_hash_speed(unsigned char hash_alg)
+{
+	if (hash_alg < CFS_HASH_ALG_MAX)
+		return cfs_crypto_hash_speeds[hash_alg];
+	else
+		return -1;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_speed);
+
+/**
+ * Do performance test for all hash algorithms.
+ */
+static int cfs_crypto_test_hashes(void)
+{
+	unsigned char	   i;
+	unsigned char	   *data;
+	unsigned int	    j;
+	/* Data block size for testing hash. Maximum
+	 * kmalloc size for 2.6.18 kernel is 128K */
+	unsigned int	    data_len = 1 * 128 * 1024;
+
+	data = kmalloc(data_len, 0);
+	if (data == NULL) {
+		CERROR("Failed to allocate mem\n");
+		return -ENOMEM;
+	}
+
+	for (j = 0; j < data_len; j++)
+		data[j] = j & 0xff;
+
+	for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+		cfs_crypto_performance_test(i, data, data_len);
+
+	kfree(data);
+	return 0;
+}
+
+static int adler32;
+
+int cfs_crypto_register(void)
+{
+	request_module("crc32c");
+
+	adler32 = cfs_crypto_adler32_register();
+
+	/* check all algorithms and do performance test */
+	cfs_crypto_test_hashes();
+	return 0;
+}
+void cfs_crypto_unregister(void)
+{
+	if (adler32 == 0)
+		cfs_crypto_adler32_unregister();
+
+	return;
+}
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h
new file mode 100644
index 000000000..18e8cd4d8
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h
@@ -0,0 +1,29 @@
+ /*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/**
+ * Functions for start/stop shash adler32 algorithm.
+ */
+int cfs_crypto_adler32_register(void);
+void cfs_crypto_adler32_unregister(void);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
new file mode 100644
index 000000000..277f6b890
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
@@ -0,0 +1,111 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-curproc.c
+ *
+ * Lustre curproc API implementation for Linux kernel
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+
+#include <linux/compat.h>
+#include <linux/thread_info.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+void cfs_cap_raise(cfs_cap_t cap)
+{
+	struct cred *cred;
+
+	cred = prepare_creds();
+	if (cred) {
+		cap_raise(cred->cap_effective, cap);
+		commit_creds(cred);
+	}
+}
+
+void cfs_cap_lower(cfs_cap_t cap)
+{
+	struct cred *cred;
+
+	cred = prepare_creds();
+	if (cred) {
+		cap_lower(cred->cap_effective, cap);
+		commit_creds(cred);
+	}
+}
+
+int cfs_cap_raised(cfs_cap_t cap)
+{
+	return cap_raised(current_cap(), cap);
+}
+
+static void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap)
+{
+	/* XXX lost high byte */
+	*cap = kcap.cap[0];
+}
+
+cfs_cap_t cfs_curproc_cap_pack(void)
+{
+	cfs_cap_t cap;
+	cfs_kernel_cap_pack(current_cap(), &cap);
+	return cap;
+}
+
+EXPORT_SYMBOL(cfs_cap_raise);
+EXPORT_SYMBOL(cfs_cap_lower);
+EXPORT_SYMBOL(cfs_cap_raised);
+EXPORT_SYMBOL(cfs_curproc_cap_pack);
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
new file mode 100644
index 000000000..4545d54f7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
@@ -0,0 +1,200 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#include "../tracefile.h"
+
+#include <linux/kallsyms.h>
+
+char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall";
+char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall";
+
+/**
+ * Upcall function once a Lustre log has been dumped.
+ *
+ * \param file  path of the dumped log
+ */
+void libcfs_run_debug_log_upcall(char *file)
+{
+	char *argv[3];
+	int   rc;
+	char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL};
+
+	argv[0] = lnet_debug_log_upcall;
+
+	LASSERTF(file != NULL, "called on a null filename\n");
+	argv[1] = file; /* only need to pass the path of the file */
+
+	argv[2] = NULL;
+
+	rc = call_usermodehelper(argv[0], argv, envp, 1);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("Error %d invoking LNET debug log upcall %s %s; check /proc/sys/lnet/debug_log_upcall\n",
+		       rc, argv[0], argv[1]);
+	} else {
+		CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n",
+		       argv[0], argv[1]);
+	}
+}
+
+void libcfs_run_upcall(char **argv)
+{
+	int   rc;
+	int   argc;
+	char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL};
+
+	argv[0] = lnet_upcall;
+	argc = 1;
+	while (argv[argc] != NULL)
+		argc++;
+
+	LASSERT(argc >= 2);
+
+	rc = call_usermodehelper(argv[0], argv, envp, 1);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; check /proc/sys/lnet/upcall\n",
+		       rc, argv[0], argv[1],
+		       argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+		       argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+		       argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+		       argc < 6 ? "" : ",...");
+	} else {
+		CDEBUG(D_HA, "Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n",
+		       argv[0], argv[1],
+		       argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+		       argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+		       argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+		       argc < 6 ? "" : ",...");
+	}
+}
+
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *msgdata)
+{
+	char *argv[6];
+	char buf[32];
+
+	snprintf(buf, sizeof(buf), "%d", msgdata->msg_line);
+
+	argv[1] = "LBUG";
+	argv[2] = (char *)msgdata->msg_file;
+	argv[3] = (char *)msgdata->msg_fn;
+	argv[4] = buf;
+	argv[5] = NULL;
+
+	libcfs_run_upcall (argv);
+}
+
+/* coverity[+kill] */
+void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
+{
+	libcfs_catastrophe = 1;
+	libcfs_debug_msg(msgdata, "LBUG\n");
+
+	if (in_interrupt()) {
+		panic("LBUG in interrupt.\n");
+		/* not reached */
+	}
+
+	dump_stack();
+	if (!libcfs_panic_on_lbug)
+		libcfs_debug_dumplog();
+	libcfs_run_lbug_upcall(msgdata);
+	if (libcfs_panic_on_lbug)
+		panic("LBUG");
+	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	while (1)
+		schedule();
+}
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+			 void *unused2)
+{
+	if (libcfs_panic_in_progress)
+		return 0;
+
+	libcfs_panic_in_progress = 1;
+	mb();
+
+	return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+	.notifier_call	= panic_notifier,
+	.next		= NULL,
+	.priority	= 10000,
+};
+
+void libcfs_register_panic_notifier(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+void libcfs_unregister_panic_notifier(void)
+{
+	atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+EXPORT_SYMBOL(libcfs_run_upcall);
+EXPORT_SYMBOL(libcfs_run_lbug_upcall);
+EXPORT_SYMBOL(lbug_with_loc);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
new file mode 100644
index 000000000..e962f8968
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#define LNET_MINOR 240
+
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
+{
+	struct libcfs_ioctl_hdr   *hdr;
+	struct libcfs_ioctl_data  *data;
+	int orig_len;
+
+	hdr = (struct libcfs_ioctl_hdr *)buf;
+	data = (struct libcfs_ioctl_data *)buf;
+
+	if (copy_from_user(buf, (void *)arg, sizeof(*hdr)))
+		return -EFAULT;
+
+	if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+		CERROR("PORTALS: version mismatch kernel vs application\n");
+		return -EINVAL;
+	}
+
+	if (hdr->ioc_len >= end - buf) {
+		CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+		return -EINVAL;
+	}
+
+
+	if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+		CERROR("PORTALS: user buffer too small for ioctl\n");
+		return -EINVAL;
+	}
+
+	orig_len = hdr->ioc_len;
+	if (copy_from_user(buf, (void *)arg, hdr->ioc_len))
+		return -EFAULT;
+	if (orig_len != data->ioc_len)
+		return -EINVAL;
+
+	if (libcfs_ioctl_is_invalid(data)) {
+		CERROR("PORTALS: ioctl not correctly formatted\n");
+		return -EINVAL;
+	}
+
+	if (data->ioc_inllen1)
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+	if (data->ioc_inllen2)
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+			cfs_size_round(data->ioc_inllen1);
+
+	return 0;
+}
+
+int libcfs_ioctl_popdata(void *arg, void *data, int size)
+{
+	if (copy_to_user((char *)arg, data, size))
+		return -EFAULT;
+	return 0;
+}
+
+extern struct cfs_psdev_ops	  libcfs_psdev_ops;
+
+static int
+libcfs_psdev_open(struct inode *inode, struct file *file)
+{
+	struct libcfs_device_userstate **pdu = NULL;
+	int    rc = 0;
+
+	if (!inode)
+		return -EINVAL;
+	pdu = (struct libcfs_device_userstate **)&file->private_data;
+	if (libcfs_psdev_ops.p_open != NULL)
+		rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
+	else
+		return -EPERM;
+	return rc;
+}
+
+/* called when closing /dev/device */
+static int
+libcfs_psdev_release(struct inode *inode, struct file *file)
+{
+	struct libcfs_device_userstate *pdu;
+	int    rc = 0;
+
+	if (!inode)
+		return -EINVAL;
+	pdu = file->private_data;
+	if (libcfs_psdev_ops.p_close != NULL)
+		rc = libcfs_psdev_ops.p_close(0, (void *)pdu);
+	else
+		rc = -EPERM;
+	return rc;
+}
+
+static long libcfs_ioctl(struct file *file,
+			 unsigned int cmd, unsigned long arg)
+{
+	struct cfs_psdev_file	 pfile;
+	int    rc = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (_IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+	     _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  ||
+	     _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR) {
+		CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+		       _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+		return -EINVAL;
+	}
+
+	/* Handle platform-dependent IOC requests */
+	switch (cmd) {
+	case IOC_LIBCFS_PANIC:
+		if (!capable(CFS_CAP_SYS_BOOT))
+			return -EPERM;
+		panic("debugctl-invoked panic");
+		return 0;
+	case IOC_LIBCFS_MEMHOG:
+		if (!capable(CFS_CAP_SYS_ADMIN))
+			return -EPERM;
+		/* go thought */
+	}
+
+	pfile.off = 0;
+	pfile.private_data = file->private_data;
+	if (libcfs_psdev_ops.p_ioctl != NULL)
+		rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
+	else
+		rc = -EPERM;
+	return rc;
+}
+
+static const struct file_operations libcfs_fops = {
+	.unlocked_ioctl	= libcfs_ioctl,
+	.open		= libcfs_psdev_open,
+	.release	= libcfs_psdev_release,
+};
+
+struct miscdevice libcfs_dev = {
+	.minor = LNET_MINOR,
+	.name = "lnet",
+	.fops = &libcfs_fops,
+};
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
new file mode 100644
index 000000000..838f5f3bd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
@@ -0,0 +1,217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs_struct.h>
+#include <linux/sched.h>
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#if defined(CONFIG_KGDB)
+#include <linux/kgdb.h>
+#endif
+
+/**
+ * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * waiting threads, which is not always desirable because all threads will
+ * be waken up again and again, even user only needs a few of them to be
+ * active most time. This is not good for performance because cache can
+ * be polluted by different threads.
+ *
+ * LIFO list can resolve this problem because we always wakeup the most
+ * recent active thread by default.
+ *
+ * NB: please don't call non-exclusive & exclusive wait on the same
+ * waitq if add_wait_queue_exclusive_head is used.
+ */
+void
+add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&waitq->lock, flags);
+	__add_wait_queue_exclusive(waitq, link);
+	spin_unlock_irqrestore(&waitq->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_head);
+
+void cfs_init_timer(struct timer_list *t)
+{
+	init_timer(t);
+}
+EXPORT_SYMBOL(cfs_init_timer);
+
+void cfs_timer_init(struct timer_list *t, cfs_timer_func_t *func, void *arg)
+{
+	init_timer(t);
+	t->function = func;
+	t->data = (unsigned long)arg;
+}
+EXPORT_SYMBOL(cfs_timer_init);
+
+void cfs_timer_done(struct timer_list *t)
+{
+	return;
+}
+EXPORT_SYMBOL(cfs_timer_done);
+
+void cfs_timer_arm(struct timer_list *t, unsigned long deadline)
+{
+	mod_timer(t, deadline);
+}
+EXPORT_SYMBOL(cfs_timer_arm);
+
+void cfs_timer_disarm(struct timer_list *t)
+{
+	del_timer(t);
+}
+EXPORT_SYMBOL(cfs_timer_disarm);
+
+int  cfs_timer_is_armed(struct timer_list *t)
+{
+	return timer_pending(t);
+}
+EXPORT_SYMBOL(cfs_timer_is_armed);
+
+unsigned long cfs_timer_deadline(struct timer_list *t)
+{
+	return t->expires;
+}
+EXPORT_SYMBOL(cfs_timer_deadline);
+
+void cfs_enter_debugger(void)
+{
+#if defined(CONFIG_KGDB)
+	/* BREAKPOINT(); */
+#else
+	/* nothing */
+#endif
+}
+EXPORT_SYMBOL(cfs_enter_debugger);
+
+
+sigset_t
+cfs_block_allsigs(void)
+{
+	unsigned long	  flags;
+	sigset_t	old;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	old = current->blocked;
+	sigfillset(&current->blocked);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+	return old;
+}
+EXPORT_SYMBOL(cfs_block_allsigs);
+
+sigset_t cfs_block_sigs(unsigned long sigs)
+{
+	unsigned long  flags;
+	sigset_t	old;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, sigs);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	return old;
+}
+EXPORT_SYMBOL(cfs_block_sigs);
+
+/* Block all signals except for the @sigs */
+sigset_t cfs_block_sigsinv(unsigned long sigs)
+{
+	unsigned long flags;
+	sigset_t old;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, ~sigs);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+	return old;
+}
+EXPORT_SYMBOL(cfs_block_sigsinv);
+
+void
+cfs_restore_sigs(sigset_t old)
+{
+	unsigned long  flags;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	current->blocked = old;
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+EXPORT_SYMBOL(cfs_restore_sigs);
+
+int
+cfs_signal_pending(void)
+{
+	return signal_pending(current);
+}
+EXPORT_SYMBOL(cfs_signal_pending);
+
+void
+cfs_clear_sigpending(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	clear_tsk_thread_flag(current, TIF_SIGPENDING);
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+EXPORT_SYMBOL(cfs_clear_sigpending);
+
+int
+libcfs_arch_init(void)
+{
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_arch_init);
+
+void
+libcfs_arch_cleanup(void)
+{
+	return;
+}
+EXPORT_SYMBOL(libcfs_arch_cleanup);
+
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
new file mode 100644
index 000000000..f2462e7f0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
@@ -0,0 +1,623 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/file.h>
+/* For sys_open & sys_close */
+#include <linux/syscalls.h>
+
+static int
+libcfs_sock_ioctl(int cmd, unsigned long arg)
+{
+	mm_segment_t	oldmm = get_fs();
+	struct socket  *sock;
+	int		rc;
+	struct file    *sock_filp;
+
+	rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+	if (rc != 0) {
+		CERROR ("Can't create socket: %d\n", rc);
+		return rc;
+	}
+
+	sock_filp = sock_alloc_file(sock, 0, NULL);
+	if (IS_ERR(sock_filp)) {
+		sock_release(sock);
+		rc = PTR_ERR(sock_filp);
+		goto out;
+	}
+
+	set_fs(KERNEL_DS);
+	if (sock_filp->f_op->unlocked_ioctl)
+		rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg);
+	set_fs(oldmm);
+
+	fput(sock_filp);
+out:
+	return rc;
+}
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+	struct ifreq   ifr;
+	int	    nob;
+	int	    rc;
+	__u32	  val;
+
+	nob = strnlen(name, IFNAMSIZ);
+	if (nob == IFNAMSIZ) {
+		CERROR("Interface name %s too long\n", name);
+		return -EINVAL;
+	}
+
+	CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+	strcpy(ifr.ifr_name, name);
+	rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get flags for interface %s\n", name);
+		return rc;
+	}
+
+	if ((ifr.ifr_flags & IFF_UP) == 0) {
+		CDEBUG(D_NET, "Interface %s down\n", name);
+		*up = 0;
+		*ip = *mask = 0;
+		return 0;
+	}
+
+	*up = 1;
+
+	strcpy(ifr.ifr_name, name);
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get IP address for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+	*ip = ntohl(val);
+
+	strcpy(ifr.ifr_name, name);
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get netmask for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+	*mask = ntohl(val);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_query);
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+	/* Allocate and fill in 'names', returning # interfaces/error */
+	char	   **names;
+	int	     toobig;
+	int	     nalloc;
+	int	     nfound;
+	struct ifreq   *ifr;
+	struct ifconf   ifc;
+	int	     rc;
+	int	     nob;
+	int	     i;
+
+
+	nalloc = 16;	/* first guess at max interfaces */
+	toobig = 0;
+	for (;;) {
+		if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) {
+			toobig = 1;
+			nalloc = PAGE_CACHE_SIZE/sizeof(*ifr);
+			CWARN("Too many interfaces: only enumerating first %d\n",
+			      nalloc);
+		}
+
+		LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+		if (ifr == NULL) {
+			CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+			rc = -ENOMEM;
+			goto out0;
+		}
+
+		ifc.ifc_buf = (char *)ifr;
+		ifc.ifc_len = nalloc * sizeof(*ifr);
+
+		rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+
+		if (rc < 0) {
+			CERROR ("Error %d enumerating interfaces\n", rc);
+			goto out1;
+		}
+
+		LASSERT (rc == 0);
+
+		nfound = ifc.ifc_len/sizeof(*ifr);
+		LASSERT (nfound <= nalloc);
+
+		if (nfound < nalloc || toobig)
+			break;
+
+		LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+		nalloc *= 2;
+	}
+
+	if (nfound == 0)
+		goto out1;
+
+	LIBCFS_ALLOC(names, nfound * sizeof(*names));
+	if (names == NULL) {
+		rc = -ENOMEM;
+		goto out1;
+	}
+
+	for (i = 0; i < nfound; i++) {
+
+		nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+		if (nob == IFNAMSIZ) {
+			/* no space for terminating NULL */
+			CERROR("interface name %.*s too long (%d max)\n",
+			       nob, ifr[i].ifr_name, IFNAMSIZ);
+			rc = -ENAMETOOLONG;
+			goto out2;
+		}
+
+		LIBCFS_ALLOC(names[i], IFNAMSIZ);
+		if (names[i] == NULL) {
+			rc = -ENOMEM;
+			goto out2;
+		}
+
+		memcpy(names[i], ifr[i].ifr_name, nob);
+		names[i][nob] = 0;
+	}
+
+	*namesp = names;
+	rc = nfound;
+
+ out2:
+	if (rc < 0)
+		libcfs_ipif_free_enumeration(names, nfound);
+ out1:
+	LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_enumerate);
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+	int      i;
+
+	LASSERT (n > 0);
+
+	for (i = 0; i < n && names[i] != NULL; i++)
+		LIBCFS_FREE(names[i], IFNAMSIZ);
+
+	LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+EXPORT_SYMBOL(libcfs_ipif_free_enumeration);
+
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int	    rc;
+	long	   ticks = timeout * HZ;
+	unsigned long  then;
+	struct timeval tv;
+
+	LASSERT (nob > 0);
+	/* Caller may pass a zero timeout if she thinks the socket buffer is
+	 * empty enough to take the whole message immediately */
+
+	for (;;) {
+		struct kvec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
+		};
+
+		if (timeout != 0) {
+			/* Set send timeout to remaining time */
+			tv = (struct timeval) {
+				.tv_sec = ticks / HZ,
+				.tv_usec = ((ticks % HZ) * 1000000) / HZ
+			};
+			rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+					     (char *)&tv, sizeof(tv));
+			if (rc != 0) {
+				CERROR("Can't set socket send timeout %ld.%06d: %d\n",
+				       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+				return rc;
+			}
+		}
+
+		then = jiffies;
+		rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
+		ticks -= jiffies - then;
+
+		if (rc == nob)
+			return 0;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0) {
+			CERROR ("Unexpected zero rc\n");
+			return -ECONNABORTED;
+		}
+
+		if (ticks <= 0)
+			return -EAGAIN;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_sock_write);
+
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int	    rc;
+	long	   ticks = timeout * HZ;
+	unsigned long  then;
+	struct timeval tv;
+
+	LASSERT (nob > 0);
+	LASSERT (ticks > 0);
+
+	for (;;) {
+		struct kvec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_flags      = 0
+		};
+
+		/* Set receive timeout to remaining time */
+		tv = (struct timeval) {
+			.tv_sec = ticks / HZ,
+			.tv_usec = ((ticks % HZ) * 1000000) / HZ
+		};
+		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+				     (char *)&tv, sizeof(tv));
+		if (rc != 0) {
+			CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+			       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+			return rc;
+		}
+
+		then = jiffies;
+		rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
+		ticks -= jiffies - then;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0)
+			return -ECONNRESET;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+
+		if (nob == 0)
+			return 0;
+
+		if (ticks <= 0)
+			return -ETIMEDOUT;
+	}
+}
+
+EXPORT_SYMBOL(libcfs_sock_read);
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+		    __u32 local_ip, int local_port)
+{
+	struct sockaddr_in  locaddr;
+	struct socket      *sock;
+	int		 rc;
+	int		 option;
+
+	/* All errors are fatal except bind failure if the port is in use */
+	*fatal = 1;
+
+	rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+	*sockp = sock;
+	if (rc != 0) {
+		CERROR ("Can't create socket: %d\n", rc);
+		return rc;
+	}
+
+	option = 1;
+	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+			     (char *)&option, sizeof (option));
+	if (rc != 0) {
+		CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+		goto failed;
+	}
+
+	if (local_ip != 0 || local_port != 0) {
+		memset(&locaddr, 0, sizeof(locaddr));
+		locaddr.sin_family = AF_INET;
+		locaddr.sin_port = htons(local_port);
+		locaddr.sin_addr.s_addr = (local_ip == 0) ?
+					  INADDR_ANY : htonl(local_ip);
+
+		rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr,
+				     sizeof(locaddr));
+		if (rc == -EADDRINUSE) {
+			CDEBUG(D_NET, "Port %d already in use\n", local_port);
+			*fatal = 0;
+			goto failed;
+		}
+		if (rc != 0) {
+			CERROR("Error trying to bind to port %d: %d\n",
+			       local_port, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+
+ failed:
+	sock_release(sock);
+	return rc;
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+	int		 option;
+	int		 rc;
+
+	if (txbufsize != 0) {
+		option = txbufsize;
+		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+				     (char *)&option, sizeof (option));
+		if (rc != 0) {
+			CERROR ("Can't set send buffer %d: %d\n",
+				option, rc);
+			return rc;
+		}
+	}
+
+	if (rxbufsize != 0) {
+		option = rxbufsize;
+		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+				      (char *)&option, sizeof (option));
+		if (rc != 0) {
+			CERROR ("Can't set receive buffer %d: %d\n",
+				option, rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_setbuf);
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+	struct sockaddr_in sin;
+	int		len = sizeof (sin);
+	int		rc;
+
+	rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len,
+				 remote ? 2 : 0);
+	if (rc != 0) {
+		CERROR ("Error %d getting sock %s IP/port\n",
+			rc, remote ? "peer" : "local");
+		return rc;
+	}
+
+	if (ip != NULL)
+		*ip = ntohl (sin.sin_addr.s_addr);
+
+	if (port != NULL)
+		*port = ntohs (sin.sin_port);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getaddr);
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+
+	if (txbufsize != NULL) {
+		*txbufsize = sock->sk->sk_sndbuf;
+	}
+
+	if (rxbufsize != NULL) {
+		*rxbufsize = sock->sk->sk_rcvbuf;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getbuf);
+
+int
+libcfs_sock_listen (struct socket **sockp,
+		    __u32 local_ip, int local_port, int backlog)
+{
+	int      fatal;
+	int      rc;
+
+	rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+	if (rc != 0) {
+		if (!fatal)
+			CERROR("Can't create socket: port %d already in use\n",
+			       local_port);
+		return rc;
+	}
+
+	rc = (*sockp)->ops->listen(*sockp, backlog);
+	if (rc == 0)
+		return 0;
+
+	CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+	sock_release(*sockp);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_listen);
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+	wait_queue_t   wait;
+	struct socket *newsock;
+	int	    rc;
+
+	init_waitqueue_entry(&wait, current);
+
+	/* XXX this should add a ref to sock->ops->owner, if
+	 * TCP could be a module */
+	rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
+	if (rc) {
+		CERROR("Can't allocate socket\n");
+		return rc;
+	}
+
+	newsock->ops = sock->ops;
+
+	rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+	if (rc == -EAGAIN) {
+		/* Nothing ready, so wait for activity */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(sk_sleep(sock->sk), &wait);
+		schedule();
+		remove_wait_queue(sk_sleep(sock->sk), &wait);
+		set_current_state(TASK_RUNNING);
+		rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+	}
+
+	if (rc != 0)
+		goto failed;
+
+	*newsockp = newsock;
+	return 0;
+
+ failed:
+	sock_release(newsock);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_accept);
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+	wake_up_all(sk_sleep(sock->sk));
+}
+
+EXPORT_SYMBOL(libcfs_sock_abort_accept);
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+		     __u32 local_ip, int local_port,
+		     __u32 peer_ip, int peer_port)
+{
+	struct sockaddr_in  srvaddr;
+	int		 rc;
+
+	rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+	if (rc != 0)
+		return rc;
+
+	memset (&srvaddr, 0, sizeof (srvaddr));
+	srvaddr.sin_family = AF_INET;
+	srvaddr.sin_port = htons(peer_port);
+	srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+	rc = (*sockp)->ops->connect(*sockp,
+				    (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+				    0);
+	if (rc == 0)
+		return 0;
+
+	/* EADDRNOTAVAIL probably means we're already connected to the same
+	 * peer/port on the same local port on a differently typed
+	 * connection.  Let our caller retry with a different local
+	 * port... */
+	*fatal = !(rc == -EADDRNOTAVAIL);
+
+	CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET,
+	       "Error %d connecting %pI4h/%d -> %pI4h/%d\n", rc,
+	       &local_ip, local_port, &peer_ip, peer_port);
+
+	sock_release(*sockp);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_connect);
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+	sock_release(sock);
+}
+
+EXPORT_SYMBOL(libcfs_sock_release);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
new file mode 100644
index 000000000..c8e293002
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "../tracefile.h"
+
+/* percents to share the total debug memory for each type */
+static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = {
+	80,  /* 80% pages for CFS_TCD_TYPE_PROC */
+	10,  /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
+	10   /* 10% pages for CFS_TCD_TYPE_IRQ */
+};
+
+char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+
+struct rw_semaphore cfs_tracefile_sem;
+
+int cfs_tracefile_init_arch(void)
+{
+	int    i;
+	int    j;
+	struct cfs_trace_cpu_data *tcd;
+
+	init_rwsem(&cfs_tracefile_sem);
+
+	/* initialize trace_data */
+	memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
+	for (i = 0; i < CFS_TCD_TYPE_MAX; i++) {
+		cfs_trace_data[i] =
+			kmalloc(sizeof(union cfs_trace_data_union) *
+				num_possible_cpus(), GFP_KERNEL);
+		if (cfs_trace_data[i] == NULL)
+			goto out;
+
+	}
+
+	/* arch related info initialized */
+	cfs_tcd_for_each(tcd, i, j) {
+		spin_lock_init(&tcd->tcd_lock);
+		tcd->tcd_pages_factor = pages_factor[i];
+		tcd->tcd_type = i;
+		tcd->tcd_cpu = j;
+	}
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++) {
+			cfs_trace_console_buffers[i][j] =
+				kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE,
+					GFP_KERNEL);
+
+			if (cfs_trace_console_buffers[i][j] == NULL)
+				goto out;
+		}
+
+	return 0;
+
+out:
+	cfs_tracefile_fini_arch();
+	printk(KERN_ERR "lnet: Not enough memory\n");
+	return -ENOMEM;
+}
+
+void cfs_tracefile_fini_arch(void)
+{
+	int    i;
+	int    j;
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++)
+			if (cfs_trace_console_buffers[i][j] != NULL) {
+				kfree(cfs_trace_console_buffers[i][j]);
+				cfs_trace_console_buffers[i][j] = NULL;
+			}
+
+	for (i = 0; cfs_trace_data[i] != NULL; i++) {
+		kfree(cfs_trace_data[i]);
+		cfs_trace_data[i] = NULL;
+	}
+}
+
+void cfs_tracefile_read_lock(void)
+{
+	down_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_unlock(void)
+{
+	up_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_lock(void)
+{
+	down_write(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_unlock(void)
+{
+	up_write(&cfs_tracefile_sem);
+}
+
+cfs_trace_buf_type_t cfs_trace_buf_idx_get(void)
+{
+	if (in_irq())
+		return CFS_TCD_TYPE_IRQ;
+	else if (in_softirq())
+		return CFS_TCD_TYPE_SOFTIRQ;
+	else
+		return CFS_TCD_TYPE_PROC;
+}
+
+/*
+ * The walking argument indicates the locking comes from all tcd types
+ * iterator and we must lock it and dissable local irqs to avoid deadlocks
+ * with other interrupt locks that might be happening. See LU-1311
+ * for details.
+ */
+int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+	__acquires(&tcd->tc_lock)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_lock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_lock_irq(&tcd->tcd_lock);
+	else
+		spin_lock(&tcd->tcd_lock);
+	return 1;
+}
+
+void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+	__releases(&tcd->tcd_lock)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_unlock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_unlock_irq(&tcd->tcd_lock);
+	else
+		spin_unlock(&tcd->tcd_lock);
+}
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+		      struct cfs_trace_page *tage)
+{
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+	return tcd->tcd_cpu == tage->cpu;
+}
+
+void
+cfs_set_ptldebug_header(struct ptldebug_header *header,
+			struct libcfs_debug_msg_data *msgdata,
+			unsigned long stack)
+{
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+
+	header->ph_subsys = msgdata->msg_subsys;
+	header->ph_mask = msgdata->msg_mask;
+	header->ph_cpu_id = smp_processor_id();
+	header->ph_type = cfs_trace_buf_idx_get();
+	header->ph_sec = (__u32)tv.tv_sec;
+	header->ph_usec = tv.tv_usec;
+	header->ph_stack = stack;
+	header->ph_pid = current->pid;
+	header->ph_line_num = msgdata->msg_line;
+	header->ph_extern_pid = 0;
+	return;
+}
+
+static char *
+dbghdr_to_err_string(struct ptldebug_header *hdr)
+{
+	switch (hdr->ph_subsys) {
+
+		case S_LND:
+		case S_LNET:
+			return "LNetError";
+		default:
+			return "LustreError";
+	}
+}
+
+static char *
+dbghdr_to_info_string(struct ptldebug_header *hdr)
+{
+	switch (hdr->ph_subsys) {
+
+		case S_LND:
+		case S_LNET:
+			return "LNet";
+		default:
+			return "Lustre";
+	}
+}
+
+void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+			  const char *buf, int len, const char *file,
+			  const char *fn)
+{
+	char *prefix = "Lustre", *ptype = NULL;
+
+	if ((mask & D_EMERG) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_EMERG;
+	} else if ((mask & D_ERROR) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_ERR;
+	} else if ((mask & D_WARNING) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_WARNING;
+	} else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_INFO;
+	}
+
+	if ((mask & D_CONSOLE) != 0) {
+		printk("%s%s: %.*s", ptype, prefix, len, buf);
+	} else {
+		printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix,
+		       hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num,
+		       fn, len, buf);
+	}
+	return;
+}
+
+int cfs_trace_max_debug_mb(void)
+{
+	int  total_mb = (totalram_pages >> (20 - PAGE_SHIFT));
+
+	return max(512, (total_mb * 80)/100);
+}
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
new file mode 100644
index 000000000..ba84e4ffd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_TRACEFILE_H__
+#define __LIBCFS_LINUX_TRACEFILE_H__
+
+/**
+ * three types of trace_data in linux
+ */
+typedef enum {
+	CFS_TCD_TYPE_PROC = 0,
+	CFS_TCD_TYPE_SOFTIRQ,
+	CFS_TCD_TYPE_IRQ,
+	CFS_TCD_TYPE_MAX
+} cfs_trace_buf_type_t;
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/module.c b/kernel/drivers/staging/lustre/lustre/libcfs/module.c
new file mode 100644
index 000000000..f0ee76abf
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/module.c
@@ -0,0 +1,976 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/list.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <asm/div64.h>
+
+#include "../../include/linux/libcfs/libcfs_crypto.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "../../include/linux/lnet/lnet.h"
+#include "tracefile.h"
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+extern struct miscdevice libcfs_dev;
+extern struct rw_semaphore cfs_tracefile_sem;
+extern struct mutex cfs_trace_thread_mutex;
+extern struct cfs_wi_sched *cfs_sched_rehash;
+extern void libcfs_init_nidstrings(void);
+
+static int insert_proc(void);
+static void remove_proc(void);
+
+static struct ctl_table_header *lnet_table_header;
+extern char lnet_upcall[1024];
+/**
+ * The path of debug log dump upcall script.
+ */
+extern char lnet_debug_log_upcall[1024];
+
+#define CTL_LNET	(0x100)
+
+enum {
+	PSDEV_DEBUG = 1,	  /* control debugging */
+	PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
+	PSDEV_PRINTK,	     /* force all messages to console */
+	PSDEV_CONSOLE_RATELIMIT,  /* ratelimit console messages */
+	PSDEV_CONSOLE_MAX_DELAY_CS, /* maximum delay over which we skip messages */
+	PSDEV_CONSOLE_MIN_DELAY_CS, /* initial delay over which we skip messages */
+	PSDEV_CONSOLE_BACKOFF,    /* delay increase factor */
+	PSDEV_DEBUG_PATH,	 /* crashdump log location */
+	PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
+	PSDEV_CPT_TABLE,	  /* information about cpu partitions */
+	PSDEV_LNET_UPCALL,	/* User mode upcall script  */
+	PSDEV_LNET_MEMUSED,       /* bytes currently PORTAL_ALLOCated */
+	PSDEV_LNET_CATASTROPHE,   /* if we have LBUGged or panic'd */
+	PSDEV_LNET_PANIC_ON_LBUG, /* flag to panic on LBUG */
+	PSDEV_LNET_DUMP_KERNEL,   /* snapshot kernel debug buffer to file */
+	PSDEV_LNET_DAEMON_FILE,   /* spool kernel debug buffer to file */
+	PSDEV_LNET_DEBUG_MB,      /* size of debug buffer */
+	PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */
+	PSDEV_LNET_WATCHDOG_RATELIMIT,  /* ratelimit watchdog messages  */
+	PSDEV_LNET_FORCE_LBUG,    /* hook to force an LBUG */
+	PSDEV_LNET_FAIL_LOC,      /* control test failures instrumentation */
+	PSDEV_LNET_FAIL_VAL,      /* userdata for fail loc */
+};
+
+static void kportal_memhog_free (struct libcfs_device_userstate *ldu)
+{
+	struct page **level0p = &ldu->ldu_memhog_root_page;
+	struct page **level1p;
+	struct page **level2p;
+	int	   count1;
+	int	   count2;
+
+	if (*level0p != NULL) {
+
+		level1p = (struct page **)page_address(*level0p);
+		count1 = 0;
+
+		while (count1 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+		       *level1p != NULL) {
+
+			level2p = (struct page **)page_address(*level1p);
+			count2 = 0;
+
+			while (count2 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+			       *level2p != NULL) {
+
+				__free_page(*level2p);
+				ldu->ldu_memhog_pages--;
+				level2p++;
+				count2++;
+			}
+
+			__free_page(*level1p);
+			ldu->ldu_memhog_pages--;
+			level1p++;
+			count1++;
+		}
+
+		__free_page(*level0p);
+		ldu->ldu_memhog_pages--;
+
+		*level0p = NULL;
+	}
+
+	LASSERT (ldu->ldu_memhog_pages == 0);
+}
+
+static int kportal_memhog_alloc(struct libcfs_device_userstate *ldu, int npages,
+		     gfp_t flags)
+{
+	struct page **level0p;
+	struct page **level1p;
+	struct page **level2p;
+	int	   count1;
+	int	   count2;
+
+	LASSERT (ldu->ldu_memhog_pages == 0);
+	LASSERT (ldu->ldu_memhog_root_page == NULL);
+
+	if (npages < 0)
+		return -EINVAL;
+
+	if (npages == 0)
+		return 0;
+
+	level0p = &ldu->ldu_memhog_root_page;
+	*level0p = alloc_page(flags);
+	if (*level0p == NULL)
+		return -ENOMEM;
+	ldu->ldu_memhog_pages++;
+
+	level1p = (struct page **)page_address(*level0p);
+	count1 = 0;
+	memset(level1p, 0, PAGE_CACHE_SIZE);
+
+	while (ldu->ldu_memhog_pages < npages &&
+	       count1 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+		if (cfs_signal_pending())
+			return -EINTR;
+
+		*level1p = alloc_page(flags);
+		if (*level1p == NULL)
+			return -ENOMEM;
+		ldu->ldu_memhog_pages++;
+
+		level2p = (struct page **)page_address(*level1p);
+		count2 = 0;
+		memset(level2p, 0, PAGE_CACHE_SIZE);
+
+		while (ldu->ldu_memhog_pages < npages &&
+		       count2 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+			if (cfs_signal_pending())
+				return -EINTR;
+
+			*level2p = alloc_page(flags);
+			if (*level2p == NULL)
+				return -ENOMEM;
+			ldu->ldu_memhog_pages++;
+
+			level2p++;
+			count2++;
+		}
+
+		level1p++;
+		count1++;
+	}
+
+	return 0;
+}
+
+/* called when opening /dev/device */
+static int libcfs_psdev_open(unsigned long flags, void *args)
+{
+	struct libcfs_device_userstate *ldu;
+
+	try_module_get(THIS_MODULE);
+
+	LIBCFS_ALLOC(ldu, sizeof(*ldu));
+	if (ldu != NULL) {
+		ldu->ldu_memhog_pages = 0;
+		ldu->ldu_memhog_root_page = NULL;
+	}
+	*(struct libcfs_device_userstate **)args = ldu;
+
+	return 0;
+}
+
+/* called when closing /dev/device */
+static int libcfs_psdev_release(unsigned long flags, void *args)
+{
+	struct libcfs_device_userstate *ldu;
+
+	ldu = (struct libcfs_device_userstate *)args;
+	if (ldu != NULL) {
+		kportal_memhog_free(ldu);
+		LIBCFS_FREE(ldu, sizeof(*ldu));
+	}
+
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static struct rw_semaphore ioctl_list_sem;
+static struct list_head ioctl_list;
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (!list_empty(&hand->item))
+		rc = -EBUSY;
+	else
+		list_add_tail(&hand->item, &ioctl_list);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_register_ioctl);
+
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (list_empty(&hand->item))
+		rc = -ENOENT;
+	else
+		list_del_init(&hand->item);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
+
+static int libcfs_ioctl_int(struct cfs_psdev_file *pfile, unsigned long cmd,
+			    void *arg, struct libcfs_ioctl_data *data)
+{
+	int err = -EINVAL;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CLEAR_DEBUG:
+		libcfs_debug_clear_buffer();
+		return 0;
+	/*
+	 * case IOC_LIBCFS_PANIC:
+	 * Handled in arch/cfs_module.c
+	 */
+	case IOC_LIBCFS_MARK_DEBUG:
+		if (data->ioc_inlbuf1 == NULL ||
+		    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+			return -EINVAL;
+		libcfs_debug_mark_buffer(data->ioc_inlbuf1);
+		return 0;
+	case IOC_LIBCFS_MEMHOG:
+		if (pfile->private_data == NULL) {
+			err = -EINVAL;
+		} else {
+			kportal_memhog_free(pfile->private_data);
+			/* XXX The ioc_flags is not GFP flags now, need to be fixed */
+			err = kportal_memhog_alloc(pfile->private_data,
+						   data->ioc_count,
+						   data->ioc_flags);
+			if (err != 0)
+				kportal_memhog_free(pfile->private_data);
+		}
+		break;
+
+	case IOC_LIBCFS_PING_TEST: {
+		extern void (kping_client)(struct libcfs_ioctl_data *);
+		void (*ping)(struct libcfs_ioctl_data *);
+
+		CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n",
+		       data->ioc_count, libcfs_nid2str(data->ioc_nid),
+		       libcfs_nid2str(data->ioc_nid));
+		ping = symbol_get(kping_client);
+		if (!ping)
+			CERROR("symbol_get failed\n");
+		else {
+			ping(data);
+			symbol_put(kping_client);
+		}
+		return 0;
+	}
+
+	default: {
+		struct libcfs_ioctl_handler *hand;
+		err = -EINVAL;
+		down_read(&ioctl_list_sem);
+		list_for_each_entry(hand, &ioctl_list, item) {
+			err = hand->handle_ioctl(cmd, data);
+			if (err != -EINVAL) {
+				if (err == 0)
+					err = libcfs_ioctl_popdata(arg,
+							data, sizeof (*data));
+				break;
+			}
+		}
+		up_read(&ioctl_list_sem);
+		break;
+	}
+	}
+
+	return err;
+}
+
+static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *arg)
+{
+	char    *buf;
+	struct libcfs_ioctl_data *data;
+	int err = 0;
+
+	LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	/* 'cmd' and permissions get checked in our arch-specific caller */
+	if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+		CERROR("PORTALS ioctl: data error\n");
+		err = -EINVAL;
+		goto out;
+	}
+	data = (struct libcfs_ioctl_data *)buf;
+
+	err = libcfs_ioctl_int(pfile, cmd, arg, data);
+
+out:
+	LIBCFS_FREE(buf, 1024);
+	return err;
+}
+
+
+struct cfs_psdev_ops libcfs_psdev_ops = {
+	libcfs_psdev_open,
+	libcfs_psdev_release,
+	NULL,
+	NULL,
+	libcfs_ioctl
+};
+
+static int init_libcfs_module(void)
+{
+	int rc;
+
+	libcfs_arch_init();
+	libcfs_init_nidstrings();
+	init_rwsem(&cfs_tracefile_sem);
+	mutex_init(&cfs_trace_thread_mutex);
+	init_rwsem(&ioctl_list_sem);
+	INIT_LIST_HEAD(&ioctl_list);
+	init_waitqueue_head(&cfs_race_waitq);
+
+	rc = libcfs_debug_init(5 * 1024 * 1024);
+	if (rc < 0) {
+		pr_err("LustreError: libcfs_debug_init: %d\n", rc);
+		return rc;
+	}
+
+	rc = cfs_cpu_init();
+	if (rc != 0)
+		goto cleanup_debug;
+
+	rc = misc_register(&libcfs_dev);
+	if (rc) {
+		CERROR("misc_register: error %d\n", rc);
+		goto cleanup_cpu;
+	}
+
+	rc = cfs_wi_startup();
+	if (rc) {
+		CERROR("initialize workitem: error %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	/* max to 4 threads, should be enough for rehash */
+	rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4);
+	rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY,
+				 rc, &cfs_sched_rehash);
+	if (rc != 0) {
+		CERROR("Startup workitem scheduler: error: %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	rc = cfs_crypto_register();
+	if (rc) {
+		CERROR("cfs_crypto_register: error %d\n", rc);
+		goto cleanup_wi;
+	}
+
+
+	rc = insert_proc();
+	if (rc) {
+		CERROR("insert_proc: error %d\n", rc);
+		goto cleanup_crypto;
+	}
+
+	CDEBUG (D_OTHER, "portals setup OK\n");
+	return 0;
+ cleanup_crypto:
+	cfs_crypto_unregister();
+ cleanup_wi:
+	cfs_wi_shutdown();
+ cleanup_deregister:
+	misc_deregister(&libcfs_dev);
+cleanup_cpu:
+	cfs_cpu_fini();
+ cleanup_debug:
+	libcfs_debug_cleanup();
+	return rc;
+}
+
+static void exit_libcfs_module(void)
+{
+	int rc;
+
+	remove_proc();
+
+	CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	if (cfs_sched_rehash != NULL) {
+		cfs_wi_sched_destroy(cfs_sched_rehash);
+		cfs_sched_rehash = NULL;
+	}
+
+	cfs_crypto_unregister();
+	cfs_wi_shutdown();
+
+	rc = misc_deregister(&libcfs_dev);
+	if (rc)
+		CERROR("misc_deregister error %d\n", rc);
+
+	cfs_cpu_fini();
+
+	if (atomic_read(&libcfs_kmemory) != 0)
+		CERROR("Portals memory leaked: %d bytes\n",
+		       atomic_read(&libcfs_kmemory));
+
+	rc = libcfs_debug_cleanup();
+	if (rc)
+		pr_err("LustreError: libcfs_debug_cleanup: %d\n", rc);
+
+	libcfs_arch_cleanup();
+}
+
+static int proc_call_handler(void *data, int write, loff_t *ppos,
+		void __user *buffer, size_t *lenp,
+		int (*handler)(void *data, int write,
+		loff_t pos, void __user *buffer, int len))
+{
+	int rc = handler(data, write, *ppos, buffer, *lenp);
+
+	if (rc < 0)
+		return rc;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+		*lenp = rc;
+		*ppos += rc;
+	}
+	return 0;
+}
+
+static int __proc_dobitmasks(void *data, int write,
+			     loff_t pos, void __user *buffer, int nob)
+{
+	const int     tmpstrlen = 512;
+	char	 *tmpstr;
+	int	   rc;
+	unsigned int *mask = data;
+	int	   is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+	int	   is_printk = (mask == &libcfs_printk) ? 1 : 0;
+
+	rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen);
+	if (rc < 0)
+		return rc;
+
+	if (!write) {
+		libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys);
+		rc = strlen(tmpstr);
+
+		if (pos >= rc) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+						      tmpstr + pos, "\n");
+		}
+	} else {
+		rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob);
+		if (rc < 0) {
+			cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+			return rc;
+		}
+
+		rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys);
+		/* Always print LBUG/LASSERT to console, so keep this mask */
+		if (is_printk)
+			*mask |= D_EMERG;
+	}
+
+	cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+	return rc;
+}
+
+static int proc_dobitmasks(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_dobitmasks);
+}
+
+static int min_watchdog_ratelimit;	  /* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
+static int __proc_dump_kernel(void *data, int write,
+			      loff_t pos, void __user *buffer, int nob)
+{
+	if (!write)
+		return 0;
+
+	return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
+}
+
+static int proc_dump_kernel(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_dump_kernel);
+}
+
+static int __proc_daemon_file(void *data, int write,
+			      loff_t pos, void __user *buffer, int nob)
+{
+	if (!write) {
+		int len = strlen(cfs_tracefile);
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob,
+						cfs_tracefile + pos, "\n");
+	}
+
+	return cfs_trace_daemon_command_usrstr(buffer, nob);
+}
+
+static int proc_daemon_file(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_daemon_file);
+}
+
+static int __proc_debug_mb(void *data, int write,
+			   loff_t pos, void __user *buffer, int nob)
+{
+	if (!write) {
+		char tmpstr[32];
+		int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
+				    cfs_trace_get_debug_mb());
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
+		       "\n");
+	}
+
+	return cfs_trace_set_debug_mb_usrstr(buffer, nob);
+}
+
+static int proc_debug_mb(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_debug_mb);
+}
+
+static int proc_console_max_delay_cs(struct ctl_table *table, int write,
+				     void __user *buffer, size_t *lenp,
+				     loff_t *ppos)
+{
+	int rc, max_delay_cs;
+	struct ctl_table dummy = *table;
+	long d;
+
+	dummy.data = &max_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	max_delay_cs = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (max_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(max_delay_cs) / 100;
+	if (d == 0 || d < libcfs_console_min_delay)
+		return -EINVAL;
+	libcfs_console_max_delay = d;
+
+	return rc;
+}
+
+static int proc_console_min_delay_cs(struct ctl_table *table, int write,
+				     void __user *buffer, size_t *lenp,
+				     loff_t *ppos)
+{
+	int rc, min_delay_cs;
+	struct ctl_table dummy = *table;
+	long d;
+
+	dummy.data = &min_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	min_delay_cs = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (min_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(min_delay_cs) / 100;
+	if (d == 0 || d > libcfs_console_max_delay)
+		return -EINVAL;
+	libcfs_console_min_delay = d;
+
+	return rc;
+}
+
+static int proc_console_backoff(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc, backoff;
+	struct ctl_table dummy = *table;
+
+	dummy.data = &backoff;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		backoff = libcfs_console_backoff;
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	backoff = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (backoff <= 0)
+		return -EINVAL;
+
+	libcfs_console_backoff = backoff;
+
+	return rc;
+}
+
+static int libcfs_force_lbug(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	if (write)
+		LBUG();
+	return 0;
+}
+
+static int proc_fail_loc(struct ctl_table *table, int write,
+			 void __user *buffer,
+			 size_t *lenp, loff_t *ppos)
+{
+	int rc;
+	long old_fail_loc = cfs_fail_loc;
+
+	rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+	if (old_fail_loc != cfs_fail_loc)
+		wake_up(&cfs_race_waitq);
+	return rc;
+}
+
+static int __proc_cpt_table(void *data, int write,
+			    loff_t pos, void __user *buffer, int nob)
+{
+	char *buf = NULL;
+	int   len = 4096;
+	int   rc  = 0;
+
+	if (write)
+		return -EPERM;
+
+	LASSERT(cfs_cpt_table != NULL);
+
+	while (1) {
+		LIBCFS_ALLOC(buf, len);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		rc = cfs_cpt_table_print(cfs_cpt_table, buf, len);
+		if (rc >= 0)
+			break;
+
+		if (rc == -EFBIG) {
+			LIBCFS_FREE(buf, len);
+			len <<= 1;
+			continue;
+		}
+		goto out;
+	}
+
+	if (pos >= rc) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+	if (buf != NULL)
+		LIBCFS_FREE(buf, len);
+	return rc;
+}
+
+static int proc_cpt_table(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_call_handler(table->data, write, ppos, buffer, lenp,
+				 __proc_cpt_table);
+}
+
+static struct ctl_table lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		.procname = "debug",
+		.data     = &libcfs_debug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		.procname = "subsystem_debug",
+		.data     = &libcfs_subsystem_debug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		.procname = "printk",
+		.data     = &libcfs_printk,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		.procname = "console_ratelimit",
+		.data     = &libcfs_console_ratelimit,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.procname = "console_max_delay_centisecs",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_max_delay_cs
+	},
+	{
+		.procname = "console_min_delay_centisecs",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_min_delay_cs
+	},
+	{
+		.procname = "console_backoff",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_backoff
+	},
+
+	{
+		.procname = "debug_path",
+		.data     = libcfs_debug_file_path_arr,
+		.maxlen   = sizeof(libcfs_debug_file_path_arr),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+
+	{
+		.procname = "cpu_partition_table",
+		.maxlen   = 128,
+		.mode     = 0444,
+		.proc_handler = &proc_cpt_table,
+	},
+
+	{
+		.procname = "upcall",
+		.data     = lnet_upcall,
+		.maxlen   = sizeof(lnet_upcall),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+	{
+		.procname = "debug_log_upcall",
+		.data     = lnet_debug_log_upcall,
+		.maxlen   = sizeof(lnet_debug_log_upcall),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+	{
+		.procname = "lnet_memused",
+		.data     = (int *)&libcfs_kmemory.counter,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "catastrophe",
+		.data     = &libcfs_catastrophe,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "panic_on_lbug",
+		.data     = &libcfs_panic_on_lbug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.procname = "dump_kernel",
+		.maxlen   = 256,
+		.mode     = 0200,
+		.proc_handler = &proc_dump_kernel,
+	},
+	{
+		.procname = "daemon_file",
+		.mode     = 0644,
+		.maxlen   = 256,
+		.proc_handler = &proc_daemon_file,
+	},
+	{
+		.procname = "debug_mb",
+		.mode     = 0644,
+		.proc_handler = &proc_debug_mb,
+	},
+	{
+		.procname = "watchdog_ratelimit",
+		.data     = &libcfs_watchdog_ratelimit,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec_minmax,
+		.extra1   = &min_watchdog_ratelimit,
+		.extra2   = &max_watchdog_ratelimit,
+	},
+	{
+		.procname = "force_lbug",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0200,
+		.proc_handler = &libcfs_force_lbug
+	},
+	{
+		.procname = "fail_loc",
+		.data     = &cfs_fail_loc,
+		.maxlen   = sizeof(cfs_fail_loc),
+		.mode     = 0644,
+		.proc_handler = &proc_fail_loc
+	},
+	{
+		.procname = "fail_val",
+		.data     = &cfs_fail_val,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+	}
+};
+
+static struct ctl_table top_table[] = {
+	{
+		.procname = "lnet",
+		.mode     = 0555,
+		.data     = NULL,
+		.maxlen   = 0,
+		.child    = lnet_table,
+	},
+	{
+	}
+};
+
+static int insert_proc(void)
+{
+	if (lnet_table_header == NULL)
+		lnet_table_header = register_sysctl_table(top_table);
+	return 0;
+}
+
+static void remove_proc(void)
+{
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+}
+
+MODULE_VERSION("1.0.0");
+
+module_init(init_libcfs_module);
+module_exit(exit_libcfs_module);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c b/kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c
new file mode 100644
index 000000000..087449f4e
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/nidstrings.c
@@ -0,0 +1,842 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lnet.h"
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char      libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int       libcfs_nidstring_idx;
+
+static spinlock_t libcfs_nidstring_lock;
+
+void libcfs_init_nidstrings(void)
+{
+	spin_lock_init(&libcfs_nidstring_lock);
+}
+
+static char *
+libcfs_next_nidstring(void)
+{
+	char	  *str;
+	unsigned long  flags;
+
+	spin_lock_irqsave(&libcfs_nidstring_lock, flags);
+
+	str = libcfs_nidstrings[libcfs_nidstring_idx++];
+	if (libcfs_nidstring_idx == ARRAY_SIZE(libcfs_nidstrings))
+		libcfs_nidstring_idx = 0;
+
+	spin_unlock_irqrestore(&libcfs_nidstring_lock, flags);
+	return str;
+}
+
+static int libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+	*addr = 0;
+	return 1;
+}
+
+static void libcfs_ip_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u",
+		 (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+		 (addr >> 8) & 0xff, addr & 0xff);
+}
+
+static int libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+	unsigned int	a;
+	unsigned int	b;
+	unsigned int	c;
+	unsigned int	d;
+	int		n = nob; /* XscanfX */
+
+	/* numeric IP? */
+	if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+	    n == nob &&
+	    (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+	    (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+		*addr = ((a<<24)|(b<<16)|(c<<8)|d);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void libcfs_decnum_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "%u", addr);
+}
+
+static void libcfs_hexnum_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr);
+}
+
+static int libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int     n;
+
+	n = nob;
+	if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+static int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int	rc;
+
+	rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+	if (rc == 0)
+		list_add_tail(&el->el_link, list);
+
+	return rc;
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+	struct cfs_expr_list *el;
+
+	LASSERT(!list_empty(numaddr));
+	el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+	return cfs_expr_list_match(addr, el);
+}
+
+struct netstrfns {
+	int	  nf_type;
+	char	*nf_name;
+	char	*nf_modname;
+	void       (*nf_addr2str)(__u32 addr, char *str);
+	int	(*nf_str2addr)(const char *str, int nob, __u32 *addr);
+	int	(*nf_parse_addrlist)(char *str, int len,
+					struct list_head *list);
+	int	(*nf_match_addr)(__u32 addr, struct list_head *list);
+};
+
+static struct netstrfns  libcfs_netstrfns[] = {
+	{/* .nf_type      */  LOLND,
+	 /* .nf_name      */  "lo",
+	 /* .nf_modname   */  "klolnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_lo_str2addr,
+	 /* .nf_parse_addr*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  SOCKLND,
+	 /* .nf_name      */  "tcp",
+	 /* .nf_modname   */  "ksocklnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  O2IBLND,
+	 /* .nf_name      */  "o2ib",
+	 /* .nf_modname   */  "ko2iblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  CIBLND,
+	 /* .nf_name      */  "cib",
+	 /* .nf_modname   */  "kciblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  OPENIBLND,
+	 /* .nf_name      */  "openib",
+	 /* .nf_modname   */  "kopeniblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  IIBLND,
+	 /* .nf_name      */  "iib",
+	 /* .nf_modname   */  "kiiblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  VIBLND,
+	 /* .nf_name      */  "vib",
+	 /* .nf_modname   */  "kviblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  RALND,
+	 /* .nf_name      */  "ra",
+	 /* .nf_modname   */  "kralnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  QSWLND,
+	 /* .nf_name      */  "elan",
+	 /* .nf_modname   */  "kqswlnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  GMLND,
+	 /* .nf_name      */  "gm",
+	 /* .nf_modname   */  "kgmlnd",
+	 /* .nf_addr2str  */  libcfs_hexnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  MXLND,
+	 /* .nf_name      */  "mx",
+	 /* .nf_modname   */  "kmxlnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  PTLLND,
+	 /* .nf_name      */  "ptl",
+	 /* .nf_modname   */  "kptllnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  GNILND,
+	 /* .nf_name      */  "gni",
+	 /* .nf_modname   */  "kgnilnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	/* placeholder for net0 alias.  It MUST BE THE LAST ENTRY */
+	{/* .nf_type      */  -1},
+};
+
+static const int libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns);
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+
+static struct netstrfns *
+libcfs_lnd2netstrfns(int lnd)
+{
+	int    i;
+
+	if (lnd >= 0)
+		for (i = 0; i < libcfs_nnetstrfns; i++)
+			if (lnd == libcfs_netstrfns[i].nf_type)
+				return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+	struct netstrfns *nf;
+	int	       i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (nf->nf_type >= 0 &&
+		    !strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+			return nf;
+	}
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+	int    i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (libcfs_netstrfns[i].nf_type >= 0 &&
+		    !strcmp(libcfs_netstrfns[i].nf_name, name))
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+int
+libcfs_isknown_lnd(int type)
+{
+	return libcfs_lnd2netstrfns(type) != NULL;
+}
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+
+char *
+libcfs_lnd2modname(int lnd)
+{
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	return (nf == NULL) ? NULL : nf->nf_modname;
+}
+EXPORT_SYMBOL(libcfs_lnd2modname);
+
+char *
+libcfs_lnd2str(int lnd)
+{
+	char	   *str;
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	if (nf != NULL)
+		return nf->nf_name;
+
+	str = libcfs_next_nidstring();
+	snprintf(str, LNET_NIDSTR_SIZE, "?%d?", lnd);
+	return str;
+}
+EXPORT_SYMBOL(libcfs_lnd2str);
+
+int
+libcfs_str2lnd(const char *str)
+{
+	struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+	if (nf != NULL)
+		return nf->nf_type;
+
+	return -1;
+}
+EXPORT_SYMBOL(libcfs_str2lnd);
+
+char *
+libcfs_net2str(__u32 net)
+{
+	int	       lnd = LNET_NETTYP(net);
+	int	       num = LNET_NETNUM(net);
+	struct netstrfns *nf  = libcfs_lnd2netstrfns(lnd);
+	char	     *str = libcfs_next_nidstring();
+
+	if (nf == NULL)
+		snprintf(str, LNET_NIDSTR_SIZE, "<%d:%d>", lnd, num);
+	else if (num == 0)
+		snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name);
+	else
+		snprintf(str, LNET_NIDSTR_SIZE, "%s%d", nf->nf_name, num);
+
+	return str;
+}
+EXPORT_SYMBOL(libcfs_net2str);
+
+char *
+libcfs_nid2str(lnet_nid_t nid)
+{
+	__u32	     addr = LNET_NIDADDR(nid);
+	__u32	     net = LNET_NIDNET(nid);
+	int	       lnd = LNET_NETTYP(net);
+	int	       nnum = LNET_NETNUM(net);
+	struct netstrfns *nf;
+	char	     *str;
+	int	       nob;
+
+	if (nid == LNET_NID_ANY)
+		return "<?>";
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	str = libcfs_next_nidstring();
+
+	if (nf == NULL)
+		snprintf(str, LNET_NIDSTR_SIZE, "%x@<%d:%d>", addr, lnd, nnum);
+	else {
+		nf->nf_addr2str(addr, str);
+		nob = strlen(str);
+		if (nnum == 0)
+			snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s",
+				 nf->nf_name);
+		else
+			snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%d",
+				 nf->nf_name, nnum);
+	}
+
+	return str;
+}
+EXPORT_SYMBOL(libcfs_nid2str);
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+	struct netstrfns *uninitialized_var(nf);
+	int	       nob;
+	unsigned int   netnum;
+	int	       i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (nf->nf_type >= 0 &&
+		    !strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+			break;
+	}
+
+	if (i == libcfs_nnetstrfns)
+		return NULL;
+
+	nob = strlen(nf->nf_name);
+
+	if (strlen(str) == (unsigned int)nob) {
+		netnum = 0;
+	} else {
+		if (nf->nf_type == LOLND) /* net number not allowed */
+			return NULL;
+
+		str += nob;
+		i = strlen(str);
+		if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+		    i != (int)strlen(str))
+			return NULL;
+	}
+
+	*net = LNET_MKNET(nf->nf_type, netnum);
+	return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+	__u32  net;
+
+	if (libcfs_str2net_internal(str, &net) != NULL)
+		return net;
+
+	return LNET_NIDNET(LNET_NID_ANY);
+}
+EXPORT_SYMBOL(libcfs_str2net);
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+	const char       *sep = strchr(str, '@');
+	struct netstrfns *nf;
+	__u32	     net;
+	__u32	     addr;
+
+	if (sep != NULL) {
+		nf = libcfs_str2net_internal(sep + 1, &net);
+		if (nf == NULL)
+			return LNET_NID_ANY;
+	} else {
+		sep = str + strlen(str);
+		net = LNET_MKNET(SOCKLND, 0);
+		nf = libcfs_lnd2netstrfns(SOCKLND);
+		LASSERT(nf != NULL);
+	}
+
+	if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+		return LNET_NID_ANY;
+
+	return LNET_MKNID(net, addr);
+}
+EXPORT_SYMBOL(libcfs_str2nid);
+
+char *
+libcfs_id2str(lnet_process_id_t id)
+{
+	char *str = libcfs_next_nidstring();
+
+	if (id.pid == LNET_PID_ANY) {
+		snprintf(str, LNET_NIDSTR_SIZE,
+			 "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+		return str;
+	}
+
+	snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+		 ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+		 (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+	return str;
+}
+EXPORT_SYMBOL(libcfs_id2str);
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+	if (!strcmp(str, "*")) {
+		*nidp = LNET_NID_ANY;
+		return 1;
+	}
+
+	*nidp = libcfs_str2nid(str);
+	return *nidp != LNET_NID_ANY;
+}
+EXPORT_SYMBOL(libcfs_str2anynid);
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>	 :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>	:== <addrrange> '@' <net>
+ * <addrrange>       :== '*' |
+ *		       <ipaddr_range> |
+ *			 <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *			 <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *		       <expr_list>
+ * <expr_list>       :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *		       <number> '-' <number> |
+ *		       <number> '-' <number> '/' <number>
+ * <net>	     :== <netname> | <netname><number>
+ * <netname>	 :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *		       "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+	/**
+	 * Link to list of this structures which is built on nid range
+	 * list parsing.
+	 */
+	struct list_head nr_link;
+	/**
+	 * List head for addrrange::ar_link.
+	 */
+	struct list_head nr_addrranges;
+	/**
+	 * Flag indicating that *@<net> is found.
+	 */
+	int nr_all;
+	/**
+	 * Pointer to corresponding element of libcfs_netstrfns.
+	 */
+	struct netstrfns *nr_netstrfns;
+	/**
+	 * Number of network. E.g. 5 if \<net\> is "elan5".
+	 */
+	int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+	/**
+	 * Link to nidrange::nr_addrranges.
+	 */
+	struct list_head ar_link;
+	/**
+	 * List head for cfs_expr_list::el_list.
+	 */
+	struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 1 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval 0 otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+	struct addrrange *addrrange;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		nidrange->nr_all = 1;
+		return 1;
+	}
+
+	LIBCFS_ALLOC(addrrange, sizeof(struct addrrange));
+	if (addrrange == NULL)
+		return 0;
+	list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+	INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+	return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+						src->ls_len,
+						&addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+	     struct list_head *nidlist)
+{
+	struct netstrfns *nf;
+	struct nidrange *nr;
+	int endlen;
+	unsigned netnum;
+
+	if (src->ls_len >= LNET_NIDSTR_SIZE)
+		return NULL;
+
+	nf = libcfs_namenum2netstrfns(src->ls_str);
+	if (nf == NULL)
+		return NULL;
+	endlen = src->ls_len - strlen(nf->nf_name);
+	if (endlen == 0)
+		/* network name only, e.g. "elan" or "tcp" */
+		netnum = 0;
+	else {
+		/* e.g. "elan25" or "tcp23", refuse to parse if
+		 * network name is not appended with decimal or
+		 * hexadecimal number */
+		if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+				       endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+			return NULL;
+	}
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns != nf)
+			continue;
+		if (nr->nr_netnum != netnum)
+			continue;
+		return nr;
+	}
+
+	LIBCFS_ALLOC(nr, sizeof(struct nidrange));
+	if (nr == NULL)
+		return NULL;
+	list_add_tail(&nr->nr_link, nidlist);
+	INIT_LIST_HEAD(&nr->nr_addrranges);
+	nr->nr_netstrfns = nf;
+	nr->nr_all = 0;
+	nr->nr_netnum = netnum;
+
+	return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+	struct cfs_lstr addrrange;
+	struct cfs_lstr net;
+	struct cfs_lstr tmp;
+	struct nidrange *nr;
+
+	tmp = *src;
+	if (cfs_gettok(src, '@', &addrrange) == 0)
+		goto failed;
+
+	if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+		goto failed;
+
+	nr = add_nidrange(&net, nidlist);
+	if (nr == NULL)
+		goto failed;
+
+	if (parse_addrange(&addrrange, nr) != 0)
+		goto failed;
+
+	return 1;
+ failed:
+	CWARN("can't parse nidrange: \"%.*s\"\n", tmp.ls_len, tmp.ls_str);
+	return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct addrrange *ar;
+
+		ar = list_entry(list->next, struct addrrange, ar_link);
+
+		cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+		list_del(&ar->ar_link);
+		LIBCFS_FREE(ar, sizeof(struct addrrange));
+	}
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	struct nidrange *nr;
+
+	list_for_each_safe(pos, next, list) {
+		nr = list_entry(pos, struct nidrange, nr_link);
+		free_addrranges(&nr->nr_addrranges);
+		list_del(pos);
+		LIBCFS_FREE(nr, sizeof(struct nidrange));
+	}
+}
+EXPORT_SYMBOL(cfs_free_nidlist);
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(nidlist);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+		rc = parse_nidrange(&res, nidlist);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+	}
+	return 1;
+}
+EXPORT_SYMBOL(cfs_parse_nidlist);
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_all)
+			return 1;
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+			if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+						       &ar->ar_numaddr_ranges))
+				return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cfs_match_nid);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/prng.c b/kernel/drivers/staging/lustre/lustre/libcfs/prng.c
new file mode 100644
index 000000000..4147664ff
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/prng.c
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/prng.c
+ *
+ * concatenation of following two 16-bit multiply with carry generators
+ * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
+ * number and carry packed within the same 32 bit integer.
+ * algorithm recommended by Marsaglia
+*/
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/*
+From: George Marsaglia <geo@stat.fsu.edu>
+Newsgroups: sci.math
+Subject: Re: A RANDOM NUMBER GENERATOR FOR C
+Date: Tue, 30 Sep 1997 05:29:35 -0700
+
+ * You may replace the two constants 36969 and 18000 by any
+ * pair of distinct constants from this list:
+ * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584
+ * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243
+ * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974
+ * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114
+ * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088
+ * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834
+ * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013
+ * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083
+ * (or any other 16-bit constants k for which both k*2^16-1
+ * and k*2^15-1 are prime) */
+
+#define RANDOM_CONST_A 18030
+#define RANDOM_CONST_B 29013
+
+static unsigned int seed_x = 521288629;
+static unsigned int seed_y = 362436069;
+
+/**
+ * cfs_rand - creates new seeds
+ *
+ * First it creates new seeds from the previous seeds. Then it generates a
+ * new pseudo random number for use.
+ *
+ * Returns a pseudo-random 32-bit integer
+ */
+unsigned int cfs_rand(void)
+{
+	seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16);
+	seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16);
+
+	return ((seed_x << 16) + (seed_y & 65535));
+}
+EXPORT_SYMBOL(cfs_rand);
+
+/**
+ * cfs_srand - sets the initial seed
+ * @seed1 : (seed_x) should have the most entropy in the low bits of the word
+ * @seed2 : (seed_y) should have the most entropy in the high bits of the word
+ *
+ * Replaces the original seeds with new values. Used to generate a new pseudo
+ * random numbers.
+ */
+void cfs_srand(unsigned int seed1, unsigned int seed2)
+{
+	if (seed1)
+		seed_x = seed1; /* use default seeds if parameter is 0 */
+	if (seed2)
+		seed_y = seed2;
+}
+EXPORT_SYMBOL(cfs_srand);
+
+/**
+ * cfs_get_random_bytes - generate a bunch of random numbers
+ * @buf : buffer to fill with random numbers
+ * @size: size of passed in buffer
+ *
+ * Fills a buffer with random bytes
+ */
+void cfs_get_random_bytes(void *buf, int size)
+{
+	int *p = buf;
+	int rem, tmp;
+
+	LASSERT(size >= 0);
+
+	rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size);
+	if (rem) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, rem);
+		p = buf + rem;
+		size -= rem;
+	}
+
+	while (size >= sizeof(int)) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		*p = cfs_rand() ^ tmp;
+		size -= sizeof(int);
+		p++;
+	}
+	buf = p;
+	if (size) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, size);
+	}
+}
+EXPORT_SYMBOL(cfs_get_random_bytes);
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c
new file mode 100644
index 000000000..c86394f7f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.c
@@ -0,0 +1,1196 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/tracefile.c
+ *
+ * Author: Zach Brown <zab@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+#include "tracefile.h"
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* XXX move things up to the top, comment */
+union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned;
+
+char cfs_tracefile[TRACEFILE_NAME_SIZE];
+long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+static struct tracefiled_ctl trace_tctl;
+struct mutex cfs_trace_thread_mutex;
+static int thread_running;
+
+static atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					 struct cfs_trace_cpu_data *tcd);
+
+static inline struct cfs_trace_page *
+cfs_tage_from_list(struct list_head *list)
+{
+	return list_entry(list, struct cfs_trace_page, linkage);
+}
+
+static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp)
+{
+	struct page	    *page;
+	struct cfs_trace_page *tage;
+
+	/* My caller is trying to free memory */
+	if (!in_interrupt() && memory_pressure_get())
+		return NULL;
+
+	/*
+	 * Don't spam console with allocation failures: they will be reported
+	 * by upper layer anyway.
+	 */
+	gfp |= __GFP_NOWARN;
+	page = alloc_page(gfp);
+	if (page == NULL)
+		return NULL;
+
+	tage = kmalloc(sizeof(*tage), gfp);
+	if (tage == NULL) {
+		__free_page(page);
+		return NULL;
+	}
+
+	tage->page = page;
+	atomic_inc(&cfs_tage_allocated);
+	return tage;
+}
+
+static void cfs_tage_free(struct cfs_trace_page *tage)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(tage->page != NULL);
+
+	__free_page(tage->page);
+	kfree(tage);
+	atomic_dec(&cfs_tage_allocated);
+}
+
+static void cfs_tage_to_tail(struct cfs_trace_page *tage,
+			     struct list_head *queue)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(queue != NULL);
+
+	list_move_tail(&tage->linkage, queue);
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp,
+			   struct list_head *stock)
+{
+	int i;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+		struct cfs_trace_page *tage;
+
+		tage = cfs_tage_alloc(gfp);
+		if (tage == NULL)
+			break;
+		list_add_tail(&tage->linkage, stock);
+	}
+	return i;
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *
+cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
+{
+	struct cfs_trace_page *tage;
+
+	if (tcd->tcd_cur_pages > 0) {
+		__LASSERT(!list_empty(&tcd->tcd_pages));
+		tage = cfs_tage_from_list(tcd->tcd_pages.prev);
+		if (tage->used + len <= PAGE_CACHE_SIZE)
+			return tage;
+	}
+
+	if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
+		if (tcd->tcd_cur_stock_pages > 0) {
+			tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
+			--tcd->tcd_cur_stock_pages;
+			list_del_init(&tage->linkage);
+		} else {
+			tage = cfs_tage_alloc(GFP_ATOMIC);
+			if (unlikely(tage == NULL)) {
+				if ((!memory_pressure_get() ||
+				     in_interrupt()) && printk_ratelimit())
+					printk(KERN_WARNING
+					       "cannot allocate a tage (%ld)\n",
+					       tcd->tcd_cur_pages);
+				return NULL;
+			}
+		}
+
+		tage->used = 0;
+		tage->cpu = smp_processor_id();
+		tage->type = tcd->tcd_type;
+		list_add_tail(&tage->linkage, &tcd->tcd_pages);
+		tcd->tcd_cur_pages++;
+
+		if (tcd->tcd_cur_pages > 8 && thread_running) {
+			struct tracefiled_ctl *tctl = &trace_tctl;
+			/*
+			 * wake up tracefiled to process some pages.
+			 */
+			wake_up(&tctl->tctl_waitq);
+		}
+		return tage;
+	}
+	return NULL;
+}
+
+static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
+{
+	int pgcount = tcd->tcd_cur_pages / 10;
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (printk_ratelimit())
+		printk(KERN_WARNING "debug daemon buffer overflowed; discarding 10%% of pages (%d of %ld)\n",
+		       pgcount + 1, tcd->tcd_cur_pages);
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+	spin_lock_init(&pc.pc_lock);
+
+	list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+		if (pgcount-- == 0)
+			break;
+
+		list_move_tail(&tage->linkage, &pc.pc_pages);
+		tcd->tcd_cur_pages--;
+	}
+	put_pages_on_tcd_daemon_list(&pc, tcd);
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
+						 unsigned long len)
+{
+	struct cfs_trace_page *tage;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (len > PAGE_CACHE_SIZE) {
+		pr_err("cowardly refusing to write %lu bytes in a page\n", len);
+		return NULL;
+	}
+
+	tage = cfs_trace_get_tage_try(tcd, len);
+	if (tage != NULL)
+		return tage;
+	if (thread_running)
+		cfs_tcd_shrink(tcd);
+	if (tcd->tcd_cur_pages > 0) {
+		tage = cfs_tage_from_list(tcd->tcd_pages.next);
+		tage->used = 0;
+		cfs_tage_to_tail(tage, &tcd->tcd_pages);
+	}
+	return tage;
+}
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+		     const char *format, ...)
+{
+	va_list args;
+	int     rc;
+
+	va_start(args, format);
+	rc = libcfs_debug_vmsg2(msgdata, format, args, NULL);
+	va_end(args);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_debug_msg);
+
+int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+		       const char *format1, va_list args,
+		       const char *format2, ...)
+{
+	struct cfs_trace_cpu_data *tcd = NULL;
+	struct ptldebug_header     header = {0};
+	struct cfs_trace_page     *tage;
+	/* string_buf is used only if tcd != NULL, and is always set then */
+	char		      *string_buf = NULL;
+	char		      *debug_buf;
+	int			known_size;
+	int			needed = 85; /* average message length */
+	int			max_nob;
+	va_list		    ap;
+	int			depth;
+	int			i;
+	int			remain;
+	int			mask = msgdata->msg_mask;
+	const char		*file = kbasename(msgdata->msg_file);
+	struct cfs_debug_limit_state   *cdls = msgdata->msg_cdls;
+
+	tcd = cfs_trace_get_tcd();
+
+	/* cfs_trace_get_tcd() grabs a lock, which disables preemption and
+	 * pins us to a particular CPU.  This avoids an smp_processor_id()
+	 * warning on Linux when debugging is enabled. */
+	cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
+
+	if (tcd == NULL)		/* arch may not log in IRQ context */
+		goto console;
+
+	if (tcd->tcd_cur_pages == 0)
+		header.ph_flags |= PH_FLAG_FIRST_RECORD;
+
+	if (tcd->tcd_shutting_down) {
+		cfs_trace_put_tcd(tcd);
+		tcd = NULL;
+		goto console;
+	}
+
+	depth = __current_nesting_level();
+	known_size = strlen(file) + 1 + depth;
+	if (msgdata->msg_fn)
+		known_size += strlen(msgdata->msg_fn) + 1;
+
+	if (libcfs_debug_binary)
+		known_size += sizeof(header);
+
+	/*/
+	 * '2' used because vsnprintf return real size required for output
+	 * _without_ terminating NULL.
+	 * if needed is to small for this format.
+	 */
+	for (i = 0; i < 2; i++) {
+		tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
+		if (tage == NULL) {
+			if (needed + known_size > PAGE_CACHE_SIZE)
+				mask |= D_ERROR;
+
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		string_buf = (char *)page_address(tage->page) +
+					tage->used + known_size;
+
+		max_nob = PAGE_CACHE_SIZE - tage->used - known_size;
+		if (max_nob <= 0) {
+			printk(KERN_EMERG "negative max_nob: %d\n",
+			       max_nob);
+			mask |= D_ERROR;
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		needed = 0;
+		if (format1) {
+			va_copy(ap, args);
+			needed = vsnprintf(string_buf, max_nob, format1, ap);
+			va_end(ap);
+		}
+
+		if (format2) {
+			remain = max_nob - needed;
+			if (remain < 0)
+				remain = 0;
+
+			va_start(ap, format2);
+			needed += vsnprintf(string_buf + needed, remain,
+					    format2, ap);
+			va_end(ap);
+		}
+
+		if (needed < max_nob) /* well. printing ok.. */
+			break;
+	}
+
+	if (*(string_buf+needed-1) != '\n')
+		printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n",
+		       file, msgdata->msg_line, msgdata->msg_fn);
+
+	header.ph_len = known_size + needed;
+	debug_buf = (char *)page_address(tage->page) + tage->used;
+
+	if (libcfs_debug_binary) {
+		memcpy(debug_buf, &header, sizeof(header));
+		tage->used += sizeof(header);
+		debug_buf += sizeof(header);
+	}
+
+	/* indent message according to the nesting level */
+	while (depth-- > 0) {
+		*(debug_buf++) = '.';
+		++ tage->used;
+	}
+
+	strcpy(debug_buf, file);
+	tage->used += strlen(file) + 1;
+	debug_buf += strlen(file) + 1;
+
+	if (msgdata->msg_fn) {
+		strcpy(debug_buf, msgdata->msg_fn);
+		tage->used += strlen(msgdata->msg_fn) + 1;
+		debug_buf += strlen(msgdata->msg_fn) + 1;
+	}
+
+	__LASSERT(debug_buf == string_buf);
+
+	tage->used += needed;
+	__LASSERT (tage->used <= PAGE_CACHE_SIZE);
+
+console:
+	if ((mask & libcfs_printk) == 0) {
+		/* no console output requested */
+		if (tcd != NULL)
+			cfs_trace_put_tcd(tcd);
+		return 1;
+	}
+
+	if (cdls != NULL) {
+		if (libcfs_console_ratelimit &&
+		    cdls->cdls_next != 0 &&     /* not first time ever */
+		    !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+			/* skipping a console message */
+			cdls->cdls_count++;
+			if (tcd != NULL)
+				cfs_trace_put_tcd(tcd);
+			return 1;
+		}
+
+		if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
+						       libcfs_console_max_delay
+						       + cfs_time_seconds(10))) {
+			/* last timeout was a long time ago */
+			cdls->cdls_delay /= libcfs_console_backoff * 4;
+		} else {
+			cdls->cdls_delay *= libcfs_console_backoff;
+		}
+
+		if (cdls->cdls_delay < libcfs_console_min_delay)
+			cdls->cdls_delay = libcfs_console_min_delay;
+		else if (cdls->cdls_delay > libcfs_console_max_delay)
+			cdls->cdls_delay = libcfs_console_max_delay;
+
+		/* ensure cdls_next is never zero after it's been seen */
+		cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+	}
+
+	if (tcd != NULL) {
+		cfs_print_to_console(&header, mask, string_buf, needed, file,
+				     msgdata->msg_fn);
+		cfs_trace_put_tcd(tcd);
+	} else {
+		string_buf = cfs_trace_get_console_buffer();
+
+		needed = 0;
+		if (format1 != NULL) {
+			va_copy(ap, args);
+			needed = vsnprintf(string_buf,
+					   CFS_TRACE_CONSOLE_BUFFER_SIZE,
+					   format1, ap);
+			va_end(ap);
+		}
+		if (format2 != NULL) {
+			remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed;
+			if (remain > 0) {
+				va_start(ap, format2);
+				needed += vsnprintf(string_buf+needed, remain,
+						    format2, ap);
+				va_end(ap);
+			}
+		}
+		cfs_print_to_console(&header, mask,
+				     string_buf, needed, file, msgdata->msg_fn);
+
+		cfs_trace_put_console_buffer(string_buf);
+	}
+
+	if (cdls != NULL && cdls->cdls_count != 0) {
+		string_buf = cfs_trace_get_console_buffer();
+
+		needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE,
+				  "Skipped %d previous similar message%s\n",
+				  cdls->cdls_count,
+				  (cdls->cdls_count > 1) ? "s" : "");
+
+		cfs_print_to_console(&header, mask,
+				     string_buf, needed, file, msgdata->msg_fn);
+
+		cfs_trace_put_console_buffer(string_buf);
+		cdls->cdls_count = 0;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_vmsg2);
+
+void
+cfs_trace_assertion_failed(const char *str,
+			   struct libcfs_debug_msg_data *msgdata)
+{
+	struct ptldebug_header hdr;
+
+	libcfs_panic_in_progress = 1;
+	libcfs_catastrophe = 1;
+	mb();
+
+	cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
+
+	cfs_print_to_console(&hdr, D_EMERG, str, strlen(str),
+			     msgdata->msg_file, msgdata->msg_fn);
+
+	panic("Lustre debug assertion failure\n");
+
+	/* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+	/* Do the collect_pages job on a single CPU: assumes that all other
+	 * CPUs have been stopped during a panic.  If this isn't true for some
+	 * arch, this will have to be implemented separately in each arch.  */
+	int			i;
+	int			j;
+	struct cfs_trace_cpu_data *tcd;
+
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	cfs_tcd_for_each(tcd, i, j) {
+		list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+		tcd->tcd_cur_pages = 0;
+
+		if (pc->pc_want_daemon_pages) {
+			list_splice_init(&tcd->tcd_daemon_pages,
+					     &pc->pc_pages);
+			tcd->tcd_cur_daemon_pages = 0;
+		}
+	}
+}
+
+static void collect_pages_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	spin_lock(&pc->pc_lock);
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+			tcd->tcd_cur_pages = 0;
+			if (pc->pc_want_daemon_pages) {
+				list_splice_init(&tcd->tcd_daemon_pages,
+						     &pc->pc_pages);
+				tcd->tcd_cur_daemon_pages = 0;
+			}
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void collect_pages(struct page_collection *pc)
+{
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	if (libcfs_panic_in_progress)
+		panic_collect_pages(pc);
+	else
+		collect_pages_on_all_cpus(pc);
+}
+
+static void put_pages_back_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct list_head *cur_head;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	int i, cpu;
+
+	spin_lock(&pc->pc_lock);
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			cur_head = tcd->tcd_pages.next;
+
+			list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
+						 linkage) {
+
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				if (tage->cpu != cpu || tage->type != i)
+					continue;
+
+				cfs_tage_to_tail(tage, cur_head);
+				tcd->tcd_cur_pages++;
+			}
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_back(struct page_collection *pc)
+{
+	if (!libcfs_panic_in_progress)
+		put_pages_back_on_all_cpus(pc);
+}
+
+/* Add pages to a per-cpu debug daemon ringbuffer.  This buffer makes sure that
+ * we have a good amount of data at all times for dumping during an LBUG, even
+ * if we have been steadily writing (and otherwise discarding) pages via the
+ * debug daemon. */
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					 struct cfs_trace_cpu_data *tcd)
+{
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock(&pc->pc_lock);
+	list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type)
+			continue;
+
+		cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages);
+		tcd->tcd_cur_daemon_pages++;
+
+		if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
+			struct cfs_trace_page *victim;
+
+			__LASSERT(!list_empty(&tcd->tcd_daemon_pages));
+			victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next);
+
+			__LASSERT_TAGE_INVARIANT(victim);
+
+			list_del(&victim->linkage);
+			cfs_tage_free(victim);
+			tcd->tcd_cur_daemon_pages--;
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_on_daemon_list(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu)
+			put_pages_on_tcd_daemon_list(pc, tcd);
+	}
+}
+
+void cfs_trace_debug_print(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock_init(&pc.pc_lock);
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+		char *p, *file, *fn;
+		struct page *page;
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		page = tage->page;
+		p = page_address(page);
+		while (p < ((char *)page_address(page) + tage->used)) {
+			struct ptldebug_header *hdr;
+			int len;
+			hdr = (void *)p;
+			p += sizeof(*hdr);
+			file = p;
+			p += strlen(file) + 1;
+			fn = p;
+			p += strlen(fn) + 1;
+			len = hdr->ph_len - (int)(p - (char *)hdr);
+
+			cfs_print_to_console(hdr, D_EMERG, p, len, file, fn);
+
+			p += len;
+		}
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_tracefile_dump_all_pages(char *filename)
+{
+	struct page_collection	pc;
+	struct file		*filp;
+	struct cfs_trace_page	*tage;
+	struct cfs_trace_page	*tmp;
+	char			*buf;
+	int rc;
+
+	DECL_MMSPACE;
+
+	cfs_tracefile_write_lock();
+
+	filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		filp = NULL;
+		pr_err("LustreError: can't open %s for dump: rc %d\n",
+			filename, rc);
+		goto out;
+	}
+
+	spin_lock_init(&pc.pc_lock);
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	if (list_empty(&pc.pc_pages)) {
+		rc = 0;
+		goto close;
+	}
+
+	/* ok, for now, just write the pages.  in the future we'll be building
+	 * iobufs with the pages and calling generic_direct_IO */
+	MMSPACE_OPEN;
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		buf = kmap(tage->page);
+		rc = vfs_write(filp, (__force const char __user *)buf,
+			       tage->used, &filp->f_pos);
+		kunmap(tage->page);
+
+		if (rc != (int)tage->used) {
+			printk(KERN_WARNING "wanted to write %u but wrote %d\n",
+			       tage->used, rc);
+			put_pages_back(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			break;
+		}
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+	MMSPACE_CLOSE;
+	rc = vfs_fsync(filp, 1);
+	if (rc)
+		pr_err("sync returns %d\n", rc);
+close:
+	filp_close(filp, NULL);
+out:
+	cfs_tracefile_write_unlock();
+	return rc;
+}
+
+void cfs_trace_flush_pages(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock_init(&pc.pc_lock);
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char __user *usr_buffer, int usr_buffer_nob)
+{
+	int    nob;
+
+	if (usr_buffer_nob > knl_buffer_nob)
+		return -EOVERFLOW;
+
+	if (copy_from_user((void *)knl_buffer,
+			   usr_buffer, usr_buffer_nob))
+		return -EFAULT;
+
+	nob = strnlen(knl_buffer, usr_buffer_nob);
+	while (nob-- >= 0)		      /* strip trailing whitespace */
+		if (!isspace(knl_buffer[nob]))
+			break;
+
+	if (nob < 0)			    /* empty string */
+		return -EINVAL;
+
+	if (nob == knl_buffer_nob)	      /* no space to terminate */
+		return -EOVERFLOW;
+
+	knl_buffer[nob + 1] = 0;		/* terminate */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_trace_copyin_string);
+
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+			     const char *knl_buffer, char *append)
+{
+	/* NB if 'append' != NULL, it's a single character to append to the
+	 * copied out string - usually "\n", for /proc entries and "" (i.e. a
+	 * terminating zero byte) for sysctl entries */
+	int   nob = strlen(knl_buffer);
+
+	if (nob > usr_buffer_nob)
+		nob = usr_buffer_nob;
+
+	if (copy_to_user(usr_buffer, knl_buffer, nob))
+		return -EFAULT;
+
+	if (append != NULL && nob < usr_buffer_nob) {
+		if (copy_to_user(usr_buffer + nob, append, 1))
+			return -EFAULT;
+
+		nob++;
+	}
+
+	return nob;
+}
+EXPORT_SYMBOL(cfs_trace_copyout_string);
+
+int cfs_trace_allocate_string_buffer(char **str, int nob)
+{
+	if (nob > 2 * PAGE_CACHE_SIZE)	    /* string must be "sensible" */
+		return -EINVAL;
+
+	*str = kmalloc(nob, GFP_IOFS | __GFP_ZERO);
+	if (*str == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void cfs_trace_free_string_buffer(char *str, int nob)
+{
+	kfree(str);
+}
+
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob)
+{
+	char	 *str;
+	int	   rc;
+
+	rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+	if (rc != 0)
+		return rc;
+
+	rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+				     usr_str, usr_str_nob);
+	if (rc != 0)
+		goto out;
+
+	if (str[0] != '/') {
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = cfs_tracefile_dump_all_pages(str);
+out:
+	cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+	return rc;
+}
+
+int cfs_trace_daemon_command(char *str)
+{
+	int       rc = 0;
+
+	cfs_tracefile_write_lock();
+
+	if (strcmp(str, "stop") == 0) {
+		cfs_tracefile_write_unlock();
+		cfs_trace_stop_thread();
+		cfs_tracefile_write_lock();
+		memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
+
+	} else if (strncmp(str, "size=", 5) == 0) {
+		cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
+		if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
+			cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+		else
+			cfs_tracefile_size <<= 20;
+
+	} else if (strlen(str) >= sizeof(cfs_tracefile)) {
+		rc = -ENAMETOOLONG;
+	} else if (str[0] != '/') {
+		rc = -EINVAL;
+	} else {
+		strcpy(cfs_tracefile, str);
+
+		printk(KERN_INFO
+		       "Lustre: debug daemon will attempt to start writing to %s (%lukB max)\n",
+		       cfs_tracefile,
+		       (long)(cfs_tracefile_size >> 10));
+
+		cfs_trace_start_thread();
+	}
+
+	cfs_tracefile_write_unlock();
+	return rc;
+}
+
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob)
+{
+	char *str;
+	int   rc;
+
+	rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+	if (rc != 0)
+		return rc;
+
+	rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+				 usr_str, usr_str_nob);
+	if (rc == 0)
+		rc = cfs_trace_daemon_command(str);
+
+	cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+	return rc;
+}
+
+int cfs_trace_set_debug_mb(int mb)
+{
+	int i;
+	int j;
+	int pages;
+	int limit = cfs_trace_max_debug_mb();
+	struct cfs_trace_cpu_data *tcd;
+
+	if (mb < num_possible_cpus()) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too small for debug buffer size, setting it to %d MB.\n",
+		       mb, num_possible_cpus());
+		mb = num_possible_cpus();
+	}
+
+	if (mb > limit) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too large for debug buffer size, setting it to %d MB.\n",
+		       mb, limit);
+		mb = limit;
+	}
+
+	mb /= num_possible_cpus();
+	pages = mb << (20 - PAGE_CACHE_SHIFT);
+
+	cfs_tracefile_write_lock();
+
+	cfs_tcd_for_each(tcd, i, j)
+		tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
+
+	cfs_tracefile_write_unlock();
+
+	return 0;
+}
+
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob)
+{
+	char     str[32];
+	int      rc;
+
+	rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
+	if (rc < 0)
+		return rc;
+
+	return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
+}
+
+int cfs_trace_get_debug_mb(void)
+{
+	int i;
+	int j;
+	struct cfs_trace_cpu_data *tcd;
+	int total_pages = 0;
+
+	cfs_tracefile_read_lock();
+
+	cfs_tcd_for_each(tcd, i, j)
+		total_pages += tcd->tcd_max_pages;
+
+	cfs_tracefile_read_unlock();
+
+	return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1;
+}
+
+static int tracefiled(void *arg)
+{
+	struct page_collection pc;
+	struct tracefiled_ctl *tctl = arg;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	struct file *filp;
+	char *buf;
+	int last_loop = 0;
+	int rc;
+
+	DECL_MMSPACE;
+
+	/* we're started late enough that we pick up init's fs context */
+	/* this is so broken in uml?  what on earth is going on? */
+
+	spin_lock_init(&pc.pc_lock);
+	complete(&tctl->tctl_start);
+
+	while (1) {
+		wait_queue_t __wait;
+
+		pc.pc_want_daemon_pages = 0;
+		collect_pages(&pc);
+		if (list_empty(&pc.pc_pages))
+			goto end_loop;
+
+		filp = NULL;
+		cfs_tracefile_read_lock();
+		if (cfs_tracefile[0] != 0) {
+			filp = filp_open(cfs_tracefile,
+					 O_CREAT | O_RDWR | O_LARGEFILE,
+					 0600);
+			if (IS_ERR(filp)) {
+				rc = PTR_ERR(filp);
+				filp = NULL;
+				printk(KERN_WARNING "couldn't open %s: %d\n",
+				       cfs_tracefile, rc);
+			}
+		}
+		cfs_tracefile_read_unlock();
+		if (filp == NULL) {
+			put_pages_on_daemon_list(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			goto end_loop;
+		}
+
+		MMSPACE_OPEN;
+
+		list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						   linkage) {
+			static loff_t f_pos;
+
+			__LASSERT_TAGE_INVARIANT(tage);
+
+			if (f_pos >= (off_t)cfs_tracefile_size)
+				f_pos = 0;
+			else if (f_pos > i_size_read(file_inode(filp)))
+				f_pos = i_size_read(file_inode(filp));
+
+			buf = kmap(tage->page);
+			rc = vfs_write(filp, (__force const char __user *)buf,
+				       tage->used, &f_pos);
+			kunmap(tage->page);
+
+			if (rc != (int)tage->used) {
+				printk(KERN_WARNING "wanted to write %u but wrote %d\n",
+				       tage->used, rc);
+				put_pages_back(&pc);
+				__LASSERT(list_empty(&pc.pc_pages));
+				break;
+			}
+		}
+		MMSPACE_CLOSE;
+
+		filp_close(filp, NULL);
+		put_pages_on_daemon_list(&pc);
+		if (!list_empty(&pc.pc_pages)) {
+			int i;
+
+			printk(KERN_ALERT "Lustre: trace pages aren't empty\n");
+			pr_err("total cpus(%d): ",
+				num_possible_cpus());
+			for (i = 0; i < num_possible_cpus(); i++)
+				if (cpu_online(i))
+					pr_cont("%d(on) ", i);
+				else
+					pr_cont("%d(off) ", i);
+			pr_cont("\n");
+
+			i = 0;
+			list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						     linkage)
+				pr_err("page %d belongs to cpu %d\n",
+					++i, tage->cpu);
+			pr_err("There are %d pages unwritten\n", i);
+		}
+		__LASSERT(list_empty(&pc.pc_pages));
+end_loop:
+		if (atomic_read(&tctl->tctl_shutdown)) {
+			if (last_loop == 0) {
+				last_loop = 1;
+				continue;
+			} else {
+				break;
+			}
+		}
+		init_waitqueue_entry(&__wait, current);
+		add_wait_queue(&tctl->tctl_waitq, &__wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+		remove_wait_queue(&tctl->tctl_waitq, &__wait);
+	}
+	complete(&tctl->tctl_stop);
+	return 0;
+}
+
+int cfs_trace_start_thread(void)
+{
+	struct tracefiled_ctl *tctl = &trace_tctl;
+	int rc = 0;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+	if (thread_running)
+		goto out;
+
+	init_completion(&tctl->tctl_start);
+	init_completion(&tctl->tctl_stop);
+	init_waitqueue_head(&tctl->tctl_waitq);
+	atomic_set(&tctl->tctl_shutdown, 0);
+
+	if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) {
+		rc = -ECHILD;
+		goto out;
+	}
+
+	wait_for_completion(&tctl->tctl_start);
+	thread_running = 1;
+out:
+	mutex_unlock(&cfs_trace_thread_mutex);
+	return rc;
+}
+
+void cfs_trace_stop_thread(void)
+{
+	struct tracefiled_ctl *tctl = &trace_tctl;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+	if (thread_running) {
+		printk(KERN_INFO
+		       "Lustre: shutting down debug daemon thread...\n");
+		atomic_set(&tctl->tctl_shutdown, 1);
+		wait_for_completion(&tctl->tctl_stop);
+		thread_running = 0;
+	}
+	mutex_unlock(&cfs_trace_thread_mutex);
+}
+
+int cfs_tracefile_init(int max_pages)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int		    i;
+	int		    j;
+	int		    rc;
+	int		    factor;
+
+	rc = cfs_tracefile_init_arch();
+	if (rc != 0)
+		return rc;
+
+	cfs_tcd_for_each(tcd, i, j) {
+		/* tcd_pages_factor is initialized int tracefile_init_arch. */
+		factor = tcd->tcd_pages_factor;
+		INIT_LIST_HEAD(&tcd->tcd_pages);
+		INIT_LIST_HEAD(&tcd->tcd_stock_pages);
+		INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
+		tcd->tcd_cur_pages = 0;
+		tcd->tcd_cur_stock_pages = 0;
+		tcd->tcd_cur_daemon_pages = 0;
+		tcd->tcd_max_pages = (max_pages * factor) / 100;
+		LASSERT(tcd->tcd_max_pages > 0);
+		tcd->tcd_shutting_down = 0;
+	}
+
+	return 0;
+}
+
+static void trace_cleanup_on_all_cpus(void)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			tcd->tcd_shutting_down = 1;
+
+			list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages,
+							   linkage) {
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				list_del(&tage->linkage);
+				cfs_tage_free(tage);
+			}
+
+			tcd->tcd_cur_pages = 0;
+		}
+	}
+}
+
+static void cfs_trace_cleanup(void)
+{
+	struct page_collection pc;
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+	spin_lock_init(&pc.pc_lock);
+
+	trace_cleanup_on_all_cpus();
+
+	cfs_tracefile_fini_arch();
+}
+
+void cfs_tracefile_exit(void)
+{
+	cfs_trace_stop_thread();
+	cfs_trace_cleanup();
+}
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h
new file mode 100644
index 000000000..0601476e1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/tracefile.h
@@ -0,0 +1,340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_TRACEFILE_H__
+#define __LIBCFS_TRACEFILE_H__
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "linux/linux-tracefile.h"
+
+/* trace file lock routines */
+
+#define TRACEFILE_NAME_SIZE 1024
+extern char      cfs_tracefile[TRACEFILE_NAME_SIZE];
+extern long long cfs_tracefile_size;
+
+extern void libcfs_run_debug_log_upcall(char *file);
+
+int  cfs_tracefile_init_arch(void);
+void cfs_tracefile_fini_arch(void);
+
+void cfs_tracefile_read_lock(void);
+void cfs_tracefile_read_unlock(void);
+void cfs_tracefile_write_lock(void);
+void cfs_tracefile_write_unlock(void);
+
+int cfs_tracefile_dump_all_pages(char *filename);
+void cfs_trace_debug_print(void);
+void cfs_trace_flush_pages(void);
+int cfs_trace_start_thread(void);
+void cfs_trace_stop_thread(void);
+int cfs_tracefile_init(int max_pages);
+void cfs_tracefile_exit(void);
+
+
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char __user *usr_buffer, int usr_buffer_nob);
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+			     const char *knl_str, char *append);
+int cfs_trace_allocate_string_buffer(char **str, int nob);
+void cfs_trace_free_string_buffer(char *str, int nob);
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_daemon_command(char *str);
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_get_debug_mb(void);
+
+extern void libcfs_debug_dumplog_internal(void *arg);
+extern void libcfs_register_panic_notifier(void);
+extern void libcfs_unregister_panic_notifier(void);
+extern int  libcfs_panic_in_progress;
+extern int  cfs_trace_max_debug_mb(void);
+
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+#ifdef LUSTRE_TRACEFILE_PRIVATE
+
+/*
+ * Private declare for tracefile
+ */
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+/* Size of a buffer for sprinting console messages if we can't get a page
+ * from system */
+#define CFS_TRACE_CONSOLE_BUFFER_SIZE   1024
+
+union cfs_trace_data_union {
+	struct cfs_trace_cpu_data {
+		/*
+		 * Even though this structure is meant to be per-CPU, locking
+		 * is needed because in some places the data may be accessed
+		 * from other CPUs. This lock is directly used in trace_get_tcd
+		 * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and
+		 * tcd_for_each_type_lock
+		 */
+		spinlock_t		tcd_lock;
+		unsigned long	   tcd_lock_flags;
+
+		/*
+		 * pages with trace records not yet processed by tracefiled.
+		 */
+		struct list_head	      tcd_pages;
+		/* number of pages on ->tcd_pages */
+		unsigned long	   tcd_cur_pages;
+
+		/*
+		 * pages with trace records already processed by
+		 * tracefiled. These pages are kept in memory, so that some
+		 * portion of log can be written in the event of LBUG. This
+		 * list is maintained in LRU order.
+		 *
+		 * Pages are moved to ->tcd_daemon_pages by tracefiled()
+		 * (put_pages_on_daemon_list()). LRU pages from this list are
+		 * discarded when list grows too large.
+		 */
+		struct list_head	      tcd_daemon_pages;
+		/* number of pages on ->tcd_daemon_pages */
+		unsigned long	   tcd_cur_daemon_pages;
+
+		/*
+		 * Maximal number of pages allowed on ->tcd_pages and
+		 * ->tcd_daemon_pages each.
+		 * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current
+		 * implementation.
+		 */
+		unsigned long	   tcd_max_pages;
+
+		/*
+		 * preallocated pages to write trace records into. Pages from
+		 * ->tcd_stock_pages are moved to ->tcd_pages by
+		 * portals_debug_msg().
+		 *
+		 * This list is necessary, because on some platforms it's
+		 * impossible to perform efficient atomic page allocation in a
+		 * non-blockable context.
+		 *
+		 * Such platforms fill ->tcd_stock_pages "on occasion", when
+		 * tracing code is entered in blockable context.
+		 *
+		 * trace_get_tage_try() tries to get a page from
+		 * ->tcd_stock_pages first and resorts to atomic page
+		 * allocation only if this queue is empty. ->tcd_stock_pages
+		 * is replenished when tracing code is entered in blocking
+		 * context (darwin-tracefile.c:trace_get_tcd()). We try to
+		 * maintain TCD_STOCK_PAGES (40 by default) pages in this
+		 * queue. Atomic allocation is only required if more than
+		 * TCD_STOCK_PAGES pagesful are consumed by trace records all
+		 * emitted in non-blocking contexts. Which is quite unlikely.
+		 */
+		struct list_head	      tcd_stock_pages;
+		/* number of pages on ->tcd_stock_pages */
+		unsigned long	   tcd_cur_stock_pages;
+
+		unsigned short	  tcd_shutting_down;
+		unsigned short	  tcd_cpu;
+		unsigned short	  tcd_type;
+		/* The factors to share debug memory. */
+		unsigned short	  tcd_pages_factor;
+	} tcd;
+	char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))];
+};
+
+#define TCD_MAX_TYPES      8
+extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS];
+
+#define cfs_tcd_for_each(tcd, i, j)				       \
+    for (i = 0; cfs_trace_data[i] != NULL; i++)			   \
+	for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd);	       \
+	     j < num_possible_cpus();				 \
+	     j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
+
+#define cfs_tcd_for_each_type_lock(tcd, i, cpu)			   \
+    for (i = 0; cfs_trace_data[i] &&				      \
+	 (tcd = &(*cfs_trace_data[i])[cpu].tcd) &&			\
+	 cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct page_collection {
+	struct list_head	pc_pages;
+	/*
+	 * spin-lock protecting ->pc_pages. It is taken by smp_call_function()
+	 * call-back functions. XXX nikita: Which is horrible: all processors
+	 * receive NMI at the same time only to be serialized by this
+	 * lock. Probably ->pc_pages should be replaced with an array of
+	 * NR_CPUS elements accessed locklessly.
+	 */
+	spinlock_t	pc_lock;
+	/*
+	 * if this flag is set, collect_pages() will spill both
+	 * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+	 * only ->tcd_pages are spilled.
+	 */
+	int		pc_want_daemon_pages;
+};
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct tracefiled_ctl {
+	struct completion	tctl_start;
+	struct completion	tctl_stop;
+	wait_queue_head_t		tctl_waitq;
+	pid_t			tctl_pid;
+	atomic_t		tctl_shutdown;
+};
+
+/*
+ * small data-structure for each page owned by tracefiled.
+ */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct cfs_trace_page {
+	/*
+	 * page itself
+	 */
+	struct page	  *page;
+	/*
+	 * linkage into one of the lists in trace_data_union or
+	 * page_collection
+	 */
+	struct list_head	   linkage;
+	/*
+	 * number of bytes used within this page
+	 */
+	unsigned int	 used;
+	/*
+	 * cpu that owns this page
+	 */
+	unsigned short       cpu;
+	/*
+	 * type(context) of this page
+	 */
+	unsigned short       type;
+};
+
+extern void cfs_set_ptldebug_header(struct ptldebug_header *header,
+				    struct libcfs_debug_msg_data *m,
+				    unsigned long stack);
+extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+				 const char *buf, int len, const char *file,
+				 const char *fn);
+
+extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+
+/**
+ * trace_buf_type_t, trace_buf_idx_get() and trace_console_buffers[][]
+ * are not public libcfs API; they should be defined in
+ * platform-specific tracefile include files
+ * (see, for example, linux-tracefile.h).
+ */
+
+extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+extern cfs_trace_buf_type_t cfs_trace_buf_idx_get(void);
+
+static inline char *
+cfs_trace_get_console_buffer(void)
+{
+	unsigned int i = get_cpu();
+	unsigned int j = cfs_trace_buf_idx_get();
+
+	return cfs_trace_console_buffers[i][j];
+}
+
+static inline void
+cfs_trace_put_console_buffer(char *buffer)
+{
+	put_cpu();
+}
+
+static inline struct cfs_trace_cpu_data *
+cfs_trace_get_tcd(void)
+{
+	struct cfs_trace_cpu_data *tcd =
+		&(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
+
+	cfs_trace_lock_tcd(tcd, 0);
+
+	return tcd;
+}
+
+static inline void
+cfs_trace_put_tcd (struct cfs_trace_cpu_data *tcd)
+{
+	cfs_trace_unlock_tcd(tcd, 0);
+
+	put_cpu();
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp,
+			   struct list_head *stock);
+
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+		      struct cfs_trace_page *tage);
+
+extern void cfs_trace_assertion_failed(const char *str,
+				       struct libcfs_debug_msg_data *m);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond)						 \
+do {								    \
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);     \
+		cfs_trace_assertion_failed("ASSERTION("#cond") failed", \
+					   &msgdata);		   \
+	}							       \
+} while (0)
+
+#define __LASSERT_TAGE_INVARIANT(tage)				  \
+do {								    \
+	__LASSERT(tage != NULL);					\
+	__LASSERT(tage->page != NULL);				  \
+	__LASSERT(tage->used <= PAGE_CACHE_SIZE);			 \
+	__LASSERT(page_count(tage->page) > 0);		      \
+} while (0)
+
+#endif	/* LUSTRE_TRACEFILE_PRIVATE */
+
+#endif /* __LIBCFS_TRACEFILE_H__ */
diff --git a/kernel/drivers/staging/lustre/lustre/libcfs/workitem.c b/kernel/drivers/staging/lustre/lustre/libcfs/workitem.c
new file mode 100644
index 000000000..48009b775
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lustre/libcfs/workitem.c
@@ -0,0 +1,479 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/workitem.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *	 Liang Zhen  <zhen.liang@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#define CFS_WS_NAME_LEN	 16
+
+typedef struct cfs_wi_sched {
+	struct list_head		ws_list;	/* chain on global list */
+	/** serialised workitems */
+	spinlock_t		ws_lock;
+	/** where schedulers sleep */
+	wait_queue_head_t		ws_waitq;
+	/** concurrent workitems */
+	struct list_head		ws_runq;
+	/** rescheduled running-workitems, a workitem can be rescheduled
+	 * while running in wi_action(), but we don't to execute it again
+	 * unless it returns from wi_action(), so we put it on ws_rerunq
+	 * while rescheduling, and move it to runq after it returns
+	 * from wi_action() */
+	struct list_head		ws_rerunq;
+	/** CPT-table for this scheduler */
+	struct cfs_cpt_table	*ws_cptab;
+	/** CPT id for affinity */
+	int			ws_cpt;
+	/** number of scheduled workitems */
+	int			ws_nscheduled;
+	/** started scheduler thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_nthreads:30;
+	/** shutting down, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_stopping:1;
+	/** serialize starting thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_starting:1;
+	/** scheduler name */
+	char			ws_name[CFS_WS_NAME_LEN];
+} cfs_wi_sched_t;
+
+static struct cfs_workitem_data {
+	/** serialize */
+	spinlock_t		wi_glock;
+	/** list of all schedulers */
+	struct list_head		wi_scheds;
+	/** WI module is initialized */
+	int			wi_init;
+	/** shutting down the whole WI module */
+	int			wi_stopping;
+} cfs_wi_data;
+
+static inline void
+cfs_wi_sched_lock(cfs_wi_sched_t *sched)
+{
+	spin_lock(&sched->ws_lock);
+}
+
+static inline void
+cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
+{
+	spin_unlock(&sched->ws_lock);
+}
+
+static inline int
+cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
+{
+	cfs_wi_sched_lock(sched);
+	if (sched->ws_stopping) {
+		cfs_wi_sched_unlock(sched);
+		return 0;
+	}
+
+	if (!list_empty(&sched->ws_runq)) {
+		cfs_wi_sched_unlock(sched);
+		return 0;
+	}
+	cfs_wi_sched_unlock(sched);
+	return 1;
+}
+
+
+/* XXX:
+ * 0. it only works when called from wi->wi_action.
+ * 1. when it returns no one shall try to schedule the workitem.
+ */
+void
+cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	cfs_wi_sched_lock(sched);
+
+	LASSERT(wi->wi_running);
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+	}
+
+	LASSERT(list_empty(&wi->wi_list));
+
+	wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+	cfs_wi_sched_unlock(sched);
+
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_exit);
+
+/**
+ * cancel schedule request of workitem \a wi
+ */
+int
+cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	int	rc;
+
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	/*
+	 * return 0 if it's running already, otherwise return 1, which
+	 * means the workitem will not be scheduled and will not have
+	 * any race with wi_action.
+	 */
+	cfs_wi_sched_lock(sched);
+
+	rc = !(wi->wi_running);
+
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+
+		wi->wi_scheduled = 0;
+	}
+
+	LASSERT (list_empty(&wi->wi_list));
+
+	cfs_wi_sched_unlock(sched);
+	return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
+
+/*
+ * Workitem scheduled with (serial == 1) is strictly serialised not only with
+ * itself, but also with others scheduled this way.
+ *
+ * Now there's only one static serialised queue, but in the future more might
+ * be added, and even dynamic creation of serialised queues might be supported.
+ */
+void
+cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	cfs_wi_sched_lock(sched);
+
+	if (!wi->wi_scheduled) {
+		LASSERT (list_empty(&wi->wi_list));
+
+		wi->wi_scheduled = 1;
+		sched->ws_nscheduled++;
+		if (!wi->wi_running) {
+			list_add_tail(&wi->wi_list, &sched->ws_runq);
+			wake_up(&sched->ws_waitq);
+		} else {
+			list_add(&wi->wi_list, &sched->ws_rerunq);
+		}
+	}
+
+	LASSERT (!list_empty(&wi->wi_list));
+	cfs_wi_sched_unlock(sched);
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
+
+
+static int
+cfs_wi_scheduler (void *arg)
+{
+	struct cfs_wi_sched	*sched = (cfs_wi_sched_t *)arg;
+
+	cfs_block_allsigs();
+
+	/* CPT affinity scheduler? */
+	if (sched->ws_cptab != NULL)
+		cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+
+	LASSERT(sched->ws_starting == 1);
+	sched->ws_starting--;
+	sched->ws_nthreads++;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	cfs_wi_sched_lock(sched);
+
+	while (!sched->ws_stopping) {
+		int	     nloops = 0;
+		int	     rc;
+		cfs_workitem_t *wi;
+
+		while (!list_empty(&sched->ws_runq) &&
+		       nloops < CFS_WI_RESCHED) {
+			wi = list_entry(sched->ws_runq.next,
+					    cfs_workitem_t, wi_list);
+			LASSERT(wi->wi_scheduled && !wi->wi_running);
+
+			list_del_init(&wi->wi_list);
+
+			LASSERT(sched->ws_nscheduled > 0);
+			sched->ws_nscheduled--;
+
+			wi->wi_running   = 1;
+			wi->wi_scheduled = 0;
+
+
+			cfs_wi_sched_unlock(sched);
+			nloops++;
+
+			rc = (*wi->wi_action) (wi);
+
+			cfs_wi_sched_lock(sched);
+			if (rc != 0) /* WI should be dead, even be freed! */
+				continue;
+
+			wi->wi_running = 0;
+			if (list_empty(&wi->wi_list))
+				continue;
+
+			LASSERT(wi->wi_scheduled);
+			/* wi is rescheduled, should be on rerunq now, we
+			 * move it to runq so it can run action now */
+			list_move_tail(&wi->wi_list, &sched->ws_runq);
+		}
+
+		if (!list_empty(&sched->ws_runq)) {
+			cfs_wi_sched_unlock(sched);
+			/* don't sleep because some workitems still
+			 * expect me to come back soon */
+			cond_resched();
+			cfs_wi_sched_lock(sched);
+			continue;
+		}
+
+		cfs_wi_sched_unlock(sched);
+		rc = wait_event_interruptible_exclusive(sched->ws_waitq,
+						!cfs_wi_sched_cansleep(sched));
+		cfs_wi_sched_lock(sched);
+	}
+
+	cfs_wi_sched_unlock(sched);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	sched->ws_nthreads--;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	return 0;
+}
+
+
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
+{
+	int	i;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	if (sched->ws_stopping) {
+		CDEBUG(D_INFO, "%s is in progress of stopping\n",
+		       sched->ws_name);
+		spin_unlock(&cfs_wi_data.wi_glock);
+		return;
+	}
+
+	LASSERT(!list_empty(&sched->ws_list));
+	sched->ws_stopping = 1;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	i = 2;
+	wake_up_all(&sched->ws_waitq);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	while (sched->ws_nthreads > 0) {
+		CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
+		       "waiting for %d threads of WI sched[%s] to terminate\n",
+		       sched->ws_nthreads, sched->ws_name);
+
+		spin_unlock(&cfs_wi_data.wi_glock);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1) / 20);
+		spin_lock(&cfs_wi_data.wi_glock);
+	}
+
+	list_del(&sched->ws_list);
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+	LASSERT(sched->ws_nscheduled == 0);
+
+	LIBCFS_FREE(sched, sizeof(*sched));
+}
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
+
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+		    int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
+{
+	struct cfs_wi_sched	*sched;
+	int			rc;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+	LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+		(cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+	LIBCFS_ALLOC(sched, sizeof(*sched));
+	if (sched == NULL)
+		return -ENOMEM;
+
+	strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
+	sched->ws_name[CFS_WS_NAME_LEN - 1] = '\0';
+	sched->ws_cptab = cptab;
+	sched->ws_cpt = cpt;
+
+	spin_lock_init(&sched->ws_lock);
+	init_waitqueue_head(&sched->ws_waitq);
+	INIT_LIST_HEAD(&sched->ws_runq);
+	INIT_LIST_HEAD(&sched->ws_rerunq);
+	INIT_LIST_HEAD(&sched->ws_list);
+
+	rc = 0;
+	while (nthrs > 0)  {
+		char	name[16];
+		struct task_struct *task;
+
+		spin_lock(&cfs_wi_data.wi_glock);
+		while (sched->ws_starting > 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			schedule();
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+
+		sched->ws_starting++;
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+			snprintf(name, sizeof(name), "%s_%02d_%02u",
+				 sched->ws_name, sched->ws_cpt,
+				 sched->ws_nthreads);
+		} else {
+			snprintf(name, sizeof(name), "%s_%02u",
+				 sched->ws_name, sched->ws_nthreads);
+		}
+
+		task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
+		if (!IS_ERR(task)) {
+			nthrs--;
+			continue;
+		}
+		rc = PTR_ERR(task);
+
+		CERROR("Failed to create thread for WI scheduler %s: %d\n",
+		       name, rc);
+
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		/* make up for cfs_wi_sched_destroy */
+		list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+		sched->ws_starting--;
+
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		cfs_wi_sched_destroy(sched);
+		return rc;
+	}
+	spin_lock(&cfs_wi_data.wi_glock);
+	list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	*sched_pp = sched;
+	return 0;
+}
+EXPORT_SYMBOL(cfs_wi_sched_create);
+
+int
+cfs_wi_startup(void)
+{
+	memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
+
+	spin_lock_init(&cfs_wi_data.wi_glock);
+	INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+	cfs_wi_data.wi_init = 1;
+
+	return 0;
+}
+
+void
+cfs_wi_shutdown(void)
+{
+	struct cfs_wi_sched	*sched;
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	cfs_wi_data.wi_stopping = 1;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	/* nobody should contend on this list */
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		sched->ws_stopping = 1;
+		wake_up_all(&sched->ws_waitq);
+	}
+
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		while (sched->ws_nthreads != 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1) / 20);
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+		spin_unlock(&cfs_wi_data.wi_glock);
+	}
+	while (!list_empty(&cfs_wi_data.wi_scheds)) {
+		sched = list_entry(cfs_wi_data.wi_scheds.next,
+				       struct cfs_wi_sched, ws_list);
+		list_del(&sched->ws_list);
+		LIBCFS_FREE(sched, sizeof(*sched));
+	}
+
+	cfs_wi_data.wi_stopping = 0;
+	cfs_wi_data.wi_init = 0;
+}