summaryrefslogtreecommitdiffstats
path: root/kernel/drivers/staging/lustre/lnet
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/drivers/staging/lustre/lnet')
-rw-r--r--kernel/drivers/staging/lustre/lnet/Kconfig40
-rw-r--r--kernel/drivers/staging/lustre/lnet/Makefile1
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/Makefile1
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile2
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c3118
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h1030
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c3519
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c230
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile3
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c2886
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h588
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c2634
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c714
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h86
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c188
-rw-r--r--kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c797
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/Makefile5
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/acceptor.c500
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/api-ni.c1940
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/config.c1292
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c441
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/lib-md.c454
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/lib-me.c298
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/lib-move.c2460
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c647
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c935
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/lo.c120
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/module.c155
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/peer.c338
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/router.c1706
-rw-r--r--kernel/drivers/staging/lustre/lnet/lnet/router_proc.c968
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/Makefile4
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/brw_test.c508
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/conctl.c929
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/conrpc.c1396
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/conrpc.h146
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/console.c2096
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/console.h235
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/framework.c1804
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/module.c159
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/ping_test.c230
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/rpc.c1673
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/rpc.h302
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/selftest.h624
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/timer.c248
-rw-r--r--kernel/drivers/staging/lustre/lnet/selftest/timer.h53
46 files changed, 38503 insertions, 0 deletions
diff --git a/kernel/drivers/staging/lustre/lnet/Kconfig b/kernel/drivers/staging/lustre/lnet/Kconfig
new file mode 100644
index 000000000..00850eeb6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/Kconfig
@@ -0,0 +1,40 @@
+config LNET
+ tristate "Lustre networking subsystem"
+ depends on LUSTRE_FS
+
+config LNET_MAX_PAYLOAD
+ int "Lustre lnet max transfer payload (default 2MB)"
+ depends on LUSTRE_FS
+ default "1048576"
+ help
+ This option defines the maximum size of payload in bytes that lnet
+ can put into its transport.
+
+ If unsure, use default.
+
+config LNET_SELFTEST
+ tristate "Lustre networking self testing"
+ depends on LNET
+ help
+ Choose Y here if you want to do lnet self testing. To compile this
+ as a module, choose M here: the module will be called lnet_selftest.
+
+ To compile this as a kernel modules, choose M here and it will be
+ called lnet_selftest.
+
+ If unsure, say N.
+
+ See also http://wiki.lustre.org/
+
+config LNET_XPRT_IB
+ tristate "LNET infiniband support"
+ depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+ default LNET && INFINIBAND
+ help
+ This option allows the LNET users to use infiniband as an
+ RDMA-enabled transport.
+
+ To compile this as a kernel module, choose M here and it will be
+ called ko2iblnd.
+
+ If unsure, say N.
diff --git a/kernel/drivers/staging/lustre/lnet/Makefile b/kernel/drivers/staging/lustre/lnet/Makefile
new file mode 100644
index 000000000..f6f03e304
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += lnet/ klnds/ selftest/
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/Makefile b/kernel/drivers/staging/lustre/lnet/klnds/Makefile
new file mode 100644
index 000000000..c23e4f67f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += o2iblnd/ socklnd/
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
new file mode 100644
index 000000000..e0a7aa72b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
+ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644
index 000000000..3bad441de
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
@@ -0,0 +1,3118 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+#include <asm/div64.h>
+
+static lnd_t the_o2iblnd = {
+ .lnd_type = O2IBLND,
+ .lnd_startup = kiblnd_startup,
+ .lnd_shutdown = kiblnd_shutdown,
+ .lnd_ctl = kiblnd_ctl,
+ .lnd_query = kiblnd_query,
+ .lnd_send = kiblnd_send,
+ .lnd_recv = kiblnd_recv,
+};
+
+kib_data_t kiblnd_data;
+
+static __u32 kiblnd_cksum(void *ptr, int nob)
+{
+ char *c = ptr;
+ __u32 sum = 0;
+
+ while (nob-- > 0)
+ sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+ /* ensure I don't return 0 (== no checksum) */
+ return (sum == 0) ? 1 : sum;
+}
+
+static char *kiblnd_msgtype2str(int type)
+{
+ switch (type) {
+ case IBLND_MSG_CONNREQ:
+ return "CONNREQ";
+
+ case IBLND_MSG_CONNACK:
+ return "CONNACK";
+
+ case IBLND_MSG_NOOP:
+ return "NOOP";
+
+ case IBLND_MSG_IMMEDIATE:
+ return "IMMEDIATE";
+
+ case IBLND_MSG_PUT_REQ:
+ return "PUT_REQ";
+
+ case IBLND_MSG_PUT_NAK:
+ return "PUT_NAK";
+
+ case IBLND_MSG_PUT_ACK:
+ return "PUT_ACK";
+
+ case IBLND_MSG_PUT_DONE:
+ return "PUT_DONE";
+
+ case IBLND_MSG_GET_REQ:
+ return "GET_REQ";
+
+ case IBLND_MSG_GET_DONE:
+ return "GET_DONE";
+
+ default:
+ return "???";
+ }
+}
+
+static int kiblnd_msgtype2size(int type)
+{
+ const int hdr_size = offsetof(kib_msg_t, ibm_u);
+
+ switch (type) {
+ case IBLND_MSG_CONNREQ:
+ case IBLND_MSG_CONNACK:
+ return hdr_size + sizeof(kib_connparams_t);
+
+ case IBLND_MSG_NOOP:
+ return hdr_size;
+
+ case IBLND_MSG_IMMEDIATE:
+ return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+
+ case IBLND_MSG_PUT_REQ:
+ return hdr_size + sizeof(kib_putreq_msg_t);
+
+ case IBLND_MSG_PUT_ACK:
+ return hdr_size + sizeof(kib_putack_msg_t);
+
+ case IBLND_MSG_GET_REQ:
+ return hdr_size + sizeof(kib_get_msg_t);
+
+ case IBLND_MSG_PUT_NAK:
+ case IBLND_MSG_PUT_DONE:
+ case IBLND_MSG_GET_DONE:
+ return hdr_size + sizeof(kib_completion_msg_t);
+ default:
+ return -1;
+ }
+}
+
+static int kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+{
+ kib_rdma_desc_t *rd;
+ int nob;
+ int n;
+ int i;
+
+ LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
+ msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+ rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+ &msg->ibm_u.get.ibgm_rd :
+ &msg->ibm_u.putack.ibpam_rd;
+
+ if (flip) {
+ __swab32s(&rd->rd_key);
+ __swab32s(&rd->rd_nfrags);
+ }
+
+ n = rd->rd_nfrags;
+
+ if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+ CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+ n, IBLND_MAX_RDMA_FRAGS);
+ return 1;
+ }
+
+ nob = offsetof(kib_msg_t, ibm_u) +
+ kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+ if (msg->ibm_nob < nob) {
+ CERROR("Short %s: %d(%d)\n",
+ kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+ return 1;
+ }
+
+ if (!flip)
+ return 0;
+
+ for (i = 0; i < n; i++) {
+ __swab32s(&rd->rd_frags[i].rf_nob);
+ __swab64s(&rd->rd_frags[i].rf_addr);
+ }
+
+ return 0;
+}
+
+void kiblnd_pack_msg(lnet_ni_t *ni, kib_msg_t *msg, int version,
+ int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+ kib_net_t *net = ni->ni_data;
+
+ /* CAVEAT EMPTOR! all message fields not set here should have been
+ * initialised previously. */
+ msg->ibm_magic = IBLND_MSG_MAGIC;
+ msg->ibm_version = version;
+ /* ibm_type */
+ msg->ibm_credits = credits;
+ /* ibm_nob */
+ msg->ibm_cksum = 0;
+ msg->ibm_srcnid = ni->ni_nid;
+ msg->ibm_srcstamp = net->ibn_incarnation;
+ msg->ibm_dstnid = dstnid;
+ msg->ibm_dststamp = dststamp;
+
+ if (*kiblnd_tunables.kib_cksum) {
+ /* NB ibm_cksum zero while computing cksum */
+ msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+ }
+}
+
+int kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+ const int hdr_size = offsetof(kib_msg_t, ibm_u);
+ __u32 msg_cksum;
+ __u16 version;
+ int msg_nob;
+ int flip;
+
+ /* 6 bytes are enough to have received magic + version */
+ if (nob < 6) {
+ CERROR("Short message: %d\n", nob);
+ return -EPROTO;
+ }
+
+ if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+ flip = 0;
+ } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+ flip = 1;
+ } else {
+ CERROR("Bad magic: %08x\n", msg->ibm_magic);
+ return -EPROTO;
+ }
+
+ version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+ if (version != IBLND_MSG_VERSION &&
+ version != IBLND_MSG_VERSION_1) {
+ CERROR("Bad version: %x\n", version);
+ return -EPROTO;
+ }
+
+ if (nob < hdr_size) {
+ CERROR("Short message: %d\n", nob);
+ return -EPROTO;
+ }
+
+ msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+ if (msg_nob > nob) {
+ CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+ return -EPROTO;
+ }
+
+ /* checksum must be computed with ibm_cksum zero and BEFORE anything
+ * gets flipped */
+ msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+ msg->ibm_cksum = 0;
+ if (msg_cksum != 0 &&
+ msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+ CERROR("Bad checksum\n");
+ return -EPROTO;
+ }
+
+ msg->ibm_cksum = msg_cksum;
+
+ if (flip) {
+ /* leave magic unflipped as a clue to peer endianness */
+ msg->ibm_version = version;
+ CLASSERT(sizeof(msg->ibm_type) == 1);
+ CLASSERT(sizeof(msg->ibm_credits) == 1);
+ msg->ibm_nob = msg_nob;
+ __swab64s(&msg->ibm_srcnid);
+ __swab64s(&msg->ibm_srcstamp);
+ __swab64s(&msg->ibm_dstnid);
+ __swab64s(&msg->ibm_dststamp);
+ }
+
+ if (msg->ibm_srcnid == LNET_NID_ANY) {
+ CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+ return -EPROTO;
+ }
+
+ if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+ CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+ msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+ return -EPROTO;
+ }
+
+ switch (msg->ibm_type) {
+ default:
+ CERROR("Unknown message type %x\n", msg->ibm_type);
+ return -EPROTO;
+
+ case IBLND_MSG_NOOP:
+ case IBLND_MSG_IMMEDIATE:
+ case IBLND_MSG_PUT_REQ:
+ break;
+
+ case IBLND_MSG_PUT_ACK:
+ case IBLND_MSG_GET_REQ:
+ if (kiblnd_unpack_rd(msg, flip))
+ return -EPROTO;
+ break;
+
+ case IBLND_MSG_PUT_NAK:
+ case IBLND_MSG_PUT_DONE:
+ case IBLND_MSG_GET_DONE:
+ if (flip)
+ __swab32s(&msg->ibm_u.completion.ibcm_status);
+ break;
+
+ case IBLND_MSG_CONNREQ:
+ case IBLND_MSG_CONNACK:
+ if (flip) {
+ __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+ __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+ __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+ }
+ break;
+ }
+ return 0;
+}
+
+int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+{
+ kib_peer_t *peer;
+ kib_net_t *net = ni->ni_data;
+ int cpt = lnet_cpt_of_nid(nid);
+ unsigned long flags;
+
+ LASSERT(net != NULL);
+ LASSERT(nid != LNET_NID_ANY);
+
+ LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
+ if (peer == NULL) {
+ CERROR("Cannot allocate peer\n");
+ return -ENOMEM;
+ }
+
+ memset(peer, 0, sizeof(*peer)); /* zero flags etc */
+
+ peer->ibp_ni = ni;
+ peer->ibp_nid = nid;
+ peer->ibp_error = 0;
+ peer->ibp_last_alive = 0;
+ atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
+
+ INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
+ INIT_LIST_HEAD(&peer->ibp_conns);
+ INIT_LIST_HEAD(&peer->ibp_tx_queue);
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ /* always called with a ref on ni, which prevents ni being shutdown */
+ LASSERT(net->ibn_shutdown == 0);
+
+ /* npeers only grows with the global lock held */
+ atomic_inc(&net->ibn_npeers);
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ *peerp = peer;
+ return 0;
+}
+
+void kiblnd_destroy_peer(kib_peer_t *peer)
+{
+ kib_net_t *net = peer->ibp_ni->ni_data;
+
+ LASSERT(net != NULL);
+ LASSERT(atomic_read(&peer->ibp_refcount) == 0);
+ LASSERT(!kiblnd_peer_active(peer));
+ LASSERT(peer->ibp_connecting == 0);
+ LASSERT(peer->ibp_accepting == 0);
+ LASSERT(list_empty(&peer->ibp_conns));
+ LASSERT(list_empty(&peer->ibp_tx_queue));
+
+ LIBCFS_FREE(peer, sizeof(*peer));
+
+ /* NB a peer's connections keep a reference on their peer until
+ * they are destroyed, so we can be assured that _all_ state to do
+ * with this peer has been cleaned up when its refcount drops to
+ * zero. */
+ atomic_dec(&net->ibn_npeers);
+}
+
+kib_peer_t *kiblnd_find_peer_locked(lnet_nid_t nid)
+{
+ /* the caller is responsible for accounting the additional reference
+ * that this creates */
+ struct list_head *peer_list = kiblnd_nid2peerlist(nid);
+ struct list_head *tmp;
+ kib_peer_t *peer;
+
+ list_for_each(tmp, peer_list) {
+
+ peer = list_entry(tmp, kib_peer_t, ibp_list);
+
+ LASSERT(peer->ibp_connecting > 0 || /* creating conns */
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns)); /* active conn */
+
+ if (peer->ibp_nid != nid)
+ continue;
+
+ CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
+ peer, libcfs_nid2str(nid),
+ atomic_read(&peer->ibp_refcount),
+ peer->ibp_version);
+ return peer;
+ }
+ return NULL;
+}
+
+void kiblnd_unlink_peer_locked(kib_peer_t *peer)
+{
+ LASSERT(list_empty(&peer->ibp_conns));
+
+ LASSERT(kiblnd_peer_active(peer));
+ list_del_init(&peer->ibp_list);
+ /* lose peerlist's ref */
+ kiblnd_peer_decref(peer);
+}
+
+static int kiblnd_get_peer_info(lnet_ni_t *ni, int index,
+ lnet_nid_t *nidp, int *count)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ int i;
+ unsigned long flags;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+ list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
+
+ peer = list_entry(ptmp, kib_peer_t, ibp_list);
+ LASSERT(peer->ibp_connecting > 0 ||
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns));
+
+ if (peer->ibp_ni != ni)
+ continue;
+
+ if (index-- > 0)
+ continue;
+
+ *nidp = peer->ibp_nid;
+ *count = atomic_read(&peer->ibp_refcount);
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+ flags);
+ return 0;
+ }
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ return -ENOENT;
+}
+
+static void kiblnd_del_peer_locked(kib_peer_t *peer)
+{
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ kib_conn_t *conn;
+
+ if (list_empty(&peer->ibp_conns)) {
+ kiblnd_unlink_peer_locked(peer);
+ } else {
+ list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ kiblnd_close_conn_locked(conn, 0);
+ }
+ /* NB closing peer's last conn unlinked it. */
+ }
+ /* NB peer now unlinked; might even be freed if the peer table had the
+ * last ref on it. */
+}
+
+static int kiblnd_del_peer(lnet_ni_t *ni, lnet_nid_t nid)
+{
+ LIST_HEAD(zombies);
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ kib_peer_t *peer;
+ int lo;
+ int hi;
+ int i;
+ unsigned long flags;
+ int rc = -ENOENT;
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ if (nid != LNET_NID_ANY) {
+ lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+ } else {
+ lo = 0;
+ hi = kiblnd_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+ peer = list_entry(ptmp, kib_peer_t, ibp_list);
+ LASSERT(peer->ibp_connecting > 0 ||
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns));
+
+ if (peer->ibp_ni != ni)
+ continue;
+
+ if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+ continue;
+
+ if (!list_empty(&peer->ibp_tx_queue)) {
+ LASSERT(list_empty(&peer->ibp_conns));
+
+ list_splice_init(&peer->ibp_tx_queue,
+ &zombies);
+ }
+
+ kiblnd_del_peer_locked(peer);
+ rc = 0; /* matched something */
+ }
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_txlist_done(ni, &zombies, -EIO);
+
+ return rc;
+}
+
+static kib_conn_t *kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ int i;
+ unsigned long flags;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+ list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
+
+ peer = list_entry(ptmp, kib_peer_t, ibp_list);
+ LASSERT(peer->ibp_connecting > 0 ||
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns));
+
+ if (peer->ibp_ni != ni)
+ continue;
+
+ list_for_each(ctmp, &peer->ibp_conns) {
+ if (index-- > 0)
+ continue;
+
+ conn = list_entry(ctmp, kib_conn_t,
+ ibc_list);
+ kiblnd_conn_addref(conn);
+ read_unlock_irqrestore(
+ &kiblnd_data.kib_global_lock,
+ flags);
+ return conn;
+ }
+ }
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ return NULL;
+}
+
+int kiblnd_translate_mtu(int value)
+{
+ switch (value) {
+ default:
+ return -1;
+ case 0:
+ return 0;
+ case 256:
+ return IB_MTU_256;
+ case 512:
+ return IB_MTU_512;
+ case 1024:
+ return IB_MTU_1024;
+ case 2048:
+ return IB_MTU_2048;
+ case 4096:
+ return IB_MTU_4096;
+ }
+}
+
+static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+ int mtu;
+
+ /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+ if (cmid->route.path_rec == NULL)
+ return;
+
+ mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
+ LASSERT(mtu >= 0);
+ if (mtu != 0)
+ cmid->route.path_rec->mtu = mtu;
+}
+
+static int kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+{
+ cpumask_t *mask;
+ int vectors;
+ int off;
+ int i;
+ lnet_nid_t nid = conn->ibc_peer->ibp_nid;
+
+ vectors = conn->ibc_cmid->device->num_comp_vectors;
+ if (vectors <= 1)
+ return 0;
+
+ mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+ if (mask == NULL)
+ return 0;
+
+ /* hash NID to CPU id in this partition... */
+ off = do_div(nid, cpumask_weight(mask));
+ for_each_cpu(i, mask) {
+ if (off-- == 0)
+ return i % vectors;
+ }
+
+ LBUG();
+ return 1;
+}
+
+kib_conn_t *kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+ int state, int version)
+{
+ /* CAVEAT EMPTOR:
+ * If the new conn is created successfully it takes over the caller's
+ * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself
+ * is destroyed. On failure, the caller's ref on 'peer' remains and
+ * she must dispose of 'cmid'. (Actually I'd block forever if I tried
+ * to destroy 'cmid' here since I'm called from the CM which still has
+ * its ref on 'cmid'). */
+ rwlock_t *glock = &kiblnd_data.kib_global_lock;
+ kib_net_t *net = peer->ibp_ni->ni_data;
+ kib_dev_t *dev;
+ struct ib_qp_init_attr *init_qp_attr;
+ struct kib_sched_info *sched;
+ kib_conn_t *conn;
+ struct ib_cq *cq;
+ unsigned long flags;
+ int cpt;
+ int rc;
+ int i;
+
+ LASSERT(net != NULL);
+ LASSERT(!in_interrupt());
+
+ dev = net->ibn_dev;
+
+ cpt = lnet_cpt_of_nid(peer->ibp_nid);
+ sched = kiblnd_data.kib_scheds[cpt];
+
+ LASSERT(sched->ibs_nthreads > 0);
+
+ LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
+ sizeof(*init_qp_attr));
+ if (init_qp_attr == NULL) {
+ CERROR("Can't allocate qp_attr for %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ goto failed_0;
+ }
+
+ LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+ if (conn == NULL) {
+ CERROR("Can't allocate connection for %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ goto failed_1;
+ }
+
+ conn->ibc_state = IBLND_CONN_INIT;
+ conn->ibc_version = version;
+ conn->ibc_peer = peer; /* I take the caller's ref */
+ cmid->context = conn; /* for future CM callbacks */
+ conn->ibc_cmid = cmid;
+
+ INIT_LIST_HEAD(&conn->ibc_early_rxs);
+ INIT_LIST_HEAD(&conn->ibc_tx_noops);
+ INIT_LIST_HEAD(&conn->ibc_tx_queue);
+ INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+ INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+ INIT_LIST_HEAD(&conn->ibc_active_txs);
+ spin_lock_init(&conn->ibc_lock);
+
+ LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+ sizeof(*conn->ibc_connvars));
+ if (conn->ibc_connvars == NULL) {
+ CERROR("Can't allocate in-progress connection state\n");
+ goto failed_2;
+ }
+
+ write_lock_irqsave(glock, flags);
+ if (dev->ibd_failover) {
+ write_unlock_irqrestore(glock, flags);
+ CERROR("%s: failover in progress\n", dev->ibd_ifname);
+ goto failed_2;
+ }
+
+ if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+ /* wakeup failover thread and teardown connection */
+ if (kiblnd_dev_can_failover(dev)) {
+ list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ wake_up(&kiblnd_data.kib_failover_waitq);
+ }
+
+ write_unlock_irqrestore(glock, flags);
+ CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+ cmid->device->name, dev->ibd_ifname);
+ goto failed_2;
+ }
+
+ kiblnd_hdev_addref_locked(dev->ibd_hdev);
+ conn->ibc_hdev = dev->ibd_hdev;
+
+ kiblnd_setup_mtu_locked(cmid);
+
+ write_unlock_irqrestore(glock, flags);
+
+ LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+ IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
+ if (conn->ibc_rxs == NULL) {
+ CERROR("Cannot allocate RX buffers\n");
+ goto failed_2;
+ }
+
+ rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+ IBLND_RX_MSG_PAGES(version));
+ if (rc != 0)
+ goto failed_2;
+
+ kiblnd_map_rx_descs(conn);
+
+ cq = ib_create_cq(cmid->device,
+ kiblnd_cq_completion, kiblnd_cq_event, conn,
+ IBLND_CQ_ENTRIES(version),
+ kiblnd_get_completion_vector(conn, cpt));
+ if (IS_ERR(cq)) {
+ CERROR("Can't create CQ: %ld, cqe: %d\n",
+ PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
+ goto failed_2;
+ }
+
+ conn->ibc_cq = cq;
+
+ rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ if (rc != 0) {
+ CERROR("Can't request completion notificiation: %d\n", rc);
+ goto failed_2;
+ }
+
+ init_qp_attr->event_handler = kiblnd_qp_event;
+ init_qp_attr->qp_context = conn;
+ init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
+ init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
+ init_qp_attr->cap.max_send_sge = 1;
+ init_qp_attr->cap.max_recv_sge = 1;
+ init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+ init_qp_attr->qp_type = IB_QPT_RC;
+ init_qp_attr->send_cq = cq;
+ init_qp_attr->recv_cq = cq;
+
+ conn->ibc_sched = sched;
+
+ rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+ if (rc != 0) {
+ CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
+ rc, init_qp_attr->cap.max_send_wr,
+ init_qp_attr->cap.max_recv_wr);
+ goto failed_2;
+ }
+
+ LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+ /* 1 ref for caller and each rxmsg */
+ atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
+ conn->ibc_nrx = IBLND_RX_MSGS(version);
+
+ /* post receives */
+ for (i = 0; i < IBLND_RX_MSGS(version); i++) {
+ rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+ IBLND_POSTRX_NO_CREDIT);
+ if (rc != 0) {
+ CERROR("Can't post rxmsg: %d\n", rc);
+
+ /* Make posted receives complete */
+ kiblnd_abort_receives(conn);
+
+ /* correct # of posted buffers
+ * NB locking needed now I'm racing with completion */
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ /* cmid will be destroyed by CM(ofed) after cm_callback
+ * returned, so we can't refer it anymore
+ * (by kiblnd_connd()->kiblnd_destroy_conn) */
+ rdma_destroy_qp(conn->ibc_cmid);
+ conn->ibc_cmid = NULL;
+
+ /* Drop my own and unused rxbuffer refcounts */
+ while (i++ <= IBLND_RX_MSGS(version))
+ kiblnd_conn_decref(conn);
+
+ return NULL;
+ }
+ }
+
+ /* Init successful! */
+ LASSERT(state == IBLND_CONN_ACTIVE_CONNECT ||
+ state == IBLND_CONN_PASSIVE_WAIT);
+ conn->ibc_state = state;
+
+ /* 1 more conn */
+ atomic_inc(&net->ibn_nconns);
+ return conn;
+
+ failed_2:
+ kiblnd_destroy_conn(conn);
+ failed_1:
+ LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+ return NULL;
+}
+
+void kiblnd_destroy_conn(kib_conn_t *conn)
+{
+ struct rdma_cm_id *cmid = conn->ibc_cmid;
+ kib_peer_t *peer = conn->ibc_peer;
+ int rc;
+
+ LASSERT(!in_interrupt());
+ LASSERT(atomic_read(&conn->ibc_refcount) == 0);
+ LASSERT(list_empty(&conn->ibc_early_rxs));
+ LASSERT(list_empty(&conn->ibc_tx_noops));
+ LASSERT(list_empty(&conn->ibc_tx_queue));
+ LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
+ LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
+ LASSERT(list_empty(&conn->ibc_active_txs));
+ LASSERT(conn->ibc_noops_posted == 0);
+ LASSERT(conn->ibc_nsends_posted == 0);
+
+ switch (conn->ibc_state) {
+ default:
+ /* conn must be completely disengaged from the network */
+ LBUG();
+
+ case IBLND_CONN_DISCONNECTED:
+ /* connvars should have been freed already */
+ LASSERT(conn->ibc_connvars == NULL);
+ break;
+
+ case IBLND_CONN_INIT:
+ break;
+ }
+
+ /* conn->ibc_cmid might be destroyed by CM already */
+ if (cmid != NULL && cmid->qp != NULL)
+ rdma_destroy_qp(cmid);
+
+ if (conn->ibc_cq != NULL) {
+ rc = ib_destroy_cq(conn->ibc_cq);
+ if (rc != 0)
+ CWARN("Error destroying CQ: %d\n", rc);
+ }
+
+ if (conn->ibc_rx_pages != NULL)
+ kiblnd_unmap_rx_descs(conn);
+
+ if (conn->ibc_rxs != NULL) {
+ LIBCFS_FREE(conn->ibc_rxs,
+ IBLND_RX_MSGS(conn->ibc_version)
+ * sizeof(kib_rx_t));
+ }
+
+ if (conn->ibc_connvars != NULL)
+ LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+ if (conn->ibc_hdev != NULL)
+ kiblnd_hdev_decref(conn->ibc_hdev);
+
+ /* See CAVEAT EMPTOR above in kiblnd_create_conn */
+ if (conn->ibc_state != IBLND_CONN_INIT) {
+ kib_net_t *net = peer->ibp_ni->ni_data;
+
+ kiblnd_peer_decref(peer);
+ rdma_destroy_id(cmid);
+ atomic_dec(&net->ibn_nconns);
+ }
+
+ LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ conn->ibc_version, why);
+
+ kiblnd_close_conn_locked(conn, why);
+ count++;
+ }
+
+ return count;
+}
+
+int kiblnd_close_stale_conns_locked(kib_peer_t *peer,
+ int version, __u64 incarnation)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ if (conn->ibc_version == version &&
+ conn->ibc_incarnation == incarnation)
+ continue;
+
+ CDEBUG(D_NET,
+ "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ conn->ibc_version, conn->ibc_incarnation,
+ version, incarnation);
+
+ kiblnd_close_conn_locked(conn, -ESTALE);
+ count++;
+ }
+
+ return count;
+}
+
+static int kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ int lo;
+ int hi;
+ int i;
+ unsigned long flags;
+ int count = 0;
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ if (nid != LNET_NID_ANY)
+ lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+ else {
+ lo = 0;
+ hi = kiblnd_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+ peer = list_entry(ptmp, kib_peer_t, ibp_list);
+ LASSERT(peer->ibp_connecting > 0 ||
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns));
+
+ if (peer->ibp_ni != ni)
+ continue;
+
+ if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+ continue;
+
+ count += kiblnd_close_peer_conns_locked(peer, 0);
+ }
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ /* wildcards always succeed */
+ if (nid == LNET_NID_ANY)
+ return 0;
+
+ return (count == 0) ? -ENOENT : 0;
+}
+
+int kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+ struct libcfs_ioctl_data *data = arg;
+ int rc = -EINVAL;
+
+ switch (cmd) {
+ case IOC_LIBCFS_GET_PEER: {
+ lnet_nid_t nid = 0;
+ int count = 0;
+
+ rc = kiblnd_get_peer_info(ni, data->ioc_count,
+ &nid, &count);
+ data->ioc_nid = nid;
+ data->ioc_count = count;
+ break;
+ }
+
+ case IOC_LIBCFS_DEL_PEER: {
+ rc = kiblnd_del_peer(ni, data->ioc_nid);
+ break;
+ }
+ case IOC_LIBCFS_GET_CONN: {
+ kib_conn_t *conn;
+
+ rc = 0;
+ conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+ if (conn == NULL) {
+ rc = -ENOENT;
+ break;
+ }
+
+ LASSERT(conn->ibc_cmid != NULL);
+ data->ioc_nid = conn->ibc_peer->ibp_nid;
+ if (conn->ibc_cmid->route.path_rec == NULL)
+ data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+ else
+ data->ioc_u32[0] =
+ ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+ kiblnd_conn_decref(conn);
+ break;
+ }
+ case IOC_LIBCFS_CLOSE_CONNECTION: {
+ rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ return rc;
+}
+
+void kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when)
+{
+ unsigned long last_alive = 0;
+ unsigned long now = cfs_time_current();
+ rwlock_t *glock = &kiblnd_data.kib_global_lock;
+ kib_peer_t *peer;
+ unsigned long flags;
+
+ read_lock_irqsave(glock, flags);
+
+ peer = kiblnd_find_peer_locked(nid);
+ if (peer != NULL) {
+ LASSERT(peer->ibp_connecting > 0 || /* creating conns */
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns)); /* active conn */
+ last_alive = peer->ibp_last_alive;
+ }
+
+ read_unlock_irqrestore(glock, flags);
+
+ if (last_alive != 0)
+ *when = last_alive;
+
+ /* peer is not persistent in hash, trigger peer creation
+ * and connection establishment with a NULL tx */
+ if (peer == NULL)
+ kiblnd_launch_tx(ni, NULL, nid);
+
+ CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+ libcfs_nid2str(nid), peer,
+ last_alive ? cfs_duration_sec(now - last_alive) : -1);
+}
+
+void kiblnd_free_pages(kib_pages_t *p)
+{
+ int npages = p->ibp_npages;
+ int i;
+
+ for (i = 0; i < npages; i++) {
+ if (p->ibp_pages[i] != NULL)
+ __free_page(p->ibp_pages[i]);
+ }
+
+ LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+{
+ kib_pages_t *p;
+ int i;
+
+ LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+ offsetof(kib_pages_t, ibp_pages[npages]));
+ if (p == NULL) {
+ CERROR("Can't allocate descriptor for %d pages\n", npages);
+ return -ENOMEM;
+ }
+
+ memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+ p->ibp_npages = npages;
+
+ for (i = 0; i < npages; i++) {
+ p->ibp_pages[i] = alloc_pages_node(
+ cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+ GFP_NOFS, 0);
+ if (p->ibp_pages[i] == NULL) {
+ CERROR("Can't allocate page %d of %d\n", i, npages);
+ kiblnd_free_pages(p);
+ return -ENOMEM;
+ }
+ }
+
+ *pp = p;
+ return 0;
+}
+
+void kiblnd_unmap_rx_descs(kib_conn_t *conn)
+{
+ kib_rx_t *rx;
+ int i;
+
+ LASSERT(conn->ibc_rxs != NULL);
+ LASSERT(conn->ibc_hdev != NULL);
+
+ for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+ rx = &conn->ibc_rxs[i];
+
+ LASSERT(rx->rx_nob >= 0); /* not posted */
+
+ kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+ KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+ rx->rx_msgaddr),
+ IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+ }
+
+ kiblnd_free_pages(conn->ibc_rx_pages);
+
+ conn->ibc_rx_pages = NULL;
+}
+
+void kiblnd_map_rx_descs(kib_conn_t *conn)
+{
+ kib_rx_t *rx;
+ struct page *pg;
+ int pg_off;
+ int ipg;
+ int i;
+
+ for (pg_off = ipg = i = 0;
+ i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+ pg = conn->ibc_rx_pages->ibp_pages[ipg];
+ rx = &conn->ibc_rxs[i];
+
+ rx->rx_conn = conn;
+ rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+
+ rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+ rx->rx_msg,
+ IBLND_MSG_SIZE,
+ DMA_FROM_DEVICE);
+ LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+ rx->rx_msgaddr));
+ KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+ CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
+ i, rx->rx_msg, rx->rx_msgaddr,
+ lnet_page2phys(pg) + pg_off);
+
+ pg_off += IBLND_MSG_SIZE;
+ LASSERT(pg_off <= PAGE_SIZE);
+
+ if (pg_off == PAGE_SIZE) {
+ pg_off = 0;
+ ipg++;
+ LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
+ }
+ }
+}
+
+static void kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+{
+ kib_hca_dev_t *hdev = tpo->tpo_hdev;
+ kib_tx_t *tx;
+ int i;
+
+ LASSERT(tpo->tpo_pool.po_allocated == 0);
+
+ if (hdev == NULL)
+ return;
+
+ for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+ tx = &tpo->tpo_tx_descs[i];
+ kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+ KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+ tx->tx_msgaddr),
+ IBLND_MSG_SIZE, DMA_TO_DEVICE);
+ }
+
+ kiblnd_hdev_decref(hdev);
+ tpo->tpo_hdev = NULL;
+}
+
+static kib_hca_dev_t *kiblnd_current_hdev(kib_dev_t *dev)
+{
+ kib_hca_dev_t *hdev;
+ unsigned long flags;
+ int i = 0;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ while (dev->ibd_failover) {
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ if (i++ % 50 == 0)
+ CDEBUG(D_NET, "%s: Wait for failover\n",
+ dev->ibd_ifname);
+ schedule_timeout(cfs_time_seconds(1) / 100);
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ }
+
+ kiblnd_hdev_addref_locked(dev->ibd_hdev);
+ hdev = dev->ibd_hdev;
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ return hdev;
+}
+
+static void kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+ kib_pages_t *txpgs = tpo->tpo_tx_pages;
+ kib_pool_t *pool = &tpo->tpo_pool;
+ kib_net_t *net = pool->po_owner->ps_net;
+ kib_dev_t *dev;
+ struct page *page;
+ kib_tx_t *tx;
+ int page_offset;
+ int ipage;
+ int i;
+
+ LASSERT(net != NULL);
+
+ dev = net->ibn_dev;
+
+ /* pre-mapped messages are not bigger than 1 page */
+ CLASSERT(IBLND_MSG_SIZE <= PAGE_SIZE);
+
+ /* No fancy arithmetic when we do the buffer calculations */
+ CLASSERT(PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+ tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+ for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+ page = txpgs->ibp_pages[ipage];
+ tx = &tpo->tpo_tx_descs[i];
+
+ tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ page_offset);
+
+ tx->tx_msgaddr = kiblnd_dma_map_single(
+ tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
+ IBLND_MSG_SIZE, DMA_TO_DEVICE);
+ LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+ tx->tx_msgaddr));
+ KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+ list_add(&tx->tx_list, &pool->po_free_list);
+
+ page_offset += IBLND_MSG_SIZE;
+ LASSERT(page_offset <= PAGE_SIZE);
+
+ if (page_offset == PAGE_SIZE) {
+ page_offset = 0;
+ ipage++;
+ LASSERT(ipage <= txpgs->ibp_npages);
+ }
+ }
+}
+
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
+{
+ __u64 index;
+
+ LASSERT(hdev->ibh_mrs[0] != NULL);
+
+ if (hdev->ibh_nmrs == 1)
+ return hdev->ibh_mrs[0];
+
+ index = addr >> hdev->ibh_mr_shift;
+
+ if (index < hdev->ibh_nmrs &&
+ index == ((addr + size - 1) >> hdev->ibh_mr_shift))
+ return hdev->ibh_mrs[index];
+
+ return NULL;
+}
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
+{
+ struct ib_mr *prev_mr;
+ struct ib_mr *mr;
+ int i;
+
+ LASSERT(hdev->ibh_mrs[0] != NULL);
+
+ if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+ *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
+ return NULL;
+
+ if (hdev->ibh_nmrs == 1)
+ return hdev->ibh_mrs[0];
+
+ for (i = 0, mr = prev_mr = NULL;
+ i < rd->rd_nfrags; i++) {
+ mr = kiblnd_find_dma_mr(hdev,
+ rd->rd_frags[i].rf_addr,
+ rd->rd_frags[i].rf_nob);
+ if (prev_mr == NULL)
+ prev_mr = mr;
+
+ if (mr == NULL || prev_mr != mr) {
+ /* Can't covered by one single MR */
+ mr = NULL;
+ break;
+ }
+ }
+
+ return mr;
+}
+
+static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+{
+ LASSERT(pool->fpo_map_count == 0);
+
+ if (pool->fpo_fmr_pool != NULL)
+ ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+
+ if (pool->fpo_hdev != NULL)
+ kiblnd_hdev_decref(pool->fpo_hdev);
+
+ LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
+}
+
+static void kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+ kib_fmr_pool_t *pool;
+
+ while (!list_empty(head)) {
+ pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
+ list_del(&pool->fpo_list);
+ kiblnd_destroy_fmr_pool(pool);
+ }
+}
+
+static int kiblnd_fmr_pool_size(int ncpts)
+{
+ int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
+
+ return max(IBLND_FMR_POOL, size);
+}
+
+static int kiblnd_fmr_flush_trigger(int ncpts)
+{
+ int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
+
+ return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps,
+ kib_fmr_pool_t **pp_fpo)
+{
+ /* FMR pool for RDMA */
+ kib_dev_t *dev = fps->fps_net->ibn_dev;
+ kib_fmr_pool_t *fpo;
+ struct ib_fmr_pool_param param = {
+ .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+ .page_shift = PAGE_SHIFT,
+ .access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE),
+ .pool_size = fps->fps_pool_size,
+ .dirty_watermark = fps->fps_flush_trigger,
+ .flush_function = NULL,
+ .flush_arg = NULL,
+ .cache = !!*kiblnd_tunables.kib_fmr_cache};
+ int rc;
+
+ LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+ if (fpo == NULL)
+ return -ENOMEM;
+
+ fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+ fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
+ if (IS_ERR(fpo->fpo_fmr_pool)) {
+ rc = PTR_ERR(fpo->fpo_fmr_pool);
+ CERROR("Failed to create FMR pool: %d\n", rc);
+
+ kiblnd_hdev_decref(fpo->fpo_hdev);
+ LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
+ return rc;
+ }
+
+ fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ fpo->fpo_owner = fps;
+ *pp_fpo = fpo;
+
+ return 0;
+}
+
+static void kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps,
+ struct list_head *zombies)
+{
+ if (fps->fps_net == NULL) /* intialized? */
+ return;
+
+ spin_lock(&fps->fps_lock);
+
+ while (!list_empty(&fps->fps_pool_list)) {
+ kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+ kib_fmr_pool_t, fpo_list);
+ fpo->fpo_failed = 1;
+ list_del(&fpo->fpo_list);
+ if (fpo->fpo_map_count == 0)
+ list_add(&fpo->fpo_list, zombies);
+ else
+ list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+ }
+
+ spin_unlock(&fps->fps_lock);
+}
+
+static void kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+{
+ if (fps->fps_net != NULL) { /* initialized? */
+ kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+ kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+ }
+}
+
+static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt,
+ kib_net_t *net, int pool_size,
+ int flush_trigger)
+{
+ kib_fmr_pool_t *fpo;
+ int rc;
+
+ memset(fps, 0, sizeof(kib_fmr_poolset_t));
+
+ fps->fps_net = net;
+ fps->fps_cpt = cpt;
+ fps->fps_pool_size = pool_size;
+ fps->fps_flush_trigger = flush_trigger;
+ spin_lock_init(&fps->fps_lock);
+ INIT_LIST_HEAD(&fps->fps_pool_list);
+ INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+ rc = kiblnd_create_fmr_pool(fps, &fpo);
+ if (rc == 0)
+ list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+ return rc;
+}
+
+static int kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, unsigned long now)
+{
+ if (fpo->fpo_map_count != 0) /* still in use */
+ return 0;
+ if (fpo->fpo_failed)
+ return 1;
+ return cfs_time_aftereq(now, fpo->fpo_deadline);
+}
+
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+{
+ LIST_HEAD(zombies);
+ kib_fmr_pool_t *fpo = fmr->fmr_pool;
+ kib_fmr_poolset_t *fps = fpo->fpo_owner;
+ unsigned long now = cfs_time_current();
+ kib_fmr_pool_t *tmp;
+ int rc;
+
+ rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+ LASSERT(rc == 0);
+
+ if (status != 0) {
+ rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
+ LASSERT(rc == 0);
+ }
+
+ fmr->fmr_pool = NULL;
+ fmr->fmr_pfmr = NULL;
+
+ spin_lock(&fps->fps_lock);
+ fpo->fpo_map_count--; /* decref the pool */
+
+ list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+ /* the first pool is persistent */
+ if (fps->fps_pool_list.next == &fpo->fpo_list)
+ continue;
+
+ if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+ list_move(&fpo->fpo_list, &zombies);
+ fps->fps_version++;
+ }
+ }
+ spin_unlock(&fps->fps_lock);
+
+ if (!list_empty(&zombies))
+ kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
+ __u64 iov, kib_fmr_t *fmr)
+{
+ struct ib_pool_fmr *pfmr;
+ kib_fmr_pool_t *fpo;
+ __u64 version;
+ int rc;
+
+ again:
+ spin_lock(&fps->fps_lock);
+ version = fps->fps_version;
+ list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+ fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ fpo->fpo_map_count++;
+ spin_unlock(&fps->fps_lock);
+
+ pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
+ pages, npages, iov);
+ if (likely(!IS_ERR(pfmr))) {
+ fmr->fmr_pool = fpo;
+ fmr->fmr_pfmr = pfmr;
+ return 0;
+ }
+
+ spin_lock(&fps->fps_lock);
+ fpo->fpo_map_count--;
+ if (PTR_ERR(pfmr) != -EAGAIN) {
+ spin_unlock(&fps->fps_lock);
+ return PTR_ERR(pfmr);
+ }
+
+ /* EAGAIN and ... */
+ if (version != fps->fps_version) {
+ spin_unlock(&fps->fps_lock);
+ goto again;
+ }
+ }
+
+ if (fps->fps_increasing) {
+ spin_unlock(&fps->fps_lock);
+ CDEBUG(D_NET,
+ "Another thread is allocating new FMR pool, waiting for her to complete\n");
+ schedule();
+ goto again;
+
+ }
+
+ if (time_before(cfs_time_current(), fps->fps_next_retry)) {
+ /* someone failed recently */
+ spin_unlock(&fps->fps_lock);
+ return -EAGAIN;
+ }
+
+ fps->fps_increasing = 1;
+ spin_unlock(&fps->fps_lock);
+
+ CDEBUG(D_NET, "Allocate new FMR pool\n");
+ rc = kiblnd_create_fmr_pool(fps, &fpo);
+ spin_lock(&fps->fps_lock);
+ fps->fps_increasing = 0;
+ if (rc == 0) {
+ fps->fps_version++;
+ list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+ } else {
+ fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+ }
+ spin_unlock(&fps->fps_lock);
+
+ goto again;
+}
+
+static void kiblnd_fini_pool(kib_pool_t *pool)
+{
+ LASSERT(list_empty(&pool->po_free_list));
+ LASSERT(pool->po_allocated == 0);
+
+ CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+{
+ CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+ memset(pool, 0, sizeof(kib_pool_t));
+ INIT_LIST_HEAD(&pool->po_free_list);
+ pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ pool->po_owner = ps;
+ pool->po_size = size;
+}
+
+static void kiblnd_destroy_pool_list(struct list_head *head)
+{
+ kib_pool_t *pool;
+
+ while (!list_empty(head)) {
+ pool = list_entry(head->next, kib_pool_t, po_list);
+ list_del(&pool->po_list);
+
+ LASSERT(pool->po_owner != NULL);
+ pool->po_owner->ps_pool_destroy(pool);
+ }
+}
+
+static void kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+{
+ if (ps->ps_net == NULL) /* intialized? */
+ return;
+
+ spin_lock(&ps->ps_lock);
+ while (!list_empty(&ps->ps_pool_list)) {
+ kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+ kib_pool_t, po_list);
+ po->po_failed = 1;
+ list_del(&po->po_list);
+ if (po->po_allocated == 0)
+ list_add(&po->po_list, zombies);
+ else
+ list_add(&po->po_list, &ps->ps_failed_pool_list);
+ }
+ spin_unlock(&ps->ps_lock);
+}
+
+static void kiblnd_fini_poolset(kib_poolset_t *ps)
+{
+ if (ps->ps_net != NULL) { /* initialized? */
+ kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+ kiblnd_destroy_pool_list(&ps->ps_pool_list);
+ }
+}
+
+static int kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+ kib_net_t *net, char *name, int size,
+ kib_ps_pool_create_t po_create,
+ kib_ps_pool_destroy_t po_destroy,
+ kib_ps_node_init_t nd_init,
+ kib_ps_node_fini_t nd_fini)
+{
+ kib_pool_t *pool;
+ int rc;
+
+ memset(ps, 0, sizeof(kib_poolset_t));
+
+ ps->ps_cpt = cpt;
+ ps->ps_net = net;
+ ps->ps_pool_create = po_create;
+ ps->ps_pool_destroy = po_destroy;
+ ps->ps_node_init = nd_init;
+ ps->ps_node_fini = nd_fini;
+ ps->ps_pool_size = size;
+ if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+ >= sizeof(ps->ps_name))
+ return -E2BIG;
+ spin_lock_init(&ps->ps_lock);
+ INIT_LIST_HEAD(&ps->ps_pool_list);
+ INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+ rc = ps->ps_pool_create(ps, size, &pool);
+ if (rc == 0)
+ list_add(&pool->po_list, &ps->ps_pool_list);
+ else
+ CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+ return rc;
+}
+
+static int kiblnd_pool_is_idle(kib_pool_t *pool, unsigned long now)
+{
+ if (pool->po_allocated != 0) /* still in use */
+ return 0;
+ if (pool->po_failed)
+ return 1;
+ return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+{
+ LIST_HEAD(zombies);
+ kib_poolset_t *ps = pool->po_owner;
+ kib_pool_t *tmp;
+ unsigned long now = cfs_time_current();
+
+ spin_lock(&ps->ps_lock);
+
+ if (ps->ps_node_fini != NULL)
+ ps->ps_node_fini(pool, node);
+
+ LASSERT(pool->po_allocated > 0);
+ list_add(node, &pool->po_free_list);
+ pool->po_allocated--;
+
+ list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+ /* the first pool is persistent */
+ if (ps->ps_pool_list.next == &pool->po_list)
+ continue;
+
+ if (kiblnd_pool_is_idle(pool, now))
+ list_move(&pool->po_list, &zombies);
+ }
+ spin_unlock(&ps->ps_lock);
+
+ if (!list_empty(&zombies))
+ kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+ struct list_head *node;
+ kib_pool_t *pool;
+ int rc;
+
+ again:
+ spin_lock(&ps->ps_lock);
+ list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+ if (list_empty(&pool->po_free_list))
+ continue;
+
+ pool->po_allocated++;
+ pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ node = pool->po_free_list.next;
+ list_del(node);
+
+ if (ps->ps_node_init != NULL) {
+ /* still hold the lock */
+ ps->ps_node_init(pool, node);
+ }
+ spin_unlock(&ps->ps_lock);
+ return node;
+ }
+
+ /* no available tx pool and ... */
+ if (ps->ps_increasing) {
+ /* another thread is allocating a new pool */
+ spin_unlock(&ps->ps_lock);
+ CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting for her to complete\n",
+ ps->ps_name);
+ schedule();
+ goto again;
+ }
+
+ if (time_before(cfs_time_current(), ps->ps_next_retry)) {
+ /* someone failed recently */
+ spin_unlock(&ps->ps_lock);
+ return NULL;
+ }
+
+ ps->ps_increasing = 1;
+ spin_unlock(&ps->ps_lock);
+
+ CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+
+ rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+
+ spin_lock(&ps->ps_lock);
+ ps->ps_increasing = 0;
+ if (rc == 0) {
+ list_add_tail(&pool->po_list, &ps->ps_pool_list);
+ } else {
+ ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+ CERROR("Can't allocate new %s pool because out of memory\n",
+ ps->ps_name);
+ }
+ spin_unlock(&ps->ps_lock);
+
+ goto again;
+}
+
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
+{
+ kib_pmr_pool_t *ppo = pmr->pmr_pool;
+ struct ib_mr *mr = pmr->pmr_mr;
+
+ pmr->pmr_mr = NULL;
+ kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
+ if (mr != NULL)
+ ib_dereg_mr(mr);
+}
+
+int kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+ kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
+{
+ kib_phys_mr_t *pmr;
+ struct list_head *node;
+ int rc;
+ int i;
+
+ node = kiblnd_pool_alloc_node(&pps->pps_poolset);
+ if (node == NULL) {
+ CERROR("Failed to allocate PMR descriptor\n");
+ return -ENOMEM;
+ }
+
+ pmr = container_of(node, kib_phys_mr_t, pmr_list);
+ if (pmr->pmr_pool->ppo_hdev != hdev) {
+ kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+ return -EAGAIN;
+ }
+
+ for (i = 0; i < rd->rd_nfrags; i++) {
+ pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
+ pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
+ }
+
+ pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
+ pmr->pmr_ipb, rd->rd_nfrags,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE,
+ iova);
+ if (!IS_ERR(pmr->pmr_mr)) {
+ pmr->pmr_iova = *iova;
+ *pp_pmr = pmr;
+ return 0;
+ }
+
+ rc = PTR_ERR(pmr->pmr_mr);
+ CERROR("Failed ib_reg_phys_mr: %d\n", rc);
+
+ pmr->pmr_mr = NULL;
+ kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+
+ return rc;
+}
+
+static void kiblnd_destroy_pmr_pool(kib_pool_t *pool)
+{
+ kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
+ kib_phys_mr_t *pmr;
+ kib_phys_mr_t *tmp;
+
+ LASSERT(pool->po_allocated == 0);
+
+ list_for_each_entry_safe(pmr, tmp, &pool->po_free_list, pmr_list) {
+ LASSERT(pmr->pmr_mr == NULL);
+ list_del(&pmr->pmr_list);
+
+ if (pmr->pmr_ipb != NULL) {
+ LIBCFS_FREE(pmr->pmr_ipb,
+ IBLND_MAX_RDMA_FRAGS *
+ sizeof(struct ib_phys_buf));
+ }
+
+ LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
+ }
+
+ kiblnd_fini_pool(pool);
+ if (ppo->ppo_hdev != NULL)
+ kiblnd_hdev_decref(ppo->ppo_hdev);
+
+ LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
+}
+
+static inline int kiblnd_pmr_pool_size(int ncpts)
+{
+ int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts;
+
+ return max(IBLND_PMR_POOL, size);
+}
+
+static int kiblnd_create_pmr_pool(kib_poolset_t *ps, int size,
+ kib_pool_t **pp_po)
+{
+ struct kib_pmr_pool *ppo;
+ struct kib_pool *pool;
+ kib_phys_mr_t *pmr;
+ int i;
+
+ LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(),
+ ps->ps_cpt, sizeof(kib_pmr_pool_t));
+ if (ppo == NULL) {
+ CERROR("Failed to allocate PMR pool\n");
+ return -ENOMEM;
+ }
+
+ pool = &ppo->ppo_pool;
+ kiblnd_init_pool(ps, pool, size);
+
+ for (i = 0; i < size; i++) {
+ LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(),
+ ps->ps_cpt, sizeof(kib_phys_mr_t));
+ if (pmr == NULL)
+ break;
+
+ pmr->pmr_pool = ppo;
+ LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt,
+ IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb));
+ if (pmr->pmr_ipb == NULL)
+ break;
+
+ list_add(&pmr->pmr_list, &pool->po_free_list);
+ }
+
+ if (i < size) {
+ ps->ps_pool_destroy(pool);
+ return -ENOMEM;
+ }
+
+ ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
+ *pp_po = pool;
+ return 0;
+}
+
+static void kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+ kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+ int i;
+
+ LASSERT(pool->po_allocated == 0);
+
+ if (tpo->tpo_tx_pages != NULL) {
+ kiblnd_unmap_tx_pool(tpo);
+ kiblnd_free_pages(tpo->tpo_tx_pages);
+ }
+
+ if (tpo->tpo_tx_descs == NULL)
+ goto out;
+
+ for (i = 0; i < pool->po_size; i++) {
+ kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+ list_del(&tx->tx_list);
+ if (tx->tx_pages != NULL)
+ LIBCFS_FREE(tx->tx_pages,
+ LNET_MAX_IOV *
+ sizeof(*tx->tx_pages));
+ if (tx->tx_frags != NULL)
+ LIBCFS_FREE(tx->tx_frags,
+ IBLND_MAX_RDMA_FRAGS *
+ sizeof(*tx->tx_frags));
+ if (tx->tx_wrq != NULL)
+ LIBCFS_FREE(tx->tx_wrq,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_wrq));
+ if (tx->tx_sge != NULL)
+ LIBCFS_FREE(tx->tx_sge,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_sge));
+ if (tx->tx_rd != NULL)
+ LIBCFS_FREE(tx->tx_rd,
+ offsetof(kib_rdma_desc_t,
+ rd_frags[IBLND_MAX_RDMA_FRAGS]));
+ }
+
+ LIBCFS_FREE(tpo->tpo_tx_descs,
+ pool->po_size * sizeof(kib_tx_t));
+out:
+ kiblnd_fini_pool(pool);
+ LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int kiblnd_tx_pool_size(int ncpts)
+{
+ int ntx = *kiblnd_tunables.kib_ntx / ncpts;
+
+ return max(IBLND_TX_POOL, ntx);
+}
+
+static int kiblnd_create_tx_pool(kib_poolset_t *ps, int size,
+ kib_pool_t **pp_po)
+{
+ int i;
+ int npg;
+ kib_pool_t *pool;
+ kib_tx_pool_t *tpo;
+
+ LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+ if (tpo == NULL) {
+ CERROR("Failed to allocate TX pool\n");
+ return -ENOMEM;
+ }
+
+ pool = &tpo->tpo_pool;
+ kiblnd_init_pool(ps, pool, size);
+ tpo->tpo_tx_descs = NULL;
+ tpo->tpo_tx_pages = NULL;
+
+ npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+ if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+ CERROR("Can't allocate tx pages: %d\n", npg);
+ LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+ return -ENOMEM;
+ }
+
+ LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+ size * sizeof(kib_tx_t));
+ if (tpo->tpo_tx_descs == NULL) {
+ CERROR("Can't allocate %d tx descriptors\n", size);
+ ps->ps_pool_destroy(pool);
+ return -ENOMEM;
+ }
+
+ memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+ for (i = 0; i < size; i++) {
+ kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+ tx->tx_pool = tpo;
+ if (ps->ps_net->ibn_fmr_ps != NULL) {
+ LIBCFS_CPT_ALLOC(tx->tx_pages,
+ lnet_cpt_table(), ps->ps_cpt,
+ LNET_MAX_IOV * sizeof(*tx->tx_pages));
+ if (tx->tx_pages == NULL)
+ break;
+ }
+
+ LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+ IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
+ if (tx->tx_frags == NULL)
+ break;
+
+ sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
+
+ LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_wrq));
+ if (tx->tx_wrq == NULL)
+ break;
+
+ LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_sge));
+ if (tx->tx_sge == NULL)
+ break;
+
+ LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+ offsetof(kib_rdma_desc_t,
+ rd_frags[IBLND_MAX_RDMA_FRAGS]));
+ if (tx->tx_rd == NULL)
+ break;
+ }
+
+ if (i == size) {
+ kiblnd_map_tx_pool(tpo);
+ *pp_po = pool;
+ return 0;
+ }
+
+ ps->ps_pool_destroy(pool);
+ return -ENOMEM;
+}
+
+static void kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+{
+ kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+ tps_poolset);
+ kib_tx_t *tx = list_entry(node, kib_tx_t, tx_list);
+
+ tx->tx_cookie = tps->tps_next_tx_cookie++;
+}
+
+static void kiblnd_net_fini_pools(kib_net_t *net)
+{
+ int i;
+
+ cfs_cpt_for_each(i, lnet_cpt_table()) {
+ kib_tx_poolset_t *tps;
+ kib_fmr_poolset_t *fps;
+ kib_pmr_poolset_t *pps;
+
+ if (net->ibn_tx_ps != NULL) {
+ tps = net->ibn_tx_ps[i];
+ kiblnd_fini_poolset(&tps->tps_poolset);
+ }
+
+ if (net->ibn_fmr_ps != NULL) {
+ fps = net->ibn_fmr_ps[i];
+ kiblnd_fini_fmr_poolset(fps);
+ }
+
+ if (net->ibn_pmr_ps != NULL) {
+ pps = net->ibn_pmr_ps[i];
+ kiblnd_fini_poolset(&pps->pps_poolset);
+ }
+ }
+
+ if (net->ibn_tx_ps != NULL) {
+ cfs_percpt_free(net->ibn_tx_ps);
+ net->ibn_tx_ps = NULL;
+ }
+
+ if (net->ibn_fmr_ps != NULL) {
+ cfs_percpt_free(net->ibn_fmr_ps);
+ net->ibn_fmr_ps = NULL;
+ }
+
+ if (net->ibn_pmr_ps != NULL) {
+ cfs_percpt_free(net->ibn_pmr_ps);
+ net->ibn_pmr_ps = NULL;
+ }
+}
+
+static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
+{
+ unsigned long flags;
+ int cpt;
+ int rc;
+ int i;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ if (*kiblnd_tunables.kib_map_on_demand == 0 &&
+ net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+ flags);
+ goto create_tx_pool;
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ if (*kiblnd_tunables.kib_fmr_pool_size <
+ *kiblnd_tunables.kib_ntx / 4) {
+ CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+ *kiblnd_tunables.kib_fmr_pool_size,
+ *kiblnd_tunables.kib_ntx / 4);
+ rc = -EINVAL;
+ goto failed;
+ }
+
+ /* TX pool must be created later than FMR/PMR, see LU-2268
+ * for details */
+ LASSERT(net->ibn_tx_ps == NULL);
+
+ /* premapping can fail if ibd_nmr > 1, so we always create
+ * FMR/PMR pool and map-on-demand if premapping failed */
+
+ net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(kib_fmr_poolset_t));
+ if (net->ibn_fmr_ps == NULL) {
+ CERROR("Failed to allocate FMR pool array\n");
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ for (i = 0; i < ncpts; i++) {
+ cpt = (cpts == NULL) ? i : cpts[i];
+ rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
+ kiblnd_fmr_pool_size(ncpts),
+ kiblnd_fmr_flush_trigger(ncpts));
+ if (rc == -ENOSYS && i == 0) /* no FMR */
+ break; /* create PMR pool */
+
+ if (rc != 0) { /* a real error */
+ CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+ cpt, rc);
+ goto failed;
+ }
+ }
+
+ if (i > 0) {
+ LASSERT(i == ncpts);
+ goto create_tx_pool;
+ }
+
+ cfs_percpt_free(net->ibn_fmr_ps);
+ net->ibn_fmr_ps = NULL;
+
+ CWARN("Device does not support FMR, failing back to PMR\n");
+
+ if (*kiblnd_tunables.kib_pmr_pool_size <
+ *kiblnd_tunables.kib_ntx / 4) {
+ CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
+ *kiblnd_tunables.kib_pmr_pool_size,
+ *kiblnd_tunables.kib_ntx / 4);
+ rc = -EINVAL;
+ goto failed;
+ }
+
+ net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(kib_pmr_poolset_t));
+ if (net->ibn_pmr_ps == NULL) {
+ CERROR("Failed to allocate PMR pool array\n");
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ for (i = 0; i < ncpts; i++) {
+ cpt = (cpts == NULL) ? i : cpts[i];
+ rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset,
+ cpt, net, "PMR",
+ kiblnd_pmr_pool_size(ncpts),
+ kiblnd_create_pmr_pool,
+ kiblnd_destroy_pmr_pool, NULL, NULL);
+ if (rc != 0) {
+ CERROR("Can't initialize PMR pool for CPT %d: %d\n",
+ cpt, rc);
+ goto failed;
+ }
+ }
+
+ create_tx_pool:
+ net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(kib_tx_poolset_t));
+ if (net->ibn_tx_ps == NULL) {
+ CERROR("Failed to allocate tx pool array\n");
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ for (i = 0; i < ncpts; i++) {
+ cpt = (cpts == NULL) ? i : cpts[i];
+ rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+ cpt, net, "TX",
+ kiblnd_tx_pool_size(ncpts),
+ kiblnd_create_tx_pool,
+ kiblnd_destroy_tx_pool,
+ kiblnd_tx_init, NULL);
+ if (rc != 0) {
+ CERROR("Can't initialize TX pool for CPT %d: %d\n",
+ cpt, rc);
+ goto failed;
+ }
+ }
+
+ return 0;
+ failed:
+ kiblnd_net_fini_pools(net);
+ LASSERT(rc != 0);
+ return rc;
+}
+
+static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+ struct ib_device_attr *attr;
+ int rc;
+
+ /* It's safe to assume a HCA can handle a page size
+ * matching that of the native system */
+ hdev->ibh_page_shift = PAGE_SHIFT;
+ hdev->ibh_page_size = 1 << PAGE_SHIFT;
+ hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
+
+ LIBCFS_ALLOC(attr, sizeof(*attr));
+ if (attr == NULL) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ rc = ib_query_device(hdev->ibh_ibdev, attr);
+ if (rc == 0)
+ hdev->ibh_mr_size = attr->max_mr_size;
+
+ LIBCFS_FREE(attr, sizeof(*attr));
+
+ if (rc != 0) {
+ CERROR("Failed to query IB device: %d\n", rc);
+ return rc;
+ }
+
+ if (hdev->ibh_mr_size == ~0ULL) {
+ hdev->ibh_mr_shift = 64;
+ return 0;
+ }
+
+ for (hdev->ibh_mr_shift = 0;
+ hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift++) {
+ if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
+ hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
+ return 0;
+ }
+
+ CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
+ return -EINVAL;
+}
+
+static void kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+{
+ int i;
+
+ if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
+ return;
+
+ for (i = 0; i < hdev->ibh_nmrs; i++) {
+ if (hdev->ibh_mrs[i] == NULL)
+ break;
+
+ ib_dereg_mr(hdev->ibh_mrs[i]);
+ }
+
+ LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+ hdev->ibh_mrs = NULL;
+ hdev->ibh_nmrs = 0;
+}
+
+void kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+{
+ kiblnd_hdev_cleanup_mrs(hdev);
+
+ if (hdev->ibh_pd != NULL)
+ ib_dealloc_pd(hdev->ibh_pd);
+
+ if (hdev->ibh_cmid != NULL)
+ rdma_destroy_id(hdev->ibh_cmid);
+
+ LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+static int kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+{
+ struct ib_mr *mr;
+ int i;
+ int rc;
+ __u64 mm_size;
+ __u64 mr_size;
+ int acflags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE;
+
+ rc = kiblnd_hdev_get_attr(hdev);
+ if (rc != 0)
+ return rc;
+
+ if (hdev->ibh_mr_shift == 64) {
+ LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
+ if (hdev->ibh_mrs == NULL) {
+ CERROR("Failed to allocate MRs table\n");
+ return -ENOMEM;
+ }
+
+ hdev->ibh_mrs[0] = NULL;
+ hdev->ibh_nmrs = 1;
+
+ mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+ if (IS_ERR(mr)) {
+ CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
+ kiblnd_hdev_cleanup_mrs(hdev);
+ return PTR_ERR(mr);
+ }
+
+ hdev->ibh_mrs[0] = mr;
+
+ goto out;
+ }
+
+ mr_size = 1ULL << hdev->ibh_mr_shift;
+ mm_size = (unsigned long)high_memory - PAGE_OFFSET;
+
+ hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
+
+ if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
+ /* it's 4T..., assume we will re-code at that time */
+ CERROR("Can't support memory size: x%#llx with MR size: x%#llx\n",
+ mm_size, mr_size);
+ return -EINVAL;
+ }
+
+ /* create an array of MRs to cover all memory */
+ LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+ if (hdev->ibh_mrs == NULL) {
+ CERROR("Failed to allocate MRs' table\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < hdev->ibh_nmrs; i++) {
+ struct ib_phys_buf ipb;
+ __u64 iova;
+
+ ipb.size = hdev->ibh_mr_size;
+ ipb.addr = i * mr_size;
+ iova = ipb.addr;
+
+ mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
+ if (IS_ERR(mr)) {
+ CERROR("Failed ib_reg_phys_mr addr %#llx size %#llx : %ld\n",
+ ipb.addr, ipb.size, PTR_ERR(mr));
+ kiblnd_hdev_cleanup_mrs(hdev);
+ return PTR_ERR(mr);
+ }
+
+ LASSERT(iova == ipb.addr);
+
+ hdev->ibh_mrs[i] = mr;
+ }
+
+out:
+ if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
+ LCONSOLE_INFO("Register global MR array, MR size: %#llx, array size: %d\n",
+ hdev->ibh_mr_size, hdev->ibh_nmrs);
+ return 0;
+}
+
+/* DUMMY */
+static int kiblnd_dummy_callback(struct rdma_cm_id *cmid,
+ struct rdma_cm_event *event)
+{
+ return 0;
+}
+
+static int kiblnd_dev_need_failover(kib_dev_t *dev)
+{
+ struct rdma_cm_id *cmid;
+ struct sockaddr_in srcaddr;
+ struct sockaddr_in dstaddr;
+ int rc;
+
+ if (dev->ibd_hdev == NULL || /* initializing */
+ dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+ *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+ return 1;
+
+ /* XXX: it's UGLY, but I don't have better way to find
+ * ib-bonding HCA failover because:
+ *
+ * a. no reliable CM event for HCA failover...
+ * b. no OFED API to get ib_device for current net_device...
+ *
+ * We have only two choices at this point:
+ *
+ * a. rdma_bind_addr(), it will conflict with listener cmid
+ * b. rdma_resolve_addr() to zero addr */
+ cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
+ IB_QPT_RC);
+ if (IS_ERR(cmid)) {
+ rc = PTR_ERR(cmid);
+ CERROR("Failed to create cmid for failover: %d\n", rc);
+ return rc;
+ }
+
+ memset(&srcaddr, 0, sizeof(srcaddr));
+ srcaddr.sin_family = AF_INET;
+ srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+ memset(&dstaddr, 0, sizeof(dstaddr));
+ dstaddr.sin_family = AF_INET;
+ rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+ (struct sockaddr *)&dstaddr, 1);
+ if (rc != 0 || cmid->device == NULL) {
+ CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
+ dev->ibd_ifname, &dev->ibd_ifip,
+ cmid->device, rc);
+ rdma_destroy_id(cmid);
+ return rc;
+ }
+
+ if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
+ /* don't need device failover */
+ rdma_destroy_id(cmid);
+ return 0;
+ }
+
+ return 1;
+}
+
+int kiblnd_dev_failover(kib_dev_t *dev)
+{
+ LIST_HEAD(zombie_tpo);
+ LIST_HEAD(zombie_ppo);
+ LIST_HEAD(zombie_fpo);
+ struct rdma_cm_id *cmid = NULL;
+ kib_hca_dev_t *hdev = NULL;
+ kib_hca_dev_t *old;
+ struct ib_pd *pd;
+ kib_net_t *net;
+ struct sockaddr_in addr;
+ unsigned long flags;
+ int rc = 0;
+ int i;
+
+ LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
+ dev->ibd_can_failover ||
+ dev->ibd_hdev == NULL);
+
+ rc = kiblnd_dev_need_failover(dev);
+ if (rc <= 0)
+ goto out;
+
+ if (dev->ibd_hdev != NULL &&
+ dev->ibd_hdev->ibh_cmid != NULL) {
+ /* XXX it's not good to close old listener at here,
+ * because we can fail to create new listener.
+ * But we have to close it now, otherwise rdma_bind_addr
+ * will return EADDRINUSE... How crap! */
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ cmid = dev->ibd_hdev->ibh_cmid;
+ /* make next schedule of kiblnd_dev_need_failover()
+ * return 1 for me */
+ dev->ibd_hdev->ibh_cmid = NULL;
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ rdma_destroy_id(cmid);
+ }
+
+ cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+ IB_QPT_RC);
+ if (IS_ERR(cmid)) {
+ rc = PTR_ERR(cmid);
+ CERROR("Failed to create cmid for failover: %d\n", rc);
+ goto out;
+ }
+
+ memset(&addr, 0, sizeof(addr));
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+ addr.sin_port = htons(*kiblnd_tunables.kib_service);
+
+ /* Bind to failover device or port */
+ rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+ if (rc != 0 || cmid->device == NULL) {
+ CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
+ dev->ibd_ifname, &dev->ibd_ifip,
+ cmid->device, rc);
+ rdma_destroy_id(cmid);
+ goto out;
+ }
+
+ LIBCFS_ALLOC(hdev, sizeof(*hdev));
+ if (hdev == NULL) {
+ CERROR("Failed to allocate kib_hca_dev\n");
+ rdma_destroy_id(cmid);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ atomic_set(&hdev->ibh_ref, 1);
+ hdev->ibh_dev = dev;
+ hdev->ibh_cmid = cmid;
+ hdev->ibh_ibdev = cmid->device;
+
+ pd = ib_alloc_pd(cmid->device);
+ if (IS_ERR(pd)) {
+ rc = PTR_ERR(pd);
+ CERROR("Can't allocate PD: %d\n", rc);
+ goto out;
+ }
+
+ hdev->ibh_pd = pd;
+
+ rc = rdma_listen(cmid, 0);
+ if (rc != 0) {
+ CERROR("Can't start new listener: %d\n", rc);
+ goto out;
+ }
+
+ rc = kiblnd_hdev_setup_mrs(hdev);
+ if (rc != 0) {
+ CERROR("Can't setup device: %d\n", rc);
+ goto out;
+ }
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ old = dev->ibd_hdev;
+ dev->ibd_hdev = hdev; /* take over the refcount */
+ hdev = old;
+
+ list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+ cfs_cpt_for_each(i, lnet_cpt_table()) {
+ kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+ &zombie_tpo);
+
+ if (net->ibn_fmr_ps != NULL) {
+ kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+ &zombie_fpo);
+
+ } else if (net->ibn_pmr_ps != NULL) {
+ kiblnd_fail_poolset(&net->ibn_pmr_ps[i]->
+ pps_poolset, &zombie_ppo);
+ }
+ }
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+ if (!list_empty(&zombie_tpo))
+ kiblnd_destroy_pool_list(&zombie_tpo);
+ if (!list_empty(&zombie_ppo))
+ kiblnd_destroy_pool_list(&zombie_ppo);
+ if (!list_empty(&zombie_fpo))
+ kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+ if (hdev != NULL)
+ kiblnd_hdev_decref(hdev);
+
+ if (rc != 0)
+ dev->ibd_failed_failover++;
+ else
+ dev->ibd_failed_failover = 0;
+
+ return rc;
+}
+
+void kiblnd_destroy_dev(kib_dev_t *dev)
+{
+ LASSERT(dev->ibd_nnets == 0);
+ LASSERT(list_empty(&dev->ibd_nets));
+
+ list_del(&dev->ibd_fail_list);
+ list_del(&dev->ibd_list);
+
+ if (dev->ibd_hdev != NULL)
+ kiblnd_hdev_decref(dev->ibd_hdev);
+
+ LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+static kib_dev_t *kiblnd_create_dev(char *ifname)
+{
+ struct net_device *netdev;
+ kib_dev_t *dev;
+ __u32 netmask;
+ __u32 ip;
+ int up;
+ int rc;
+
+ rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
+ if (rc != 0) {
+ CERROR("Can't query IPoIB interface %s: %d\n",
+ ifname, rc);
+ return NULL;
+ }
+
+ if (!up) {
+ CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(dev, sizeof(*dev));
+ if (dev == NULL)
+ return NULL;
+
+ netdev = dev_get_by_name(&init_net, ifname);
+ if (netdev == NULL) {
+ dev->ibd_can_failover = 0;
+ } else {
+ dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
+ dev_put(netdev);
+ }
+
+ INIT_LIST_HEAD(&dev->ibd_nets);
+ INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
+ INIT_LIST_HEAD(&dev->ibd_fail_list);
+ dev->ibd_ifip = ip;
+ strcpy(&dev->ibd_ifname[0], ifname);
+
+ /* initialize the device */
+ rc = kiblnd_dev_failover(dev);
+ if (rc != 0) {
+ CERROR("Can't initialize device: %d\n", rc);
+ LIBCFS_FREE(dev, sizeof(*dev));
+ return NULL;
+ }
+
+ list_add_tail(&dev->ibd_list,
+ &kiblnd_data.kib_devs);
+ return dev;
+}
+
+static void kiblnd_base_shutdown(void)
+{
+ struct kib_sched_info *sched;
+ int i;
+
+ LASSERT(list_empty(&kiblnd_data.kib_devs));
+
+ CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ switch (kiblnd_data.kib_init) {
+ default:
+ LBUG();
+
+ case IBLND_INIT_ALL:
+ case IBLND_INIT_DATA:
+ LASSERT(kiblnd_data.kib_peers != NULL);
+ for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+ LASSERT(list_empty(&kiblnd_data.kib_peers[i]));
+ LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
+ LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
+
+ /* flag threads to terminate; wake and wait for them to die */
+ kiblnd_data.kib_shutdown = 1;
+
+ /* NB: we really want to stop scheduler threads net by net
+ * instead of the whole module, this should be improved
+ * with dynamic configuration LNet */
+ cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+ wake_up_all(&sched->ibs_waitq);
+
+ wake_up_all(&kiblnd_data.kib_connd_waitq);
+ wake_up_all(&kiblnd_data.kib_failover_waitq);
+
+ i = 2;
+ while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+ i++;
+ /* power of 2 ? */
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for %d threads to terminate\n",
+ atomic_read(&kiblnd_data.kib_nthreads));
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ }
+
+ /* fall through */
+
+ case IBLND_INIT_NOTHING:
+ break;
+ }
+
+ if (kiblnd_data.kib_peers != NULL) {
+ LIBCFS_FREE(kiblnd_data.kib_peers,
+ sizeof(struct list_head) *
+ kiblnd_data.kib_peer_hash_size);
+ }
+
+ if (kiblnd_data.kib_scheds != NULL)
+ cfs_percpt_free(kiblnd_data.kib_scheds);
+
+ CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+ module_put(THIS_MODULE);
+}
+
+void kiblnd_shutdown(lnet_ni_t *ni)
+{
+ kib_net_t *net = ni->ni_data;
+ rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+ int i;
+ unsigned long flags;
+
+ LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+ if (net == NULL)
+ goto out;
+
+ CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ write_lock_irqsave(g_lock, flags);
+ net->ibn_shutdown = 1;
+ write_unlock_irqrestore(g_lock, flags);
+
+ switch (net->ibn_init) {
+ default:
+ LBUG();
+
+ case IBLND_INIT_ALL:
+ /* nuke all existing peers within this net */
+ kiblnd_del_peer(ni, LNET_NID_ANY);
+
+ /* Wait for all peer state to clean up */
+ i = 2;
+ while (atomic_read(&net->ibn_npeers) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
+ "%s: waiting for %d peers to disconnect\n",
+ libcfs_nid2str(ni->ni_nid),
+ atomic_read(&net->ibn_npeers));
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ }
+
+ kiblnd_net_fini_pools(net);
+
+ write_lock_irqsave(g_lock, flags);
+ LASSERT(net->ibn_dev->ibd_nnets > 0);
+ net->ibn_dev->ibd_nnets--;
+ list_del(&net->ibn_list);
+ write_unlock_irqrestore(g_lock, flags);
+
+ /* fall through */
+
+ case IBLND_INIT_NOTHING:
+ LASSERT(atomic_read(&net->ibn_nconns) == 0);
+
+ if (net->ibn_dev != NULL &&
+ net->ibn_dev->ibd_nnets == 0)
+ kiblnd_destroy_dev(net->ibn_dev);
+
+ break;
+ }
+
+ CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ net->ibn_init = IBLND_INIT_NOTHING;
+ ni->ni_data = NULL;
+
+ LIBCFS_FREE(net, sizeof(*net));
+
+out:
+ if (list_empty(&kiblnd_data.kib_devs))
+ kiblnd_base_shutdown();
+}
+
+static int kiblnd_base_startup(void)
+{
+ struct kib_sched_info *sched;
+ int rc;
+ int i;
+
+ LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+ try_module_get(THIS_MODULE);
+ /* zero pointers, flags etc */
+ memset(&kiblnd_data, 0, sizeof(kiblnd_data));
+
+ rwlock_init(&kiblnd_data.kib_global_lock);
+
+ INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+ INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+ kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+ LIBCFS_ALLOC(kiblnd_data.kib_peers,
+ sizeof(struct list_head) *
+ kiblnd_data.kib_peer_hash_size);
+ if (kiblnd_data.kib_peers == NULL)
+ goto failed;
+ for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+ INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+ spin_lock_init(&kiblnd_data.kib_connd_lock);
+ INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+ INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+ init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+ init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+ kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*sched));
+ if (kiblnd_data.kib_scheds == NULL)
+ goto failed;
+
+ cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+ int nthrs;
+
+ spin_lock_init(&sched->ibs_lock);
+ INIT_LIST_HEAD(&sched->ibs_conns);
+ init_waitqueue_head(&sched->ibs_waitq);
+
+ nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+ if (*kiblnd_tunables.kib_nscheds > 0) {
+ nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+ } else {
+ /* max to half of CPUs, another half is reserved for
+ * upper layer modules */
+ nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+ }
+
+ sched->ibs_nthreads_max = nthrs;
+ sched->ibs_cpt = i;
+ }
+
+ kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+ /* lists/ptrs/locks initialised */
+ kiblnd_data.kib_init = IBLND_INIT_DATA;
+ /*****************************************************/
+
+ rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+ if (rc != 0) {
+ CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+ goto failed;
+ }
+
+ if (*kiblnd_tunables.kib_dev_failover != 0)
+ rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+ "kiblnd_failover");
+
+ if (rc != 0) {
+ CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag everything initialised */
+ kiblnd_data.kib_init = IBLND_INIT_ALL;
+ /*****************************************************/
+
+ return 0;
+
+ failed:
+ kiblnd_base_shutdown();
+ return -ENETDOWN;
+}
+
+static int kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+ int rc = 0;
+ int nthrs;
+ int i;
+
+ if (sched->ibs_nthreads == 0) {
+ if (*kiblnd_tunables.kib_nscheds > 0) {
+ nthrs = sched->ibs_nthreads_max;
+ } else {
+ nthrs = cfs_cpt_weight(lnet_cpt_table(),
+ sched->ibs_cpt);
+ nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+ nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+ }
+ } else {
+ LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+ /* increase one thread if there is new interface */
+ nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max;
+ }
+
+ for (i = 0; i < nthrs; i++) {
+ long id;
+ char name[20];
+
+ id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+ snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+ KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+ rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+ if (rc == 0)
+ continue;
+
+ CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+ sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+ break;
+ }
+
+ sched->ibs_nthreads += i;
+ return rc;
+}
+
+static int kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts,
+ int ncpts)
+{
+ int cpt;
+ int rc;
+ int i;
+
+ for (i = 0; i < ncpts; i++) {
+ struct kib_sched_info *sched;
+
+ cpt = (cpts == NULL) ? i : cpts[i];
+ sched = kiblnd_data.kib_scheds[cpt];
+
+ if (!newdev && sched->ibs_nthreads > 0)
+ continue;
+
+ rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+ if (rc != 0) {
+ CERROR("Failed to start scheduler threads for %s\n",
+ dev->ibd_ifname);
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static kib_dev_t *kiblnd_dev_search(char *ifname)
+{
+ kib_dev_t *alias = NULL;
+ kib_dev_t *dev;
+ char *colon;
+ char *colon2;
+
+ colon = strchr(ifname, ':');
+ list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+ if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+ return dev;
+
+ if (alias != NULL)
+ continue;
+
+ colon2 = strchr(dev->ibd_ifname, ':');
+ if (colon != NULL)
+ *colon = 0;
+ if (colon2 != NULL)
+ *colon2 = 0;
+
+ if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+ alias = dev;
+
+ if (colon != NULL)
+ *colon = ':';
+ if (colon2 != NULL)
+ *colon2 = ':';
+ }
+ return alias;
+}
+
+int kiblnd_startup(lnet_ni_t *ni)
+{
+ char *ifname;
+ kib_dev_t *ibdev = NULL;
+ kib_net_t *net;
+ struct timeval tv;
+ unsigned long flags;
+ int rc;
+ int newdev;
+
+ LASSERT(ni->ni_lnd == &the_o2iblnd);
+
+ if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+ rc = kiblnd_base_startup();
+ if (rc != 0)
+ return rc;
+ }
+
+ LIBCFS_ALLOC(net, sizeof(*net));
+ ni->ni_data = net;
+ if (net == NULL)
+ goto net_failed;
+
+ do_gettimeofday(&tv);
+ net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+ ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout;
+ ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits;
+ ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits;
+ ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
+
+ if (ni->ni_interfaces[0] != NULL) {
+ /* Use the IPoIB interface specified in 'networks=' */
+
+ CLASSERT(LNET_MAX_INTERFACES > 1);
+ if (ni->ni_interfaces[1] != NULL) {
+ CERROR("Multiple interfaces not supported\n");
+ goto failed;
+ }
+
+ ifname = ni->ni_interfaces[0];
+ } else {
+ ifname = *kiblnd_tunables.kib_default_ipif;
+ }
+
+ if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+ CERROR("IPoIB interface name too long: %s\n", ifname);
+ goto failed;
+ }
+
+ ibdev = kiblnd_dev_search(ifname);
+
+ newdev = ibdev == NULL;
+ /* hmm...create kib_dev even for alias */
+ if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
+ ibdev = kiblnd_create_dev(ifname);
+
+ if (ibdev == NULL)
+ goto failed;
+
+ net->ibn_dev = ibdev;
+ ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+
+ rc = kiblnd_dev_start_threads(ibdev, newdev,
+ ni->ni_cpts, ni->ni_ncpts);
+ if (rc != 0)
+ goto failed;
+
+ rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
+ if (rc != 0) {
+ CERROR("Failed to initialize NI pools: %d\n", rc);
+ goto failed;
+ }
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ ibdev->ibd_nnets++;
+ list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ net->ibn_init = IBLND_INIT_ALL;
+
+ return 0;
+
+failed:
+ if (net->ibn_dev == NULL && ibdev != NULL)
+ kiblnd_destroy_dev(ibdev);
+
+net_failed:
+ kiblnd_shutdown(ni);
+
+ CDEBUG(D_NET, "kiblnd_startup failed\n");
+ return -ENETDOWN;
+}
+
+static void __exit kiblnd_module_fini(void)
+{
+ lnet_unregister_lnd(&the_o2iblnd);
+}
+
+static int __init kiblnd_module_init(void)
+{
+ int rc;
+
+ CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+ CLASSERT(offsetof(kib_msg_t,
+ ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+ <= IBLND_MSG_SIZE);
+ CLASSERT(offsetof(kib_msg_t,
+ ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+ <= IBLND_MSG_SIZE);
+
+ rc = kiblnd_tunables_init();
+ if (rc != 0)
+ return rc;
+
+ lnet_register_lnd(&the_o2iblnd);
+
+ return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
+MODULE_LICENSE("GPL");
+
+module_init(kiblnd_module_init);
+module_exit(kiblnd_module_fini);
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644
index 000000000..cd664d025
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
@@ -0,0 +1,1030 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+#include <linux/uaccess.h>
+
+#include <asm/io.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "../../../include/linux/lnet/lnet.h"
+#include "../../../include/linux/lnet/lib-lnet.h"
+#include "../../../include/linux/lnet/lnet-sysctl.h"
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED 100
+
+#define IBLND_N_SCHED 2
+#define IBLND_N_SCHED_HIGH 4
+
+typedef struct {
+ int *kib_dev_failover; /* HCA failover */
+ unsigned int *kib_service; /* IB service number */
+ int *kib_min_reconnect_interval; /* first failed connection retry... */
+ int *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+ int *kib_cksum; /* checksum kib_msg_t? */
+ int *kib_timeout; /* comms timeout (seconds) */
+ int *kib_keepalive; /* keepalive timeout (seconds) */
+ int *kib_ntx; /* # tx descs */
+ int *kib_credits; /* # concurrent sends */
+ int *kib_peertxcredits; /* # concurrent sends to 1 peer */
+ int *kib_peerrtrcredits; /* # per-peer router buffer credits */
+ int *kib_peercredits_hiw; /* # when eagerly to return credits */
+ int *kib_peertimeout; /* seconds to consider peer dead */
+ char **kib_default_ipif; /* default IPoIB interface */
+ int *kib_retry_count;
+ int *kib_rnr_retry_count;
+ int *kib_concurrent_sends; /* send work queue sizing */
+ int *kib_ib_mtu; /* IB MTU */
+ int *kib_map_on_demand; /* map-on-demand if RD has more fragments
+ * than this value, 0 disable map-on-demand */
+ int *kib_pmr_pool_size; /* # physical MR in pool */
+ int *kib_fmr_pool_size; /* # FMRs in pool */
+ int *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+ int *kib_fmr_cache; /* enable FMR pool cache? */
+ int *kib_require_priv_port;/* accept only privileged ports */
+ int *kib_use_priv_port; /* use privileged port for active connect */
+ /* # threads on each CPT */
+ int *kib_nscheds;
+} kib_tunables_t;
+
+extern kib_tunables_t kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */
+#define IBLND_CREDITS_MAX ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1) /* Max # of peer credits */
+
+#define IBLND_MSG_QUEUE_SIZE(v) ((v) == IBLND_MSG_VERSION_1 ? \
+ IBLND_MSG_QUEUE_SIZE_V1 : \
+ *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
+#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
+ IBLND_CREDIT_HIGHWATER_V1 : \
+ *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+
+static inline int
+kiblnd_concurrent_sends_v1(void)
+{
+ if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+ return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+ if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+ return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+
+ return *kiblnd_tunables.kib_concurrent_sends;
+}
+
+#define IBLND_CONCURRENT_SENDS(v) ((v) == IBLND_MSG_VERSION_1 ? \
+ kiblnd_concurrent_sends_v1() : \
+ *kiblnd_tunables.kib_concurrent_sends)
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */
+#define IBLND_CFG_RDMA_FRAGS (*kiblnd_tunables.kib_map_on_demand != 0 ? \
+ *kiblnd_tunables.kib_map_on_demand : \
+ IBLND_MAX_RDMA_FRAGS) /* max # of fragments configured by user */
+#define IBLND_RDMA_FRAGS(v) ((v) == IBLND_MSG_VERSION_1 ? \
+ IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL 256
+#define IBLND_PMR_POOL 256
+#define IBLND_FMR_POOL 256
+#define IBLND_FMR_POOL_FLUSH 192
+
+/* TX messages (shared by all connections) */
+#define IBLND_TX_MSGS() (*kiblnd_tunables.kib_ntx)
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(v) (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
+#define IBLND_RX_MSG_BYTES(v) (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(v) ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(v) IBLND_RX_MSGS(v)
+#define IBLND_SEND_WRS(v) ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
+#define IBLND_CQ_ENTRIES(v) (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE IFALIASZ
+#else
+#define KIB_IFNAME_SIZE 256
+#endif
+
+typedef struct {
+ struct list_head ibd_list; /* chain on kib_devs */
+ struct list_head ibd_fail_list; /* chain on kib_failed_devs */
+ __u32 ibd_ifip; /* IPoIB interface IP */
+ /** IPoIB interface name */
+ char ibd_ifname[KIB_IFNAME_SIZE];
+ int ibd_nnets; /* # nets extant */
+
+ unsigned long ibd_next_failover;
+ int ibd_failed_failover; /* # failover failures */
+ unsigned int ibd_failover; /* failover in progress */
+ unsigned int ibd_can_failover; /* IPoIB interface is a bonding master */
+ struct list_head ibd_nets;
+ struct kib_hca_dev *ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev {
+ struct rdma_cm_id *ibh_cmid; /* listener cmid */
+ struct ib_device *ibh_ibdev; /* IB device */
+ int ibh_page_shift; /* page shift of current HCA */
+ int ibh_page_size; /* page size of current HCA */
+ __u64 ibh_page_mask; /* page mask of current HCA */
+ int ibh_mr_shift; /* bits shift of max MR size */
+ __u64 ibh_mr_size; /* size of MR */
+ int ibh_nmrs; /* # of global MRs */
+ struct ib_mr **ibh_mrs; /* global MR */
+ struct ib_pd *ibh_pd; /* PD */
+ kib_dev_t *ibh_dev; /* owner */
+ atomic_t ibh_ref; /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE 300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY 1
+
+typedef struct {
+ int ibp_npages; /* # pages */
+ struct page *ibp_pages[0]; /* page array */
+} kib_pages_t;
+
+struct kib_pmr_pool;
+
+typedef struct {
+ struct list_head pmr_list; /* chain node */
+ struct ib_phys_buf *pmr_ipb; /* physical buffer */
+ struct ib_mr *pmr_mr; /* IB MR */
+ struct kib_pmr_pool *pmr_pool; /* owner of this MR */
+ __u64 pmr_iova; /* Virtual I/O address */
+ int pmr_refcount; /* reference count */
+} kib_phys_mr_t;
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+ int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN 32
+
+typedef struct kib_poolset {
+ spinlock_t ps_lock; /* serialize */
+ struct kib_net *ps_net; /* network it belongs to */
+ char ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
+ struct list_head ps_pool_list; /* list of pools */
+ struct list_head ps_failed_pool_list; /* failed pool list */
+ unsigned long ps_next_retry; /* time stamp for retry if failed to allocate */
+ int ps_increasing; /* is allocating new pool */
+ int ps_pool_size; /* new pool size */
+ int ps_cpt; /* CPT id */
+
+ kib_ps_pool_create_t ps_pool_create; /* create a new pool */
+ kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */
+ kib_ps_node_init_t ps_node_init; /* initialize new allocated node */
+ kib_ps_node_fini_t ps_node_fini; /* finalize node */
+} kib_poolset_t;
+
+typedef struct kib_pool {
+ struct list_head po_list; /* chain on pool list */
+ struct list_head po_free_list; /* pre-allocated node */
+ kib_poolset_t *po_owner; /* pool_set of this pool */
+ unsigned long po_deadline; /* deadline of this pool */
+ int po_allocated; /* # of elements in use */
+ int po_failed; /* pool is created on failed HCA */
+ int po_size; /* # of pre-allocated elements */
+} kib_pool_t;
+
+typedef struct {
+ kib_poolset_t tps_poolset; /* pool-set */
+ __u64 tps_next_tx_cookie; /* cookie of TX */
+} kib_tx_poolset_t;
+
+typedef struct {
+ kib_pool_t tpo_pool; /* pool */
+ struct kib_hca_dev *tpo_hdev; /* device for this pool */
+ struct kib_tx *tpo_tx_descs; /* all the tx descriptors */
+ kib_pages_t *tpo_tx_pages; /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct {
+ kib_poolset_t pps_poolset; /* pool-set */
+} kib_pmr_poolset_t;
+
+typedef struct kib_pmr_pool {
+ struct kib_hca_dev *ppo_hdev; /* device for this pool */
+ kib_pool_t ppo_pool; /* pool */
+} kib_pmr_pool_t;
+
+typedef struct {
+ spinlock_t fps_lock; /* serialize */
+ struct kib_net *fps_net; /* IB network */
+ struct list_head fps_pool_list; /* FMR pool list */
+ struct list_head fps_failed_pool_list; /* FMR pool list */
+ __u64 fps_version; /* validity stamp */
+ int fps_cpt; /* CPT id */
+ int fps_pool_size;
+ int fps_flush_trigger;
+ /* is allocating new pool */
+ int fps_increasing;
+ /* time stamp for retry if failed to allocate */
+ unsigned long fps_next_retry;
+} kib_fmr_poolset_t;
+
+typedef struct {
+ struct list_head fpo_list; /* chain on pool list */
+ struct kib_hca_dev *fpo_hdev; /* device for this pool */
+ kib_fmr_poolset_t *fpo_owner; /* owner of this pool */
+ struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
+ unsigned long fpo_deadline; /* deadline of this pool */
+ int fpo_failed; /* fmr pool is failed */
+ int fpo_map_count; /* # of mapped FMR */
+} kib_fmr_pool_t;
+
+typedef struct {
+ struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
+ kib_fmr_pool_t *fmr_pool; /* pool of FMR */
+} kib_fmr_t;
+
+typedef struct kib_net {
+ struct list_head ibn_list; /* chain on kib_dev_t::ibd_nets */
+ __u64 ibn_incarnation; /* my epoch */
+ int ibn_init; /* initialisation state */
+ int ibn_shutdown; /* shutting down? */
+
+ atomic_t ibn_npeers; /* # peers extant */
+ atomic_t ibn_nconns; /* # connections extant */
+
+ kib_tx_poolset_t **ibn_tx_ps; /* tx pool-set */
+ kib_fmr_poolset_t **ibn_fmr_ps; /* fmr pool-set */
+ kib_pmr_poolset_t **ibn_pmr_ps; /* pmr pool-set */
+
+ kib_dev_t *ibn_dev; /* underlying IB device */
+} kib_net_t;
+
+#define KIB_THREAD_SHIFT 16
+#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+ /* serialise */
+ spinlock_t ibs_lock;
+ /* schedulers sleep here */
+ wait_queue_head_t ibs_waitq;
+ /* conns to check for rx completions */
+ struct list_head ibs_conns;
+ /* number of scheduler threads */
+ int ibs_nthreads;
+ /* max allowed scheduler threads */
+ int ibs_nthreads_max;
+ int ibs_cpt; /* CPT id */
+};
+
+typedef struct {
+ int kib_init; /* initialisation state */
+ int kib_shutdown; /* shut down? */
+ struct list_head kib_devs; /* IB devices extant */
+ /* list head of failed devices */
+ struct list_head kib_failed_devs;
+ /* schedulers sleep here */
+ wait_queue_head_t kib_failover_waitq;
+ atomic_t kib_nthreads; /* # live threads */
+ /* stabilize net/dev/peer/conn ops */
+ rwlock_t kib_global_lock;
+ /* hash table of all my known peers */
+ struct list_head *kib_peers;
+ /* size of kib_peers */
+ int kib_peer_hash_size;
+ /* the connd task (serialisation assertions) */
+ void *kib_connd;
+ /* connections to setup/teardown */
+ struct list_head kib_connd_conns;
+ /* connections with zero refcount */
+ struct list_head kib_connd_zombies;
+ /* connection daemon sleeps here */
+ wait_queue_head_t kib_connd_waitq;
+ spinlock_t kib_connd_lock; /* serialise */
+ struct ib_qp_attr kib_error_qpa; /* QP->ERROR */
+ /* percpt data for schedulers */
+ struct kib_sched_info **kib_scheds;
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING 0
+#define IBLND_INIT_DATA 1
+#define IBLND_INIT_ALL 2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams {
+ __u16 ibcp_queue_depth;
+ __u16 ibcp_max_frags;
+ __u32 ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct {
+ lnet_hdr_t ibim_hdr; /* portals header */
+ char ibim_payload[0]; /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+typedef struct {
+ __u32 rf_nob; /* # bytes this frag */
+ __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct {
+ __u32 rd_key; /* local/remote key */
+ __u32 rd_nfrags; /* # fragments */
+ kib_rdma_frag_t rd_frags[0]; /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+
+typedef struct {
+ lnet_hdr_t ibprm_hdr; /* portals header */
+ __u64 ibprm_cookie; /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct {
+ __u64 ibpam_src_cookie; /* reflected completion cookie */
+ __u64 ibpam_dst_cookie; /* opaque completion cookie */
+ kib_rdma_desc_t ibpam_rd; /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct {
+ lnet_hdr_t ibgm_hdr; /* portals header */
+ __u64 ibgm_cookie; /* opaque completion cookie */
+ kib_rdma_desc_t ibgm_rd; /* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct {
+ __u64 ibcm_cookie; /* opaque completion cookie */
+ __s32 ibcm_status; /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct {
+ /* First 2 fields fixed FOR ALL TIME */
+ __u32 ibm_magic; /* I'm an ibnal message */
+ __u16 ibm_version; /* this is my version number */
+
+ __u8 ibm_type; /* msg type */
+ __u8 ibm_credits; /* returned credits */
+ __u32 ibm_nob; /* # bytes in whole message */
+ __u32 ibm_cksum; /* checksum (0 == no checksum) */
+ __u64 ibm_srcnid; /* sender's NID */
+ __u64 ibm_srcstamp; /* sender's incarnation */
+ __u64 ibm_dstnid; /* destination's NID */
+ __u64 ibm_dststamp; /* destination's incarnation */
+
+ union {
+ kib_connparams_t connparams;
+ kib_immediate_msg_t immediate;
+ kib_putreq_msg_t putreq;
+ kib_putack_msg_t putack;
+ kib_get_msg_t get;
+ kib_completion_msg_t completion;
+ } WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */
+
+#define IBLND_MSG_VERSION_1 0x11
+#define IBLND_MSG_VERSION_2 0x12
+#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ 0xc0 /* connection request */
+#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */
+#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */
+#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */
+#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */
+
+typedef struct {
+ __u32 ibr_magic; /* sender's magic */
+ __u16 ibr_version; /* sender's version */
+ __u8 ibr_why; /* reject reason */
+ __u8 ibr_padding; /* padding */
+ __u64 ibr_incarnation; /* incarnation of peer */
+ kib_connparams_t ibr_cp; /* connection parameters */
+} WIRE_ATTR kib_rej_t;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL 3 /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer */
+#define IBLND_REJECT_CONN_STALE 5 /* stale peer */
+
+#define IBLND_REJECT_RDMA_FRAGS 6 /* Fatal: peer's rdma frags can't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE 7 /* Fatal: peer's msg queue size can't match mine */
+
+/***********************************************************************/
+
+typedef struct kib_rx /* receive message */
+{
+ struct list_head rx_list; /* queue for attention */
+ struct kib_conn *rx_conn; /* owning conn */
+ int rx_nob; /* # bytes received (-1 while posted) */
+ enum ib_wc_status rx_status; /* completion status */
+ kib_msg_t *rx_msg; /* message buffer (host vaddr) */
+ __u64 rx_msgaddr; /* message buffer (I/O addr) */
+ DECLARE_PCI_UNMAP_ADDR (rx_msgunmap); /* for dma_unmap_single() */
+ struct ib_recv_wr rx_wrq; /* receive work item... */
+ struct ib_sge rx_sge; /* ...and its memory */
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST 0 /* don't post */
+#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx /* transmit message */
+{
+ struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
+ kib_tx_pool_t *tx_pool; /* pool I'm from */
+ struct kib_conn *tx_conn; /* owning conn */
+ short tx_sending; /* # tx callbacks outstanding */
+ short tx_queued; /* queued for sending */
+ short tx_waiting; /* waiting for peer */
+ int tx_status; /* LNET completion status */
+ unsigned long tx_deadline; /* completion deadline */
+ __u64 tx_cookie; /* completion cookie */
+ lnet_msg_t *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
+ kib_msg_t *tx_msg; /* message buffer (host vaddr) */
+ __u64 tx_msgaddr; /* message buffer (I/O addr) */
+ DECLARE_PCI_UNMAP_ADDR (tx_msgunmap); /* for dma_unmap_single() */
+ int tx_nwrq; /* # send work items */
+ struct ib_send_wr *tx_wrq; /* send work items... */
+ struct ib_sge *tx_sge; /* ...and their memory */
+ kib_rdma_desc_t *tx_rd; /* rdma descriptor */
+ int tx_nfrags; /* # entries in... */
+ struct scatterlist *tx_frags; /* dma_map_sg descriptor */
+ __u64 *tx_pages; /* rdma phys page addrs */
+ union {
+ kib_phys_mr_t *pmr; /* MR for physical buffer */
+ kib_fmr_t fmr; /* FMR */
+ } tx_u;
+ int tx_dmadir; /* dma direction */
+} kib_tx_t;
+
+typedef struct kib_connvars {
+ /* connection-in-progress variables */
+ kib_msg_t cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn {
+ struct kib_sched_info *ibc_sched; /* scheduler information */
+ struct kib_peer *ibc_peer; /* owning peer */
+ kib_hca_dev_t *ibc_hdev; /* HCA bound on */
+ struct list_head ibc_list; /* stash on peer's conn list */
+ struct list_head ibc_sched_list; /* schedule for attention */
+ __u16 ibc_version; /* version of connection */
+ __u64 ibc_incarnation; /* which instance of the peer */
+ atomic_t ibc_refcount; /* # users */
+ int ibc_state; /* what's happening */
+ int ibc_nsends_posted; /* # uncompleted sends */
+ int ibc_noops_posted; /* # uncompleted NOOPs */
+ int ibc_credits; /* # credits I have */
+ int ibc_outstanding_credits; /* # credits to return */
+ int ibc_reserved_credits;/* # ACK/DONE msg credits */
+ int ibc_comms_error; /* set on comms error */
+ unsigned int ibc_nrx:16; /* receive buffers owned */
+ unsigned int ibc_scheduled:1; /* scheduled for attention */
+ unsigned int ibc_ready:1; /* CQ callback fired */
+ /* time of last send */
+ unsigned long ibc_last_send;
+ /** link chain for kiblnd_check_conns only */
+ struct list_head ibc_connd_list;
+ /** rxs completed before ESTABLISHED */
+ struct list_head ibc_early_rxs;
+ /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+ struct list_head ibc_tx_noops;
+ struct list_head ibc_tx_queue; /* sends that need a credit */
+ struct list_head ibc_tx_queue_nocred;/* sends that don't need a credit */
+ struct list_head ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
+ struct list_head ibc_active_txs; /* active tx awaiting completion */
+ spinlock_t ibc_lock; /* serialise */
+ kib_rx_t *ibc_rxs; /* the rx descs */
+ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */
+
+ struct rdma_cm_id *ibc_cmid; /* CM id */
+ struct ib_cq *ibc_cq; /* completion queue */
+
+ kib_connvars_t *ibc_connvars; /* in-progress connection state */
+} kib_conn_t;
+
+#define IBLND_CONN_INIT 0 /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED 3 /* connection established */
+#define IBLND_CONN_CLOSING 4 /* being closed */
+#define IBLND_CONN_DISCONNECTED 5 /* disconnected */
+
+typedef struct kib_peer {
+ struct list_head ibp_list; /* stash on global peer list */
+ lnet_nid_t ibp_nid; /* who's on the other end(s) */
+ lnet_ni_t *ibp_ni; /* LNet interface */
+ atomic_t ibp_refcount; /* # users */
+ struct list_head ibp_conns; /* all active connections */
+ struct list_head ibp_tx_queue; /* msgs waiting for a conn */
+ __u16 ibp_version; /* version of peer */
+ __u64 ibp_incarnation; /* incarnation of peer */
+ int ibp_connecting; /* current active connection attempts */
+ int ibp_accepting; /* current passive connection attempts */
+ int ibp_error; /* errno on closing this peer */
+ unsigned long ibp_last_alive; /* when (in jiffies) I was last alive */
+} kib_peer_t;
+
+extern kib_data_t kiblnd_data;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+ LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+ atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+ LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+ if (atomic_dec_and_test(&hdev->ibh_ref))
+ kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+ if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+ return 0;
+
+ if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+ return 0;
+
+ if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+ return 1;
+
+ return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn) \
+do { \
+ CDEBUG(D_NET, "conn[%p] (%d)++\n", \
+ (conn), atomic_read(&(conn)->ibc_refcount)); \
+ atomic_inc(&(conn)->ibc_refcount); \
+} while (0)
+
+#define kiblnd_conn_decref(conn) \
+do { \
+ unsigned long flags; \
+ \
+ CDEBUG(D_NET, "conn[%p] (%d)--\n", \
+ (conn), atomic_read(&(conn)->ibc_refcount)); \
+ LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \
+ if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \
+ list_add_tail(&(conn)->ibc_list, \
+ &kiblnd_data.kib_connd_zombies); \
+ wake_up(&kiblnd_data.kib_connd_waitq); \
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+ } \
+} while (0)
+
+#define kiblnd_peer_addref(peer) \
+do { \
+ CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \
+ (peer), libcfs_nid2str((peer)->ibp_nid), \
+ atomic_read (&(peer)->ibp_refcount)); \
+ atomic_inc(&(peer)->ibp_refcount); \
+} while (0)
+
+#define kiblnd_peer_decref(peer) \
+do { \
+ CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \
+ (peer), libcfs_nid2str((peer)->ibp_nid), \
+ atomic_read (&(peer)->ibp_refcount)); \
+ LASSERT_ATOMIC_POS(&(peer)->ibp_refcount); \
+ if (atomic_dec_and_test(&(peer)->ibp_refcount)) \
+ kiblnd_destroy_peer(peer); \
+} while (0)
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+ unsigned int hash =
+ ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+ return (&kiblnd_data.kib_peers [hash]);
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_t *peer)
+{
+ /* Am I in the peer hash table? */
+ return (!list_empty(&peer->ibp_list));
+}
+
+static inline kib_conn_t *
+kiblnd_get_conn_locked (kib_peer_t *peer)
+{
+ LASSERT (!list_empty(&peer->ibp_conns));
+
+ /* just return the first connection */
+ return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn)
+{
+ return (*kiblnd_tunables.kib_keepalive > 0) &&
+ cfs_time_after(jiffies, conn->ibc_last_send +
+ *kiblnd_tunables.kib_keepalive*HZ);
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+ LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ if (conn->ibc_outstanding_credits <
+ IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+ !kiblnd_send_keepalive(conn))
+ return 0; /* No need to send NOOP */
+
+ if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+ if (!list_empty(&conn->ibc_tx_queue_nocred))
+ return 0; /* NOOP can be piggybacked */
+
+ /* No tx to piggyback NOOP onto or no credit to send a tx */
+ return (list_empty(&conn->ibc_tx_queue) ||
+ conn->ibc_credits == 0);
+ }
+
+ if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+ !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+ conn->ibc_credits == 0) /* no credit */
+ return 0;
+
+ if (conn->ibc_credits == 1 && /* last credit reserved for */
+ conn->ibc_outstanding_credits == 0) /* giving back credits */
+ return 0;
+
+ /* No tx to piggyback NOOP onto or no credit to send a tx */
+ return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+ ib_modify_qp(conn->ibc_cmid->qp,
+ &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str (kib_conn_t *conn, struct list_head *q)
+{
+ if (q == &conn->ibc_tx_queue)
+ return "tx_queue";
+
+ if (q == &conn->ibc_tx_queue_rsrvd)
+ return "tx_queue_rsrvd";
+
+ if (q == &conn->ibc_tx_queue_nocred)
+ return "tx_queue_nocred";
+
+ if (q == &conn->ibc_active_txs)
+ return "active_txs";
+
+ LBUG();
+ return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_TX 0
+#define IBLND_WID_RDMA 1
+#define IBLND_WID_RX 2
+#define IBLND_WID_MASK 3UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+ unsigned long lptr = (unsigned long)ptr;
+
+ LASSERT ((lptr & IBLND_WID_MASK) == 0);
+ LASSERT ((type & ~IBLND_WID_MASK) == 0);
+ return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+ return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+ return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+ conn->ibc_state = state;
+ mb();
+}
+
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+ msg->ibm_type = type;
+ msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+ int i;
+ int size;
+
+ for (i = size = 0; i < rd->rd_nfrags; i++)
+ size += rd->rd_frags[i].rf_nob;
+
+ return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+ return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+ return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+ return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+ if (nob < rd->rd_frags[index].rf_nob) {
+ rd->rd_frags[index].rf_addr += nob;
+ rd->rd_frags[index].rf_nob -= nob;
+ } else {
+ index ++;
+ }
+
+ return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+ LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+ msgtype == IBLND_MSG_PUT_ACK);
+
+ return msgtype == IBLND_MSG_GET_REQ ?
+ offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+ offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+ void *msg, size_t size,
+ enum dma_data_direction direction)
+{
+ return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+ __u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a) (a)
+
+static inline int kiblnd_dma_map_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ return ib_dma_map_sg(dev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ ib_dma_unmap_sg(dev, sg, nents, direction);
+}
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+ struct scatterlist *sg)
+{
+ return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+ struct scatterlist *sg)
+{
+ return ib_sg_dma_len(dev, sg);
+}
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len)
+
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
+ kib_rdma_desc_t *rd);
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev,
+ __u64 addr, __u64 size);
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+ kib_rdma_desc_t *rd, int nfrags);
+void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+
+int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
+ int npages, __u64 iov, kib_fmr_t *fmr);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+
+int kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+ kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr);
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
+
+int kiblnd_startup (lnet_ni_t *ni);
+void kiblnd_shutdown (lnet_ni_t *ni);
+int kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when);
+
+int kiblnd_tunables_init(void);
+void kiblnd_tunables_fini(void);
+
+int kiblnd_connd (void *arg);
+int kiblnd_scheduler(void *arg);
+int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int kiblnd_failover_thread (void *arg);
+
+int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+void kiblnd_free_pages (kib_pages_t *p);
+
+int kiblnd_cm_callback(struct rdma_cm_id *cmid,
+ struct rdma_cm_event *event);
+int kiblnd_translate_mtu(int value);
+
+int kiblnd_dev_failover(kib_dev_t *dev);
+int kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_t *peer);
+void kiblnd_peer_alive (kib_peer_t *peer);
+kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
+void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
+int kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+ int version, __u64 incarnation);
+int kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
+
+void kiblnd_connreq_done(kib_conn_t *conn, int status);
+kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
+ int state, int version);
+void kiblnd_destroy_conn (kib_conn_t *conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+int kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+ int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+
+void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
+void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+ int status);
+void kiblnd_check_sends (kib_conn_t *conn);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+ int credits, lnet_nid_t dstnid, __u64 dststamp);
+int kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+ unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen);
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644
index 000000000..dbf374983
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -0,0 +1,3519 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static void
+kiblnd_tx_done(lnet_ni_t *ni, kib_tx_t *tx)
+{
+ lnet_msg_t *lntmsg[2];
+ kib_net_t *net = ni->ni_data;
+ int rc;
+ int i;
+
+ LASSERT(net != NULL);
+ LASSERT(!in_interrupt());
+ LASSERT(!tx->tx_queued); /* mustn't be queued for sending */
+ LASSERT(tx->tx_sending == 0); /* mustn't be awaiting sent callback */
+ LASSERT(!tx->tx_waiting); /* mustn't be awaiting peer response */
+ LASSERT(tx->tx_pool != NULL);
+
+ kiblnd_unmap_tx(ni, tx);
+
+ /* tx may have up to 2 lnet msgs to finalise */
+ lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+ lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+ rc = tx->tx_status;
+
+ if (tx->tx_conn != NULL) {
+ LASSERT(ni == tx->tx_conn->ibc_peer->ibp_ni);
+
+ kiblnd_conn_decref(tx->tx_conn);
+ tx->tx_conn = NULL;
+ }
+
+ tx->tx_nwrq = 0;
+ tx->tx_status = 0;
+
+ kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+ /* delay finalize until my descs have been freed */
+ for (i = 0; i < 2; i++) {
+ if (lntmsg[i] == NULL)
+ continue;
+
+ lnet_finalize(ni, lntmsg[i], rc);
+ }
+}
+
+void
+kiblnd_txlist_done(lnet_ni_t *ni, struct list_head *txlist, int status)
+{
+ kib_tx_t *tx;
+
+ while (!list_empty(txlist)) {
+ tx = list_entry(txlist->next, kib_tx_t, tx_list);
+
+ list_del(&tx->tx_list);
+ /* complete now */
+ tx->tx_waiting = 0;
+ tx->tx_status = status;
+ kiblnd_tx_done(ni, tx);
+ }
+}
+
+static kib_tx_t *
+kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
+{
+ kib_net_t *net = (kib_net_t *)ni->ni_data;
+ struct list_head *node;
+ kib_tx_t *tx;
+ kib_tx_poolset_t *tps;
+
+ tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+ node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+ if (node == NULL)
+ return NULL;
+ tx = container_of(node, kib_tx_t, tx_list);
+
+ LASSERT(tx->tx_nwrq == 0);
+ LASSERT(!tx->tx_queued);
+ LASSERT(tx->tx_sending == 0);
+ LASSERT(!tx->tx_waiting);
+ LASSERT(tx->tx_status == 0);
+ LASSERT(tx->tx_conn == NULL);
+ LASSERT(tx->tx_lntmsg[0] == NULL);
+ LASSERT(tx->tx_lntmsg[1] == NULL);
+ LASSERT(tx->tx_u.pmr == NULL);
+ LASSERT(tx->tx_nfrags == 0);
+
+ return tx;
+}
+
+static void
+kiblnd_drop_rx(kib_rx_t *rx)
+{
+ kib_conn_t *conn = rx->rx_conn;
+ struct kib_sched_info *sched = conn->ibc_sched;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ LASSERT(conn->ibc_nrx > 0);
+ conn->ibc_nrx--;
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx(kib_rx_t *rx, int credit)
+{
+ kib_conn_t *conn = rx->rx_conn;
+ kib_net_t *net = conn->ibc_peer->ibp_ni->ni_data;
+ struct ib_recv_wr *bad_wrq = NULL;
+ struct ib_mr *mr;
+ int rc;
+
+ LASSERT(net != NULL);
+ LASSERT(!in_interrupt());
+ LASSERT(credit == IBLND_POSTRX_NO_CREDIT ||
+ credit == IBLND_POSTRX_PEER_CREDIT ||
+ credit == IBLND_POSTRX_RSRVD_CREDIT);
+
+ mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE);
+ LASSERT(mr != NULL);
+
+ rx->rx_sge.lkey = mr->lkey;
+ rx->rx_sge.addr = rx->rx_msgaddr;
+ rx->rx_sge.length = IBLND_MSG_SIZE;
+
+ rx->rx_wrq.next = NULL;
+ rx->rx_wrq.sg_list = &rx->rx_sge;
+ rx->rx_wrq.num_sge = 1;
+ rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+ LASSERT(conn->ibc_state >= IBLND_CONN_INIT);
+ LASSERT(rx->rx_nob >= 0); /* not posted */
+
+ if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+ kiblnd_drop_rx(rx); /* No more posts for this rx */
+ return 0;
+ }
+
+ rx->rx_nob = -1; /* flag posted */
+
+ rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+ if (rc != 0) {
+ CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+ rx->rx_nob = 0;
+ }
+
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+ return rc;
+
+ if (rc != 0) {
+ kiblnd_close_conn(conn, rc);
+ kiblnd_drop_rx(rx); /* No more posts for this rx */
+ return rc;
+ }
+
+ if (credit == IBLND_POSTRX_NO_CREDIT)
+ return 0;
+
+ spin_lock(&conn->ibc_lock);
+ if (credit == IBLND_POSTRX_PEER_CREDIT)
+ conn->ibc_outstanding_credits++;
+ else
+ conn->ibc_reserved_credits++;
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_check_sends(conn);
+ return 0;
+}
+
+static kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+ struct list_head *tmp;
+
+ list_for_each(tmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+ LASSERT(!tx->tx_queued);
+ LASSERT(tx->tx_sending != 0 || tx->tx_waiting);
+
+ if (tx->tx_cookie != cookie)
+ continue;
+
+ if (tx->tx_waiting &&
+ tx->tx_msg->ibm_type == txtype)
+ return tx;
+
+ CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+ tx->tx_waiting ? "" : "NOT ",
+ tx->tx_msg->ibm_type, txtype);
+ }
+ return NULL;
+}
+
+static void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+ kib_tx_t *tx;
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ int idle;
+
+ spin_lock(&conn->ibc_lock);
+
+ tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+ if (tx == NULL) {
+ spin_unlock(&conn->ibc_lock);
+
+ CWARN("Unmatched completion type %x cookie %#llx from %s\n",
+ txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_close_conn(conn, -EPROTO);
+ return;
+ }
+
+ if (tx->tx_status == 0) { /* success so far */
+ if (status < 0) { /* failed? */
+ tx->tx_status = status;
+ } else if (txtype == IBLND_MSG_GET_REQ) {
+ lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+ }
+ }
+
+ tx->tx_waiting = 0;
+
+ idle = !tx->tx_queued && (tx->tx_sending == 0);
+ if (idle)
+ list_del(&tx->tx_list);
+
+ spin_unlock(&conn->ibc_lock);
+
+ if (idle)
+ kiblnd_tx_done(ni, tx);
+}
+
+static void
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ kib_tx_t *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+ if (tx == NULL) {
+ CERROR("Can't get tx for completion %x for %s\n",
+ type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return;
+ }
+
+ tx->tx_msg->ibm_u.completion.ibcm_status = status;
+ tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+ kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+ kiblnd_queue_tx(tx, conn);
+}
+
+static void
+kiblnd_handle_rx(kib_rx_t *rx)
+{
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ int credits = msg->ibm_credits;
+ kib_tx_t *tx;
+ int rc = 0;
+ int rc2;
+ int post_credit;
+
+ LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ CDEBUG(D_NET, "Received %x[%d] from %s\n",
+ msg->ibm_type, credits,
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+ if (credits != 0) {
+ /* Have I received credits that will let me send? */
+ spin_lock(&conn->ibc_lock);
+
+ if (conn->ibc_credits + credits >
+ IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
+ rc2 = conn->ibc_credits;
+ spin_unlock(&conn->ibc_lock);
+
+ CERROR("Bad credits from %s: %d + %d > %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ rc2, credits,
+ IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+
+ kiblnd_close_conn(conn, -EPROTO);
+ kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+ return;
+ }
+
+ conn->ibc_credits += credits;
+
+ /* This ensures the credit taken by NOOP can be returned */
+ if (msg->ibm_type == IBLND_MSG_NOOP &&
+ !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+ conn->ibc_outstanding_credits++;
+
+ spin_unlock(&conn->ibc_lock);
+ kiblnd_check_sends(conn);
+ }
+
+ switch (msg->ibm_type) {
+ default:
+ CERROR("Bad IBLND message type %x from %s\n",
+ msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ rc = -EPROTO;
+ break;
+
+ case IBLND_MSG_NOOP:
+ if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ break;
+ }
+
+ if (credits != 0) /* credit already posted */
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ else /* a keepalive NOOP */
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ break;
+
+ case IBLND_MSG_IMMEDIATE:
+ post_credit = IBLND_POSTRX_DONT_POST;
+ rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+ msg->ibm_srcnid, rx, 0);
+ if (rc < 0) /* repost on error */
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ break;
+
+ case IBLND_MSG_PUT_REQ:
+ post_credit = IBLND_POSTRX_DONT_POST;
+ rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+ msg->ibm_srcnid, rx, 1);
+ if (rc < 0) /* repost on error */
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ break;
+
+ case IBLND_MSG_PUT_NAK:
+ CWARN("PUT_NACK from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+ kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
+
+ case IBLND_MSG_PUT_ACK:
+ post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+ spin_lock(&conn->ibc_lock);
+ tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+ msg->ibm_u.putack.ibpam_src_cookie);
+ if (tx != NULL)
+ list_del(&tx->tx_list);
+ spin_unlock(&conn->ibc_lock);
+
+ if (tx == NULL) {
+ CERROR("Unmatched PUT_ACK from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ rc = -EPROTO;
+ break;
+ }
+
+ LASSERT(tx->tx_waiting);
+ /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+ * (a) I can overwrite tx_msg since my peer has received it!
+ * (b) tx_waiting set tells tx_complete() it's not done. */
+
+ tx->tx_nwrq = 0; /* overwrite PUT_REQ */
+
+ rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+ kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+ &msg->ibm_u.putack.ibpam_rd,
+ msg->ibm_u.putack.ibpam_dst_cookie);
+ if (rc2 < 0)
+ CERROR("Can't setup rdma for PUT to %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+ spin_lock(&conn->ibc_lock);
+ tx->tx_waiting = 0; /* clear waiting and queue atomically */
+ kiblnd_queue_tx_locked(tx, conn);
+ spin_unlock(&conn->ibc_lock);
+ break;
+
+ case IBLND_MSG_PUT_DONE:
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
+
+ case IBLND_MSG_GET_REQ:
+ post_credit = IBLND_POSTRX_DONT_POST;
+ rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+ msg->ibm_srcnid, rx, 1);
+ if (rc < 0) /* repost on error */
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ break;
+
+ case IBLND_MSG_GET_DONE:
+ post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+ kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
+ }
+
+ if (rc < 0) /* protocol error */
+ kiblnd_close_conn(conn, rc);
+
+ if (post_credit != IBLND_POSTRX_DONT_POST)
+ kiblnd_post_rx(rx, post_credit);
+}
+
+static void
+kiblnd_rx_complete(kib_rx_t *rx, int status, int nob)
+{
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ kib_net_t *net = ni->ni_data;
+ int rc;
+ int err = -EIO;
+
+ LASSERT(net != NULL);
+ LASSERT(rx->rx_nob < 0); /* was posted */
+ rx->rx_nob = 0; /* isn't now */
+
+ if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+ goto ignore;
+
+ if (status != IB_WC_SUCCESS) {
+ CNETERR("Rx from %s failed: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+ goto failed;
+ }
+
+ LASSERT(nob >= 0);
+ rx->rx_nob = nob;
+
+ rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+ if (rc != 0) {
+ CERROR("Error %d unpacking rx from %s\n",
+ rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ goto failed;
+ }
+
+ if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+ msg->ibm_dstnid != ni->ni_nid ||
+ msg->ibm_srcstamp != conn->ibc_incarnation ||
+ msg->ibm_dststamp != net->ibn_incarnation) {
+ CERROR("Stale rx from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ err = -ESTALE;
+ goto failed;
+ }
+
+ /* set time last known alive */
+ kiblnd_peer_alive(conn->ibc_peer);
+
+ /* racing with connection establishment/teardown! */
+
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+ rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+ unsigned long flags;
+
+ write_lock_irqsave(g_lock, flags);
+ /* must check holding global lock to eliminate race */
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+ list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+ write_unlock_irqrestore(g_lock, flags);
+ return;
+ }
+ write_unlock_irqrestore(g_lock, flags);
+ }
+ kiblnd_handle_rx(rx);
+ return;
+
+ failed:
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ kiblnd_close_conn(conn, err);
+ ignore:
+ kiblnd_drop_rx(rx); /* Don't re-post rx. */
+}
+
+static struct page *
+kiblnd_kvaddr_to_page(unsigned long vaddr)
+{
+ struct page *page;
+
+ if (is_vmalloc_addr((void *)vaddr)) {
+ page = vmalloc_to_page((void *)vaddr);
+ LASSERT(page != NULL);
+ return page;
+ }
+#ifdef CONFIG_HIGHMEM
+ if (vaddr >= PKMAP_BASE &&
+ vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+ /* No highmem pages only used for bulk (kiov) I/O */
+ CERROR("find page for address in highmem\n");
+ LBUG();
+ }
+#endif
+ page = virt_to_page(vaddr);
+ LASSERT(page != NULL);
+ return page;
+}
+
+static int
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+ kib_hca_dev_t *hdev;
+ __u64 *pages = tx->tx_pages;
+ kib_fmr_poolset_t *fps;
+ int npages;
+ int size;
+ int cpt;
+ int rc;
+ int i;
+
+ LASSERT(tx->tx_pool != NULL);
+ LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+ hdev = tx->tx_pool->tpo_hdev;
+
+ for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+ for (size = 0; size < rd->rd_frags[i].rf_nob;
+ size += hdev->ibh_page_size) {
+ pages[npages++] = (rd->rd_frags[i].rf_addr &
+ hdev->ibh_page_mask) + size;
+ }
+ }
+
+ cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+ fps = net->ibn_fmr_ps[cpt];
+ rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr);
+ if (rc != 0) {
+ CERROR("Can't map %d pages: %d\n", npages, rc);
+ return rc;
+ }
+
+ /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+ * the rkey */
+ rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey :
+ tx->tx_u.fmr.fmr_pfmr->fmr->lkey;
+ rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+ rd->rd_frags[0].rf_nob = nob;
+ rd->rd_nfrags = 1;
+
+ return 0;
+}
+
+static int
+kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+ kib_hca_dev_t *hdev;
+ kib_pmr_poolset_t *pps;
+ __u64 iova;
+ int cpt;
+ int rc;
+
+ LASSERT(tx->tx_pool != NULL);
+ LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+ hdev = tx->tx_pool->tpo_hdev;
+
+ iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
+
+ cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+ pps = net->ibn_pmr_ps[cpt];
+ rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr);
+ if (rc != 0) {
+ CERROR("Failed to create MR by phybuf: %d\n", rc);
+ return rc;
+ }
+
+ /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+ * the rkey */
+ rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey :
+ tx->tx_u.pmr->pmr_mr->lkey;
+ rd->rd_nfrags = 1;
+ rd->rd_frags[0].rf_addr = iova;
+ rd->rd_frags[0].rf_nob = nob;
+
+ return 0;
+}
+
+void
+kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
+{
+ kib_net_t *net = ni->ni_data;
+
+ LASSERT(net != NULL);
+
+ if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) {
+ kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
+ tx->tx_u.fmr.fmr_pfmr = NULL;
+
+ } else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) {
+ kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
+ tx->tx_u.pmr = NULL;
+ }
+
+ if (tx->tx_nfrags != 0) {
+ kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+ tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+ tx->tx_nfrags = 0;
+ }
+}
+
+int
+kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+ kib_rdma_desc_t *rd, int nfrags)
+{
+ kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+ kib_net_t *net = ni->ni_data;
+ struct ib_mr *mr = NULL;
+ __u32 nob;
+ int i;
+
+ /* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+ * RDMA sink */
+ tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ tx->tx_nfrags = nfrags;
+
+ rd->rd_nfrags =
+ kiblnd_dma_map_sg(hdev->ibh_ibdev,
+ tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+
+ for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+ rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len(
+ hdev->ibh_ibdev, &tx->tx_frags[i]);
+ rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+ hdev->ibh_ibdev, &tx->tx_frags[i]);
+ nob += rd->rd_frags[i].rf_nob;
+ }
+
+ /* looking for pre-mapping MR */
+ mr = kiblnd_find_rd_dma_mr(hdev, rd);
+ if (mr != NULL) {
+ /* found pre-mapping MR */
+ rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+ return 0;
+ }
+
+ if (net->ibn_fmr_ps != NULL)
+ return kiblnd_fmr_map_tx(net, tx, rd, nob);
+ else if (net->ibn_pmr_ps != NULL)
+ return kiblnd_pmr_map_tx(net, tx, rd, nob);
+
+ return -EINVAL;
+}
+
+
+static int
+kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+ unsigned int niov, struct kvec *iov, int offset, int nob)
+{
+ kib_net_t *net = ni->ni_data;
+ struct page *page;
+ struct scatterlist *sg;
+ unsigned long vaddr;
+ int fragnob;
+ int page_offset;
+
+ LASSERT(nob > 0);
+ LASSERT(niov > 0);
+ LASSERT(net != NULL);
+
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ niov--;
+ iov++;
+ LASSERT(niov > 0);
+ }
+
+ sg = tx->tx_frags;
+ do {
+ LASSERT(niov > 0);
+
+ vaddr = ((unsigned long)iov->iov_base) + offset;
+ page_offset = vaddr & (PAGE_SIZE - 1);
+ page = kiblnd_kvaddr_to_page(vaddr);
+ if (page == NULL) {
+ CERROR("Can't find page\n");
+ return -EFAULT;
+ }
+
+ fragnob = min((int)(iov->iov_len - offset), nob);
+ fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+ sg_set_page(sg, page, fragnob, page_offset);
+ sg++;
+
+ if (offset + fragnob < iov->iov_len) {
+ offset += fragnob;
+ } else {
+ offset = 0;
+ iov++;
+ niov--;
+ }
+ nob -= fragnob;
+ } while (nob > 0);
+
+ return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+static int
+kiblnd_setup_rd_kiov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+ int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+ kib_net_t *net = ni->ni_data;
+ struct scatterlist *sg;
+ int fragnob;
+
+ CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+ LASSERT(nob > 0);
+ LASSERT(nkiov > 0);
+ LASSERT(net != NULL);
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT(nkiov > 0);
+ }
+
+ sg = tx->tx_frags;
+ do {
+ LASSERT(nkiov > 0);
+
+ fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+ sg_set_page(sg, kiov->kiov_page, fragnob,
+ kiov->kiov_offset + offset);
+ sg++;
+
+ offset = 0;
+ kiov++;
+ nkiov--;
+ nob -= fragnob;
+ } while (nob > 0);
+
+ return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+static int
+kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit)
+ __releases(conn->ibc_lock)
+ __acquires(conn->ibc_lock)
+{
+ kib_msg_t *msg = tx->tx_msg;
+ kib_peer_t *peer = conn->ibc_peer;
+ int ver = conn->ibc_version;
+ int rc;
+ int done;
+ struct ib_send_wr *bad_wrq;
+
+ LASSERT(tx->tx_queued);
+ /* We rely on this for QP sizing */
+ LASSERT(tx->tx_nwrq > 0);
+ LASSERT(tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
+
+ LASSERT(credit == 0 || credit == 1);
+ LASSERT(conn->ibc_outstanding_credits >= 0);
+ LASSERT(conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+ LASSERT(conn->ibc_credits >= 0);
+ LASSERT(conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+
+ if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+ /* tx completions outstanding... */
+ CDEBUG(D_NET, "%s: posted enough\n",
+ libcfs_nid2str(peer->ibp_nid));
+ return -EAGAIN;
+ }
+
+ if (credit != 0 && conn->ibc_credits == 0) { /* no credits */
+ CDEBUG(D_NET, "%s: no credits\n",
+ libcfs_nid2str(peer->ibp_nid));
+ return -EAGAIN;
+ }
+
+ if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+ conn->ibc_credits == 1 && /* last credit reserved */
+ msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */
+ CDEBUG(D_NET, "%s: not using last credit\n",
+ libcfs_nid2str(peer->ibp_nid));
+ return -EAGAIN;
+ }
+
+ /* NB don't drop ibc_lock before bumping tx_sending */
+ list_del(&tx->tx_list);
+ tx->tx_queued = 0;
+
+ if (msg->ibm_type == IBLND_MSG_NOOP &&
+ (!kiblnd_need_noop(conn) || /* redundant NOOP */
+ (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+ conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+ /* OK to drop when posted enough NOOPs, since
+ * kiblnd_check_sends will queue NOOP again when
+ * posted NOOPs complete */
+ spin_unlock(&conn->ibc_lock);
+ kiblnd_tx_done(peer->ibp_ni, tx);
+ spin_lock(&conn->ibc_lock);
+ CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+ libcfs_nid2str(peer->ibp_nid),
+ conn->ibc_noops_posted);
+ return 0;
+ }
+
+ kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+ peer->ibp_nid, conn->ibc_incarnation);
+
+ conn->ibc_credits -= credit;
+ conn->ibc_outstanding_credits = 0;
+ conn->ibc_nsends_posted++;
+ if (msg->ibm_type == IBLND_MSG_NOOP)
+ conn->ibc_noops_posted++;
+
+ /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
+ * PUT. If so, it was first queued here as a PUT_REQ, sent and
+ * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+ * and then re-queued here. It's (just) possible that
+ * tx_sending is non-zero if we've not done the tx_complete()
+ * from the first send; hence the ++ rather than = below. */
+ tx->tx_sending++;
+ list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+ /* I'm still holding ibc_lock! */
+ if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+ rc = -ECONNABORTED;
+ } else if (tx->tx_pool->tpo_pool.po_failed ||
+ conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+ /* close_conn will launch failover */
+ rc = -ENETDOWN;
+ } else {
+ rc = ib_post_send(conn->ibc_cmid->qp,
+ tx->tx_wrq, &bad_wrq);
+ }
+
+ conn->ibc_last_send = jiffies;
+
+ if (rc == 0)
+ return 0;
+
+ /* NB credits are transferred in the actual
+ * message, which can only be the last work item */
+ conn->ibc_credits += credit;
+ conn->ibc_outstanding_credits += msg->ibm_credits;
+ conn->ibc_nsends_posted--;
+ if (msg->ibm_type == IBLND_MSG_NOOP)
+ conn->ibc_noops_posted--;
+
+ tx->tx_status = rc;
+ tx->tx_waiting = 0;
+ tx->tx_sending--;
+
+ done = (tx->tx_sending == 0);
+ if (done)
+ list_del(&tx->tx_list);
+
+ spin_unlock(&conn->ibc_lock);
+
+ if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+ CERROR("Error %d posting transmit to %s\n",
+ rc, libcfs_nid2str(peer->ibp_nid));
+ else
+ CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+ rc, libcfs_nid2str(peer->ibp_nid));
+
+ kiblnd_close_conn(conn, rc);
+
+ if (done)
+ kiblnd_tx_done(peer->ibp_ni, tx);
+
+ spin_lock(&conn->ibc_lock);
+
+ return -EIO;
+}
+
+void
+kiblnd_check_sends(kib_conn_t *conn)
+{
+ int ver = conn->ibc_version;
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ kib_tx_t *tx;
+
+ /* Don't send anything until after the connection is established */
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+ CDEBUG(D_NET, "%s too soon\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return;
+ }
+
+ spin_lock(&conn->ibc_lock);
+
+ LASSERT(conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+ LASSERT(!IBLND_OOB_CAPABLE(ver) ||
+ conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+ LASSERT(conn->ibc_reserved_credits >= 0);
+
+ while (conn->ibc_reserved_credits > 0 &&
+ !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+ tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+ kib_tx_t, tx_list);
+ list_del(&tx->tx_list);
+ list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+ conn->ibc_reserved_credits--;
+ }
+
+ if (kiblnd_need_noop(conn)) {
+ spin_unlock(&conn->ibc_lock);
+
+ tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+ if (tx != NULL)
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+ spin_lock(&conn->ibc_lock);
+ if (tx != NULL)
+ kiblnd_queue_tx_locked(tx, conn);
+ }
+
+ kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */
+
+ for (;;) {
+ int credit;
+
+ if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+ credit = 0;
+ tx = list_entry(conn->ibc_tx_queue_nocred.next,
+ kib_tx_t, tx_list);
+ } else if (!list_empty(&conn->ibc_tx_noops)) {
+ LASSERT(!IBLND_OOB_CAPABLE(ver));
+ credit = 1;
+ tx = list_entry(conn->ibc_tx_noops.next,
+ kib_tx_t, tx_list);
+ } else if (!list_empty(&conn->ibc_tx_queue)) {
+ credit = 1;
+ tx = list_entry(conn->ibc_tx_queue.next,
+ kib_tx_t, tx_list);
+ } else
+ break;
+
+ if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+ break;
+ }
+
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_conn_decref(conn); /* ...until here */
+}
+
+static void
+kiblnd_tx_complete(kib_tx_t *tx, int status)
+{
+ int failed = (status != IB_WC_SUCCESS);
+ kib_conn_t *conn = tx->tx_conn;
+ int idle;
+
+ LASSERT(tx->tx_sending > 0);
+
+ if (failed) {
+ if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+ CNETERR("Tx -> %s cookie %#llx sending %d waiting %d: failed %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+ status);
+
+ kiblnd_close_conn(conn, -EIO);
+ } else {
+ kiblnd_peer_alive(conn->ibc_peer);
+ }
+
+ spin_lock(&conn->ibc_lock);
+
+ /* I could be racing with rdma completion. Whoever makes 'tx' idle
+ * gets to free it, which also drops its ref on 'conn'. */
+
+ tx->tx_sending--;
+ conn->ibc_nsends_posted--;
+ if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+ conn->ibc_noops_posted--;
+
+ if (failed) {
+ tx->tx_waiting = 0; /* don't wait for peer */
+ tx->tx_status = -EIO;
+ }
+
+ idle = (tx->tx_sending == 0) && /* This is the final callback */
+ !tx->tx_waiting && /* Not waiting for peer */
+ !tx->tx_queued; /* Not re-queued (PUT_DONE) */
+ if (idle)
+ list_del(&tx->tx_list);
+
+ kiblnd_conn_addref(conn); /* 1 ref for me.... */
+
+ spin_unlock(&conn->ibc_lock);
+
+ if (idle)
+ kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
+
+ kiblnd_check_sends(conn);
+
+ kiblnd_conn_decref(conn); /* ...until here */
+}
+
+void
+kiblnd_init_tx_msg(lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
+{
+ kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+ struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq];
+ struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
+ int nob = offsetof(kib_msg_t, ibm_u) + body_nob;
+ struct ib_mr *mr;
+
+ LASSERT(tx->tx_nwrq >= 0);
+ LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+ LASSERT(nob <= IBLND_MSG_SIZE);
+
+ kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+ mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob);
+ LASSERT(mr != NULL);
+
+ sge->lkey = mr->lkey;
+ sge->addr = tx->tx_msgaddr;
+ sge->length = nob;
+
+ memset(wrq, 0, sizeof(*wrq));
+
+ wrq->next = NULL;
+ wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+ wrq->sg_list = sge;
+ wrq->num_sge = 1;
+ wrq->opcode = IB_WR_SEND;
+ wrq->send_flags = IB_SEND_SIGNALED;
+
+ tx->tx_nwrq++;
+}
+
+int
+kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
+ int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+ kib_msg_t *ibmsg = tx->tx_msg;
+ kib_rdma_desc_t *srcrd = tx->tx_rd;
+ struct ib_sge *sge = &tx->tx_sge[0];
+ struct ib_send_wr *wrq = &tx->tx_wrq[0];
+ int rc = resid;
+ int srcidx;
+ int dstidx;
+ int wrknob;
+
+ LASSERT(!in_interrupt());
+ LASSERT(tx->tx_nwrq == 0);
+ LASSERT(type == IBLND_MSG_GET_DONE ||
+ type == IBLND_MSG_PUT_DONE);
+
+ srcidx = dstidx = 0;
+
+ while (resid > 0) {
+ if (srcidx >= srcrd->rd_nfrags) {
+ CERROR("Src buffer exhausted: %d frags\n", srcidx);
+ rc = -EPROTO;
+ break;
+ }
+
+ if (dstidx == dstrd->rd_nfrags) {
+ CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+ rc = -EPROTO;
+ break;
+ }
+
+ if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
+ CERROR("RDMA too fragmented for %s (%d): %d/%d src %d/%d dst frags\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ IBLND_RDMA_FRAGS(conn->ibc_version),
+ srcidx, srcrd->rd_nfrags,
+ dstidx, dstrd->rd_nfrags);
+ rc = -EMSGSIZE;
+ break;
+ }
+
+ wrknob = min(min(kiblnd_rd_frag_size(srcrd, srcidx),
+ kiblnd_rd_frag_size(dstrd, dstidx)),
+ (__u32) resid);
+
+ sge = &tx->tx_sge[tx->tx_nwrq];
+ sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx);
+ sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx);
+ sge->length = wrknob;
+
+ wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+ wrq->next = wrq + 1;
+ wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+ wrq->sg_list = sge;
+ wrq->num_sge = 1;
+ wrq->opcode = IB_WR_RDMA_WRITE;
+ wrq->send_flags = 0;
+
+ wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
+ wrq->wr.rdma.rkey = kiblnd_rd_frag_key(dstrd, dstidx);
+
+ srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
+ dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
+
+ resid -= wrknob;
+
+ tx->tx_nwrq++;
+ wrq++;
+ sge++;
+ }
+
+ if (rc < 0) /* no RDMA if completing with failure */
+ tx->tx_nwrq = 0;
+
+ ibmsg->ibm_u.completion.ibcm_status = rc;
+ ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+ kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+ type, sizeof(kib_completion_msg_t));
+
+ return rc;
+}
+
+void
+kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn)
+{
+ struct list_head *q;
+
+ LASSERT(tx->tx_nwrq > 0); /* work items set up */
+ LASSERT(!tx->tx_queued); /* not queued for sending already */
+ LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ tx->tx_queued = 1;
+ tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
+
+ if (tx->tx_conn == NULL) {
+ kiblnd_conn_addref(conn);
+ tx->tx_conn = conn;
+ LASSERT(tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+ } else {
+ /* PUT_DONE first attached to conn as a PUT_REQ */
+ LASSERT(tx->tx_conn == conn);
+ LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+ }
+
+ switch (tx->tx_msg->ibm_type) {
+ default:
+ LBUG();
+
+ case IBLND_MSG_PUT_REQ:
+ case IBLND_MSG_GET_REQ:
+ q = &conn->ibc_tx_queue_rsrvd;
+ break;
+
+ case IBLND_MSG_PUT_NAK:
+ case IBLND_MSG_PUT_ACK:
+ case IBLND_MSG_PUT_DONE:
+ case IBLND_MSG_GET_DONE:
+ q = &conn->ibc_tx_queue_nocred;
+ break;
+
+ case IBLND_MSG_NOOP:
+ if (IBLND_OOB_CAPABLE(conn->ibc_version))
+ q = &conn->ibc_tx_queue_nocred;
+ else
+ q = &conn->ibc_tx_noops;
+ break;
+
+ case IBLND_MSG_IMMEDIATE:
+ q = &conn->ibc_tx_queue;
+ break;
+ }
+
+ list_add_tail(&tx->tx_list, q);
+}
+
+void
+kiblnd_queue_tx(kib_tx_t *tx, kib_conn_t *conn)
+{
+ spin_lock(&conn->ibc_lock);
+ kiblnd_queue_tx_locked(tx, conn);
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_check_sends(conn);
+}
+
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+ struct sockaddr_in *srcaddr,
+ struct sockaddr_in *dstaddr,
+ int timeout_ms)
+{
+ unsigned short port;
+ int rc;
+
+ /* allow the port to be reused */
+ rc = rdma_set_reuseaddr(cmid, 1);
+ if (rc != 0) {
+ CERROR("Unable to set reuse on cmid: %d\n", rc);
+ return rc;
+ }
+
+ /* look for a free privileged port */
+ for (port = PROT_SOCK-1; port > 0; port--) {
+ srcaddr->sin_port = htons(port);
+ rc = rdma_resolve_addr(cmid,
+ (struct sockaddr *)srcaddr,
+ (struct sockaddr *)dstaddr,
+ timeout_ms);
+ if (rc == 0) {
+ CDEBUG(D_NET, "bound to port %hu\n", port);
+ return 0;
+ } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+ CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+ port, rc);
+ } else {
+ return rc;
+ }
+ }
+
+ CERROR("Failed to bind to a free privileged port\n");
+ return rc;
+}
+
+static void
+kiblnd_connect_peer(kib_peer_t *peer)
+{
+ struct rdma_cm_id *cmid;
+ kib_dev_t *dev;
+ kib_net_t *net = peer->ibp_ni->ni_data;
+ struct sockaddr_in srcaddr;
+ struct sockaddr_in dstaddr;
+ int rc;
+
+ LASSERT(net != NULL);
+ LASSERT(peer->ibp_connecting > 0);
+
+ cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+ IB_QPT_RC);
+
+ if (IS_ERR(cmid)) {
+ CERROR("Can't create CMID for %s: %ld\n",
+ libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+ rc = PTR_ERR(cmid);
+ goto failed;
+ }
+
+ dev = net->ibn_dev;
+ memset(&srcaddr, 0, sizeof(srcaddr));
+ srcaddr.sin_family = AF_INET;
+ srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+ memset(&dstaddr, 0, sizeof(dstaddr));
+ dstaddr.sin_family = AF_INET;
+ dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+ dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+
+ kiblnd_peer_addref(peer); /* cmid's ref */
+
+ if (*kiblnd_tunables.kib_use_priv_port) {
+ rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+ *kiblnd_tunables.kib_timeout * 1000);
+ } else {
+ rc = rdma_resolve_addr(cmid,
+ (struct sockaddr *)&srcaddr,
+ (struct sockaddr *)&dstaddr,
+ *kiblnd_tunables.kib_timeout * 1000);
+ }
+ if (rc != 0) {
+ /* Can't initiate address resolution: */
+ CERROR("Can't resolve addr for %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc);
+ goto failed2;
+ }
+
+ LASSERT(cmid->device != NULL);
+ CDEBUG(D_NET, "%s: connection bound to %s:%pI4h:%s\n",
+ libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+ &dev->ibd_ifip, cmid->device->name);
+
+ return;
+
+ failed2:
+ kiblnd_peer_decref(peer); /* cmid's ref */
+ rdma_destroy_id(cmid);
+ failed:
+ kiblnd_peer_connect_failed(peer, 1, rc);
+}
+
+void
+kiblnd_launch_tx(lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+ kib_conn_t *conn;
+ rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+ unsigned long flags;
+ int rc;
+
+ /* If I get here, I've committed to send, so I complete the tx with
+ * failure on any problems */
+
+ LASSERT(tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
+ LASSERT(tx == NULL || tx->tx_nwrq > 0); /* work items have been set up */
+
+ /* First time, just use a read lock since I expect to find my peer
+ * connected */
+ read_lock_irqsave(g_lock, flags);
+
+ peer = kiblnd_find_peer_locked(nid);
+ if (peer != NULL && !list_empty(&peer->ibp_conns)) {
+ /* Found a peer with an established connection */
+ conn = kiblnd_get_conn_locked(peer);
+ kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+ read_unlock_irqrestore(g_lock, flags);
+
+ if (tx != NULL)
+ kiblnd_queue_tx(tx, conn);
+ kiblnd_conn_decref(conn); /* ...to here */
+ return;
+ }
+
+ read_unlock(g_lock);
+ /* Re-try with a write lock */
+ write_lock(g_lock);
+
+ peer = kiblnd_find_peer_locked(nid);
+ if (peer != NULL) {
+ if (list_empty(&peer->ibp_conns)) {
+ /* found a peer, but it's still connecting... */
+ LASSERT(peer->ibp_connecting != 0 ||
+ peer->ibp_accepting != 0);
+ if (tx != NULL)
+ list_add_tail(&tx->tx_list,
+ &peer->ibp_tx_queue);
+ write_unlock_irqrestore(g_lock, flags);
+ } else {
+ conn = kiblnd_get_conn_locked(peer);
+ kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+ write_unlock_irqrestore(g_lock, flags);
+
+ if (tx != NULL)
+ kiblnd_queue_tx(tx, conn);
+ kiblnd_conn_decref(conn); /* ...to here */
+ }
+ return;
+ }
+
+ write_unlock_irqrestore(g_lock, flags);
+
+ /* Allocate a peer ready to add to the peer table and retry */
+ rc = kiblnd_create_peer(ni, &peer, nid);
+ if (rc != 0) {
+ CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+ if (tx != NULL) {
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kiblnd_tx_done(ni, tx);
+ }
+ return;
+ }
+
+ write_lock_irqsave(g_lock, flags);
+
+ peer2 = kiblnd_find_peer_locked(nid);
+ if (peer2 != NULL) {
+ if (list_empty(&peer2->ibp_conns)) {
+ /* found a peer, but it's still connecting... */
+ LASSERT(peer2->ibp_connecting != 0 ||
+ peer2->ibp_accepting != 0);
+ if (tx != NULL)
+ list_add_tail(&tx->tx_list,
+ &peer2->ibp_tx_queue);
+ write_unlock_irqrestore(g_lock, flags);
+ } else {
+ conn = kiblnd_get_conn_locked(peer2);
+ kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+ write_unlock_irqrestore(g_lock, flags);
+
+ if (tx != NULL)
+ kiblnd_queue_tx(tx, conn);
+ kiblnd_conn_decref(conn); /* ...to here */
+ }
+
+ kiblnd_peer_decref(peer);
+ return;
+ }
+
+ /* Brand new peer */
+ LASSERT(peer->ibp_connecting == 0);
+ peer->ibp_connecting = 1;
+
+ /* always called with a ref on ni, which prevents ni being shutdown */
+ LASSERT(((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
+ if (tx != NULL)
+ list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+
+ kiblnd_peer_addref(peer);
+ list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+ write_unlock_irqrestore(g_lock, flags);
+
+ kiblnd_connect_peer(peer);
+ kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+ lnet_hdr_t *hdr = &lntmsg->msg_hdr;
+ int type = lntmsg->msg_type;
+ lnet_process_id_t target = lntmsg->msg_target;
+ int target_is_router = lntmsg->msg_target_is_router;
+ int routing = lntmsg->msg_routing;
+ unsigned int payload_niov = lntmsg->msg_niov;
+ struct kvec *payload_iov = lntmsg->msg_iov;
+ lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
+ unsigned int payload_offset = lntmsg->msg_offset;
+ unsigned int payload_nob = lntmsg->msg_len;
+ kib_msg_t *ibmsg;
+ kib_tx_t *tx;
+ int nob;
+ int rc;
+
+ /* NB 'private' is different depending on what we're sending.... */
+
+ CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+ payload_nob, payload_niov, libcfs_id2str(target));
+
+ LASSERT(payload_nob == 0 || payload_niov > 0);
+ LASSERT(payload_niov <= LNET_MAX_IOV);
+
+ /* Thread context */
+ LASSERT(!in_interrupt());
+ /* payload is either all vaddrs or all pages */
+ LASSERT(!(payload_kiov != NULL && payload_iov != NULL));
+
+ switch (type) {
+ default:
+ LBUG();
+ return -EIO;
+
+ case LNET_MSG_ACK:
+ LASSERT(payload_nob == 0);
+ break;
+
+ case LNET_MSG_GET:
+ if (routing || target_is_router)
+ break; /* send IMMEDIATE */
+
+ /* is the REPLY message too small for RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+ if (nob <= IBLND_MSG_SIZE)
+ break; /* send IMMEDIATE */
+
+ tx = kiblnd_get_idle_tx(ni, target.nid);
+ if (tx == NULL) {
+ CERROR("Can't allocate txd for GET to %s\n",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ ibmsg = tx->tx_msg;
+
+ if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+ rc = kiblnd_setup_rd_iov(ni, tx,
+ &ibmsg->ibm_u.get.ibgm_rd,
+ lntmsg->msg_md->md_niov,
+ lntmsg->msg_md->md_iov.iov,
+ 0, lntmsg->msg_md->md_length);
+ else
+ rc = kiblnd_setup_rd_kiov(ni, tx,
+ &ibmsg->ibm_u.get.ibgm_rd,
+ lntmsg->msg_md->md_niov,
+ lntmsg->msg_md->md_iov.kiov,
+ 0, lntmsg->msg_md->md_length);
+ if (rc != 0) {
+ CERROR("Can't setup GET sink for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ kiblnd_tx_done(ni, tx);
+ return -EIO;
+ }
+
+ nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]);
+ ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+ ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+ tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+ if (tx->tx_lntmsg[1] == NULL) {
+ CERROR("Can't create reply for GET -> %s\n",
+ libcfs_nid2str(target.nid));
+ kiblnd_tx_done(ni, tx);
+ return -EIO;
+ }
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */
+ tx->tx_waiting = 1; /* waiting for GET_DONE */
+ kiblnd_launch_tx(ni, tx, target.nid);
+ return 0;
+
+ case LNET_MSG_REPLY:
+ case LNET_MSG_PUT:
+ /* Is the payload small enough not to need RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob <= IBLND_MSG_SIZE)
+ break; /* send IMMEDIATE */
+
+ tx = kiblnd_get_idle_tx(ni, target.nid);
+ if (tx == NULL) {
+ CERROR("Can't allocate %s txd for %s\n",
+ type == LNET_MSG_PUT ? "PUT" : "REPLY",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ if (payload_kiov == NULL)
+ rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ else
+ rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ if (rc != 0) {
+ CERROR("Can't setup PUT src for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ kiblnd_tx_done(ni, tx);
+ return -EIO;
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+ ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
+ kiblnd_launch_tx(ni, tx, target.nid);
+ return 0;
+ }
+
+ /* send IMMEDIATE */
+
+ LASSERT(offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+ <= IBLND_MSG_SIZE);
+
+ tx = kiblnd_get_idle_tx(ni, target.nid);
+ if (tx == NULL) {
+ CERROR("Can't send %d to %s: tx descs exhausted\n",
+ type, libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+ if (payload_kiov != NULL)
+ lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ else
+ lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+
+ nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ kiblnd_launch_tx(ni, tx, target.nid);
+ return 0;
+}
+
+static void
+kiblnd_reply(lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+ lnet_process_id_t target = lntmsg->msg_target;
+ unsigned int niov = lntmsg->msg_niov;
+ struct kvec *iov = lntmsg->msg_iov;
+ lnet_kiov_t *kiov = lntmsg->msg_kiov;
+ unsigned int offset = lntmsg->msg_offset;
+ unsigned int nob = lntmsg->msg_len;
+ kib_tx_t *tx;
+ int rc;
+
+ tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+ if (tx == NULL) {
+ CERROR("Can't get tx for REPLY to %s\n",
+ libcfs_nid2str(target.nid));
+ goto failed_0;
+ }
+
+ if (nob == 0)
+ rc = 0;
+ else if (kiov == NULL)
+ rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+ niov, iov, offset, nob);
+ else
+ rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+ niov, kiov, offset, nob);
+
+ if (rc != 0) {
+ CERROR("Can't setup GET src for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ goto failed_1;
+ }
+
+ rc = kiblnd_init_rdma(rx->rx_conn, tx,
+ IBLND_MSG_GET_DONE, nob,
+ &rx->rx_msg->ibm_u.get.ibgm_rd,
+ rx->rx_msg->ibm_u.get.ibgm_cookie);
+ if (rc < 0) {
+ CERROR("Can't setup rdma for GET from %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ goto failed_1;
+ }
+
+ if (nob == 0) {
+ /* No RDMA: local completion may happen now! */
+ lnet_finalize(ni, lntmsg, 0);
+ } else {
+ /* RDMA: lnet_finalize(lntmsg) when it
+ * completes */
+ tx->tx_lntmsg[0] = lntmsg;
+ }
+
+ kiblnd_queue_tx(tx, rx->rx_conn);
+ return;
+
+ failed_1:
+ kiblnd_tx_done(ni, tx);
+ failed_0:
+ lnet_finalize(ni, lntmsg, -EIO);
+}
+
+int
+kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+ unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ kib_rx_t *rx = private;
+ kib_msg_t *rxmsg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ kib_tx_t *tx;
+ kib_msg_t *txmsg;
+ int nob;
+ int post_credit = IBLND_POSTRX_PEER_CREDIT;
+ int rc = 0;
+
+ LASSERT(mlen <= rlen);
+ LASSERT(!in_interrupt());
+ /* Either all pages or all vaddrs */
+ LASSERT(!(kiov != NULL && iov != NULL));
+
+ switch (rxmsg->ibm_type) {
+ default:
+ LBUG();
+
+ case IBLND_MSG_IMMEDIATE:
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+ if (nob > rx->rx_nob) {
+ CERROR("Immediate message from %s too big: %d(%d)\n",
+ libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+ nob, rx->rx_nob);
+ rc = -EPROTO;
+ break;
+ }
+
+ if (kiov != NULL)
+ lnet_copy_flat2kiov(niov, kiov, offset,
+ IBLND_MSG_SIZE, rxmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ mlen);
+ else
+ lnet_copy_flat2iov(niov, iov, offset,
+ IBLND_MSG_SIZE, rxmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ mlen);
+ lnet_finalize(ni, lntmsg, 0);
+ break;
+
+ case IBLND_MSG_PUT_REQ:
+ if (mlen == 0) {
+ lnet_finalize(ni, lntmsg, 0);
+ kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
+ rxmsg->ibm_u.putreq.ibprm_cookie);
+ break;
+ }
+
+ tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+ if (tx == NULL) {
+ CERROR("Can't allocate tx for %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ /* Not replying will break the connection */
+ rc = -ENOMEM;
+ break;
+ }
+
+ txmsg = tx->tx_msg;
+ if (kiov == NULL)
+ rc = kiblnd_setup_rd_iov(ni, tx,
+ &txmsg->ibm_u.putack.ibpam_rd,
+ niov, iov, offset, mlen);
+ else
+ rc = kiblnd_setup_rd_kiov(ni, tx,
+ &txmsg->ibm_u.putack.ibpam_rd,
+ niov, kiov, offset, mlen);
+ if (rc != 0) {
+ CERROR("Can't setup PUT sink for %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ kiblnd_tx_done(ni, tx);
+ /* tell peer it's over */
+ kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
+ rxmsg->ibm_u.putreq.ibprm_cookie);
+ break;
+ }
+
+ nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]);
+ txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+ txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ tx->tx_waiting = 1; /* waiting for PUT_DONE */
+ kiblnd_queue_tx(tx, conn);
+
+ /* reposted buffer reserved for PUT_DONE */
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ break;
+
+ case IBLND_MSG_GET_REQ:
+ if (lntmsg != NULL) {
+ /* Optimized GET; RDMA lntmsg's payload */
+ kiblnd_reply(ni, rx, lntmsg);
+ } else {
+ /* GET didn't match anything */
+ kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+ -ENODATA,
+ rxmsg->ibm_u.get.ibgm_cookie);
+ }
+ break;
+ }
+
+ kiblnd_post_rx(rx, post_credit);
+ return rc;
+}
+
+int
+kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+ struct task_struct *task = kthread_run(fn, arg, "%s", name);
+
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ atomic_inc(&kiblnd_data.kib_nthreads);
+ return 0;
+}
+
+static void
+kiblnd_thread_fini(void)
+{
+ atomic_dec(&kiblnd_data.kib_nthreads);
+}
+
+void
+kiblnd_peer_alive(kib_peer_t *peer)
+{
+ /* This is racy, but everyone's only writing cfs_time_current() */
+ peer->ibp_last_alive = cfs_time_current();
+ mb();
+}
+
+static void
+kiblnd_peer_notify(kib_peer_t *peer)
+{
+ int error = 0;
+ unsigned long last_alive = 0;
+ unsigned long flags;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ if (list_empty(&peer->ibp_conns) &&
+ peer->ibp_accepting == 0 &&
+ peer->ibp_connecting == 0 &&
+ peer->ibp_error != 0) {
+ error = peer->ibp_error;
+ peer->ibp_error = 0;
+
+ last_alive = peer->ibp_last_alive;
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ if (error != 0)
+ lnet_notify(peer->ibp_ni,
+ peer->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked(kib_conn_t *conn, int error)
+{
+ /* This just does the immediate housekeeping. 'error' is zero for a
+ * normal shutdown which can happen only after the connection has been
+ * established. If the connection is established, schedule the
+ * connection to be finished off by the connd. Otherwise the connd is
+ * already dealing with it (either to set it up or tear it down).
+ * Caller holds kib_global_lock exclusively in irq context */
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_dev_t *dev;
+ unsigned long flags;
+
+ LASSERT(error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ if (error != 0 && conn->ibc_comms_error == 0)
+ conn->ibc_comms_error = error;
+
+ if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+ return; /* already being handled */
+
+ if (error == 0 &&
+ list_empty(&conn->ibc_tx_noops) &&
+ list_empty(&conn->ibc_tx_queue) &&
+ list_empty(&conn->ibc_tx_queue_rsrvd) &&
+ list_empty(&conn->ibc_tx_queue_nocred) &&
+ list_empty(&conn->ibc_active_txs)) {
+ CDEBUG(D_NET, "closing conn to %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ } else {
+ CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+ libcfs_nid2str(peer->ibp_nid), error,
+ list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+ list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+ list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+ list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+ list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+ }
+
+ dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
+ list_del(&conn->ibc_list);
+ /* connd (see below) takes over ibc_list's ref */
+
+ if (list_empty(&peer->ibp_conns) && /* no more conns */
+ kiblnd_peer_active(peer)) { /* still in peer table */
+ kiblnd_unlink_peer_locked(peer);
+
+ /* set/clear error on last conn */
+ peer->ibp_error = conn->ibc_comms_error;
+ }
+
+ kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+ if (error != 0 &&
+ kiblnd_dev_can_failover(dev)) {
+ list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ wake_up(&kiblnd_data.kib_failover_waitq);
+ }
+
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+ list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+ wake_up(&kiblnd_data.kib_connd_waitq);
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(kib_conn_t *conn, int error)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_close_conn_locked(conn, error);
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+static void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+ unsigned long flags;
+ kib_rx_t *rx;
+ kib_rx_t *tmp;
+
+ LASSERT(!in_interrupt());
+ LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ list_for_each_entry_safe(rx, tmp, &conn->ibc_early_rxs, rx_list) {
+ list_del(&rx->rx_list);
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_handle_rx(rx);
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ }
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+static void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+ LIST_HEAD(zombies);
+ struct list_head *tmp;
+ struct list_head *nxt;
+ kib_tx_t *tx;
+
+ spin_lock(&conn->ibc_lock);
+
+ list_for_each_safe(tmp, nxt, txs) {
+ tx = list_entry(tmp, kib_tx_t, tx_list);
+
+ if (txs == &conn->ibc_active_txs) {
+ LASSERT(!tx->tx_queued);
+ LASSERT(tx->tx_waiting ||
+ tx->tx_sending != 0);
+ } else {
+ LASSERT(tx->tx_queued);
+ }
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_waiting = 0;
+
+ if (tx->tx_sending == 0) {
+ tx->tx_queued = 0;
+ list_del(&tx->tx_list);
+ list_add(&tx->tx_list, &zombies);
+ }
+ }
+
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
+}
+
+static void
+kiblnd_finalise_conn(kib_conn_t *conn)
+{
+ LASSERT(!in_interrupt());
+ LASSERT(conn->ibc_state > IBLND_CONN_INIT);
+
+ kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+ /* abort_receives moves QP state to IB_QPS_ERR. This is only required
+ * for connections that didn't get as far as being connected, because
+ * rdma_disconnect() does this for free. */
+ kiblnd_abort_receives(conn);
+
+ /* Complete all tx descs not waiting for sends to complete.
+ * NB we should be safe from RDMA now that the QP has changed state */
+
+ kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+ kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+ kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+ kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+ kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+ kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_peer_connect_failed(kib_peer_t *peer, int active, int error)
+{
+ LIST_HEAD(zombies);
+ unsigned long flags;
+
+ LASSERT(error != 0);
+ LASSERT(!in_interrupt());
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ if (active) {
+ LASSERT(peer->ibp_connecting > 0);
+ peer->ibp_connecting--;
+ } else {
+ LASSERT(peer->ibp_accepting > 0);
+ peer->ibp_accepting--;
+ }
+
+ if (peer->ibp_connecting != 0 ||
+ peer->ibp_accepting != 0) {
+ /* another connection attempt under way... */
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+ flags);
+ return;
+ }
+
+ if (list_empty(&peer->ibp_conns)) {
+ /* Take peer's blocked transmits to complete with error */
+ list_add(&zombies, &peer->ibp_tx_queue);
+ list_del_init(&peer->ibp_tx_queue);
+
+ if (kiblnd_peer_active(peer))
+ kiblnd_unlink_peer_locked(peer);
+
+ peer->ibp_error = error;
+ } else {
+ /* Can't have blocked transmits if there are connections */
+ LASSERT(list_empty(&peer->ibp_tx_queue));
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_peer_notify(peer);
+
+ if (list_empty(&zombies))
+ return;
+
+ CNETERR("Deleting messages for %s: connection failed\n",
+ libcfs_nid2str(peer->ibp_nid));
+
+ kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+}
+
+void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_tx_t *tx;
+ kib_tx_t *tmp;
+ struct list_head txs;
+ unsigned long flags;
+ int active;
+
+ active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+ CDEBUG(D_NET, "%s: active(%d), version(%x), status(%d)\n",
+ libcfs_nid2str(peer->ibp_nid), active,
+ conn->ibc_version, status);
+
+ LASSERT(!in_interrupt());
+ LASSERT((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+ peer->ibp_connecting > 0) ||
+ (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+ peer->ibp_accepting > 0));
+
+ LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+ conn->ibc_connvars = NULL;
+
+ if (status != 0) {
+ /* failed to establish connection */
+ kiblnd_peer_connect_failed(peer, active, status);
+ kiblnd_finalise_conn(conn);
+ return;
+ }
+
+ /* connection established */
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ conn->ibc_last_send = jiffies;
+ kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+ kiblnd_peer_alive(peer);
+
+ /* Add conn to peer's list and nuke any dangling conns from a different
+ * peer instance... */
+ kiblnd_conn_addref(conn); /* +1 ref for ibc_list */
+ list_add(&conn->ibc_list, &peer->ibp_conns);
+ if (active)
+ peer->ibp_connecting--;
+ else
+ peer->ibp_accepting--;
+
+ if (peer->ibp_version == 0) {
+ peer->ibp_version = conn->ibc_version;
+ peer->ibp_incarnation = conn->ibc_incarnation;
+ }
+
+ if (peer->ibp_version != conn->ibc_version ||
+ peer->ibp_incarnation != conn->ibc_incarnation) {
+ kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
+ conn->ibc_incarnation);
+ peer->ibp_version = conn->ibc_version;
+ peer->ibp_incarnation = conn->ibc_incarnation;
+ }
+
+ /* grab pending txs while I have the lock */
+ list_add(&txs, &peer->ibp_tx_queue);
+ list_del_init(&peer->ibp_tx_queue);
+
+ if (!kiblnd_peer_active(peer) || /* peer has been deleted */
+ conn->ibc_comms_error != 0) { /* error has happened already */
+ lnet_ni_t *ni = peer->ibp_ni;
+
+ /* start to shut down connection */
+ kiblnd_close_conn_locked(conn, -ECONNABORTED);
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
+
+ return;
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ /* Schedule blocked txs */
+ spin_lock(&conn->ibc_lock);
+ list_for_each_entry_safe(tx, tmp, &txs, tx_list) {
+ list_del(&tx->tx_list);
+
+ kiblnd_queue_tx_locked(tx, conn);
+ }
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_check_sends(conn);
+
+ /* schedule blocked rxs */
+ kiblnd_handle_early_rxs(conn);
+}
+
+static void
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+{
+ int rc;
+
+ rc = rdma_reject(cmid, rej, sizeof(*rej));
+
+ if (rc != 0)
+ CWARN("Error %d sending reject\n", rc);
+}
+
+static int
+kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+ rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+ kib_msg_t *reqmsg = priv;
+ kib_msg_t *ackmsg;
+ kib_dev_t *ibdev;
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+ kib_conn_t *conn;
+ lnet_ni_t *ni = NULL;
+ kib_net_t *net = NULL;
+ lnet_nid_t nid;
+ struct rdma_conn_param cp;
+ kib_rej_t rej;
+ int version = IBLND_MSG_VERSION;
+ unsigned long flags;
+ int rc;
+ struct sockaddr_in *peer_addr;
+ LASSERT(!in_interrupt());
+
+ /* cmid inherits 'context' from the corresponding listener id */
+ ibdev = (kib_dev_t *)cmid->context;
+ LASSERT(ibdev != NULL);
+
+ memset(&rej, 0, sizeof(rej));
+ rej.ibr_magic = IBLND_MSG_MAGIC;
+ rej.ibr_why = IBLND_REJECT_FATAL;
+ rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+ peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+ if (*kiblnd_tunables.kib_require_priv_port &&
+ ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+ __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+ CERROR("Peer's port (%pI4h:%hu) is not privileged\n",
+ &ip, ntohs(peer_addr->sin_port));
+ goto failed;
+ }
+
+ if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+ CERROR("Short connection request\n");
+ goto failed;
+ }
+
+ /* Future protocol version compatibility support! If the
+ * o2iblnd-specific protocol changes, or when LNET unifies
+ * protocols over all LNDs, the initial connection will
+ * negotiate a protocol version. I trap this here to avoid
+ * console errors; the reject tells the peer which protocol I
+ * speak. */
+ if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+ reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+ goto failed;
+ if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+ reqmsg->ibm_version != IBLND_MSG_VERSION &&
+ reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+ goto failed;
+ if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+ reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+ reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+ goto failed;
+
+ rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+ if (rc != 0) {
+ CERROR("Can't parse connection request: %d\n", rc);
+ goto failed;
+ }
+
+ nid = reqmsg->ibm_srcnid;
+ ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+
+ if (ni != NULL) {
+ net = (kib_net_t *)ni->ni_data;
+ rej.ibr_incarnation = net->ibn_incarnation;
+ }
+
+ if (ni == NULL || /* no matching net */
+ ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */
+ net->ibn_dev != ibdev) { /* wrong device */
+ CERROR("Can't accept %s on %s (%s:%d:%pI4h): bad dst nid %s\n",
+ libcfs_nid2str(nid),
+ ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+ ibdev->ibd_ifname, ibdev->ibd_nnets,
+ &ibdev->ibd_ifip,
+ libcfs_nid2str(reqmsg->ibm_dstnid));
+
+ goto failed;
+ }
+
+ /* check time stamp as soon as possible */
+ if (reqmsg->ibm_dststamp != 0 &&
+ reqmsg->ibm_dststamp != net->ibn_incarnation) {
+ CWARN("Stale connection request\n");
+ rej.ibr_why = IBLND_REJECT_CONN_STALE;
+ goto failed;
+ }
+
+ /* I can accept peer's version */
+ version = reqmsg->ibm_version;
+
+ if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+ CERROR("Unexpected connreq msg type: %x from %s\n",
+ reqmsg->ibm_type, libcfs_nid2str(nid));
+ goto failed;
+ }
+
+ if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
+ IBLND_MSG_QUEUE_SIZE(version)) {
+ CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+ libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
+ IBLND_MSG_QUEUE_SIZE(version));
+
+ if (version == IBLND_MSG_VERSION)
+ rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+ goto failed;
+ }
+
+ if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
+ IBLND_RDMA_FRAGS(version)) {
+ CERROR("Can't accept %s(version %x): incompatible max_frags %d (%d wanted)\n",
+ libcfs_nid2str(nid), version,
+ reqmsg->ibm_u.connparams.ibcp_max_frags,
+ IBLND_RDMA_FRAGS(version));
+
+ if (version == IBLND_MSG_VERSION)
+ rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+ goto failed;
+
+ }
+
+ if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+ CERROR("Can't accept %s: message size %d too big (%d max)\n",
+ libcfs_nid2str(nid),
+ reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+ IBLND_MSG_SIZE);
+ goto failed;
+ }
+
+ /* assume 'nid' is a new peer; create */
+ rc = kiblnd_create_peer(ni, &peer, nid);
+ if (rc != 0) {
+ CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+ rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+ goto failed;
+ }
+
+ write_lock_irqsave(g_lock, flags);
+
+ peer2 = kiblnd_find_peer_locked(nid);
+ if (peer2 != NULL) {
+ if (peer2->ibp_version == 0) {
+ peer2->ibp_version = version;
+ peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+ }
+
+ /* not the guy I've talked with */
+ if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+ peer2->ibp_version != version) {
+ kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+ write_unlock_irqrestore(g_lock, flags);
+
+ CWARN("Conn stale %s [old ver: %x, new ver: %x]\n",
+ libcfs_nid2str(nid), peer2->ibp_version, version);
+
+ kiblnd_peer_decref(peer);
+ rej.ibr_why = IBLND_REJECT_CONN_STALE;
+ goto failed;
+ }
+
+ /* tie-break connection race in favour of the higher NID */
+ if (peer2->ibp_connecting != 0 &&
+ nid < ni->ni_nid) {
+ write_unlock_irqrestore(g_lock, flags);
+
+ CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
+
+ kiblnd_peer_decref(peer);
+ rej.ibr_why = IBLND_REJECT_CONN_RACE;
+ goto failed;
+ }
+
+ peer2->ibp_accepting++;
+ kiblnd_peer_addref(peer2);
+
+ write_unlock_irqrestore(g_lock, flags);
+ kiblnd_peer_decref(peer);
+ peer = peer2;
+ } else {
+ /* Brand new peer */
+ LASSERT(peer->ibp_accepting == 0);
+ LASSERT(peer->ibp_version == 0 &&
+ peer->ibp_incarnation == 0);
+
+ peer->ibp_accepting = 1;
+ peer->ibp_version = version;
+ peer->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+ /* I have a ref on ni that prevents it being shutdown */
+ LASSERT(net->ibn_shutdown == 0);
+
+ kiblnd_peer_addref(peer);
+ list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+ write_unlock_irqrestore(g_lock, flags);
+ }
+
+ conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+ if (conn == NULL) {
+ kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
+ kiblnd_peer_decref(peer);
+ rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+ goto failed;
+ }
+
+ /* conn now "owns" cmid, so I return success from here on to ensure the
+ * CM callback doesn't destroy cmid. */
+
+ conn->ibc_incarnation = reqmsg->ibm_srcstamp;
+ conn->ibc_credits = IBLND_MSG_QUEUE_SIZE(version);
+ conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
+ LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
+ <= IBLND_RX_MSGS(version));
+
+ ackmsg = &conn->ibc_connvars->cv_msg;
+ memset(ackmsg, 0, sizeof(*ackmsg));
+
+ kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+ sizeof(ackmsg->ibm_u.connparams));
+ ackmsg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+ ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+ ackmsg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
+
+ kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.private_data = ackmsg;
+ cp.private_data_len = ackmsg->ibm_nob;
+ cp.responder_resources = 0; /* No atomic ops or RDMA reads */
+ cp.initiator_depth = 0;
+ cp.flow_control = 1;
+ cp.retry_count = *kiblnd_tunables.kib_retry_count;
+ cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
+
+ CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+ rc = rdma_accept(cmid, &cp);
+ if (rc != 0) {
+ CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+ rej.ibr_version = version;
+ rej.ibr_why = IBLND_REJECT_FATAL;
+
+ kiblnd_reject(cmid, &rej);
+ kiblnd_connreq_done(conn, rc);
+ kiblnd_conn_decref(conn);
+ }
+
+ lnet_ni_decref(ni);
+ return 0;
+
+ failed:
+ if (ni != NULL)
+ lnet_ni_decref(ni);
+
+ rej.ibr_version = version;
+ rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+ rej.ibr_cp.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
+ kiblnd_reject(cmid, &rej);
+
+ return -ECONNREFUSED;
+}
+
+static void
+kiblnd_reconnect(kib_conn_t *conn, int version,
+ __u64 incarnation, int why, kib_connparams_t *cp)
+{
+ kib_peer_t *peer = conn->ibc_peer;
+ char *reason;
+ int retry = 0;
+ unsigned long flags;
+
+ LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+ LASSERT(peer->ibp_connecting > 0); /* 'conn' at least */
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ /* retry connection if it's still needed and no other connection
+ * attempts (active or passive) are in progress
+ * NB: reconnect is still needed even when ibp_tx_queue is
+ * empty if ibp_version != version because reconnect may be
+ * initiated by kiblnd_query() */
+ if ((!list_empty(&peer->ibp_tx_queue) ||
+ peer->ibp_version != version) &&
+ peer->ibp_connecting == 1 &&
+ peer->ibp_accepting == 0) {
+ retry = 1;
+ peer->ibp_connecting++;
+
+ peer->ibp_version = version;
+ peer->ibp_incarnation = incarnation;
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ if (!retry)
+ return;
+
+ switch (why) {
+ default:
+ reason = "Unknown";
+ break;
+
+ case IBLND_REJECT_CONN_STALE:
+ reason = "stale";
+ break;
+
+ case IBLND_REJECT_CONN_RACE:
+ reason = "conn race";
+ break;
+
+ case IBLND_REJECT_CONN_UNCOMPAT:
+ reason = "version negotiation";
+ break;
+ }
+
+ CNETERR("%s: retrying (%s), %x, %x, queue_dep: %d, max_frag: %d, msg_size: %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ reason, IBLND_MSG_VERSION, version,
+ cp != NULL ? cp->ibcp_queue_depth : IBLND_MSG_QUEUE_SIZE(version),
+ cp != NULL ? cp->ibcp_max_frags : IBLND_RDMA_FRAGS(version),
+ cp != NULL ? cp->ibcp_max_msg_size : IBLND_MSG_SIZE);
+
+ kiblnd_connect_peer(peer);
+}
+
+static void
+kiblnd_rejected(kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+ kib_peer_t *peer = conn->ibc_peer;
+
+ LASSERT(!in_interrupt());
+ LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+ switch (reason) {
+ case IB_CM_REJ_STALE_CONN:
+ kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0,
+ IBLND_REJECT_CONN_STALE, NULL);
+ break;
+
+ case IB_CM_REJ_INVALID_SERVICE_ID:
+ CNETERR("%s rejected: no listener at %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ *kiblnd_tunables.kib_service);
+ break;
+
+ case IB_CM_REJ_CONSUMER_DEFINED:
+ if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+ kib_rej_t *rej = priv;
+ kib_connparams_t *cp = NULL;
+ int flip = 0;
+ __u64 incarnation = -1;
+
+ /* NB. default incarnation is -1 because:
+ * a) V1 will ignore dst incarnation in connreq.
+ * b) V2 will provide incarnation while rejecting me,
+ * -1 will be overwrote.
+ *
+ * if I try to connect to a V1 peer with V2 protocol,
+ * it rejected me then upgrade to V2, I have no idea
+ * about the upgrading and try to reconnect with V1,
+ * in this case upgraded V2 can find out I'm trying to
+ * talk to the old guy and reject me(incarnation is -1).
+ */
+
+ if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+ rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+ __swab32s(&rej->ibr_magic);
+ __swab16s(&rej->ibr_version);
+ flip = 1;
+ }
+
+ if (priv_nob >= sizeof(kib_rej_t) &&
+ rej->ibr_version > IBLND_MSG_VERSION_1) {
+ /* priv_nob is always 148 in current version
+ * of OFED, so we still need to check version.
+ * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */
+ cp = &rej->ibr_cp;
+
+ if (flip) {
+ __swab64s(&rej->ibr_incarnation);
+ __swab16s(&cp->ibcp_queue_depth);
+ __swab16s(&cp->ibcp_max_frags);
+ __swab32s(&cp->ibcp_max_msg_size);
+ }
+
+ incarnation = rej->ibr_incarnation;
+ }
+
+ if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+ rej->ibr_magic != LNET_PROTO_MAGIC) {
+ CERROR("%s rejected: consumer defined fatal error\n",
+ libcfs_nid2str(peer->ibp_nid));
+ break;
+ }
+
+ if (rej->ibr_version != IBLND_MSG_VERSION &&
+ rej->ibr_version != IBLND_MSG_VERSION_1) {
+ CERROR("%s rejected: o2iblnd version %x error\n",
+ libcfs_nid2str(peer->ibp_nid),
+ rej->ibr_version);
+ break;
+ }
+
+ if (rej->ibr_why == IBLND_REJECT_FATAL &&
+ rej->ibr_version == IBLND_MSG_VERSION_1) {
+ CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
+ libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
+
+ if (conn->ibc_version != IBLND_MSG_VERSION_1)
+ rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+ }
+
+ switch (rej->ibr_why) {
+ case IBLND_REJECT_CONN_RACE:
+ case IBLND_REJECT_CONN_STALE:
+ case IBLND_REJECT_CONN_UNCOMPAT:
+ kiblnd_reconnect(conn, rej->ibr_version,
+ incarnation, rej->ibr_why, cp);
+ break;
+
+ case IBLND_REJECT_MSG_QUEUE_SIZE:
+ CERROR("%s rejected: incompatible message queue depth %d, %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ cp != NULL ? cp->ibcp_queue_depth :
+ IBLND_MSG_QUEUE_SIZE(rej->ibr_version),
+ IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+ break;
+
+ case IBLND_REJECT_RDMA_FRAGS:
+ CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ cp != NULL ? cp->ibcp_max_frags :
+ IBLND_RDMA_FRAGS(rej->ibr_version),
+ IBLND_RDMA_FRAGS(conn->ibc_version));
+ break;
+
+ case IBLND_REJECT_NO_RESOURCES:
+ CERROR("%s rejected: o2iblnd no resources\n",
+ libcfs_nid2str(peer->ibp_nid));
+ break;
+
+ case IBLND_REJECT_FATAL:
+ CERROR("%s rejected: o2iblnd fatal error\n",
+ libcfs_nid2str(peer->ibp_nid));
+ break;
+
+ default:
+ CERROR("%s rejected: o2iblnd reason %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ rej->ibr_why);
+ break;
+ }
+ break;
+ }
+ /* fall through */
+ default:
+ CNETERR("%s rejected: reason %d, size %d\n",
+ libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+ break;
+ }
+
+ kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+static void
+kiblnd_check_connreply(kib_conn_t *conn, void *priv, int priv_nob)
+{
+ kib_peer_t *peer = conn->ibc_peer;
+ lnet_ni_t *ni = peer->ibp_ni;
+ kib_net_t *net = ni->ni_data;
+ kib_msg_t *msg = priv;
+ int ver = conn->ibc_version;
+ int rc = kiblnd_unpack_msg(msg, priv_nob);
+ unsigned long flags;
+
+ LASSERT(net != NULL);
+
+ if (rc != 0) {
+ CERROR("Can't unpack connack from %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc);
+ goto failed;
+ }
+
+ if (msg->ibm_type != IBLND_MSG_CONNACK) {
+ CERROR("Unexpected message %d from %s\n",
+ msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ if (ver != msg->ibm_version) {
+ CERROR("%s replied version %x is different with requested version %x\n",
+ libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ if (msg->ibm_u.connparams.ibcp_queue_depth !=
+ IBLND_MSG_QUEUE_SIZE(ver)) {
+ CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ msg->ibm_u.connparams.ibcp_queue_depth,
+ IBLND_MSG_QUEUE_SIZE(ver));
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ if (msg->ibm_u.connparams.ibcp_max_frags !=
+ IBLND_RDMA_FRAGS(ver)) {
+ CERROR("%s has incompatible max_frags %d (%d wanted)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ msg->ibm_u.connparams.ibcp_max_frags,
+ IBLND_RDMA_FRAGS(ver));
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+ CERROR("%s max message size %d too big (%d max)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ msg->ibm_u.connparams.ibcp_max_msg_size,
+ IBLND_MSG_SIZE);
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ if (msg->ibm_dstnid == ni->ni_nid &&
+ msg->ibm_dststamp == net->ibn_incarnation)
+ rc = 0;
+ else
+ rc = -ESTALE;
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ if (rc != 0) {
+ CERROR("Bad connection reply from %s, rc = %d, version: %x max_frags: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc,
+ msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+ goto failed;
+ }
+
+ conn->ibc_incarnation = msg->ibm_srcstamp;
+ conn->ibc_credits =
+ conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
+ LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
+ <= IBLND_RX_MSGS(ver));
+
+ kiblnd_connreq_done(conn, 0);
+ return;
+
+ failed:
+ /* NB My QP has already established itself, so I handle anything going
+ * wrong here by setting ibc_comms_error.
+ * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+ * immediately tears it down. */
+
+ LASSERT(rc != 0);
+ conn->ibc_comms_error = rc;
+ kiblnd_connreq_done(conn, 0);
+}
+
+static int
+kiblnd_active_connect(struct rdma_cm_id *cmid)
+{
+ kib_peer_t *peer = (kib_peer_t *)cmid->context;
+ kib_conn_t *conn;
+ kib_msg_t *msg;
+ struct rdma_conn_param cp;
+ int version;
+ __u64 incarnation;
+ unsigned long flags;
+ int rc;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ incarnation = peer->ibp_incarnation;
+ version = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
+ peer->ibp_version;
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
+ if (conn == NULL) {
+ kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
+ kiblnd_peer_decref(peer); /* lose cmid's ref */
+ return -ENOMEM;
+ }
+
+ /* conn "owns" cmid now, so I return success from here on to ensure the
+ * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+ * on peer */
+
+ msg = &conn->ibc_connvars->cv_msg;
+
+ memset(msg, 0, sizeof(*msg));
+ kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+ msg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+ msg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
+ msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+ kiblnd_pack_msg(peer->ibp_ni, msg, version,
+ 0, peer->ibp_nid, incarnation);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.private_data = msg;
+ cp.private_data_len = msg->ibm_nob;
+ cp.responder_resources = 0; /* No atomic ops or RDMA reads */
+ cp.initiator_depth = 0;
+ cp.flow_control = 1;
+ cp.retry_count = *kiblnd_tunables.kib_retry_count;
+ cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
+
+ LASSERT(cmid->context == (void *)conn);
+ LASSERT(conn->ibc_cmid == cmid);
+
+ rc = rdma_connect(cmid, &cp);
+ if (rc != 0) {
+ CERROR("Can't connect to %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc);
+ kiblnd_connreq_done(conn, rc);
+ kiblnd_conn_decref(conn);
+ }
+
+ return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ int rc;
+
+ switch (event->event) {
+ default:
+ CERROR("Unexpected event: %d, status: %d\n",
+ event->event, event->status);
+ LBUG();
+
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ /* destroy cmid on failure */
+ rc = kiblnd_passive_connect(cmid,
+ (void *)KIBLND_CONN_PARAM(event),
+ KIBLND_CONN_PARAM_LEN(event));
+ CDEBUG(D_NET, "connreq: %d\n", rc);
+ return rc;
+
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ peer = (kib_peer_t *)cmid->context;
+ CNETERR("%s: ADDR ERROR %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+ kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+ kiblnd_peer_decref(peer);
+ return -EHOSTUNREACH; /* rc != 0 destroys cmid */
+
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ peer = (kib_peer_t *)cmid->context;
+
+ CDEBUG(D_NET, "%s Addr resolved: %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+
+ if (event->status != 0) {
+ CNETERR("Can't resolve address for %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+ rc = event->status;
+ } else {
+ rc = rdma_resolve_route(
+ cmid, *kiblnd_tunables.kib_timeout * 1000);
+ if (rc == 0)
+ return 0;
+ /* Can't initiate route resolution */
+ CERROR("Can't resolve route for %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc);
+ }
+ kiblnd_peer_connect_failed(peer, 1, rc);
+ kiblnd_peer_decref(peer);
+ return rc; /* rc != 0 destroys cmid */
+
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ peer = (kib_peer_t *)cmid->context;
+ CNETERR("%s: ROUTE ERROR %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+ kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+ kiblnd_peer_decref(peer);
+ return -EHOSTUNREACH; /* rc != 0 destroys cmid */
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ peer = (kib_peer_t *)cmid->context;
+ CDEBUG(D_NET, "%s Route resolved: %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+
+ if (event->status == 0)
+ return kiblnd_active_connect(cmid);
+
+ CNETERR("Can't resolve route for %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+ kiblnd_peer_connect_failed(peer, 1, event->status);
+ kiblnd_peer_decref(peer);
+ return event->status; /* rc != 0 destroys cmid */
+
+ case RDMA_CM_EVENT_UNREACHABLE:
+ conn = (kib_conn_t *)cmid->context;
+ LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+ conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+ CNETERR("%s: UNREACHABLE %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+ kiblnd_connreq_done(conn, -ENETDOWN);
+ kiblnd_conn_decref(conn);
+ return 0;
+
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ conn = (kib_conn_t *)cmid->context;
+ LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+ conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+ CNETERR("%s: CONNECT ERROR %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+ kiblnd_connreq_done(conn, -ENOTCONN);
+ kiblnd_conn_decref(conn);
+ return 0;
+
+ case RDMA_CM_EVENT_REJECTED:
+ conn = (kib_conn_t *)cmid->context;
+ switch (conn->ibc_state) {
+ default:
+ LBUG();
+
+ case IBLND_CONN_PASSIVE_WAIT:
+ CERROR("%s: REJECTED %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ event->status);
+ kiblnd_connreq_done(conn, -ECONNRESET);
+ break;
+
+ case IBLND_CONN_ACTIVE_CONNECT:
+ kiblnd_rejected(conn, event->status,
+ (void *)KIBLND_CONN_PARAM(event),
+ KIBLND_CONN_PARAM_LEN(event));
+ break;
+ }
+ kiblnd_conn_decref(conn);
+ return 0;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ conn = (kib_conn_t *)cmid->context;
+ switch (conn->ibc_state) {
+ default:
+ LBUG();
+
+ case IBLND_CONN_PASSIVE_WAIT:
+ CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_connreq_done(conn, 0);
+ break;
+
+ case IBLND_CONN_ACTIVE_CONNECT:
+ CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_check_connreply(conn,
+ (void *)KIBLND_CONN_PARAM(event),
+ KIBLND_CONN_PARAM_LEN(event));
+ break;
+ }
+ /* net keeps its ref on conn! */
+ return 0;
+
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+ return 0;
+ case RDMA_CM_EVENT_DISCONNECTED:
+ conn = (kib_conn_t *)cmid->context;
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+ CERROR("%s DISCONNECTED\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_connreq_done(conn, -ECONNRESET);
+ } else {
+ kiblnd_close_conn(conn, 0);
+ }
+ kiblnd_conn_decref(conn);
+ cmid->context = NULL;
+ return 0;
+
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ LCONSOLE_ERROR_MSG(0x131,
+ "Received notification of device removal\n"
+ "Please shutdown LNET to allow this to proceed\n");
+ /* Can't remove network from underneath LNET for now, so I have
+ * to ignore this */
+ return 0;
+
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+ return 0;
+ }
+}
+
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+{
+ kib_tx_t *tx;
+ struct list_head *ttmp;
+
+ list_for_each(ttmp, txs) {
+ tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+ if (txs != &conn->ibc_active_txs) {
+ LASSERT(tx->tx_queued);
+ } else {
+ LASSERT(!tx->tx_queued);
+ LASSERT(tx->tx_waiting || tx->tx_sending != 0);
+ }
+
+ if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
+ CERROR("Timed out tx: %s, %lu seconds\n",
+ kiblnd_queue2str(conn, txs),
+ cfs_duration_sec(jiffies - tx->tx_deadline));
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+{
+ return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+static void
+kiblnd_check_conns(int idx)
+{
+ LIST_HEAD(closes);
+ LIST_HEAD(checksends);
+ struct list_head *peers = &kiblnd_data.kib_peers[idx];
+ struct list_head *ptmp;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ kib_conn_t *tmp;
+ struct list_head *ctmp;
+ unsigned long flags;
+
+ /* NB. We expect to have a look at all the peers and not find any
+ * RDMAs to time out, so we just use a shared lock while we
+ * take a look... */
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ list_for_each(ptmp, peers) {
+ peer = list_entry(ptmp, kib_peer_t, ibp_list);
+
+ list_for_each(ctmp, &peer->ibp_conns) {
+ int timedout;
+ int sendnoop;
+
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+ spin_lock(&conn->ibc_lock);
+
+ sendnoop = kiblnd_need_noop(conn);
+ timedout = kiblnd_conn_timed_out_locked(conn);
+ if (!sendnoop && !timedout) {
+ spin_unlock(&conn->ibc_lock);
+ continue;
+ }
+
+ if (timedout) {
+ CERROR("Timed out RDMA with %s (%lu): c: %u, oc: %u, rc: %u\n",
+ libcfs_nid2str(peer->ibp_nid),
+ cfs_duration_sec(cfs_time_current() -
+ peer->ibp_last_alive),
+ conn->ibc_credits,
+ conn->ibc_outstanding_credits,
+ conn->ibc_reserved_credits);
+ list_add(&conn->ibc_connd_list, &closes);
+ } else {
+ list_add(&conn->ibc_connd_list,
+ &checksends);
+ }
+ /* +ref for 'closes' or 'checksends' */
+ kiblnd_conn_addref(conn);
+
+ spin_unlock(&conn->ibc_lock);
+ }
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ /* Handle timeout by closing the whole
+ * connection. We can only be sure RDMA activity
+ * has ceased once the QP has been modified. */
+ list_for_each_entry_safe(conn, tmp, &closes, ibc_connd_list) {
+ list_del(&conn->ibc_connd_list);
+ kiblnd_close_conn(conn, -ETIMEDOUT);
+ kiblnd_conn_decref(conn);
+ }
+
+ /* In case we have enough credits to return via a
+ * NOOP, but there were no non-blocking tx descs
+ * free to do it last time... */
+ while (!list_empty(&checksends)) {
+ conn = list_entry(checksends.next,
+ kib_conn_t, ibc_connd_list);
+ list_del(&conn->ibc_connd_list);
+ kiblnd_check_sends(conn);
+ kiblnd_conn_decref(conn);
+ }
+}
+
+static void
+kiblnd_disconnect_conn(kib_conn_t *conn)
+{
+ LASSERT(!in_interrupt());
+ LASSERT(current == kiblnd_data.kib_connd);
+ LASSERT(conn->ibc_state == IBLND_CONN_CLOSING);
+
+ rdma_disconnect(conn->ibc_cmid);
+ kiblnd_finalise_conn(conn);
+
+ kiblnd_peer_notify(conn->ibc_peer);
+}
+
+int
+kiblnd_connd(void *arg)
+{
+ wait_queue_t wait;
+ unsigned long flags;
+ kib_conn_t *conn;
+ int timeout;
+ int i;
+ int dropped_lock;
+ int peer_index = 0;
+ unsigned long deadline = jiffies;
+
+ cfs_block_allsigs();
+
+ init_waitqueue_entry(&wait, current);
+ kiblnd_data.kib_connd = current;
+
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+ while (!kiblnd_data.kib_shutdown) {
+
+ dropped_lock = 0;
+
+ if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
+ conn = list_entry(kiblnd_data. \
+ kib_connd_zombies.next,
+ kib_conn_t, ibc_list);
+ list_del(&conn->ibc_list);
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+ flags);
+ dropped_lock = 1;
+
+ kiblnd_destroy_conn(conn);
+
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+ }
+
+ if (!list_empty(&kiblnd_data.kib_connd_conns)) {
+ conn = list_entry(kiblnd_data.kib_connd_conns.next,
+ kib_conn_t, ibc_list);
+ list_del(&conn->ibc_list);
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+ flags);
+ dropped_lock = 1;
+
+ kiblnd_disconnect_conn(conn);
+ kiblnd_conn_decref(conn);
+
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+ }
+
+ /* careful with the jiffy wrap... */
+ timeout = (int)(deadline - jiffies);
+ if (timeout <= 0) {
+ const int n = 4;
+ const int p = 1;
+ int chunk = kiblnd_data.kib_peer_hash_size;
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+ dropped_lock = 1;
+
+ /* Time to check for RDMA timeouts on a few more
+ * peers: I do checks every 'p' seconds on a
+ * proportion of the peer table and I need to check
+ * every connection 'n' times within a timeout
+ * interval, to ensure I detect a timeout on any
+ * connection within (n+1)/n times the timeout
+ * interval. */
+
+ if (*kiblnd_tunables.kib_timeout > n * p)
+ chunk = (chunk * n * p) /
+ *kiblnd_tunables.kib_timeout;
+ if (chunk == 0)
+ chunk = 1;
+
+ for (i = 0; i < chunk; i++) {
+ kiblnd_check_conns(peer_index);
+ peer_index = (peer_index + 1) %
+ kiblnd_data.kib_peer_hash_size;
+ }
+
+ deadline += p * HZ;
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+ }
+
+ if (dropped_lock)
+ continue;
+
+ /* Nothing to do for 'timeout' */
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+ schedule_timeout(timeout);
+
+ remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+ kiblnd_thread_fini();
+ return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+ kib_conn_t *conn = arg;
+
+ switch (event->event) {
+ case IB_EVENT_COMM_EST:
+ CDEBUG(D_NET, "%s established\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return;
+
+ default:
+ CERROR("%s: Async QP event type %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+ return;
+ }
+}
+
+static void
+kiblnd_complete(struct ib_wc *wc)
+{
+ switch (kiblnd_wreqid2type(wc->wr_id)) {
+ default:
+ LBUG();
+
+ case IBLND_WID_RDMA:
+ /* We only get RDMA completion notification if it fails. All
+ * subsequent work items, including the final SEND will fail
+ * too. However we can't print out any more info about the
+ * failing RDMA because 'tx' might be back on the idle list or
+ * even reused already if we didn't manage to post all our work
+ * items */
+ CNETERR("RDMA (tx: %p) failed: %d\n",
+ kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+ return;
+
+ case IBLND_WID_TX:
+ kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+ return;
+
+ case IBLND_WID_RX:
+ kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+ wc->byte_len);
+ return;
+ }
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+ /* NB I'm not allowed to schedule this conn once its refcount has
+ * reached 0. Since fundamentally I'm racing with scheduler threads
+ * consuming my CQ I could be called after all completions have
+ * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+ * and this CQ is about to be destroyed so I NOOP. */
+ kib_conn_t *conn = (kib_conn_t *)arg;
+ struct kib_sched_info *sched = conn->ibc_sched;
+ unsigned long flags;
+
+ LASSERT(cq == conn->ibc_cq);
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+
+ conn->ibc_ready = 1;
+
+ if (!conn->ibc_scheduled &&
+ (conn->ibc_nrx > 0 ||
+ conn->ibc_nsends_posted > 0)) {
+ kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+ conn->ibc_scheduled = 1;
+ list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+ if (waitqueue_active(&sched->ibs_waitq))
+ wake_up(&sched->ibs_waitq);
+ }
+
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+ kib_conn_t *conn = arg;
+
+ CERROR("%s: async CQ event type %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+ long id = (long)arg;
+ struct kib_sched_info *sched;
+ kib_conn_t *conn;
+ wait_queue_t wait;
+ unsigned long flags;
+ struct ib_wc wc;
+ int did_something;
+ int busy_loops = 0;
+ int rc;
+
+ cfs_block_allsigs();
+
+ init_waitqueue_entry(&wait, current);
+
+ sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+ rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+ if (rc != 0) {
+ CWARN("Failed to bind on CPT %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n",
+ sched->ibs_cpt);
+ }
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+
+ while (!kiblnd_data.kib_shutdown) {
+ if (busy_loops++ >= IBLND_RESCHED) {
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ cond_resched();
+ busy_loops = 0;
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ }
+
+ did_something = 0;
+
+ if (!list_empty(&sched->ibs_conns)) {
+ conn = list_entry(sched->ibs_conns.next,
+ kib_conn_t, ibc_sched_list);
+ /* take over kib_sched_conns' ref on conn... */
+ LASSERT(conn->ibc_scheduled);
+ list_del(&conn->ibc_sched_list);
+ conn->ibc_ready = 0;
+
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+ if (rc == 0) {
+ rc = ib_req_notify_cq(conn->ibc_cq,
+ IB_CQ_NEXT_COMP);
+ if (rc < 0) {
+ CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ kiblnd_close_conn(conn, -EIO);
+ kiblnd_conn_decref(conn);
+ spin_lock_irqsave(&sched->ibs_lock,
+ flags);
+ continue;
+ }
+
+ rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+ }
+
+ if (rc < 0) {
+ CWARN("%s: ib_poll_cq failed: %d, closing connection\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ rc);
+ kiblnd_close_conn(conn, -EIO);
+ kiblnd_conn_decref(conn);
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ continue;
+ }
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+
+ if (rc != 0 || conn->ibc_ready) {
+ /* There may be another completion waiting; get
+ * another scheduler to check while I handle
+ * this one... */
+ /* +1 ref for sched_conns */
+ kiblnd_conn_addref(conn);
+ list_add_tail(&conn->ibc_sched_list,
+ &sched->ibs_conns);
+ if (waitqueue_active(&sched->ibs_waitq))
+ wake_up(&sched->ibs_waitq);
+ } else {
+ conn->ibc_scheduled = 0;
+ }
+
+ if (rc != 0) {
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+ kiblnd_complete(&wc);
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ }
+
+ kiblnd_conn_decref(conn); /* ...drop my ref from above */
+ did_something = 1;
+ }
+
+ if (did_something)
+ continue;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ schedule();
+ busy_loops = 0;
+
+ remove_wait_queue(&sched->ibs_waitq, &wait);
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ kiblnd_thread_fini();
+ return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+ rwlock_t *glock = &kiblnd_data.kib_global_lock;
+ kib_dev_t *dev;
+ wait_queue_t wait;
+ unsigned long flags;
+ int rc;
+
+ LASSERT(*kiblnd_tunables.kib_dev_failover != 0);
+
+ cfs_block_allsigs();
+
+ init_waitqueue_entry(&wait, current);
+ write_lock_irqsave(glock, flags);
+
+ while (!kiblnd_data.kib_shutdown) {
+ int do_failover = 0;
+ int long_sleep;
+
+ list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+ ibd_fail_list) {
+ if (time_before(cfs_time_current(),
+ dev->ibd_next_failover))
+ continue;
+ do_failover = 1;
+ break;
+ }
+
+ if (do_failover) {
+ list_del_init(&dev->ibd_fail_list);
+ dev->ibd_failover = 1;
+ write_unlock_irqrestore(glock, flags);
+
+ rc = kiblnd_dev_failover(dev);
+
+ write_lock_irqsave(glock, flags);
+
+ LASSERT(dev->ibd_failover);
+ dev->ibd_failover = 0;
+ if (rc >= 0) { /* Device is OK or failover succeed */
+ dev->ibd_next_failover = cfs_time_shift(3);
+ continue;
+ }
+
+ /* failed to failover, retry later */
+ dev->ibd_next_failover =
+ cfs_time_shift(min(dev->ibd_failed_failover, 10));
+ if (kiblnd_dev_can_failover(dev)) {
+ list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ }
+
+ continue;
+ }
+
+ /* long sleep if no more pending failover */
+ long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+ write_unlock_irqrestore(glock, flags);
+
+ rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+ cfs_time_seconds(1));
+ remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+ write_lock_irqsave(glock, flags);
+
+ if (!long_sleep || rc != 0)
+ continue;
+
+ /* have a long sleep, routine check all active devices,
+ * we need checking like this because if there is not active
+ * connection on the dev and no SEND from local, we may listen
+ * on wrong HCA for ever while there is a bonding failover */
+ list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+ if (kiblnd_dev_can_failover(dev)) {
+ list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ }
+ }
+ }
+
+ write_unlock_irqrestore(glock, flags);
+
+ kiblnd_thread_fini();
+ return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644
index 000000000..eedf01afd
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -0,0 +1,230 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static int service = 987;
+module_param(service, int, 0444);
+MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)");
+
+static int cksum;
+module_param(cksum, int, 0644);
+MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+module_param(timeout, int, 0644);
+MODULE_PARM_DESC(timeout, "timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+module_param(ntx, int, 0444);
+MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
+
+static int peer_credits = 8;
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
+
+static int peer_credits_hiw;
+module_param(peer_credits_hiw, int, 0444);
+MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits");
+
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+module_param(ipif_name, charp, 0444);
+MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
+
+static int retry_count = 5;
+module_param(retry_count, int, 0644);
+MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+module_param(rnr_retry_count, int, 0644);
+MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions");
+
+static int keepalive = 100;
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive");
+
+static int ib_mtu;
+module_param(ib_mtu, int, 0444);
+MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends;
+module_param(concurrent_sends, int, 0444);
+MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
+
+static int map_on_demand;
+module_param(map_on_demand, int, 0444);
+MODULE_PARM_DESC(map_on_demand, "map on demand");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+module_param(fmr_flush_trigger, int, 0444);
+MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+module_param(fmr_cache, int, 0444);
+MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int pmr_pool_size = 512;
+module_param(pmr_pool_size, int, 0444);
+MODULE_PARM_DESC(pmr_pool_size, "size of MR cache pmr pool on each CPT");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover;
+module_param(dev_failover, int, 0444);
+MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+
+static int require_privileged_port;
+module_param(require_privileged_port, int, 0644);
+MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+module_param(use_privileged_port, int, 0644);
+MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection");
+
+kib_tunables_t kiblnd_tunables = {
+ .kib_dev_failover = &dev_failover,
+ .kib_service = &service,
+ .kib_cksum = &cksum,
+ .kib_timeout = &timeout,
+ .kib_keepalive = &keepalive,
+ .kib_ntx = &ntx,
+ .kib_credits = &credits,
+ .kib_peertxcredits = &peer_credits,
+ .kib_peercredits_hiw = &peer_credits_hiw,
+ .kib_peerrtrcredits = &peer_buffer_credits,
+ .kib_peertimeout = &peer_timeout,
+ .kib_default_ipif = &ipif_name,
+ .kib_retry_count = &retry_count,
+ .kib_rnr_retry_count = &rnr_retry_count,
+ .kib_concurrent_sends = &concurrent_sends,
+ .kib_ib_mtu = &ib_mtu,
+ .kib_map_on_demand = &map_on_demand,
+ .kib_fmr_pool_size = &fmr_pool_size,
+ .kib_fmr_flush_trigger = &fmr_flush_trigger,
+ .kib_fmr_cache = &fmr_cache,
+ .kib_pmr_pool_size = &pmr_pool_size,
+ .kib_require_priv_port = &require_privileged_port,
+ .kib_use_priv_port = &use_privileged_port,
+ .kib_nscheds = &nscheds
+};
+
+int
+kiblnd_tunables_init(void)
+{
+ if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+ CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+ *kiblnd_tunables.kib_ib_mtu);
+ return -EINVAL;
+ }
+
+ if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
+ *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
+
+ if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
+ *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
+
+ if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
+ *kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
+
+ if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
+ *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
+
+ if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
+ *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
+
+ if (*kiblnd_tunables.kib_map_on_demand < 0 ||
+ *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
+ *kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
+
+ if (*kiblnd_tunables.kib_map_on_demand == 1)
+ *kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
+
+ if (*kiblnd_tunables.kib_concurrent_sends == 0) {
+ if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+ *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
+ *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
+ else
+ *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
+ }
+
+ if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
+ *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
+
+ if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
+ *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
+
+ if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
+ CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n",
+ *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
+ }
+
+ return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
new file mode 100644
index 000000000..f3fb8778c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LNET) += ksocklnd.o
+
+ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
new file mode 100644
index 000000000..7586b7e40
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -0,0 +1,2886 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "socklnd.h"
+
+static lnd_t the_ksocklnd;
+ksock_nal_data_t ksocknal_data;
+
+static ksock_interface_t *
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
+{
+ ksock_net_t *net = ni->ni_data;
+ int i;
+ ksock_interface_t *iface;
+
+ for (i = 0; i < net->ksnn_ninterfaces; i++) {
+ LASSERT(i < LNET_MAX_INTERFACES);
+ iface = &net->ksnn_interfaces[i];
+
+ if (iface->ksni_ipaddr == ip)
+ return iface;
+ }
+
+ return NULL;
+}
+
+static ksock_route_t *
+ksocknal_create_route(__u32 ipaddr, int port)
+{
+ ksock_route_t *route;
+
+ LIBCFS_ALLOC(route, sizeof(*route));
+ if (route == NULL)
+ return NULL;
+
+ atomic_set(&route->ksnr_refcount, 1);
+ route->ksnr_peer = NULL;
+ route->ksnr_retry_interval = 0; /* OK to connect at any time */
+ route->ksnr_ipaddr = ipaddr;
+ route->ksnr_port = port;
+ route->ksnr_scheduled = 0;
+ route->ksnr_connecting = 0;
+ route->ksnr_connected = 0;
+ route->ksnr_deleted = 0;
+ route->ksnr_conn_count = 0;
+ route->ksnr_share_count = 0;
+
+ return route;
+}
+
+void
+ksocknal_destroy_route(ksock_route_t *route)
+{
+ LASSERT(atomic_read(&route->ksnr_refcount) == 0);
+
+ if (route->ksnr_peer != NULL)
+ ksocknal_peer_decref(route->ksnr_peer);
+
+ LIBCFS_FREE(route, sizeof(*route));
+}
+
+static int
+ksocknal_create_peer(ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+{
+ ksock_net_t *net = ni->ni_data;
+ ksock_peer_t *peer;
+
+ LASSERT(id.nid != LNET_NID_ANY);
+ LASSERT(id.pid != LNET_PID_ANY);
+ LASSERT(!in_interrupt());
+
+ LIBCFS_ALLOC(peer, sizeof(*peer));
+ if (peer == NULL)
+ return -ENOMEM;
+
+ peer->ksnp_ni = ni;
+ peer->ksnp_id = id;
+ atomic_set(&peer->ksnp_refcount, 1); /* 1 ref for caller */
+ peer->ksnp_closing = 0;
+ peer->ksnp_accepting = 0;
+ peer->ksnp_proto = NULL;
+ peer->ksnp_last_alive = 0;
+ peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+ INIT_LIST_HEAD(&peer->ksnp_conns);
+ INIT_LIST_HEAD(&peer->ksnp_routes);
+ INIT_LIST_HEAD(&peer->ksnp_tx_queue);
+ INIT_LIST_HEAD(&peer->ksnp_zc_req_list);
+ spin_lock_init(&peer->ksnp_lock);
+
+ spin_lock_bh(&net->ksnn_lock);
+
+ if (net->ksnn_shutdown) {
+ spin_unlock_bh(&net->ksnn_lock);
+
+ LIBCFS_FREE(peer, sizeof(*peer));
+ CERROR("Can't create peer: network shutdown\n");
+ return -ESHUTDOWN;
+ }
+
+ net->ksnn_npeers++;
+
+ spin_unlock_bh(&net->ksnn_lock);
+
+ *peerp = peer;
+ return 0;
+}
+
+void
+ksocknal_destroy_peer(ksock_peer_t *peer)
+{
+ ksock_net_t *net = peer->ksnp_ni->ni_data;
+
+ CDEBUG(D_NET, "peer %s %p deleted\n",
+ libcfs_id2str(peer->ksnp_id), peer);
+
+ LASSERT(atomic_read(&peer->ksnp_refcount) == 0);
+ LASSERT(peer->ksnp_accepting == 0);
+ LASSERT(list_empty(&peer->ksnp_conns));
+ LASSERT(list_empty(&peer->ksnp_routes));
+ LASSERT(list_empty(&peer->ksnp_tx_queue));
+ LASSERT(list_empty(&peer->ksnp_zc_req_list));
+
+ LIBCFS_FREE(peer, sizeof(*peer));
+
+ /* NB a peer's connections and routes keep a reference on their peer
+ * until they are destroyed, so we can be assured that _all_ state to
+ * do with this peer has been cleaned up when its refcount drops to
+ * zero. */
+ spin_lock_bh(&net->ksnn_lock);
+ net->ksnn_npeers--;
+ spin_unlock_bh(&net->ksnn_lock);
+}
+
+ksock_peer_t *
+ksocknal_find_peer_locked(lnet_ni_t *ni, lnet_process_id_t id)
+{
+ struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
+ struct list_head *tmp;
+ ksock_peer_t *peer;
+
+ list_for_each(tmp, peer_list) {
+
+ peer = list_entry(tmp, ksock_peer_t, ksnp_list);
+
+ LASSERT(!peer->ksnp_closing);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ if (peer->ksnp_id.nid != id.nid ||
+ peer->ksnp_id.pid != id.pid)
+ continue;
+
+ CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+ peer, libcfs_id2str(id),
+ atomic_read(&peer->ksnp_refcount));
+ return peer;
+ }
+ return NULL;
+}
+
+ksock_peer_t *
+ksocknal_find_peer(lnet_ni_t *ni, lnet_process_id_t id)
+{
+ ksock_peer_t *peer;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL) /* +1 ref for caller? */
+ ksocknal_peer_addref(peer);
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ return peer;
+}
+
+static void
+ksocknal_unlink_peer_locked(ksock_peer_t *peer)
+{
+ int i;
+ __u32 ip;
+ ksock_interface_t *iface;
+
+ for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+ LASSERT(i < LNET_MAX_INTERFACES);
+ ip = peer->ksnp_passive_ips[i];
+
+ iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+ /* All IPs in peer->ksnp_passive_ips[] come from the
+ * interface list, therefore the call must succeed. */
+ LASSERT(iface != NULL);
+
+ CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
+ peer, iface, iface->ksni_nroutes);
+ iface->ksni_npeers--;
+ }
+
+ LASSERT(list_empty(&peer->ksnp_conns));
+ LASSERT(list_empty(&peer->ksnp_routes));
+ LASSERT(!peer->ksnp_closing);
+ peer->ksnp_closing = 1;
+ list_del(&peer->ksnp_list);
+ /* lose peerlist's ref */
+ ksocknal_peer_decref(peer);
+}
+
+static int
+ksocknal_get_peer_info(lnet_ni_t *ni, int index,
+ lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip,
+ int *port, int *conn_count, int *share_count)
+{
+ ksock_peer_t *peer;
+ struct list_head *ptmp;
+ ksock_route_t *route;
+ struct list_head *rtmp;
+ int i;
+ int j;
+ int rc = -ENOENT;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+
+ list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ if (peer->ksnp_n_passive_ips == 0 &&
+ list_empty(&peer->ksnp_routes)) {
+ if (index-- > 0)
+ continue;
+
+ *id = peer->ksnp_id;
+ *myip = 0;
+ *peer_ip = 0;
+ *port = 0;
+ *conn_count = 0;
+ *share_count = 0;
+ rc = 0;
+ goto out;
+ }
+
+ for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+ if (index-- > 0)
+ continue;
+
+ *id = peer->ksnp_id;
+ *myip = peer->ksnp_passive_ips[j];
+ *peer_ip = 0;
+ *port = 0;
+ *conn_count = 0;
+ *share_count = 0;
+ rc = 0;
+ goto out;
+ }
+
+ list_for_each(rtmp, &peer->ksnp_routes) {
+ if (index-- > 0)
+ continue;
+
+ route = list_entry(rtmp, ksock_route_t,
+ ksnr_list);
+
+ *id = peer->ksnp_id;
+ *myip = route->ksnr_myipaddr;
+ *peer_ip = route->ksnr_ipaddr;
+ *port = route->ksnr_port;
+ *conn_count = route->ksnr_conn_count;
+ *share_count = route->ksnr_share_count;
+ rc = 0;
+ goto out;
+ }
+ }
+ }
+ out:
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return rc;
+}
+
+static void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+ ksock_peer_t *peer = route->ksnr_peer;
+ int type = conn->ksnc_type;
+ ksock_interface_t *iface;
+
+ conn->ksnc_route = route;
+ ksocknal_route_addref(route);
+
+ if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+ if (route->ksnr_myipaddr == 0) {
+ /* route wasn't bound locally yet (the initial route) */
+ CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n",
+ libcfs_id2str(peer->ksnp_id),
+ &route->ksnr_ipaddr,
+ &conn->ksnc_myipaddr);
+ } else {
+ CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h to %pI4h\n",
+ libcfs_id2str(peer->ksnp_id),
+ &route->ksnr_ipaddr,
+ &route->ksnr_myipaddr,
+ &conn->ksnc_myipaddr);
+
+ iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+ route->ksnr_myipaddr);
+ if (iface != NULL)
+ iface->ksni_nroutes--;
+ }
+ route->ksnr_myipaddr = conn->ksnc_myipaddr;
+ iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+ route->ksnr_myipaddr);
+ if (iface != NULL)
+ iface->ksni_nroutes++;
+ }
+
+ route->ksnr_connected |= (1<<type);
+ route->ksnr_conn_count++;
+
+ /* Successful connection => further attempts can
+ * proceed immediately */
+ route->ksnr_retry_interval = 0;
+}
+
+static void
+ksocknal_add_route_locked(ksock_peer_t *peer, ksock_route_t *route)
+{
+ struct list_head *tmp;
+ ksock_conn_t *conn;
+ ksock_route_t *route2;
+
+ LASSERT(!peer->ksnp_closing);
+ LASSERT(route->ksnr_peer == NULL);
+ LASSERT(!route->ksnr_scheduled);
+ LASSERT(!route->ksnr_connecting);
+ LASSERT(route->ksnr_connected == 0);
+
+ /* LASSERT(unique) */
+ list_for_each(tmp, &peer->ksnp_routes) {
+ route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+ CERROR("Duplicate route %s %pI4h\n",
+ libcfs_id2str(peer->ksnp_id),
+ &route->ksnr_ipaddr);
+ LBUG();
+ }
+ }
+
+ route->ksnr_peer = peer;
+ ksocknal_peer_addref(peer);
+ /* peer's routelist takes over my ref on 'route' */
+ list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+
+ list_for_each(tmp, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+ continue;
+
+ ksocknal_associate_route_conn_locked(route, conn);
+ /* keep going (typed routes) */
+ }
+}
+
+static void
+ksocknal_del_route_locked(ksock_route_t *route)
+{
+ ksock_peer_t *peer = route->ksnr_peer;
+ ksock_interface_t *iface;
+ ksock_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+
+ LASSERT(!route->ksnr_deleted);
+
+ /* Close associated conns */
+ list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
+ conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+ if (conn->ksnc_route != route)
+ continue;
+
+ ksocknal_close_conn_locked(conn, 0);
+ }
+
+ if (route->ksnr_myipaddr != 0) {
+ iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+ route->ksnr_myipaddr);
+ if (iface != NULL)
+ iface->ksni_nroutes--;
+ }
+
+ route->ksnr_deleted = 1;
+ list_del(&route->ksnr_list);
+ ksocknal_route_decref(route); /* drop peer's ref */
+
+ if (list_empty(&peer->ksnp_routes) &&
+ list_empty(&peer->ksnp_conns)) {
+ /* I've just removed the last route to a peer with no active
+ * connections */
+ ksocknal_unlink_peer_locked(peer);
+ }
+}
+
+int
+ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
+{
+ struct list_head *tmp;
+ ksock_peer_t *peer;
+ ksock_peer_t *peer2;
+ ksock_route_t *route;
+ ksock_route_t *route2;
+ int rc;
+
+ if (id.nid == LNET_NID_ANY ||
+ id.pid == LNET_PID_ANY)
+ return -EINVAL;
+
+ /* Have a brand new peer ready... */
+ rc = ksocknal_create_peer(&peer, ni, id);
+ if (rc != 0)
+ return rc;
+
+ route = ksocknal_create_route(ipaddr, port);
+ if (route == NULL) {
+ ksocknal_peer_decref(peer);
+ return -ENOMEM;
+ }
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ /* always called with a ref on ni, so shutdown can't have started */
+ LASSERT(((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+ peer2 = ksocknal_find_peer_locked(ni, id);
+ if (peer2 != NULL) {
+ ksocknal_peer_decref(peer);
+ peer = peer2;
+ } else {
+ /* peer table takes my ref on peer */
+ list_add_tail(&peer->ksnp_list,
+ ksocknal_nid2peerlist(id.nid));
+ }
+
+ route2 = NULL;
+ list_for_each(tmp, &peer->ksnp_routes) {
+ route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ if (route2->ksnr_ipaddr == ipaddr)
+ break;
+
+ route2 = NULL;
+ }
+ if (route2 == NULL) {
+ ksocknal_add_route_locked(peer, route);
+ route->ksnr_share_count++;
+ } else {
+ ksocknal_route_decref(route);
+ route2->ksnr_share_count++;
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ return 0;
+}
+
+static void
+ksocknal_del_peer_locked(ksock_peer_t *peer, __u32 ip)
+{
+ ksock_conn_t *conn;
+ ksock_route_t *route;
+ struct list_head *tmp;
+ struct list_head *nxt;
+ int nshared;
+
+ LASSERT(!peer->ksnp_closing);
+
+ /* Extra ref prevents peer disappearing until I'm done with it */
+ ksocknal_peer_addref(peer);
+
+ list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ /* no match */
+ if (!(ip == 0 || route->ksnr_ipaddr == ip))
+ continue;
+
+ route->ksnr_share_count = 0;
+ /* This deletes associated conns too */
+ ksocknal_del_route_locked(route);
+ }
+
+ nshared = 0;
+ list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+ nshared += route->ksnr_share_count;
+ }
+
+ if (nshared == 0) {
+ /* remove everything else if there are no explicit entries
+ * left */
+
+ list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ /* we should only be removing auto-entries */
+ LASSERT(route->ksnr_share_count == 0);
+ ksocknal_del_route_locked(route);
+ }
+
+ list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ ksocknal_close_conn_locked(conn, 0);
+ }
+ }
+
+ ksocknal_peer_decref(peer);
+ /* NB peer unlinks itself when last conn/route is removed */
+}
+
+static int
+ksocknal_del_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
+{
+ LIST_HEAD(zombies);
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ ksock_peer_t *peer;
+ int lo;
+ int hi;
+ int i;
+ int rc = -ENOENT;
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ if (id.nid != LNET_NID_ANY)
+ lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+ else {
+ lo = 0;
+ hi = ksocknal_data.ksnd_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe(ptmp, pnxt,
+ &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+ (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+ continue;
+
+ ksocknal_peer_addref(peer); /* a ref for me... */
+
+ ksocknal_del_peer_locked(peer, ip);
+
+ if (peer->ksnp_closing &&
+ !list_empty(&peer->ksnp_tx_queue)) {
+ LASSERT(list_empty(&peer->ksnp_conns));
+ LASSERT(list_empty(&peer->ksnp_routes));
+
+ list_splice_init(&peer->ksnp_tx_queue,
+ &zombies);
+ }
+
+ ksocknal_peer_decref(peer); /* ...till here */
+
+ rc = 0; /* matched! */
+ }
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_txlist_done(ni, &zombies, 1);
+
+ return rc;
+}
+
+static ksock_conn_t *
+ksocknal_get_conn_by_idx(lnet_ni_t *ni, int index)
+{
+ ksock_peer_t *peer;
+ struct list_head *ptmp;
+ ksock_conn_t *conn;
+ struct list_head *ctmp;
+ int i;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+ LASSERT(!peer->ksnp_closing);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ list_for_each(ctmp, &peer->ksnp_conns) {
+ if (index-- > 0)
+ continue;
+
+ conn = list_entry(ctmp, ksock_conn_t,
+ ksnc_list);
+ ksocknal_conn_addref(conn);
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return conn;
+ }
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return NULL;
+}
+
+static ksock_sched_t *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+ struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt];
+ ksock_sched_t *sched;
+ int i;
+
+ LASSERT(info->ksi_nthreads > 0);
+
+ sched = &info->ksi_scheds[0];
+ /*
+ * NB: it's safe so far, but info->ksi_nthreads could be changed
+ * at runtime when we have dynamic LNet configuration, then we
+ * need to take care of this.
+ */
+ for (i = 1; i < info->ksi_nthreads; i++) {
+ if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+ sched = &info->ksi_scheds[i];
+ }
+
+ return sched;
+}
+
+static int
+ksocknal_local_ipvec(lnet_ni_t *ni, __u32 *ipaddrs)
+{
+ ksock_net_t *net = ni->ni_data;
+ int i;
+ int nip;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ nip = net->ksnn_ninterfaces;
+ LASSERT(nip <= LNET_MAX_INTERFACES);
+
+ /* Only offer interfaces for additional connections if I have
+ * more than one. */
+ if (nip < 2) {
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return 0;
+ }
+
+ for (i = 0; i < nip; i++) {
+ ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+ LASSERT(ipaddrs[i] != 0);
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return nip;
+}
+
+static int
+ksocknal_match_peerip(ksock_interface_t *iface, __u32 *ips, int nips)
+{
+ int best_netmatch = 0;
+ int best_xor = 0;
+ int best = -1;
+ int this_xor;
+ int this_netmatch;
+ int i;
+
+ for (i = 0; i < nips; i++) {
+ if (ips[i] == 0)
+ continue;
+
+ this_xor = ips[i] ^ iface->ksni_ipaddr;
+ this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+ if (!(best < 0 ||
+ best_netmatch < this_netmatch ||
+ (best_netmatch == this_netmatch &&
+ best_xor > this_xor)))
+ continue;
+
+ best = i;
+ best_netmatch = this_netmatch;
+ best_xor = this_xor;
+ }
+
+ LASSERT(best >= 0);
+ return best;
+}
+
+static int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+ rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+ ksock_net_t *net = peer->ksnp_ni->ni_data;
+ ksock_interface_t *iface;
+ ksock_interface_t *best_iface;
+ int n_ips;
+ int i;
+ int j;
+ int k;
+ __u32 ip;
+ __u32 xor;
+ int this_netmatch;
+ int best_netmatch;
+ int best_npeers;
+
+ /* CAVEAT EMPTOR: We do all our interface matching with an
+ * exclusive hold of global lock at IRQ priority. We're only
+ * expecting to be dealing with small numbers of interfaces, so the
+ * O(n**3)-ness shouldn't matter */
+
+ /* Also note that I'm not going to return more than n_peerips
+ * interfaces, even if I have more myself */
+
+ write_lock_bh(global_lock);
+
+ LASSERT(n_peerips <= LNET_MAX_INTERFACES);
+ LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+ /* Only match interfaces for additional connections
+ * if I have > 1 interface */
+ n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+ min(n_peerips, net->ksnn_ninterfaces);
+
+ for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+ /* ^ yes really... */
+
+ /* If we have any new interfaces, first tick off all the
+ * peer IPs that match old interfaces, then choose new
+ * interfaces to match the remaining peer IPS.
+ * We don't forget interfaces we've stopped using; we might
+ * start using them again... */
+
+ if (i < peer->ksnp_n_passive_ips) {
+ /* Old interface. */
+ ip = peer->ksnp_passive_ips[i];
+ best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+
+ } else {
+ /* choose a new interface */
+ LASSERT(i == peer->ksnp_n_passive_ips);
+
+ best_iface = NULL;
+ best_netmatch = 0;
+ best_npeers = 0;
+
+ for (j = 0; j < net->ksnn_ninterfaces; j++) {
+ iface = &net->ksnn_interfaces[j];
+ ip = iface->ksni_ipaddr;
+
+ for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+ if (peer->ksnp_passive_ips[k] == ip)
+ break;
+
+ if (k < peer->ksnp_n_passive_ips) /* using it already */
+ continue;
+
+ k = ksocknal_match_peerip(iface, peerips, n_peerips);
+ xor = ip ^ peerips[k];
+ this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+ if (!(best_iface == NULL ||
+ best_netmatch < this_netmatch ||
+ (best_netmatch == this_netmatch &&
+ best_npeers > iface->ksni_npeers)))
+ continue;
+
+ best_iface = iface;
+ best_netmatch = this_netmatch;
+ best_npeers = iface->ksni_npeers;
+ }
+
+ best_iface->ksni_npeers++;
+ ip = best_iface->ksni_ipaddr;
+ peer->ksnp_passive_ips[i] = ip;
+ peer->ksnp_n_passive_ips = i+1;
+ }
+
+ /* mark the best matching peer IP used */
+ j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+ peerips[j] = 0;
+ }
+
+ /* Overwrite input peer IP addresses */
+ memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+ write_unlock_bh(global_lock);
+
+ return n_ips;
+}
+
+static void
+ksocknal_create_routes(ksock_peer_t *peer, int port,
+ __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+ ksock_route_t *newroute = NULL;
+ rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+ lnet_ni_t *ni = peer->ksnp_ni;
+ ksock_net_t *net = ni->ni_data;
+ struct list_head *rtmp;
+ ksock_route_t *route;
+ ksock_interface_t *iface;
+ ksock_interface_t *best_iface;
+ int best_netmatch;
+ int this_netmatch;
+ int best_nroutes;
+ int i;
+ int j;
+
+ /* CAVEAT EMPTOR: We do all our interface matching with an
+ * exclusive hold of global lock at IRQ priority. We're only
+ * expecting to be dealing with small numbers of interfaces, so the
+ * O(n**3)-ness here shouldn't matter */
+
+ write_lock_bh(global_lock);
+
+ if (net->ksnn_ninterfaces < 2) {
+ /* Only create additional connections
+ * if I have > 1 interface */
+ write_unlock_bh(global_lock);
+ return;
+ }
+
+ LASSERT(npeer_ipaddrs <= LNET_MAX_INTERFACES);
+
+ for (i = 0; i < npeer_ipaddrs; i++) {
+ if (newroute != NULL) {
+ newroute->ksnr_ipaddr = peer_ipaddrs[i];
+ } else {
+ write_unlock_bh(global_lock);
+
+ newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+ if (newroute == NULL)
+ return;
+
+ write_lock_bh(global_lock);
+ }
+
+ if (peer->ksnp_closing) {
+ /* peer got closed under me */
+ break;
+ }
+
+ /* Already got a route? */
+ route = NULL;
+ list_for_each(rtmp, &peer->ksnp_routes) {
+ route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+ if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+ break;
+
+ route = NULL;
+ }
+ if (route != NULL)
+ continue;
+
+ best_iface = NULL;
+ best_nroutes = 0;
+ best_netmatch = 0;
+
+ LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+ /* Select interface to connect from */
+ for (j = 0; j < net->ksnn_ninterfaces; j++) {
+ iface = &net->ksnn_interfaces[j];
+
+ /* Using this interface already? */
+ list_for_each(rtmp, &peer->ksnp_routes) {
+ route = list_entry(rtmp, ksock_route_t,
+ ksnr_list);
+
+ if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+ break;
+
+ route = NULL;
+ }
+ if (route != NULL)
+ continue;
+
+ this_netmatch = (((iface->ksni_ipaddr ^
+ newroute->ksnr_ipaddr) &
+ iface->ksni_netmask) == 0) ? 1 : 0;
+
+ if (!(best_iface == NULL ||
+ best_netmatch < this_netmatch ||
+ (best_netmatch == this_netmatch &&
+ best_nroutes > iface->ksni_nroutes)))
+ continue;
+
+ best_iface = iface;
+ best_netmatch = this_netmatch;
+ best_nroutes = iface->ksni_nroutes;
+ }
+
+ if (best_iface == NULL)
+ continue;
+
+ newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+ best_iface->ksni_nroutes++;
+
+ ksocknal_add_route_locked(peer, newroute);
+ newroute = NULL;
+ }
+
+ write_unlock_bh(global_lock);
+ if (newroute != NULL)
+ ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept(lnet_ni_t *ni, struct socket *sock)
+{
+ ksock_connreq_t *cr;
+ int rc;
+ __u32 peer_ip;
+ int peer_port;
+
+ rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+ LASSERT(rc == 0); /* we succeeded before */
+
+ LIBCFS_ALLOC(cr, sizeof(*cr));
+ if (cr == NULL) {
+ LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from %pI4h: memory exhausted\n",
+ &peer_ip);
+ return -ENOMEM;
+ }
+
+ lnet_ni_addref(ni);
+ cr->ksncr_ni = ni;
+ cr->ksncr_sock = sock;
+
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+ list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+ wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+ return 0;
+}
+
+static int
+ksocknal_connecting(ksock_peer_t *peer, __u32 ipaddr)
+{
+ ksock_route_t *route;
+
+ list_for_each_entry(route, &peer->ksnp_routes, ksnr_list) {
+
+ if (route->ksnr_ipaddr == ipaddr)
+ return route->ksnr_connecting;
+ }
+ return 0;
+}
+
+int
+ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
+ struct socket *sock, int type)
+{
+ rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+ LIST_HEAD(zombies);
+ lnet_process_id_t peerid;
+ struct list_head *tmp;
+ __u64 incarnation;
+ ksock_conn_t *conn;
+ ksock_conn_t *conn2;
+ ksock_peer_t *peer = NULL;
+ ksock_peer_t *peer2;
+ ksock_sched_t *sched;
+ ksock_hello_msg_t *hello;
+ int cpt;
+ ksock_tx_t *tx;
+ ksock_tx_t *txtmp;
+ int rc;
+ int active;
+ char *warn = NULL;
+
+ active = (route != NULL);
+
+ LASSERT(active == (type != SOCKLND_CONN_NONE));
+
+ LIBCFS_ALLOC(conn, sizeof(*conn));
+ if (conn == NULL) {
+ rc = -ENOMEM;
+ goto failed_0;
+ }
+
+ conn->ksnc_peer = NULL;
+ conn->ksnc_route = NULL;
+ conn->ksnc_sock = sock;
+ /* 2 ref, 1 for conn, another extra ref prevents socket
+ * being closed before establishment of connection */
+ atomic_set(&conn->ksnc_sock_refcount, 2);
+ conn->ksnc_type = type;
+ ksocknal_lib_save_callback(sock, conn);
+ atomic_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+ conn->ksnc_rx_ready = 0;
+ conn->ksnc_rx_scheduled = 0;
+
+ INIT_LIST_HEAD(&conn->ksnc_tx_queue);
+ conn->ksnc_tx_ready = 0;
+ conn->ksnc_tx_scheduled = 0;
+ conn->ksnc_tx_carrier = NULL;
+ atomic_set(&conn->ksnc_tx_nob, 0);
+
+ LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t,
+ kshm_ips[LNET_MAX_INTERFACES]));
+ if (hello == NULL) {
+ rc = -ENOMEM;
+ goto failed_1;
+ }
+
+ /* stash conn's local and remote addrs */
+ rc = ksocknal_lib_get_conn_addrs(conn);
+ if (rc != 0)
+ goto failed_1;
+
+ /* Find out/confirm peer's NID and connection type and get the
+ * vector of interfaces she's willing to let me connect to.
+ * Passive connections use the listener timeout since the peer sends
+ * eagerly */
+
+ if (active) {
+ peer = route->ksnr_peer;
+ LASSERT(ni == peer->ksnp_ni);
+
+ /* Active connection sends HELLO eagerly */
+ hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+ peerid = peer->ksnp_id;
+
+ write_lock_bh(global_lock);
+ conn->ksnc_proto = peer->ksnp_proto;
+ write_unlock_bh(global_lock);
+
+ if (conn->ksnc_proto == NULL) {
+ conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+ if (*ksocknal_tunables.ksnd_protocol == 2)
+ conn->ksnc_proto = &ksocknal_protocol_v2x;
+ else if (*ksocknal_tunables.ksnd_protocol == 1)
+ conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+ }
+
+ rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+ if (rc != 0)
+ goto failed_1;
+ } else {
+ peerid.nid = LNET_NID_ANY;
+ peerid.pid = LNET_PID_ANY;
+
+ /* Passive, get protocol from peer */
+ conn->ksnc_proto = NULL;
+ }
+
+ rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation);
+ if (rc < 0)
+ goto failed_1;
+
+ LASSERT(rc == 0 || active);
+ LASSERT(conn->ksnc_proto != NULL);
+ LASSERT(peerid.nid != LNET_NID_ANY);
+
+ cpt = lnet_cpt_of_nid(peerid.nid);
+
+ if (active) {
+ ksocknal_peer_addref(peer);
+ write_lock_bh(global_lock);
+ } else {
+ rc = ksocknal_create_peer(&peer, ni, peerid);
+ if (rc != 0)
+ goto failed_1;
+
+ write_lock_bh(global_lock);
+
+ /* called with a ref on ni, so shutdown can't have started */
+ LASSERT(((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+ peer2 = ksocknal_find_peer_locked(ni, peerid);
+ if (peer2 == NULL) {
+ /* NB this puts an "empty" peer in the peer
+ * table (which takes my ref) */
+ list_add_tail(&peer->ksnp_list,
+ ksocknal_nid2peerlist(peerid.nid));
+ } else {
+ ksocknal_peer_decref(peer);
+ peer = peer2;
+ }
+
+ /* +1 ref for me */
+ ksocknal_peer_addref(peer);
+ peer->ksnp_accepting++;
+
+ /* Am I already connecting to this guy? Resolve in
+ * favour of higher NID... */
+ if (peerid.nid < ni->ni_nid &&
+ ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
+ rc = EALREADY;
+ warn = "connection race resolution";
+ goto failed_2;
+ }
+ }
+
+ if (peer->ksnp_closing ||
+ (active && route->ksnr_deleted)) {
+ /* peer/route got closed under me */
+ rc = -ESTALE;
+ warn = "peer/route removed";
+ goto failed_2;
+ }
+
+ if (peer->ksnp_proto == NULL) {
+ /* Never connected before.
+ * NB recv_hello may have returned EPROTO to signal my peer
+ * wants a different protocol than the one I asked for.
+ */
+ LASSERT(list_empty(&peer->ksnp_conns));
+
+ peer->ksnp_proto = conn->ksnc_proto;
+ peer->ksnp_incarnation = incarnation;
+ }
+
+ if (peer->ksnp_proto != conn->ksnc_proto ||
+ peer->ksnp_incarnation != incarnation) {
+ /* Peer rebooted or I've got the wrong protocol version */
+ ksocknal_close_peer_conns_locked(peer, 0, 0);
+
+ peer->ksnp_proto = NULL;
+ rc = ESTALE;
+ warn = peer->ksnp_incarnation != incarnation ?
+ "peer rebooted" :
+ "wrong proto version";
+ goto failed_2;
+ }
+
+ switch (rc) {
+ default:
+ LBUG();
+ case 0:
+ break;
+ case EALREADY:
+ warn = "lost conn race";
+ goto failed_2;
+ case EPROTO:
+ warn = "retry with different protocol version";
+ goto failed_2;
+ }
+
+ /* Refuse to duplicate an existing connection, unless this is a
+ * loopback connection */
+ if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+ list_for_each(tmp, &peer->ksnp_conns) {
+ conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+ conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+ conn2->ksnc_type != conn->ksnc_type)
+ continue;
+
+ /* Reply on a passive connection attempt so the peer
+ * realises we're connected. */
+ LASSERT(rc == 0);
+ if (!active)
+ rc = EALREADY;
+
+ warn = "duplicate";
+ goto failed_2;
+ }
+ }
+
+ /* If the connection created by this route didn't bind to the IP
+ * address the route connected to, the connection/route matching
+ * code below probably isn't going to work. */
+ if (active &&
+ route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+ CERROR("Route %s %pI4h connected to %pI4h\n",
+ libcfs_id2str(peer->ksnp_id),
+ &route->ksnr_ipaddr,
+ &conn->ksnc_ipaddr);
+ }
+
+ /* Search for a route corresponding to the new connection and
+ * create an association. This allows incoming connections created
+ * by routes in my peer to match my own route entries so I don't
+ * continually create duplicate routes. */
+ list_for_each(tmp, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+ continue;
+
+ ksocknal_associate_route_conn_locked(route, conn);
+ break;
+ }
+
+ conn->ksnc_peer = peer; /* conn takes my ref on peer */
+ peer->ksnp_last_alive = cfs_time_current();
+ peer->ksnp_send_keepalive = 0;
+ peer->ksnp_error = 0;
+
+ sched = ksocknal_choose_scheduler_locked(cpt);
+ sched->kss_nconns++;
+ conn->ksnc_scheduler = sched;
+
+ conn->ksnc_tx_last_post = cfs_time_current();
+ /* Set the deadline for the outgoing HELLO to drain */
+ conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
+ conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ mb(); /* order with adding to peer's conn list */
+
+ list_add(&conn->ksnc_list, &peer->ksnp_conns);
+ ksocknal_conn_addref(conn);
+
+ ksocknal_new_packet(conn, 0);
+
+ conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+ /* Take packets blocking for this connection. */
+ list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
+ if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO)
+ continue;
+
+ list_del(&tx->tx_list);
+ ksocknal_queue_tx_locked(tx, conn);
+ }
+
+ write_unlock_bh(global_lock);
+
+ /* We've now got a new connection. Any errors from here on are just
+ * like "normal" comms errors and we close the connection normally.
+ * NB (a) we still have to send the reply HELLO for passive
+ * connections,
+ * (b) normal I/O on the conn is blocked until I setup and call the
+ * socket callbacks.
+ */
+
+ CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d incarnation:%lld sched[%d:%d]\n",
+ libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+ &conn->ksnc_myipaddr, &conn->ksnc_ipaddr,
+ conn->ksnc_port, incarnation, cpt,
+ (int)(sched - &sched->kss_info->ksi_scheds[0]));
+
+ if (active) {
+ /* additional routes after interface exchange? */
+ ksocknal_create_routes(peer, conn->ksnc_port,
+ hello->kshm_ips, hello->kshm_nips);
+ } else {
+ hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+ hello->kshm_nips);
+ rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+ }
+
+ LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+ kshm_ips[LNET_MAX_INTERFACES]));
+
+ /* setup the socket AFTER I've received hello (it disables
+ * SO_LINGER). I might call back to the acceptor who may want
+ * to send a protocol version response and then close the
+ * socket; this ensures the socket only tears down after the
+ * response has been sent. */
+ if (rc == 0)
+ rc = ksocknal_lib_setup_sock(sock);
+
+ write_lock_bh(global_lock);
+
+ /* NB my callbacks block while I hold ksnd_global_lock */
+ ksocknal_lib_set_callback(sock, conn);
+
+ if (!active)
+ peer->ksnp_accepting--;
+
+ write_unlock_bh(global_lock);
+
+ if (rc != 0) {
+ write_lock_bh(global_lock);
+ if (!conn->ksnc_closing) {
+ /* could be closed by another thread */
+ ksocknal_close_conn_locked(conn, rc);
+ }
+ write_unlock_bh(global_lock);
+ } else if (ksocknal_connsock_addref(conn) == 0) {
+ /* Allow I/O to proceed. */
+ ksocknal_read_callback(conn);
+ ksocknal_write_callback(conn);
+ ksocknal_connsock_decref(conn);
+ }
+
+ ksocknal_connsock_decref(conn);
+ ksocknal_conn_decref(conn);
+ return rc;
+
+ failed_2:
+ if (!peer->ksnp_closing &&
+ list_empty(&peer->ksnp_conns) &&
+ list_empty(&peer->ksnp_routes)) {
+ list_add(&zombies, &peer->ksnp_tx_queue);
+ list_del_init(&peer->ksnp_tx_queue);
+ ksocknal_unlink_peer_locked(peer);
+ }
+
+ write_unlock_bh(global_lock);
+
+ if (warn != NULL) {
+ if (rc < 0)
+ CERROR("Not creating conn %s type %d: %s\n",
+ libcfs_id2str(peerid), conn->ksnc_type, warn);
+ else
+ CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+ libcfs_id2str(peerid), conn->ksnc_type, warn);
+ }
+
+ if (!active) {
+ if (rc > 0) {
+ /* Request retry by replying with CONN_NONE
+ * ksnc_proto has been set already */
+ conn->ksnc_type = SOCKLND_CONN_NONE;
+ hello->kshm_nips = 0;
+ ksocknal_send_hello(ni, conn, peerid.nid, hello);
+ }
+
+ write_lock_bh(global_lock);
+ peer->ksnp_accepting--;
+ write_unlock_bh(global_lock);
+ }
+
+ ksocknal_txlist_done(ni, &zombies, 1);
+ ksocknal_peer_decref(peer);
+
+ failed_1:
+ if (hello != NULL)
+ LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+ kshm_ips[LNET_MAX_INTERFACES]));
+
+ LIBCFS_FREE(conn, sizeof(*conn));
+
+ failed_0:
+ libcfs_sock_release(sock);
+ return rc;
+}
+
+void
+ksocknal_close_conn_locked(ksock_conn_t *conn, int error)
+{
+ /* This just does the immmediate housekeeping, and queues the
+ * connection for the reaper to terminate.
+ * Caller holds ksnd_global_lock exclusively in irq context */
+ ksock_peer_t *peer = conn->ksnc_peer;
+ ksock_route_t *route;
+ ksock_conn_t *conn2;
+ struct list_head *tmp;
+
+ LASSERT(peer->ksnp_error == 0);
+ LASSERT(!conn->ksnc_closing);
+ conn->ksnc_closing = 1;
+
+ /* ksnd_deathrow_conns takes over peer's ref */
+ list_del(&conn->ksnc_list);
+
+ route = conn->ksnc_route;
+ if (route != NULL) {
+ /* dissociate conn from route... */
+ LASSERT(!route->ksnr_deleted);
+ LASSERT((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+ conn2 = NULL;
+ list_for_each(tmp, &peer->ksnp_conns) {
+ conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ if (conn2->ksnc_route == route &&
+ conn2->ksnc_type == conn->ksnc_type)
+ break;
+
+ conn2 = NULL;
+ }
+ if (conn2 == NULL)
+ route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+ conn->ksnc_route = NULL;
+
+#if 0 /* irrelevant with only eager routes */
+ /* make route least favourite */
+ list_del(&route->ksnr_list);
+ list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+#endif
+ ksocknal_route_decref(route); /* drop conn's ref on route */
+ }
+
+ if (list_empty(&peer->ksnp_conns)) {
+ /* No more connections to this peer */
+
+ if (!list_empty(&peer->ksnp_tx_queue)) {
+ ksock_tx_t *tx;
+
+ LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+ /* throw them to the last connection...,
+ * these TXs will be send to /dev/null by scheduler */
+ list_for_each_entry(tx, &peer->ksnp_tx_queue,
+ tx_list)
+ ksocknal_tx_prep(conn, tx);
+
+ spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+ list_splice_init(&peer->ksnp_tx_queue,
+ &conn->ksnc_tx_queue);
+ spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+ }
+
+ peer->ksnp_proto = NULL; /* renegotiate protocol version */
+ peer->ksnp_error = error; /* stash last conn close reason */
+
+ if (list_empty(&peer->ksnp_routes)) {
+ /* I've just closed last conn belonging to a
+ * peer with no routes to it */
+ ksocknal_unlink_peer_locked(peer);
+ }
+ }
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ list_add_tail(&conn->ksnc_list,
+ &ksocknal_data.ksnd_deathrow_conns);
+ wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed(ksock_peer_t *peer)
+{
+ int notify = 0;
+ unsigned long last_alive = 0;
+
+ /* There has been a connection failure or comms error; but I'll only
+ * tell LNET I think the peer is dead if it's to another kernel and
+ * there are no connections or connection attempts in existence. */
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+ list_empty(&peer->ksnp_conns) &&
+ peer->ksnp_accepting == 0 &&
+ ksocknal_find_connecting_route_locked(peer) == NULL) {
+ notify = 1;
+ last_alive = peer->ksnp_last_alive;
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ if (notify)
+ lnet_notify(peer->ksnp_ni, peer->ksnp_id.nid, 0,
+ last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
+{
+ ksock_peer_t *peer = conn->ksnc_peer;
+ ksock_tx_t *tx;
+ ksock_tx_t *tmp;
+ LIST_HEAD(zlist);
+
+ /* NB safe to finalize TXs because closing of socket will
+ * abort all buffered data */
+ LASSERT(conn->ksnc_sock == NULL);
+
+ spin_lock(&peer->ksnp_lock);
+
+ list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
+ if (tx->tx_conn != conn)
+ continue;
+
+ LASSERT(tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+ tx->tx_msg.ksm_zc_cookies[0] = 0;
+ tx->tx_zc_aborted = 1; /* mark it as not-acked */
+ list_del(&tx->tx_zc_list);
+ list_add(&tx->tx_zc_list, &zlist);
+ }
+
+ spin_unlock(&peer->ksnp_lock);
+
+ while (!list_empty(&zlist)) {
+ tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+
+ list_del(&tx->tx_zc_list);
+ ksocknal_tx_decref(tx);
+ }
+}
+
+void
+ksocknal_terminate_conn(ksock_conn_t *conn)
+{
+ /* This gets called by the reaper (guaranteed thread context) to
+ * disengage the socket from its callbacks and close it.
+ * ksnc_refcount will eventually hit zero, and then the reaper will
+ * destroy it. */
+ ksock_peer_t *peer = conn->ksnc_peer;
+ ksock_sched_t *sched = conn->ksnc_scheduler;
+ int failed = 0;
+
+ LASSERT(conn->ksnc_closing);
+
+ /* wake up the scheduler to "send" all remaining packets to /dev/null */
+ spin_lock_bh(&sched->kss_lock);
+
+ /* a closing conn is always ready to tx */
+ conn->ksnc_tx_ready = 1;
+
+ if (!conn->ksnc_tx_scheduled &&
+ !list_empty(&conn->ksnc_tx_queue)) {
+ list_add_tail(&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ conn->ksnc_tx_scheduled = 1;
+ /* extra ref for scheduler */
+ ksocknal_conn_addref(conn);
+
+ wake_up(&sched->kss_waitq);
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+
+ /* serialise with callbacks */
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+ /* OK, so this conn may not be completely disengaged from its
+ * scheduler yet, but it _has_ committed to terminate... */
+ conn->ksnc_scheduler->kss_nconns--;
+
+ if (peer->ksnp_error != 0) {
+ /* peer's last conn closed in error */
+ LASSERT(list_empty(&peer->ksnp_conns));
+ failed = 1;
+ peer->ksnp_error = 0; /* avoid multiple notifications */
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ if (failed)
+ ksocknal_peer_failed(peer);
+
+ /* The socket is closed on the final put; either here, or in
+ * ksocknal_{send,recv}msg(). Since we set up the linger2 option
+ * when the connection was established, this will close the socket
+ * immediately, aborting anything buffered in it. Any hung
+ * zero-copy transmits will therefore complete in finite time. */
+ ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn(ksock_conn_t *conn)
+{
+ /* Queue the conn for the reaper to destroy */
+
+ LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+ wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn(ksock_conn_t *conn)
+{
+ unsigned long last_rcv;
+
+ /* Final coup-de-grace of the reaper */
+ CDEBUG(D_NET, "connection %p\n", conn);
+
+ LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+ LASSERT(atomic_read(&conn->ksnc_sock_refcount) == 0);
+ LASSERT(conn->ksnc_sock == NULL);
+ LASSERT(conn->ksnc_route == NULL);
+ LASSERT(!conn->ksnc_tx_scheduled);
+ LASSERT(!conn->ksnc_rx_scheduled);
+ LASSERT(list_empty(&conn->ksnc_tx_queue));
+
+ /* complete current receive if any */
+ switch (conn->ksnc_rx_state) {
+ case SOCKNAL_RX_LNET_PAYLOAD:
+ last_rcv = conn->ksnc_rx_deadline -
+ cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+ CERROR("Completing partial receive from %s[%d], ip %pI4h:%d, with error, wanted: %d, left: %d, last alive is %ld secs ago\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+ &conn->ksnc_ipaddr, conn->ksnc_port,
+ conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+ cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+ last_rcv)));
+ lnet_finalize(conn->ksnc_peer->ksnp_ni,
+ conn->ksnc_cookie, -EIO);
+ break;
+ case SOCKNAL_RX_LNET_HEADER:
+ if (conn->ksnc_rx_started)
+ CERROR("Incomplete receive of lnet header from %s, ip %pI4h:%d, with error, protocol: %d.x.\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ &conn->ksnc_ipaddr, conn->ksnc_port,
+ conn->ksnc_proto->pro_version);
+ break;
+ case SOCKNAL_RX_KSM_HEADER:
+ if (conn->ksnc_rx_started)
+ CERROR("Incomplete receive of ksock message from %s, ip %pI4h:%d, with error, protocol: %d.x.\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ &conn->ksnc_ipaddr, conn->ksnc_port,
+ conn->ksnc_proto->pro_version);
+ break;
+ case SOCKNAL_RX_SLOP:
+ if (conn->ksnc_rx_started)
+ CERROR("Incomplete receive of slops from %s, ip %pI4h:%d, with error\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ &conn->ksnc_ipaddr, conn->ksnc_port);
+ break;
+ default:
+ LBUG();
+ break;
+ }
+
+ ksocknal_peer_decref(conn->ksnc_peer);
+
+ LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked(ksock_peer_t *peer, __u32 ipaddr, int why)
+{
+ ksock_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
+ conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+ if (ipaddr == 0 ||
+ conn->ksnc_ipaddr == ipaddr) {
+ count++;
+ ksocknal_close_conn_locked(conn, why);
+ }
+ }
+
+ return count;
+}
+
+int
+ksocknal_close_conn_and_siblings(ksock_conn_t *conn, int why)
+{
+ ksock_peer_t *peer = conn->ksnc_peer;
+ __u32 ipaddr = conn->ksnc_ipaddr;
+ int count;
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ count = ksocknal_close_peer_conns_locked(peer, ipaddr, why);
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ return count;
+}
+
+int
+ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr)
+{
+ ksock_peer_t *peer;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ int lo;
+ int hi;
+ int i;
+ int count = 0;
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ if (id.nid != LNET_NID_ANY)
+ lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+ else {
+ lo = 0;
+ hi = ksocknal_data.ksnd_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe(ptmp, pnxt,
+ &ksocknal_data.ksnd_peers[i]) {
+
+ peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+ if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+ (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+ continue;
+
+ count += ksocknal_close_peer_conns_locked(peer, ipaddr, 0);
+ }
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ /* wildcards always succeed */
+ if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+ return 0;
+
+ if (count == 0)
+ return -ENOENT;
+ else
+ return 0;
+}
+
+void
+ksocknal_notify(lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
+{
+ /* The router is telling me she's been notified of a change in
+ * gateway state.... */
+ lnet_process_id_t id = {0};
+
+ id.nid = gw_nid;
+ id.pid = LNET_PID_ANY;
+
+ CDEBUG(D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
+ alive ? "up" : "down");
+
+ if (!alive) {
+ /* If the gateway crashed, close all open connections... */
+ ksocknal_close_matching_conns(id, 0);
+ return;
+ }
+
+ /* ...otherwise do nothing. We can only establish new connections
+ * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when)
+{
+ int connect = 1;
+ unsigned long last_alive = 0;
+ unsigned long now = cfs_time_current();
+ ksock_peer_t *peer = NULL;
+ rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
+ lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+
+ read_lock(glock);
+
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL) {
+ struct list_head *tmp;
+ ksock_conn_t *conn;
+ int bufnob;
+
+ list_for_each(tmp, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+ bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+
+ if (bufnob < conn->ksnc_tx_bufnob) {
+ /* something got ACKed */
+ conn->ksnc_tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ peer->ksnp_last_alive = now;
+ conn->ksnc_tx_bufnob = bufnob;
+ }
+ }
+
+ last_alive = peer->ksnp_last_alive;
+ if (ksocknal_find_connectable_route_locked(peer) == NULL)
+ connect = 0;
+ }
+
+ read_unlock(glock);
+
+ if (last_alive != 0)
+ *when = last_alive;
+
+ CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+ libcfs_nid2str(nid), peer,
+ last_alive ? cfs_duration_sec(now - last_alive) : -1,
+ connect);
+
+ if (!connect)
+ return;
+
+ ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
+
+ write_lock_bh(glock);
+
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL)
+ ksocknal_launch_all_connections_locked(peer);
+
+ write_unlock_bh(glock);
+ return;
+}
+
+static void
+ksocknal_push_peer(ksock_peer_t *peer)
+{
+ int index;
+ int i;
+ struct list_head *tmp;
+ ksock_conn_t *conn;
+
+ for (index = 0; ; index++) {
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ i = 0;
+ conn = NULL;
+
+ list_for_each(tmp, &peer->ksnp_conns) {
+ if (i++ == index) {
+ conn = list_entry(tmp, ksock_conn_t,
+ ksnc_list);
+ ksocknal_conn_addref(conn);
+ break;
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ if (conn == NULL)
+ break;
+
+ ksocknal_lib_push_conn(conn);
+ ksocknal_conn_decref(conn);
+ }
+}
+
+static int
+ksocknal_push(lnet_ni_t *ni, lnet_process_id_t id)
+{
+ ksock_peer_t *peer;
+ struct list_head *tmp;
+ int index;
+ int i;
+ int j;
+ int rc = -ENOENT;
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ for (j = 0; ; j++) {
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ index = 0;
+ peer = NULL;
+
+ list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry(tmp, ksock_peer_t,
+ ksnp_list);
+
+ if (!((id.nid == LNET_NID_ANY ||
+ id.nid == peer->ksnp_id.nid) &&
+ (id.pid == LNET_PID_ANY ||
+ id.pid == peer->ksnp_id.pid))) {
+ peer = NULL;
+ continue;
+ }
+
+ if (index++ == j) {
+ ksocknal_peer_addref(peer);
+ break;
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ if (peer != NULL) {
+ rc = 0;
+ ksocknal_push_peer(peer);
+ ksocknal_peer_decref(peer);
+ }
+ }
+
+ }
+
+ return rc;
+}
+
+static int
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
+{
+ ksock_net_t *net = ni->ni_data;
+ ksock_interface_t *iface;
+ int rc;
+ int i;
+ int j;
+ struct list_head *ptmp;
+ ksock_peer_t *peer;
+ struct list_head *rtmp;
+ ksock_route_t *route;
+
+ if (ipaddress == 0 ||
+ netmask == 0)
+ return -EINVAL;
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ iface = ksocknal_ip2iface(ni, ipaddress);
+ if (iface != NULL) {
+ /* silently ignore dups */
+ rc = 0;
+ } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
+ rc = -ENOSPC;
+ } else {
+ iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+ iface->ksni_ipaddr = ipaddress;
+ iface->ksni_netmask = netmask;
+ iface->ksni_nroutes = 0;
+ iface->ksni_npeers = 0;
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry(ptmp, ksock_peer_t,
+ ksnp_list);
+
+ for (j = 0; j < peer->ksnp_n_passive_ips; j++)
+ if (peer->ksnp_passive_ips[j] == ipaddress)
+ iface->ksni_npeers++;
+
+ list_for_each(rtmp, &peer->ksnp_routes) {
+ route = list_entry(rtmp,
+ ksock_route_t,
+ ksnr_list);
+
+ if (route->ksnr_myipaddr == ipaddress)
+ iface->ksni_nroutes++;
+ }
+ }
+ }
+
+ rc = 0;
+ /* NB only new connections will pay attention to the new interface! */
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ return rc;
+}
+
+static void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+ struct list_head *tmp;
+ struct list_head *nxt;
+ ksock_route_t *route;
+ ksock_conn_t *conn;
+ int i;
+ int j;
+
+ for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+ if (peer->ksnp_passive_ips[i] == ipaddr) {
+ for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+ peer->ksnp_passive_ips[j-1] =
+ peer->ksnp_passive_ips[j];
+ peer->ksnp_n_passive_ips--;
+ break;
+ }
+
+ list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ if (route->ksnr_myipaddr != ipaddr)
+ continue;
+
+ if (route->ksnr_share_count != 0) {
+ /* Manually created; keep, but unbind */
+ route->ksnr_myipaddr = 0;
+ } else {
+ ksocknal_del_route_locked(route);
+ }
+ }
+
+ list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ if (conn->ksnc_myipaddr == ipaddr)
+ ksocknal_close_conn_locked(conn, 0);
+ }
+}
+
+static int
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
+{
+ ksock_net_t *net = ni->ni_data;
+ int rc = -ENOENT;
+ struct list_head *tmp;
+ struct list_head *nxt;
+ ksock_peer_t *peer;
+ __u32 this_ip;
+ int i;
+ int j;
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ for (i = 0; i < net->ksnn_ninterfaces; i++) {
+ this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+ if (!(ipaddress == 0 ||
+ ipaddress == this_ip))
+ continue;
+
+ rc = 0;
+
+ for (j = i+1; j < net->ksnn_ninterfaces; j++)
+ net->ksnn_interfaces[j-1] =
+ net->ksnn_interfaces[j];
+
+ net->ksnn_ninterfaces--;
+
+ for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+ list_for_each_safe(tmp, nxt,
+ &ksocknal_data.ksnd_peers[j]) {
+ peer = list_entry(tmp, ksock_peer_t,
+ ksnp_list);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ ksocknal_peer_del_interface_locked(peer, this_ip);
+ }
+ }
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ return rc;
+}
+
+int
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+ lnet_process_id_t id = {0};
+ struct libcfs_ioctl_data *data = arg;
+ int rc;
+
+ switch (cmd) {
+ case IOC_LIBCFS_GET_INTERFACE: {
+ ksock_net_t *net = ni->ni_data;
+ ksock_interface_t *iface;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
+ rc = -ENOENT;
+ } else {
+ rc = 0;
+ iface = &net->ksnn_interfaces[data->ioc_count];
+
+ data->ioc_u32[0] = iface->ksni_ipaddr;
+ data->ioc_u32[1] = iface->ksni_netmask;
+ data->ioc_u32[2] = iface->ksni_npeers;
+ data->ioc_u32[3] = iface->ksni_nroutes;
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return rc;
+ }
+
+ case IOC_LIBCFS_ADD_INTERFACE:
+ return ksocknal_add_interface(ni,
+ data->ioc_u32[0], /* IP address */
+ data->ioc_u32[1]); /* net mask */
+
+ case IOC_LIBCFS_DEL_INTERFACE:
+ return ksocknal_del_interface(ni,
+ data->ioc_u32[0]); /* IP address */
+
+ case IOC_LIBCFS_GET_PEER: {
+ __u32 myip = 0;
+ __u32 ip = 0;
+ int port = 0;
+ int conn_count = 0;
+ int share_count = 0;
+
+ rc = ksocknal_get_peer_info(ni, data->ioc_count,
+ &id, &myip, &ip, &port,
+ &conn_count, &share_count);
+ if (rc != 0)
+ return rc;
+
+ data->ioc_nid = id.nid;
+ data->ioc_count = share_count;
+ data->ioc_u32[0] = ip;
+ data->ioc_u32[1] = port;
+ data->ioc_u32[2] = myip;
+ data->ioc_u32[3] = conn_count;
+ data->ioc_u32[4] = id.pid;
+ return 0;
+ }
+
+ case IOC_LIBCFS_ADD_PEER:
+ id.nid = data->ioc_nid;
+ id.pid = LUSTRE_SRV_LNET_PID;
+ return ksocknal_add_peer(ni, id,
+ data->ioc_u32[0], /* IP */
+ data->ioc_u32[1]); /* port */
+
+ case IOC_LIBCFS_DEL_PEER:
+ id.nid = data->ioc_nid;
+ id.pid = LNET_PID_ANY;
+ return ksocknal_del_peer(ni, id,
+ data->ioc_u32[0]); /* IP */
+
+ case IOC_LIBCFS_GET_CONN: {
+ int txmem;
+ int rxmem;
+ int nagle;
+ ksock_conn_t *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count);
+
+ if (conn == NULL)
+ return -ENOENT;
+
+ ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+ data->ioc_count = txmem;
+ data->ioc_nid = conn->ksnc_peer->ksnp_id.nid;
+ data->ioc_flags = nagle;
+ data->ioc_u32[0] = conn->ksnc_ipaddr;
+ data->ioc_u32[1] = conn->ksnc_port;
+ data->ioc_u32[2] = conn->ksnc_myipaddr;
+ data->ioc_u32[3] = conn->ksnc_type;
+ data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+ data->ioc_u32[5] = rxmem;
+ data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+ ksocknal_conn_decref(conn);
+ return 0;
+ }
+
+ case IOC_LIBCFS_CLOSE_CONNECTION:
+ id.nid = data->ioc_nid;
+ id.pid = LNET_PID_ANY;
+ return ksocknal_close_matching_conns(id,
+ data->ioc_u32[0]);
+
+ case IOC_LIBCFS_REGISTER_MYNID:
+ /* Ignore if this is a noop */
+ if (data->ioc_nid == ni->ni_nid)
+ return 0;
+
+ CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+ libcfs_nid2str(data->ioc_nid),
+ libcfs_nid2str(ni->ni_nid));
+ return -EINVAL;
+
+ case IOC_LIBCFS_PUSH_CONNECTION:
+ id.nid = data->ioc_nid;
+ id.pid = LNET_PID_ANY;
+ return ksocknal_push(ni, id);
+
+ default:
+ return -EINVAL;
+ }
+ /* not reached */
+}
+
+static void
+ksocknal_free_buffers(void)
+{
+ LASSERT(atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+ if (ksocknal_data.ksnd_sched_info != NULL) {
+ struct ksock_sched_info *info;
+ int i;
+
+ cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+ if (info->ksi_scheds != NULL) {
+ LIBCFS_FREE(info->ksi_scheds,
+ info->ksi_nthreads_max *
+ sizeof(info->ksi_scheds[0]));
+ }
+ }
+ cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+ }
+
+ LIBCFS_FREE(ksocknal_data.ksnd_peers,
+ sizeof(struct list_head) *
+ ksocknal_data.ksnd_peer_hash_size);
+
+ spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+ if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+ struct list_head zlist;
+ ksock_tx_t *tx;
+
+ list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+ list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+ spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+ while (!list_empty(&zlist)) {
+ tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+ list_del(&tx->tx_list);
+ LIBCFS_FREE(tx, tx->tx_desc_size);
+ }
+ } else {
+ spin_unlock(&ksocknal_data.ksnd_tx_lock);
+ }
+}
+
+static void
+ksocknal_base_shutdown(void)
+{
+ struct ksock_sched_info *info;
+ ksock_sched_t *sched;
+ int i;
+ int j;
+
+ CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+ LASSERT(ksocknal_data.ksnd_nnets == 0);
+
+ switch (ksocknal_data.ksnd_init) {
+ default:
+ LASSERT(0);
+
+ case SOCKNAL_INIT_ALL:
+ case SOCKNAL_INIT_DATA:
+ LASSERT(ksocknal_data.ksnd_peers != NULL);
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ LASSERT(list_empty(&ksocknal_data.ksnd_peers[i]));
+ }
+
+ LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+ LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns));
+ LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns));
+ LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs));
+ LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes));
+
+ if (ksocknal_data.ksnd_sched_info != NULL) {
+ cfs_percpt_for_each(info, i,
+ ksocknal_data.ksnd_sched_info) {
+ if (info->ksi_scheds == NULL)
+ continue;
+
+ for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+ sched = &info->ksi_scheds[j];
+ LASSERT(list_empty(
+ &sched->kss_tx_conns));
+ LASSERT(list_empty(
+ &sched->kss_rx_conns));
+ LASSERT(list_empty(
+ &sched->kss_zombie_noop_txs));
+ LASSERT(sched->kss_nconns == 0);
+ }
+ }
+ }
+
+ /* flag threads to terminate; wake and wait for them to die */
+ ksocknal_data.ksnd_shuttingdown = 1;
+ wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+ wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
+
+ if (ksocknal_data.ksnd_sched_info != NULL) {
+ cfs_percpt_for_each(info, i,
+ ksocknal_data.ksnd_sched_info) {
+ if (info->ksi_scheds == NULL)
+ continue;
+
+ for (j = 0; j < info->ksi_nthreads_max; j++) {
+ sched = &info->ksi_scheds[j];
+ wake_up_all(&sched->kss_waitq);
+ }
+ }
+ }
+
+ i = 4;
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ while (ksocknal_data.ksnd_nthreads != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "waiting for %d threads to terminate\n",
+ ksocknal_data.ksnd_nthreads);
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ }
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_free_buffers();
+
+ ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+ break;
+ }
+
+ CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ module_put(THIS_MODULE);
+}
+
+static __u64
+ksocknal_new_incarnation(void)
+{
+
+ /* The incarnation number is the time this module loaded and it
+ * identifies this particular instance of the socknal.
+ */
+ return ktime_get_ns();
+}
+
+static int
+ksocknal_base_startup(void)
+{
+ struct ksock_sched_info *info;
+ int rc;
+ int i;
+
+ LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+ LASSERT(ksocknal_data.ksnd_nnets == 0);
+
+ memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */
+
+ ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+ LIBCFS_ALLOC(ksocknal_data.ksnd_peers,
+ sizeof(struct list_head) *
+ ksocknal_data.ksnd_peer_hash_size);
+ if (ksocknal_data.ksnd_peers == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+ rwlock_init(&ksocknal_data.ksnd_global_lock);
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+ spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns);
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns);
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns);
+ init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+ spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs);
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes);
+ init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+ spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs);
+
+ /* NB memset above zeros whole of ksocknal_data */
+
+ /* flag lists/ptrs/locks initialised */
+ ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+ try_module_get(THIS_MODULE);
+
+ ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*info));
+ if (ksocknal_data.ksnd_sched_info == NULL)
+ goto failed;
+
+ cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+ ksock_sched_t *sched;
+ int nthrs;
+
+ nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+ if (*ksocknal_tunables.ksnd_nscheds > 0) {
+ nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+ } else {
+ /* max to half of CPUs, assume another half should be
+ * reserved for upper layer modules */
+ nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+ }
+
+ info->ksi_nthreads_max = nthrs;
+ info->ksi_cpt = i;
+
+ LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+ info->ksi_nthreads_max * sizeof(*sched));
+ if (info->ksi_scheds == NULL)
+ goto failed;
+
+ for (; nthrs > 0; nthrs--) {
+ sched = &info->ksi_scheds[nthrs - 1];
+
+ sched->kss_info = info;
+ spin_lock_init(&sched->kss_lock);
+ INIT_LIST_HEAD(&sched->kss_rx_conns);
+ INIT_LIST_HEAD(&sched->kss_tx_conns);
+ INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+ init_waitqueue_head(&sched->kss_waitq);
+ }
+ }
+
+ ksocknal_data.ksnd_connd_starting = 0;
+ ksocknal_data.ksnd_connd_failed_stamp = 0;
+ ksocknal_data.ksnd_connd_starting_stamp = get_seconds();
+ /* must have at least 2 connds to remain responsive to accepts while
+ * connecting */
+ if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+ *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+ if (*ksocknal_tunables.ksnd_nconnds_max <
+ *ksocknal_tunables.ksnd_nconnds) {
+ ksocknal_tunables.ksnd_nconnds_max =
+ ksocknal_tunables.ksnd_nconnds;
+ }
+
+ for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+ char name[16];
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+ ksocknal_data.ksnd_connd_starting++;
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+
+ snprintf(name, sizeof(name), "socknal_cd%02d", i);
+ rc = ksocknal_thread_start(ksocknal_connd,
+ (void *)((ulong_ptr_t)i), name);
+ if (rc != 0) {
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+ ksocknal_data.ksnd_connd_starting--;
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+ CERROR("Can't spawn socknal connd: %d\n", rc);
+ goto failed;
+ }
+ }
+
+ rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+ if (rc != 0) {
+ CERROR("Can't spawn socknal reaper: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag everything initialised */
+ ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+ return 0;
+
+ failed:
+ ksocknal_base_shutdown();
+ return -ENETDOWN;
+}
+
+static void
+ksocknal_debug_peerhash(lnet_ni_t *ni)
+{
+ ksock_peer_t *peer = NULL;
+ struct list_head *tmp;
+ int i;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry(tmp, ksock_peer_t, ksnp_list);
+
+ if (peer->ksnp_ni == ni)
+ break;
+
+ peer = NULL;
+ }
+ }
+
+ if (peer != NULL) {
+ ksock_route_t *route;
+ ksock_conn_t *conn;
+
+ CWARN("Active peer on shutdown: %s, ref %d, scnt %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n",
+ libcfs_id2str(peer->ksnp_id),
+ atomic_read(&peer->ksnp_refcount),
+ peer->ksnp_sharecount, peer->ksnp_closing,
+ peer->ksnp_accepting, peer->ksnp_error,
+ peer->ksnp_zc_next_cookie,
+ !list_empty(&peer->ksnp_tx_queue),
+ !list_empty(&peer->ksnp_zc_req_list));
+
+ list_for_each(tmp, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+ CWARN("Route: ref %d, schd %d, conn %d, cnted %d, del %d\n",
+ atomic_read(&route->ksnr_refcount),
+ route->ksnr_scheduled, route->ksnr_connecting,
+ route->ksnr_connected, route->ksnr_deleted);
+ }
+
+ list_for_each(tmp, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+ CWARN("Conn: ref %d, sref %d, t %d, c %d\n",
+ atomic_read(&conn->ksnc_conn_refcount),
+ atomic_read(&conn->ksnc_sock_refcount),
+ conn->ksnc_type, conn->ksnc_closing);
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return;
+}
+
+void
+ksocknal_shutdown(lnet_ni_t *ni)
+{
+ ksock_net_t *net = ni->ni_data;
+ int i;
+ lnet_process_id_t anyid = {0};
+
+ anyid.nid = LNET_NID_ANY;
+ anyid.pid = LNET_PID_ANY;
+
+ LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+ LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+ spin_lock_bh(&net->ksnn_lock);
+ net->ksnn_shutdown = 1; /* prevent new peers */
+ spin_unlock_bh(&net->ksnn_lock);
+
+ /* Delete all peers */
+ ksocknal_del_peer(ni, anyid, 0);
+
+ /* Wait for all peer state to clean up */
+ i = 2;
+ spin_lock_bh(&net->ksnn_lock);
+ while (net->ksnn_npeers != 0) {
+ spin_unlock_bh(&net->ksnn_lock);
+
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "waiting for %d peers to disconnect\n",
+ net->ksnn_npeers);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+
+ ksocknal_debug_peerhash(ni);
+
+ spin_lock_bh(&net->ksnn_lock);
+ }
+ spin_unlock_bh(&net->ksnn_lock);
+
+ for (i = 0; i < net->ksnn_ninterfaces; i++) {
+ LASSERT(net->ksnn_interfaces[i].ksni_npeers == 0);
+ LASSERT(net->ksnn_interfaces[i].ksni_nroutes == 0);
+ }
+
+ list_del(&net->ksnn_list);
+ LIBCFS_FREE(net, sizeof(*net));
+
+ ksocknal_data.ksnd_nnets--;
+ if (ksocknal_data.ksnd_nnets == 0)
+ ksocknal_base_shutdown();
+}
+
+static int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+ char **names;
+ int i;
+ int j;
+ int rc;
+ int n;
+
+ n = libcfs_ipif_enumerate(&names);
+ if (n <= 0) {
+ CERROR("Can't enumerate interfaces: %d\n", n);
+ return n;
+ }
+
+ for (i = j = 0; i < n; i++) {
+ int up;
+ __u32 ip;
+ __u32 mask;
+
+ if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+ continue;
+
+ rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+ if (rc != 0) {
+ CWARN("Can't get interface %s info: %d\n",
+ names[i], rc);
+ continue;
+ }
+
+ if (!up) {
+ CWARN("Ignoring interface %s (down)\n",
+ names[i]);
+ continue;
+ }
+
+ if (j == LNET_MAX_INTERFACES) {
+ CWARN("Ignoring interface %s (too many interfaces)\n",
+ names[i]);
+ continue;
+ }
+
+ net->ksnn_interfaces[j].ksni_ipaddr = ip;
+ net->ksnn_interfaces[j].ksni_netmask = mask;
+ strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+ names[i], IFNAMSIZ);
+ j++;
+ }
+
+ libcfs_ipif_free_enumeration(names, n);
+
+ if (j == 0)
+ CERROR("Can't find any usable interfaces\n");
+
+ return j;
+}
+
+static int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+ int new_ipif = 0;
+ int i;
+
+ for (i = 0; i < net->ksnn_ninterfaces; i++) {
+ char *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+ char *colon = strchr(ifnam, ':');
+ int found = 0;
+ ksock_net_t *tmp;
+ int j;
+
+ if (colon != NULL) /* ignore alias device */
+ *colon = 0;
+
+ list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+ ksnn_list) {
+ for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+ char *ifnam2 =
+ &tmp->ksnn_interfaces[j].ksni_name[0];
+ char *colon2 = strchr(ifnam2, ':');
+
+ if (colon2 != NULL)
+ *colon2 = 0;
+
+ found = strcmp(ifnam, ifnam2) == 0;
+ if (colon2 != NULL)
+ *colon2 = ':';
+ }
+ if (found)
+ break;
+ }
+
+ new_ipif += !found;
+ if (colon != NULL)
+ *colon = ':';
+ }
+
+ return new_ipif;
+}
+
+static int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+ int nthrs;
+ int rc = 0;
+ int i;
+
+ if (info->ksi_nthreads == 0) {
+ if (*ksocknal_tunables.ksnd_nscheds > 0) {
+ nthrs = info->ksi_nthreads_max;
+ } else {
+ nthrs = cfs_cpt_weight(lnet_cpt_table(),
+ info->ksi_cpt);
+ nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+ nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+ }
+ nthrs = min(nthrs, info->ksi_nthreads_max);
+ } else {
+ LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+ /* increase two threads if there is new interface */
+ nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+ }
+
+ for (i = 0; i < nthrs; i++) {
+ long id;
+ char name[20];
+ ksock_sched_t *sched;
+ id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+ sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+ snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+ info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+ rc = ksocknal_thread_start(ksocknal_scheduler,
+ (void *)id, name);
+ if (rc == 0)
+ continue;
+
+ CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+ info->ksi_cpt, info->ksi_nthreads + i, rc);
+ break;
+ }
+
+ info->ksi_nthreads += i;
+ return rc;
+}
+
+static int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+ int newif = ksocknal_search_new_ipif(net);
+ int rc;
+ int i;
+
+ LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+ for (i = 0; i < ncpts; i++) {
+ struct ksock_sched_info *info;
+ int cpt = (cpts == NULL) ? i : cpts[i];
+
+ LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+ info = ksocknal_data.ksnd_sched_info[cpt];
+
+ if (!newif && info->ksi_nthreads > 0)
+ continue;
+
+ rc = ksocknal_start_schedulers(info);
+ if (rc != 0)
+ return rc;
+ }
+ return 0;
+}
+
+int
+ksocknal_startup(lnet_ni_t *ni)
+{
+ ksock_net_t *net;
+ int rc;
+ int i;
+
+ LASSERT(ni->ni_lnd == &the_ksocklnd);
+
+ if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+ rc = ksocknal_base_startup();
+ if (rc != 0)
+ return rc;
+ }
+
+ LIBCFS_ALLOC(net, sizeof(*net));
+ if (net == NULL)
+ goto fail_0;
+
+ spin_lock_init(&net->ksnn_lock);
+ net->ksnn_incarnation = ksocknal_new_incarnation();
+ ni->ni_data = net;
+ ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout;
+ ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits;
+ ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits;
+ ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
+
+ if (ni->ni_interfaces[0] == NULL) {
+ rc = ksocknal_enumerate_interfaces(net);
+ if (rc <= 0)
+ goto fail_1;
+
+ net->ksnn_ninterfaces = 1;
+ } else {
+ for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+ int up;
+
+ if (ni->ni_interfaces[i] == NULL)
+ break;
+
+ rc = libcfs_ipif_query(
+ ni->ni_interfaces[i], &up,
+ &net->ksnn_interfaces[i].ksni_ipaddr,
+ &net->ksnn_interfaces[i].ksni_netmask);
+
+ if (rc != 0) {
+ CERROR("Can't get interface %s info: %d\n",
+ ni->ni_interfaces[i], rc);
+ goto fail_1;
+ }
+
+ if (!up) {
+ CERROR("Interface %s is down\n",
+ ni->ni_interfaces[i]);
+ goto fail_1;
+ }
+
+ strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+ ni->ni_interfaces[i], IFNAMSIZ);
+ }
+ net->ksnn_ninterfaces = i;
+ }
+
+ /* call it before add it to ksocknal_data.ksnd_nets */
+ rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+ if (rc != 0)
+ goto fail_1;
+
+ ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+ net->ksnn_interfaces[0].ksni_ipaddr);
+ list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+
+ ksocknal_data.ksnd_nnets++;
+
+ return 0;
+
+ fail_1:
+ LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+ if (ksocknal_data.ksnd_nnets == 0)
+ ksocknal_base_shutdown();
+
+ return -ENETDOWN;
+}
+
+
+static void __exit
+ksocknal_module_fini(void)
+{
+ lnet_unregister_lnd(&the_ksocklnd);
+}
+
+static int __init
+ksocknal_module_init(void)
+{
+ int rc;
+
+ /* check ksnr_connected/connecting field large enough */
+ CLASSERT(SOCKLND_CONN_NTYPES <= 4);
+ CLASSERT(SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN);
+
+ /* initialize the_ksocklnd */
+ the_ksocklnd.lnd_type = SOCKLND;
+ the_ksocklnd.lnd_startup = ksocknal_startup;
+ the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
+ the_ksocklnd.lnd_ctl = ksocknal_ctl;
+ the_ksocklnd.lnd_send = ksocknal_send;
+ the_ksocklnd.lnd_recv = ksocknal_recv;
+ the_ksocklnd.lnd_notify = ksocknal_notify;
+ the_ksocklnd.lnd_query = ksocknal_query;
+ the_ksocklnd.lnd_accept = ksocknal_accept;
+
+ rc = ksocknal_tunables_init();
+ if (rc != 0)
+ return rc;
+
+ lnet_register_lnd(&the_ksocklnd);
+
+ return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("3.0.0");
+
+module_init(ksocknal_module_init);
+module_exit(ksocknal_module_fini);
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
new file mode 100644
index 000000000..c54c99551
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "socklnd_lib-linux.h"
+
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "../../../include/linux/lnet/lnet.h"
+#include "../../../include/linux/lnet/lib-lnet.h"
+#include "../../../include/linux/lnet/socklnd.h"
+#include "../../../include/linux/lnet/lnet-sysctl.h"
+
+#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */
+#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */
+#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY CFS_TICK /* jiffies between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK 0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK 1
+#endif
+
+struct ksock_sched_info;
+
+typedef struct /* per scheduler state */
+{
+ spinlock_t kss_lock; /* serialise */
+ struct list_head kss_rx_conns; /* conn waiting to be read */
+ /* conn waiting to be written */
+ struct list_head kss_tx_conns;
+ /* zombie noop tx list */
+ struct list_head kss_zombie_noop_txs;
+ wait_queue_head_t kss_waitq; /* where scheduler sleeps */
+ /* # connections assigned to this scheduler */
+ int kss_nconns;
+ struct ksock_sched_info *kss_info; /* owner of it */
+ struct page *kss_rx_scratch_pgs[LNET_MAX_IOV];
+ struct kvec kss_scratch_iov[LNET_MAX_IOV];
+} ksock_sched_t;
+
+struct ksock_sched_info {
+ int ksi_nthreads_max; /* max allowed threads */
+ int ksi_nthreads; /* number of threads */
+ int ksi_cpt; /* CPT id */
+ ksock_sched_t *ksi_scheds; /* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT 16
+#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+typedef struct /* in-use interface */
+{
+ __u32 ksni_ipaddr; /* interface's IP address */
+ __u32 ksni_netmask; /* interface's network mask */
+ int ksni_nroutes; /* # routes using (active) */
+ int ksni_npeers; /* # peers using (passive) */
+ char ksni_name[IFNAMSIZ]; /* interface name */
+} ksock_interface_t;
+
+typedef struct {
+ /* "stuck" socket timeout (seconds) */
+ int *ksnd_timeout;
+ /* # scheduler threads in each pool while starting */
+ int *ksnd_nscheds;
+ int *ksnd_nconnds; /* # connection daemons */
+ int *ksnd_nconnds_max; /* max # connection daemons */
+ int *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+ int *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+ int *ksnd_eager_ack; /* make TCP ack eagerly? */
+ int *ksnd_typed_conns; /* drive sockets by type? */
+ int *ksnd_min_bulk; /* smallest "large" message */
+ int *ksnd_tx_buffer_size; /* socket tx buffer size */
+ int *ksnd_rx_buffer_size; /* socket rx buffer size */
+ int *ksnd_nagle; /* enable NAGLE? */
+ int *ksnd_round_robin; /* round robin for multiple interfaces */
+ int *ksnd_keepalive; /* # secs for sending keepalive NOOP */
+ int *ksnd_keepalive_idle; /* # idle secs before 1st probe */
+ int *ksnd_keepalive_count; /* # probes */
+ int *ksnd_keepalive_intvl; /* time between probes */
+ int *ksnd_credits; /* # concurrent sends */
+ int *ksnd_peertxcredits; /* # concurrent sends to 1 peer */
+ int *ksnd_peerrtrcredits; /* # per-peer router buffer credits */
+ int *ksnd_peertimeout; /* seconds to consider peer dead */
+ int *ksnd_enable_csum; /* enable check sum */
+ int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+ int *ksnd_nonblk_zcack; /* always send zc-ack on non-blocking connection */
+ unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload size */
+ int *ksnd_zc_recv; /* enable ZC receive (for Chelsio TOE) */
+ int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+} ksock_tunables_t;
+
+typedef struct {
+ __u64 ksnn_incarnation; /* my epoch */
+ spinlock_t ksnn_lock; /* serialise */
+ struct list_head ksnn_list; /* chain on global list */
+ int ksnn_npeers; /* # peers */
+ int ksnn_shutdown; /* shutting down? */
+ int ksnn_ninterfaces; /* IP interfaces */
+ ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT 120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV 1
+
+typedef struct {
+ int ksnd_init; /* initialisation state */
+ int ksnd_nnets; /* # networks set up */
+ struct list_head ksnd_nets; /* list of nets */
+ /* stabilize peer/conn ops */
+ rwlock_t ksnd_global_lock;
+ /* hash table of all my known peers */
+ struct list_head *ksnd_peers;
+ int ksnd_peer_hash_size; /* size of ksnd_peers */
+
+ int ksnd_nthreads; /* # live threads */
+ int ksnd_shuttingdown; /* tell threads to exit */
+ /* schedulers information */
+ struct ksock_sched_info **ksnd_sched_info;
+
+ atomic_t ksnd_nactive_txs; /* #active txs */
+
+ struct list_head ksnd_deathrow_conns; /* conns to close: reaper_lock*/
+ struct list_head ksnd_zombie_conns; /* conns to free: reaper_lock */
+ struct list_head ksnd_enomem_conns; /* conns to retry: reaper_lock*/
+ wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */
+ unsigned long ksnd_reaper_waketime;/* when reaper will wake */
+ spinlock_t ksnd_reaper_lock; /* serialise */
+
+ int ksnd_enomem_tx; /* test ENOMEM sender */
+ int ksnd_stall_tx; /* test sluggish sender */
+ int ksnd_stall_rx; /* test sluggish receiver */
+
+ struct list_head ksnd_connd_connreqs; /* incoming connection requests */
+ struct list_head ksnd_connd_routes; /* routes waiting to be connected */
+ wait_queue_head_t ksnd_connd_waitq; /* connds sleep here */
+ int ksnd_connd_connecting;/* # connds connecting */
+ /** time stamp of the last failed connecting attempt */
+ long ksnd_connd_failed_stamp;
+ /** # starting connd */
+ unsigned ksnd_connd_starting;
+ /** time stamp of the last starting connd */
+ long ksnd_connd_starting_stamp;
+ /** # running connd */
+ unsigned ksnd_connd_running;
+ spinlock_t ksnd_connd_lock; /* serialise */
+
+ struct list_head ksnd_idle_noop_txs; /* list head for freed noop tx */
+ spinlock_t ksnd_tx_lock; /* serialise, g_lock unsafe */
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING 0
+#define SOCKNAL_INIT_DATA 1
+#define SOCKNAL_INIT_ALL 2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header). Once the header has been received, the payload is
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn; /* forward ref */
+struct ksock_peer; /* forward ref */
+struct ksock_route; /* forward ref */
+struct ksock_proto; /* forward ref */
+
+typedef struct /* transmit packet */
+{
+ struct list_head tx_list; /* queue on conn for transmission etc */
+ struct list_head tx_zc_list; /* queue on peer for ZC request */
+ atomic_t tx_refcount; /* tx reference count */
+ int tx_nob; /* # packet bytes */
+ int tx_resid; /* residual bytes */
+ int tx_niov; /* # packet iovec frags */
+ struct kvec *tx_iov; /* packet iovec frags */
+ int tx_nkiov; /* # packet page frags */
+ unsigned short tx_zc_aborted; /* aborted ZC request */
+ unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
+ unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
+ unsigned short tx_nonblk:1; /* it's a non-blocking ACK */
+ lnet_kiov_t *tx_kiov; /* packet page frags */
+ struct ksock_conn *tx_conn; /* owning conn */
+ lnet_msg_t *tx_lnetmsg; /* lnet message for lnet_finalize() */
+ unsigned long tx_deadline; /* when (in jiffies) tx times out */
+ ksock_msg_t tx_msg; /* socklnd message buffer */
+ int tx_desc_size; /* size of this descriptor */
+ union {
+ struct {
+ struct kvec iov; /* virt hdr */
+ lnet_kiov_t kiov[0]; /* paged payload */
+ } paged;
+ struct {
+ struct kvec iov[1]; /* virt hdr + payload */
+ } virt;
+ } tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_NOOP_TX_SIZE ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+ struct kvec iov[LNET_MAX_IOV];
+ lnet_kiov_t kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */
+#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP 6 /* skipping body */
+
+typedef struct ksock_conn {
+ struct ksock_peer *ksnc_peer; /* owning peer */
+ struct ksock_route *ksnc_route; /* owning route */
+ struct list_head ksnc_list; /* stash on peer's conn list */
+ struct socket *ksnc_sock; /* actual socket */
+ void *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+ void *ksnc_saved_write_space; /* socket's original write_space() callback */
+ atomic_t ksnc_conn_refcount; /* conn refcount */
+ atomic_t ksnc_sock_refcount; /* sock refcount */
+ ksock_sched_t *ksnc_scheduler; /* who schedules this connection */
+ __u32 ksnc_myipaddr; /* my IP */
+ __u32 ksnc_ipaddr; /* peer's IP */
+ int ksnc_port; /* peer's port */
+ signed int ksnc_type:3; /* type of connection,
+ * should be signed value */
+ unsigned int ksnc_closing:1; /* being shut down */
+ unsigned int ksnc_flip:1; /* flip or not, only for V2.x */
+ unsigned int ksnc_zc_capable:1; /* enable to ZC */
+ struct ksock_proto *ksnc_proto; /* protocol for the connection */
+
+ /* reader */
+ struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */
+ unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */
+ __u8 ksnc_rx_started; /* started receiving a message */
+ __u8 ksnc_rx_ready; /* data ready to read */
+ __u8 ksnc_rx_scheduled;/* being progressed */
+ __u8 ksnc_rx_state; /* what is being read */
+ int ksnc_rx_nob_left; /* # bytes to next hdr/body */
+ int ksnc_rx_nob_wanted; /* bytes actually wanted */
+ int ksnc_rx_niov; /* # iovec frags */
+ struct kvec *ksnc_rx_iov; /* the iovec frags */
+ int ksnc_rx_nkiov; /* # page frags */
+ lnet_kiov_t *ksnc_rx_kiov; /* the page frags */
+ ksock_rxiovspace_t ksnc_rx_iov_space;/* space for frag descriptors */
+ __u32 ksnc_rx_csum; /* partial checksum for incoming data */
+ void *ksnc_cookie; /* rx lnet_finalize passthru arg */
+ ksock_msg_t ksnc_msg; /* incoming message buffer:
+ * V2.x message takes the
+ * whole struct
+ * V1.x message is a bare
+ * lnet_hdr_t, it's stored in
+ * ksnc_msg.ksm_u.lnetmsg */
+
+ /* WRITER */
+ struct list_head ksnc_tx_list; /* where I enq waiting for output space */
+ struct list_head ksnc_tx_queue; /* packets waiting to be sent */
+ ksock_tx_t *ksnc_tx_carrier; /* next TX that can carry a LNet message or ZC-ACK */
+ unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */
+ int ksnc_tx_bufnob; /* send buffer marker */
+ atomic_t ksnc_tx_nob; /* # bytes queued */
+ int ksnc_tx_ready; /* write space */
+ int ksnc_tx_scheduled; /* being progressed */
+ unsigned long ksnc_tx_last_post; /* time stamp of the last posted TX */
+} ksock_conn_t;
+
+typedef struct ksock_route {
+ struct list_head ksnr_list; /* chain on peer route list */
+ struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */
+ struct ksock_peer *ksnr_peer; /* owning peer */
+ atomic_t ksnr_refcount; /* # users */
+ unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */
+ long ksnr_retry_interval; /* how long between retries */
+ __u32 ksnr_myipaddr; /* my IP */
+ __u32 ksnr_ipaddr; /* IP address to connect to */
+ int ksnr_port; /* port to connect to */
+ unsigned int ksnr_scheduled:1; /* scheduled for attention */
+ unsigned int ksnr_connecting:1;/* connection establishment in progress */
+ unsigned int ksnr_connected:4; /* connections established by type */
+ unsigned int ksnr_deleted:1; /* been removed from peer? */
+ unsigned int ksnr_share_count; /* created explicitly? */
+ int ksnr_conn_count; /* # conns established by this route */
+} ksock_route_t;
+
+#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */
+
+typedef struct ksock_peer {
+ struct list_head ksnp_list; /* stash on global peer list */
+ unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */
+ lnet_process_id_t ksnp_id; /* who's on the other end(s) */
+ atomic_t ksnp_refcount; /* # users */
+ int ksnp_sharecount; /* lconf usage counter */
+ int ksnp_closing; /* being closed */
+ int ksnp_accepting;/* # passive connections pending */
+ int ksnp_error; /* errno on closing last conn */
+ __u64 ksnp_zc_next_cookie;/* ZC completion cookie */
+ __u64 ksnp_incarnation; /* latest known peer incarnation */
+ struct ksock_proto *ksnp_proto; /* latest known peer protocol */
+ struct list_head ksnp_conns; /* all active connections */
+ struct list_head ksnp_routes; /* routes */
+ struct list_head ksnp_tx_queue; /* waiting packets */
+ spinlock_t ksnp_lock; /* serialize, g_lock unsafe */
+ struct list_head ksnp_zc_req_list; /* zero copy requests wait for ACK */
+ unsigned long ksnp_send_keepalive; /* time to send keepalive */
+ lnet_ni_t *ksnp_ni; /* which network */
+ int ksnp_n_passive_ips; /* # of... */
+ __u32 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_t;
+
+typedef struct ksock_connreq {
+ struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */
+ lnet_ni_t *ksncr_ni; /* chosen NI */
+ struct socket *ksncr_sock; /* accepted socket */
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not preferred */
+
+typedef struct ksock_proto {
+ int pro_version; /* version number of protocol */
+ int (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *); /* handshake function */
+ int (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */
+ void (*pro_pack)(ksock_tx_t *); /* message pack */
+ void (*pro_unpack)(ksock_msg_t *); /* message unpack */
+ ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *); /* queue tx on the connection */
+ int (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+ int (*pro_handle_zcreq)(ksock_conn_t *, __u64, int); /* handle ZC request */
+ int (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64); /* handle ZC ACK */
+ int (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int); /* msg type matches the connection type:
+ * return value:
+ * return MATCH_NO : no
+ * return MATCH_YES : matching type
+ * return MATCH_MAY : can be backup */
+} ksock_proto_t;
+
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
+
+#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE 0UL
+#endif
+
+static inline int
+ksocknal_route_mask(void)
+{
+ if (!*ksocknal_tunables.ksnd_typed_conns)
+ return (1 << SOCKLND_CONN_ANY);
+
+ return ((1 << SOCKLND_CONN_CONTROL) |
+ (1 << SOCKLND_CONN_BULK_IN) |
+ (1 << SOCKLND_CONN_BULK_OUT));
+}
+
+static inline struct list_head *
+ksocknal_nid2peerlist(lnet_nid_t nid)
+{
+ unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+ return &ksocknal_data.ksnd_peers[hash];
+}
+
+static inline void
+ksocknal_conn_addref(ksock_conn_t *conn)
+{
+ LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
+ atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn(ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref(ksock_conn_t *conn)
+{
+ LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
+ if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+ ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref(ksock_conn_t *conn)
+{
+ int rc = -ESHUTDOWN;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ if (!conn->ksnc_closing) {
+ LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+ atomic_inc(&conn->ksnc_sock_refcount);
+ rc = 0;
+ }
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ return rc;
+}
+
+static inline void
+ksocknal_connsock_decref(ksock_conn_t *conn)
+{
+ LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+ if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+ LASSERT(conn->ksnc_closing);
+ libcfs_sock_release(conn->ksnc_sock);
+ conn->ksnc_sock = NULL;
+ ksocknal_finalize_zcreq(conn);
+ }
+}
+
+static inline void
+ksocknal_tx_addref(ksock_tx_t *tx)
+{
+ LASSERT(atomic_read(&tx->tx_refcount) > 0);
+ atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep(ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done(lnet_ni_t *ni, ksock_tx_t *tx);
+
+static inline void
+ksocknal_tx_decref(ksock_tx_t *tx)
+{
+ LASSERT(atomic_read(&tx->tx_refcount) > 0);
+ if (atomic_dec_and_test(&tx->tx_refcount))
+ ksocknal_tx_done(NULL, tx);
+}
+
+static inline void
+ksocknal_route_addref(ksock_route_t *route)
+{
+ LASSERT(atomic_read(&route->ksnr_refcount) > 0);
+ atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route(ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref(ksock_route_t *route)
+{
+ LASSERT(atomic_read(&route->ksnr_refcount) > 0);
+ if (atomic_dec_and_test(&route->ksnr_refcount))
+ ksocknal_destroy_route(route);
+}
+
+static inline void
+ksocknal_peer_addref(ksock_peer_t *peer)
+{
+ LASSERT(atomic_read(&peer->ksnp_refcount) > 0);
+ atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer(ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref(ksock_peer_t *peer)
+{
+ LASSERT(atomic_read(&peer->ksnp_refcount) > 0);
+ if (atomic_dec_and_test(&peer->ksnp_refcount))
+ ksocknal_destroy_peer(peer);
+}
+
+int ksocknal_startup(lnet_ni_t *ni);
+void ksocknal_shutdown(lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+ int delayed, unsigned int niov,
+ struct kvec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, struct socket *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked(lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer(lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed(ksock_peer_t *peer);
+extern int ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
+ struct socket *sock, int type);
+extern void ksocknal_close_conn_locked(ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn(ksock_conn_t *conn);
+extern void ksocknal_destroy_conn(ksock_conn_t *conn);
+extern int ksocknal_close_peer_conns_locked(ksock_peer_t *peer,
+ __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings(ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer,
+ ksock_tx_t *tx, int nonblk);
+
+extern int ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx,
+ lnet_process_id_t id);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx(ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked(ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_txlist_done(lnet_ni_t *ni, struct list_head *txlist,
+ int error);
+extern void ksocknal_notify(lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when);
+extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
+extern void ksocknal_thread_fini(void);
+extern void ksocknal_launch_all_connections_locked(ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connectable_route_locked(ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connecting_route_locked(ksock_peer_t *peer);
+extern int ksocknal_new_packet(ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler(void *arg);
+extern int ksocknal_connd(void *arg);
+extern int ksocknal_reaper(void *arg);
+extern int ksocknal_send_hello(lnet_ni_t *ni, ksock_conn_t *conn,
+ lnet_nid_t peer_nid, ksock_hello_msg_t *hello);
+extern int ksocknal_recv_hello(lnet_ni_t *ni, ksock_conn_t *conn,
+ ksock_hello_msg_t *hello, lnet_process_id_t *id,
+ __u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn(ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs(ksock_conn_t *conn);
+extern int ksocknal_lib_setup_sock(struct socket *so);
+extern int ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack(ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov(ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov(ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem,
+ int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_bind_thread_to_cpu(int id);
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644
index 000000000..fa7ad883b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -0,0 +1,2634 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+ksock_tx_t *
+ksocknal_alloc_tx(int type, int size)
+{
+ ksock_tx_t *tx = NULL;
+
+ if (type == KSOCK_MSG_NOOP) {
+ LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+ /* searching for a noop tx in free list */
+ spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+ if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+ tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+ next, ksock_tx_t, tx_list);
+ LASSERT(tx->tx_desc_size == size);
+ list_del(&tx->tx_list);
+ }
+
+ spin_unlock(&ksocknal_data.ksnd_tx_lock);
+ }
+
+ if (tx == NULL)
+ LIBCFS_ALLOC(tx, size);
+
+ if (tx == NULL)
+ return NULL;
+
+ atomic_set(&tx->tx_refcount, 1);
+ tx->tx_zc_aborted = 0;
+ tx->tx_zc_capable = 0;
+ tx->tx_zc_checked = 0;
+ tx->tx_desc_size = size;
+
+ atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+ return tx;
+}
+
+ksock_tx_t *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+ ksock_tx_t *tx;
+
+ tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+ if (tx == NULL) {
+ CERROR("Can't allocate noop tx desc\n");
+ return NULL;
+ }
+
+ tx->tx_conn = NULL;
+ tx->tx_lnetmsg = NULL;
+ tx->tx_kiov = NULL;
+ tx->tx_nkiov = 0;
+ tx->tx_iov = tx->tx_frags.virt.iov;
+ tx->tx_niov = 1;
+ tx->tx_nonblk = nonblk;
+
+ socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP);
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+ return tx;
+}
+
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+ atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+ if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+ /* it's a noop tx */
+ spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+ list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+ spin_unlock(&ksocknal_data.ksnd_tx_lock);
+ } else {
+ LIBCFS_FREE(tx, tx->tx_desc_size);
+ }
+}
+
+static int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ struct kvec *iov = tx->tx_iov;
+ int nob;
+ int rc;
+
+ LASSERT (tx->tx_niov > 0);
+
+ /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+ rc = ksocknal_lib_send_iov(conn, tx);
+
+ if (rc <= 0) /* sent nothing? */
+ return rc;
+
+ nob = rc;
+ LASSERT (nob <= tx->tx_resid);
+ tx->tx_resid -= nob;
+
+ /* "consume" iov */
+ do {
+ LASSERT (tx->tx_niov > 0);
+
+ if (nob < (int) iov->iov_len) {
+ iov->iov_base = (void *)((char *)iov->iov_base + nob);
+ iov->iov_len -= nob;
+ return rc;
+ }
+
+ nob -= iov->iov_len;
+ tx->tx_iov = ++iov;
+ tx->tx_niov--;
+ } while (nob != 0);
+
+ return rc;
+}
+
+static int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ lnet_kiov_t *kiov = tx->tx_kiov;
+ int nob;
+ int rc;
+
+ LASSERT (tx->tx_niov == 0);
+ LASSERT (tx->tx_nkiov > 0);
+
+ /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+ rc = ksocknal_lib_send_kiov(conn, tx);
+
+ if (rc <= 0) /* sent nothing? */
+ return rc;
+
+ nob = rc;
+ LASSERT (nob <= tx->tx_resid);
+ tx->tx_resid -= nob;
+
+ /* "consume" kiov */
+ do {
+ LASSERT(tx->tx_nkiov > 0);
+
+ if (nob < (int)kiov->kiov_len) {
+ kiov->kiov_offset += nob;
+ kiov->kiov_len -= nob;
+ return rc;
+ }
+
+ nob -= (int)kiov->kiov_len;
+ tx->tx_kiov = ++kiov;
+ tx->tx_nkiov--;
+ } while (nob != 0);
+
+ return rc;
+}
+
+static int
+ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ int rc;
+ int bufnob;
+
+ if (ksocknal_data.ksnd_stall_tx != 0) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+ }
+
+ LASSERT (tx->tx_resid != 0);
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) {
+ LASSERT (conn->ksnc_closing);
+ return -ESHUTDOWN;
+ }
+
+ do {
+ if (ksocknal_data.ksnd_enomem_tx > 0) {
+ /* testing... */
+ ksocknal_data.ksnd_enomem_tx--;
+ rc = -EAGAIN;
+ } else if (tx->tx_niov != 0) {
+ rc = ksocknal_send_iov (conn, tx);
+ } else {
+ rc = ksocknal_send_kiov (conn, tx);
+ }
+
+ bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+ if (rc > 0) /* sent something? */
+ conn->ksnc_tx_bufnob += rc; /* account it */
+
+ if (bufnob < conn->ksnc_tx_bufnob) {
+ /* allocated send buffer bytes < computed; infer
+ * something got ACKed */
+ conn->ksnc_tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+ conn->ksnc_tx_bufnob = bufnob;
+ mb();
+ }
+
+ if (rc <= 0) { /* Didn't write anything? */
+
+ if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+ rc = -EAGAIN;
+
+ /* Check if EAGAIN is due to memory pressure */
+ if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+ rc = -ENOMEM;
+
+ break;
+ }
+
+ /* socket's wmem_queued now includes 'rc' bytes */
+ atomic_sub (rc, &conn->ksnc_tx_nob);
+ rc = 0;
+
+ } while (tx->tx_resid != 0);
+
+ ksocknal_connsock_decref(conn);
+ return rc;
+}
+
+static int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+ struct kvec *iov = conn->ksnc_rx_iov;
+ int nob;
+ int rc;
+
+ LASSERT (conn->ksnc_rx_niov > 0);
+
+ /* Never touch conn->ksnc_rx_iov or change connection
+ * status inside ksocknal_lib_recv_iov */
+ rc = ksocknal_lib_recv_iov(conn);
+
+ if (rc <= 0)
+ return rc;
+
+ /* received something... */
+ nob = rc;
+
+ conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+ conn->ksnc_rx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ mb(); /* order with setting rx_started */
+ conn->ksnc_rx_started = 1;
+
+ conn->ksnc_rx_nob_wanted -= nob;
+ conn->ksnc_rx_nob_left -= nob;
+
+ do {
+ LASSERT (conn->ksnc_rx_niov > 0);
+
+ if (nob < (int)iov->iov_len) {
+ iov->iov_len -= nob;
+ iov->iov_base += nob;
+ return -EAGAIN;
+ }
+
+ nob -= iov->iov_len;
+ conn->ksnc_rx_iov = ++iov;
+ conn->ksnc_rx_niov--;
+ } while (nob != 0);
+
+ return rc;
+}
+
+static int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+ lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
+ int nob;
+ int rc;
+ LASSERT (conn->ksnc_rx_nkiov > 0);
+
+ /* Never touch conn->ksnc_rx_kiov or change connection
+ * status inside ksocknal_lib_recv_iov */
+ rc = ksocknal_lib_recv_kiov(conn);
+
+ if (rc <= 0)
+ return rc;
+
+ /* received something... */
+ nob = rc;
+
+ conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+ conn->ksnc_rx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ mb(); /* order with setting rx_started */
+ conn->ksnc_rx_started = 1;
+
+ conn->ksnc_rx_nob_wanted -= nob;
+ conn->ksnc_rx_nob_left -= nob;
+
+ do {
+ LASSERT (conn->ksnc_rx_nkiov > 0);
+
+ if (nob < (int) kiov->kiov_len) {
+ kiov->kiov_offset += nob;
+ kiov->kiov_len -= nob;
+ return -EAGAIN;
+ }
+
+ nob -= kiov->kiov_len;
+ conn->ksnc_rx_kiov = ++kiov;
+ conn->ksnc_rx_nkiov--;
+ } while (nob != 0);
+
+ return 1;
+}
+
+static int
+ksocknal_receive (ksock_conn_t *conn)
+{
+ /* Return 1 on success, 0 on EOF, < 0 on error.
+ * Caller checks ksnc_rx_nob_wanted to determine
+ * progress/completion. */
+ int rc;
+
+ if (ksocknal_data.ksnd_stall_rx != 0) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx));
+ }
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) {
+ LASSERT (conn->ksnc_closing);
+ return -ESHUTDOWN;
+ }
+
+ for (;;) {
+ if (conn->ksnc_rx_niov != 0)
+ rc = ksocknal_recv_iov (conn);
+ else
+ rc = ksocknal_recv_kiov (conn);
+
+ if (rc <= 0) {
+ /* error/EOF or partial receive */
+ if (rc == -EAGAIN) {
+ rc = 1;
+ } else if (rc == 0 && conn->ksnc_rx_started) {
+ /* EOF in the middle of a message */
+ rc = -EPROTO;
+ }
+ break;
+ }
+
+ /* Completed a fragment */
+
+ if (conn->ksnc_rx_nob_wanted == 0) {
+ rc = 1;
+ break;
+ }
+ }
+
+ ksocknal_connsock_decref(conn);
+ return rc;
+}
+
+void
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx)
+{
+ lnet_msg_t *lnetmsg = tx->tx_lnetmsg;
+ int rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO;
+
+ LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+ if (tx->tx_conn != NULL)
+ ksocknal_conn_decref(tx->tx_conn);
+
+ if (ni == NULL && tx->tx_conn != NULL)
+ ni = tx->tx_conn->ksnc_peer->ksnp_ni;
+
+ ksocknal_free_tx (tx);
+ if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+ lnet_finalize (ni, lnetmsg, rc);
+}
+
+void
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error)
+{
+ ksock_tx_t *tx;
+
+ while (!list_empty (txlist)) {
+ tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+ if (error && tx->tx_lnetmsg != NULL) {
+ CNETERR("Deleting packet type %d len %d %s->%s\n",
+ le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+ le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+ libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+ libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+ } else if (error) {
+ CNETERR("Deleting noop packet\n");
+ }
+
+ list_del (&tx->tx_list);
+
+ LASSERT (atomic_read(&tx->tx_refcount) == 1);
+ ksocknal_tx_done (ni, tx);
+ }
+}
+
+static void
+ksocknal_check_zc_req(ksock_tx_t *tx)
+{
+ ksock_conn_t *conn = tx->tx_conn;
+ ksock_peer_t *peer = conn->ksnc_peer;
+
+ /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+ * to ksnp_zc_req_list if some fragment of this message should be sent
+ * zero-copy. Our peer will send an ACK containing this cookie when
+ * she has received this message to tell us we can signal completion.
+ * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+ * ksnp_zc_req_list. */
+ LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+ LASSERT (tx->tx_zc_capable);
+
+ tx->tx_zc_checked = 1;
+
+ if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+ !conn->ksnc_zc_capable)
+ return;
+
+ /* assign cookie and queue tx to pending list, it will be released when
+ * a matching ack is received. See ksocknal_handle_zcack() */
+
+ ksocknal_tx_addref(tx);
+
+ spin_lock(&peer->ksnp_lock);
+
+ /* ZC_REQ is going to be pinned to the peer */
+ tx->tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+ LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+ tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
+
+ if (peer->ksnp_zc_next_cookie == 0)
+ peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+ list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+
+ spin_unlock(&peer->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+{
+ ksock_peer_t *peer = tx->tx_conn->ksnc_peer;
+
+ LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+ LASSERT(tx->tx_zc_capable);
+
+ tx->tx_zc_checked = 0;
+
+ spin_lock(&peer->ksnp_lock);
+
+ if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+ /* Not waiting for an ACK */
+ spin_unlock(&peer->ksnp_lock);
+ return;
+ }
+
+ tx->tx_msg.ksm_zc_cookies[0] = 0;
+ list_del(&tx->tx_zc_list);
+
+ spin_unlock(&peer->ksnp_lock);
+
+ ksocknal_tx_decref(tx);
+}
+
+static int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ int rc;
+
+ if (tx->tx_zc_capable && !tx->tx_zc_checked)
+ ksocknal_check_zc_req(tx);
+
+ rc = ksocknal_transmit (conn, tx);
+
+ CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+ if (tx->tx_resid == 0) {
+ /* Sent everything OK */
+ LASSERT (rc == 0);
+
+ return 0;
+ }
+
+ if (rc == -EAGAIN)
+ return rc;
+
+ if (rc == -ENOMEM) {
+ static int counter;
+
+ counter++; /* exponential backoff warnings */
+ if ((counter & (-counter)) == counter)
+ CWARN("%u ENOMEM tx %p (%u allocated)\n",
+ counter, conn, atomic_read(&libcfs_kmemory));
+
+ /* Queue on ksnd_enomem_conns for retry after a timeout */
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ /* enomem list takes over scheduler's ref... */
+ LASSERT (conn->ksnc_tx_scheduled);
+ list_add_tail(&conn->ksnc_tx_list,
+ &ksocknal_data.ksnd_enomem_conns);
+ if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+ SOCKNAL_ENOMEM_RETRY),
+ ksocknal_data.ksnd_reaper_waketime))
+ wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+ return rc;
+ }
+
+ /* Actual error */
+ LASSERT (rc < 0);
+
+ if (!conn->ksnc_closing) {
+ switch (rc) {
+ case -ECONNRESET:
+ LCONSOLE_WARN("Host %pI4h reset our connection while we were sending data; it may have rebooted.\n",
+ &conn->ksnc_ipaddr);
+ break;
+ default:
+ LCONSOLE_WARN("There was an unexpected network error while writing to %pI4h: %d.\n",
+ &conn->ksnc_ipaddr, rc);
+ break;
+ }
+ CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n",
+ conn, rc,
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ &conn->ksnc_ipaddr,
+ conn->ksnc_port);
+ }
+
+ if (tx->tx_zc_checked)
+ ksocknal_uncheck_zc_req(tx);
+
+ /* it's not an error if conn is being closed */
+ ksocknal_close_conn_and_siblings (conn,
+ (conn->ksnc_closing) ? 0 : rc);
+
+ return rc;
+}
+
+static void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+
+ /* called holding write lock on ksnd_global_lock */
+
+ LASSERT (!route->ksnr_scheduled);
+ LASSERT (!route->ksnr_connecting);
+ LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+
+ route->ksnr_scheduled = 1; /* scheduling conn for connd */
+ ksocknal_route_addref(route); /* extra ref for connd */
+
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+ list_add_tail(&route->ksnr_connd_list,
+ &ksocknal_data.ksnd_connd_routes);
+ wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
+{
+ ksock_route_t *route;
+
+ /* called holding write lock on ksnd_global_lock */
+ for (;;) {
+ /* launch any/all connections that need it */
+ route = ksocknal_find_connectable_route_locked(peer);
+ if (route == NULL)
+ return;
+
+ ksocknal_launch_connection_locked(route);
+ }
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
+{
+ struct list_head *tmp;
+ ksock_conn_t *conn;
+ ksock_conn_t *typed = NULL;
+ ksock_conn_t *fallback = NULL;
+ int tnob = 0;
+ int fnob = 0;
+
+ list_for_each (tmp, &peer->ksnp_conns) {
+ ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list);
+ int nob = atomic_read(&c->ksnc_tx_nob) +
+ c->ksnc_sock->sk->sk_wmem_queued;
+ int rc;
+
+ LASSERT (!c->ksnc_closing);
+ LASSERT (c->ksnc_proto != NULL &&
+ c->ksnc_proto->pro_match_tx != NULL);
+
+ rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+ switch (rc) {
+ default:
+ LBUG();
+ case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+ continue;
+
+ case SOCKNAL_MATCH_YES: /* typed connection */
+ if (typed == NULL || tnob > nob ||
+ (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+ cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+ typed = c;
+ tnob = nob;
+ }
+ break;
+
+ case SOCKNAL_MATCH_MAY: /* fallback connection */
+ if (fallback == NULL || fnob > nob ||
+ (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+ cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+ fallback = c;
+ fnob = nob;
+ }
+ break;
+ }
+ }
+
+ /* prefer the typed selection */
+ conn = (typed != NULL) ? typed : fallback;
+
+ if (conn != NULL)
+ conn->ksnc_tx_last_post = cfs_time_current();
+
+ return conn;
+}
+
+void
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ conn->ksnc_proto->pro_pack(tx);
+
+ atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+ ksocknal_conn_addref(conn); /* +1 ref for tx */
+ tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+ ksock_sched_t *sched = conn->ksnc_scheduler;
+ ksock_msg_t *msg = &tx->tx_msg;
+ ksock_tx_t *ztx = NULL;
+ int bufnob = 0;
+
+ /* called holding global lock (read or irq-write) and caller may
+ * not have dropped this lock between finding conn and calling me,
+ * so we don't need the {get,put}connsock dance to deref
+ * ksnc_sock... */
+ LASSERT(!conn->ksnc_closing);
+
+ CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ &conn->ksnc_ipaddr,
+ conn->ksnc_port);
+
+ ksocknal_tx_prep(conn, tx);
+
+ /* Ensure the frags we've been given EXACTLY match the number of
+ * bytes we want to send. Many TCP/IP stacks disregard any total
+ * size parameters passed to them and just look at the frags.
+ *
+ * We always expect at least 1 mapped fragment containing the
+ * complete ksocknal message header. */
+ LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+ lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+ (unsigned int)tx->tx_nob);
+ LASSERT (tx->tx_niov >= 1);
+ LASSERT (tx->tx_resid == tx->tx_nob);
+
+ CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+ tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type:
+ KSOCK_MSG_NOOP,
+ tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+ /*
+ * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
+ * but they're used inside spinlocks a lot.
+ */
+ bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+ spin_lock_bh(&sched->kss_lock);
+
+ if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+ /* First packet starts the timeout */
+ conn->ksnc_tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+ conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+ conn->ksnc_tx_bufnob = 0;
+ mb(); /* order with adding to tx_queue */
+ }
+
+ if (msg->ksm_type == KSOCK_MSG_NOOP) {
+ /* The packet is noop ZC ACK, try to piggyback the ack_cookie
+ * on a normal packet so I don't need to send it */
+ LASSERT (msg->ksm_zc_cookies[1] != 0);
+ LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+ if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+ ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+ } else {
+ /* It's a normal packet - can it piggback a noop zc-ack that
+ * has been queued already? */
+ LASSERT (msg->ksm_zc_cookies[1] == 0);
+ LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+ ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+ /* ztx will be released later */
+ }
+
+ if (ztx != NULL) {
+ atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+ list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+ }
+
+ if (conn->ksnc_tx_ready && /* able to send */
+ !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+ /* +1 ref for scheduler */
+ ksocknal_conn_addref(conn);
+ list_add_tail (&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ conn->ksnc_tx_scheduled = 1;
+ wake_up (&sched->kss_waitq);
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+}
+
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+{
+ unsigned long now = cfs_time_current();
+ struct list_head *tmp;
+ ksock_route_t *route;
+
+ list_for_each (tmp, &peer->ksnp_routes) {
+ route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+ LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+ if (route->ksnr_scheduled) /* connections being established */
+ continue;
+
+ /* all route types connected ? */
+ if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
+ continue;
+
+ if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+ cfs_time_aftereq(now, route->ksnr_timeout))) {
+ CDEBUG(D_NET,
+ "Too soon to retry route %pI4h (cnted %d, interval %ld, %ld secs later)\n",
+ &route->ksnr_ipaddr,
+ route->ksnr_connected,
+ route->ksnr_retry_interval,
+ cfs_duration_sec(route->ksnr_timeout - now));
+ continue;
+ }
+
+ return route;
+ }
+
+ return NULL;
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+{
+ struct list_head *tmp;
+ ksock_route_t *route;
+
+ list_for_each (tmp, &peer->ksnp_routes) {
+ route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+ LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+ if (route->ksnr_scheduled)
+ return route;
+ }
+
+ return NULL;
+}
+
+int
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
+{
+ ksock_peer_t *peer;
+ ksock_conn_t *conn;
+ rwlock_t *g_lock;
+ int retry;
+ int rc;
+
+ LASSERT (tx->tx_conn == NULL);
+
+ g_lock = &ksocknal_data.ksnd_global_lock;
+
+ for (retry = 0;; retry = 1) {
+ read_lock(g_lock);
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL) {
+ if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+ conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+ if (conn != NULL) {
+ /* I've got no routes that need to be
+ * connecting and I do have an actual
+ * connection... */
+ ksocknal_queue_tx_locked (tx, conn);
+ read_unlock(g_lock);
+ return 0;
+ }
+ }
+ }
+
+ /* I'll need a write lock... */
+ read_unlock(g_lock);
+
+ write_lock_bh(g_lock);
+
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL)
+ break;
+
+ write_unlock_bh(g_lock);
+
+ if ((id.pid & LNET_PID_USERFLAG) != 0) {
+ CERROR("Refusing to create a connection to userspace process %s\n",
+ libcfs_id2str(id));
+ return -EHOSTUNREACH;
+ }
+
+ if (retry) {
+ CERROR("Can't find peer %s\n", libcfs_id2str(id));
+ return -EHOSTUNREACH;
+ }
+
+ rc = ksocknal_add_peer(ni, id,
+ LNET_NIDADDR(id.nid),
+ lnet_acceptor_port());
+ if (rc != 0) {
+ CERROR("Can't add peer %s: %d\n",
+ libcfs_id2str(id), rc);
+ return rc;
+ }
+ }
+
+ ksocknal_launch_all_connections_locked(peer);
+
+ conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+ if (conn != NULL) {
+ /* Connection exists; queue message on it */
+ ksocknal_queue_tx_locked (tx, conn);
+ write_unlock_bh(g_lock);
+ return 0;
+ }
+
+ if (peer->ksnp_accepting > 0 ||
+ ksocknal_find_connecting_route_locked (peer) != NULL) {
+ /* the message is going to be pinned to the peer */
+ tx->tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+ /* Queue the message until a connection is established */
+ list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
+ write_unlock_bh(g_lock);
+ return 0;
+ }
+
+ write_unlock_bh(g_lock);
+
+ /* NB Routes may be ignored if connections to them failed recently */
+ CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+ return -EHOSTUNREACH;
+}
+
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+ int mpflag = 1;
+ int type = lntmsg->msg_type;
+ lnet_process_id_t target = lntmsg->msg_target;
+ unsigned int payload_niov = lntmsg->msg_niov;
+ struct kvec *payload_iov = lntmsg->msg_iov;
+ lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
+ unsigned int payload_offset = lntmsg->msg_offset;
+ unsigned int payload_nob = lntmsg->msg_len;
+ ksock_tx_t *tx;
+ int desc_size;
+ int rc;
+
+ /* NB 'private' is different depending on what we're sending.
+ * Just ignore it... */
+
+ CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+ payload_nob, payload_niov, libcfs_id2str(target));
+
+ LASSERT (payload_nob == 0 || payload_niov > 0);
+ LASSERT (payload_niov <= LNET_MAX_IOV);
+ /* payload is either all vaddrs or all pages */
+ LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+ LASSERT (!in_interrupt ());
+
+ if (payload_iov != NULL)
+ desc_size = offsetof(ksock_tx_t,
+ tx_frags.virt.iov[1 + payload_niov]);
+ else
+ desc_size = offsetof(ksock_tx_t,
+ tx_frags.paged.kiov[payload_niov]);
+
+ if (lntmsg->msg_vmflush)
+ mpflag = cfs_memory_pressure_get_and_set();
+ tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+ if (tx == NULL) {
+ CERROR("Can't allocate tx desc type %d size %d\n",
+ type, desc_size);
+ if (lntmsg->msg_vmflush)
+ cfs_memory_pressure_restore(mpflag);
+ return -ENOMEM;
+ }
+
+ tx->tx_conn = NULL; /* set when assigned a conn */
+ tx->tx_lnetmsg = lntmsg;
+
+ if (payload_iov != NULL) {
+ tx->tx_kiov = NULL;
+ tx->tx_nkiov = 0;
+ tx->tx_iov = tx->tx_frags.virt.iov;
+ tx->tx_niov = 1 +
+ lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ } else {
+ tx->tx_niov = 1;
+ tx->tx_iov = &tx->tx_frags.paged.iov;
+ tx->tx_kiov = tx->tx_frags.paged.kiov;
+ tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+
+ if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+ tx->tx_zc_capable = 1;
+ }
+
+ socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET);
+
+ /* The first fragment will be set later in pro_pack */
+ rc = ksocknal_launch_packet(ni, tx, target);
+ if (!mpflag)
+ cfs_memory_pressure_restore(mpflag);
+
+ if (rc == 0)
+ return 0;
+
+ ksocknal_free_tx(tx);
+ return -EIO;
+}
+
+int
+ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+ struct task_struct *task = kthread_run(fn, arg, "%s", name);
+
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+ ksocknal_data.ksnd_nthreads++;
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+ return 0;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+ ksocknal_data.ksnd_nthreads--;
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+ static char ksocknal_slop_buffer[4096];
+
+ int nob;
+ unsigned int niov;
+ int skipped;
+
+ LASSERT(conn->ksnc_proto != NULL);
+
+ if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+ /* Remind the socket to ack eagerly... */
+ ksocknal_lib_eager_ack(conn);
+ }
+
+ if (nob_to_skip == 0) { /* right at next packet boundary now */
+ conn->ksnc_rx_started = 0;
+ mb(); /* racing with timeout thread */
+
+ switch (conn->ksnc_proto->pro_version) {
+ case KSOCK_PROTO_V2:
+ case KSOCK_PROTO_V3:
+ conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+ conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+ conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg;
+
+ conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u);
+ conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u);
+ conn->ksnc_rx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u);
+ break;
+
+ case KSOCK_PROTO_V1:
+ /* Receiving bare lnet_hdr_t */
+ conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+ conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t);
+ conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t);
+
+ conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+ conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg;
+ conn->ksnc_rx_iov[0].iov_len = sizeof (lnet_hdr_t);
+ break;
+
+ default:
+ LBUG ();
+ }
+ conn->ksnc_rx_niov = 1;
+
+ conn->ksnc_rx_kiov = NULL;
+ conn->ksnc_rx_nkiov = 0;
+ conn->ksnc_rx_csum = ~0;
+ return 1;
+ }
+
+ /* Set up to skip as much as possible now. If there's more left
+ * (ran out of iov entries) we'll get called again */
+
+ conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+ conn->ksnc_rx_nob_left = nob_to_skip;
+ conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+ skipped = 0;
+ niov = 0;
+
+ do {
+ nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer));
+
+ conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+ conn->ksnc_rx_iov[niov].iov_len = nob;
+ niov++;
+ skipped += nob;
+ nob_to_skip -=nob;
+
+ } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */
+ niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+ conn->ksnc_rx_niov = niov;
+ conn->ksnc_rx_kiov = NULL;
+ conn->ksnc_rx_nkiov = 0;
+ conn->ksnc_rx_nob_wanted = skipped;
+ return 0;
+}
+
+static int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+ lnet_hdr_t *lhdr;
+ lnet_process_id_t *id;
+ int rc;
+
+ LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+ /* NB: sched lock NOT held */
+ /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */
+ LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+ conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+ conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+ conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+ if (conn->ksnc_rx_nob_wanted != 0) {
+ rc = ksocknal_receive(conn);
+
+ if (rc <= 0) {
+ LASSERT (rc != -EAGAIN);
+
+ if (rc == 0)
+ CDEBUG(D_NET, "[%p] EOF from %s ip %pI4h:%d\n",
+ conn,
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ &conn->ksnc_ipaddr,
+ conn->ksnc_port);
+ else if (!conn->ksnc_closing)
+ CERROR("[%p] Error %d on read from %s ip %pI4h:%d\n",
+ conn, rc,
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ &conn->ksnc_ipaddr,
+ conn->ksnc_port);
+
+ /* it's not an error if conn is being closed */
+ ksocknal_close_conn_and_siblings (conn,
+ (conn->ksnc_closing) ? 0 : rc);
+ return (rc == 0 ? -ESHUTDOWN : rc);
+ }
+
+ if (conn->ksnc_rx_nob_wanted != 0) {
+ /* short read */
+ return -EAGAIN;
+ }
+ }
+ switch (conn->ksnc_rx_state) {
+ case SOCKNAL_RX_KSM_HEADER:
+ if (conn->ksnc_flip) {
+ __swab32s(&conn->ksnc_msg.ksm_type);
+ __swab32s(&conn->ksnc_msg.ksm_csum);
+ __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+ __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+ }
+
+ if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
+ conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
+ CERROR("%s: Unknown message type: %x\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ conn->ksnc_msg.ksm_type);
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings(conn, -EPROTO);
+ return -EPROTO;
+ }
+
+ if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+ conn->ksnc_msg.ksm_csum != 0 && /* has checksum */
+ conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+ /* NOOP Checksum error */
+ CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings(conn, -EPROTO);
+ return -EIO;
+ }
+
+ if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+ __u64 cookie = 0;
+
+ LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+ if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+ cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+ rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
+ conn->ksnc_msg.ksm_zc_cookies[1]);
+
+ if (rc != 0) {
+ CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings(conn, -EPROTO);
+ return rc;
+ }
+ }
+
+ if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
+ ksocknal_new_packet (conn, 0);
+ return 0; /* NOOP is done and just return */
+ }
+
+ conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+ conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t);
+ conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t);
+
+ conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+ conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg;
+ conn->ksnc_rx_iov[0].iov_len = sizeof(ksock_lnet_msg_t);
+
+ conn->ksnc_rx_niov = 1;
+ conn->ksnc_rx_kiov = NULL;
+ conn->ksnc_rx_nkiov = 0;
+
+ goto again; /* read lnet header now */
+
+ case SOCKNAL_RX_LNET_HEADER:
+ /* unpack message header */
+ conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+ if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+ /* Userspace peer */
+ lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+ id = &conn->ksnc_peer->ksnp_id;
+
+ /* Substitute process ID assigned at connection time */
+ lhdr->src_pid = cpu_to_le32(id->pid);
+ lhdr->src_nid = cpu_to_le64(id->nid);
+ }
+
+ conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+ ksocknal_conn_addref(conn); /* ++ref while parsing */
+
+ rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+ &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
+ conn->ksnc_peer->ksnp_id.nid, conn, 0);
+ if (rc < 0) {
+ /* I just received garbage: give up on this conn */
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings (conn, rc);
+ ksocknal_conn_decref(conn);
+ return -EPROTO;
+ }
+
+ /* I'm racing with ksocknal_recv() */
+ LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+ conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+ if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+ return 0;
+
+ /* ksocknal_recv() got called */
+ goto again;
+
+ case SOCKNAL_RX_LNET_PAYLOAD:
+ /* payload all received */
+ rc = 0;
+
+ if (conn->ksnc_rx_nob_left == 0 && /* not truncating */
+ conn->ksnc_msg.ksm_csum != 0 && /* has checksum */
+ conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+ CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+ rc = -EIO;
+ }
+
+ if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+ LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+ lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+ id = &conn->ksnc_peer->ksnp_id;
+
+ rc = conn->ksnc_proto->pro_handle_zcreq(conn,
+ conn->ksnc_msg.ksm_zc_cookies[0],
+ *ksocknal_tunables.ksnd_nonblk_zcack ||
+ le64_to_cpu(lhdr->src_nid) != id->nid);
+ }
+
+ lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
+
+ if (rc != 0) {
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings (conn, rc);
+ return -EPROTO;
+ }
+ /* Fall through */
+
+ case SOCKNAL_RX_SLOP:
+ /* starting new packet? */
+ if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+ return 0; /* come back later */
+ goto again; /* try to finish reading slop now */
+
+ default:
+ break;
+ }
+
+ /* Not Reached */
+ LBUG ();
+ return -EINVAL; /* keep gcc happy */
+}
+
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+ unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ ksock_conn_t *conn = (ksock_conn_t *)private;
+ ksock_sched_t *sched = conn->ksnc_scheduler;
+
+ LASSERT (mlen <= rlen);
+ LASSERT (niov <= LNET_MAX_IOV);
+
+ conn->ksnc_cookie = msg;
+ conn->ksnc_rx_nob_wanted = mlen;
+ conn->ksnc_rx_nob_left = rlen;
+
+ if (mlen == 0 || iov != NULL) {
+ conn->ksnc_rx_nkiov = 0;
+ conn->ksnc_rx_kiov = NULL;
+ conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+ conn->ksnc_rx_niov =
+ lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+ niov, iov, offset, mlen);
+ } else {
+ conn->ksnc_rx_niov = 0;
+ conn->ksnc_rx_iov = NULL;
+ conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+ conn->ksnc_rx_nkiov =
+ lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+ niov, kiov, offset, mlen);
+ }
+
+ LASSERT (mlen ==
+ lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+ lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+ LASSERT (conn->ksnc_rx_scheduled);
+
+ spin_lock_bh(&sched->kss_lock);
+
+ switch (conn->ksnc_rx_state) {
+ case SOCKNAL_RX_PARSE_WAIT:
+ list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+ wake_up (&sched->kss_waitq);
+ LASSERT (conn->ksnc_rx_ready);
+ break;
+
+ case SOCKNAL_RX_PARSE:
+ /* scheduler hasn't noticed I'm parsing yet */
+ break;
+ }
+
+ conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+ spin_unlock_bh(&sched->kss_lock);
+ ksocknal_conn_decref(conn);
+ return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+ int rc;
+
+ spin_lock_bh(&sched->kss_lock);
+
+ rc = !ksocknal_data.ksnd_shuttingdown &&
+ list_empty(&sched->kss_rx_conns) &&
+ list_empty(&sched->kss_tx_conns);
+
+ spin_unlock_bh(&sched->kss_lock);
+ return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+ struct ksock_sched_info *info;
+ ksock_sched_t *sched;
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+ int rc;
+ int nloops = 0;
+ long id = (long)arg;
+
+ info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+ sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+ cfs_block_allsigs();
+
+ rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+ if (rc != 0) {
+ CERROR("Can't set CPT affinity to %d: %d\n",
+ info->ksi_cpt, rc);
+ }
+
+ spin_lock_bh(&sched->kss_lock);
+
+ while (!ksocknal_data.ksnd_shuttingdown) {
+ int did_something = 0;
+
+ /* Ensure I progress everything semi-fairly */
+
+ if (!list_empty (&sched->kss_rx_conns)) {
+ conn = list_entry(sched->kss_rx_conns.next,
+ ksock_conn_t, ksnc_rx_list);
+ list_del(&conn->ksnc_rx_list);
+
+ LASSERT(conn->ksnc_rx_scheduled);
+ LASSERT(conn->ksnc_rx_ready);
+
+ /* clear rx_ready in case receive isn't complete.
+ * Do it BEFORE we call process_recv, since
+ * data_ready can set it any time after we release
+ * kss_lock. */
+ conn->ksnc_rx_ready = 0;
+ spin_unlock_bh(&sched->kss_lock);
+
+ rc = ksocknal_process_receive(conn);
+
+ spin_lock_bh(&sched->kss_lock);
+
+ /* I'm the only one that can clear this flag */
+ LASSERT(conn->ksnc_rx_scheduled);
+
+ /* Did process_receive get everything it wanted? */
+ if (rc == 0)
+ conn->ksnc_rx_ready = 1;
+
+ if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+ /* Conn blocked waiting for ksocknal_recv()
+ * I change its state (under lock) to signal
+ * it can be rescheduled */
+ conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+ } else if (conn->ksnc_rx_ready) {
+ /* reschedule for rx */
+ list_add_tail (&conn->ksnc_rx_list,
+ &sched->kss_rx_conns);
+ } else {
+ conn->ksnc_rx_scheduled = 0;
+ /* drop my ref */
+ ksocknal_conn_decref(conn);
+ }
+
+ did_something = 1;
+ }
+
+ if (!list_empty (&sched->kss_tx_conns)) {
+ LIST_HEAD (zlist);
+
+ if (!list_empty(&sched->kss_zombie_noop_txs)) {
+ list_add(&zlist,
+ &sched->kss_zombie_noop_txs);
+ list_del_init(&sched->kss_zombie_noop_txs);
+ }
+
+ conn = list_entry(sched->kss_tx_conns.next,
+ ksock_conn_t, ksnc_tx_list);
+ list_del (&conn->ksnc_tx_list);
+
+ LASSERT(conn->ksnc_tx_scheduled);
+ LASSERT(conn->ksnc_tx_ready);
+ LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+ tx = list_entry(conn->ksnc_tx_queue.next,
+ ksock_tx_t, tx_list);
+
+ if (conn->ksnc_tx_carrier == tx)
+ ksocknal_next_tx_carrier(conn);
+
+ /* dequeue now so empty list => more to send */
+ list_del(&tx->tx_list);
+
+ /* Clear tx_ready in case send isn't complete. Do
+ * it BEFORE we call process_transmit, since
+ * write_space can set it any time after we release
+ * kss_lock. */
+ conn->ksnc_tx_ready = 0;
+ spin_unlock_bh(&sched->kss_lock);
+
+ if (!list_empty(&zlist)) {
+ /* free zombie noop txs, it's fast because
+ * noop txs are just put in freelist */
+ ksocknal_txlist_done(NULL, &zlist, 0);
+ }
+
+ rc = ksocknal_process_transmit(conn, tx);
+
+ if (rc == -ENOMEM || rc == -EAGAIN) {
+ /* Incomplete send: replace tx on HEAD of tx_queue */
+ spin_lock_bh(&sched->kss_lock);
+ list_add(&tx->tx_list,
+ &conn->ksnc_tx_queue);
+ } else {
+ /* Complete send; tx -ref */
+ ksocknal_tx_decref(tx);
+
+ spin_lock_bh(&sched->kss_lock);
+ /* assume space for more */
+ conn->ksnc_tx_ready = 1;
+ }
+
+ if (rc == -ENOMEM) {
+ /* Do nothing; after a short timeout, this
+ * conn will be reposted on kss_tx_conns. */
+ } else if (conn->ksnc_tx_ready &&
+ !list_empty (&conn->ksnc_tx_queue)) {
+ /* reschedule for tx */
+ list_add_tail (&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ } else {
+ conn->ksnc_tx_scheduled = 0;
+ /* drop my ref */
+ ksocknal_conn_decref(conn);
+ }
+
+ did_something = 1;
+ }
+ if (!did_something || /* nothing to do */
+ ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+ spin_unlock_bh(&sched->kss_lock);
+
+ nloops = 0;
+
+ if (!did_something) { /* wait for something to do */
+ rc = wait_event_interruptible_exclusive(
+ sched->kss_waitq,
+ !ksocknal_sched_cansleep(sched));
+ LASSERT (rc == 0);
+ } else {
+ cond_resched();
+ }
+
+ spin_lock_bh(&sched->kss_lock);
+ }
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+ ksocknal_thread_fini();
+ return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+ ksock_sched_t *sched;
+
+ sched = conn->ksnc_scheduler;
+
+ spin_lock_bh(&sched->kss_lock);
+
+ conn->ksnc_rx_ready = 1;
+
+ if (!conn->ksnc_rx_scheduled) { /* not being progressed */
+ list_add_tail(&conn->ksnc_rx_list,
+ &sched->kss_rx_conns);
+ conn->ksnc_rx_scheduled = 1;
+ /* extra ref for scheduler */
+ ksocknal_conn_addref(conn);
+
+ wake_up (&sched->kss_waitq);
+ }
+ spin_unlock_bh(&sched->kss_lock);
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{
+ ksock_sched_t *sched;
+
+ sched = conn->ksnc_scheduler;
+
+ spin_lock_bh(&sched->kss_lock);
+
+ conn->ksnc_tx_ready = 1;
+
+ if (!conn->ksnc_tx_scheduled && /* not being progressed */
+ !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */
+ list_add_tail (&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ conn->ksnc_tx_scheduled = 1;
+ /* extra ref for scheduler */
+ ksocknal_conn_addref(conn);
+
+ wake_up (&sched->kss_waitq);
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+}
+
+static ksock_proto_t *
+ksocknal_parse_proto_version (ksock_hello_msg_t *hello)
+{
+ __u32 version = 0;
+
+ if (hello->kshm_magic == LNET_PROTO_MAGIC)
+ version = hello->kshm_version;
+ else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+ version = __swab32(hello->kshm_version);
+
+ if (version != 0) {
+#if SOCKNAL_VERSION_DEBUG
+ if (*ksocknal_tunables.ksnd_protocol == 1)
+ return NULL;
+
+ if (*ksocknal_tunables.ksnd_protocol == 2 &&
+ version == KSOCK_PROTO_V3)
+ return NULL;
+#endif
+ if (version == KSOCK_PROTO_V2)
+ return &ksocknal_protocol_v2x;
+
+ if (version == KSOCK_PROTO_V3)
+ return &ksocknal_protocol_v3x;
+
+ return NULL;
+ }
+
+ if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+ lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello;
+
+ CLASSERT (sizeof (lnet_magicversion_t) ==
+ offsetof (ksock_hello_msg_t, kshm_src_nid));
+
+ if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+ hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+ return &ksocknal_protocol_v1x;
+ }
+
+ return NULL;
+}
+
+int
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+ lnet_nid_t peer_nid, ksock_hello_msg_t *hello)
+{
+ /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+ ksock_net_t *net = (ksock_net_t *)ni->ni_data;
+
+ LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES);
+
+ /* rely on caller to hold a ref on socket so it wouldn't disappear */
+ LASSERT (conn->ksnc_proto != NULL);
+
+ hello->kshm_src_nid = ni->ni_nid;
+ hello->kshm_dst_nid = peer_nid;
+ hello->kshm_src_pid = the_lnet.ln_pid;
+
+ hello->kshm_src_incarnation = net->ksnn_incarnation;
+ hello->kshm_ctype = conn->ksnc_type;
+
+ return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+static int
+ksocknal_invert_type(int type)
+{
+ switch (type) {
+ case SOCKLND_CONN_ANY:
+ case SOCKLND_CONN_CONTROL:
+ return type;
+ case SOCKLND_CONN_BULK_IN:
+ return SOCKLND_CONN_BULK_OUT;
+ case SOCKLND_CONN_BULK_OUT:
+ return SOCKLND_CONN_BULK_IN;
+ default:
+ return SOCKLND_CONN_NONE;
+ }
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+ ksock_hello_msg_t *hello, lnet_process_id_t *peerid,
+ __u64 *incarnation)
+{
+ /* Return < 0 fatal error
+ * 0 success
+ * EALREADY lost connection race
+ * EPROTO protocol version mismatch
+ */
+ struct socket *sock = conn->ksnc_sock;
+ int active = (conn->ksnc_proto != NULL);
+ int timeout;
+ int proto_match;
+ int rc;
+ ksock_proto_t *proto;
+ lnet_process_id_t recv_id;
+
+ /* socket type set on active connections - not set on passive */
+ LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+ timeout = active ? *ksocknal_tunables.ksnd_timeout :
+ lnet_acceptor_timeout();
+
+ rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading HELLO from %pI4h\n",
+ rc, &conn->ksnc_ipaddr);
+ LASSERT (rc < 0);
+ return rc;
+ }
+
+ if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+ hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+ hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+ /* Unexpected magic! */
+ CERROR("Bad magic(1) %#08x (%#08x expected) from %pI4h\n",
+ __cpu_to_le32 (hello->kshm_magic),
+ LNET_PROTO_TCP_MAGIC,
+ &conn->ksnc_ipaddr);
+ return -EPROTO;
+ }
+
+ rc = libcfs_sock_read(sock, &hello->kshm_version,
+ sizeof(hello->kshm_version), timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading HELLO from %pI4h\n",
+ rc, &conn->ksnc_ipaddr);
+ LASSERT (rc < 0);
+ return rc;
+ }
+
+ proto = ksocknal_parse_proto_version(hello);
+ if (proto == NULL) {
+ if (!active) {
+ /* unknown protocol from peer, tell peer my protocol */
+ conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+ if (*ksocknal_tunables.ksnd_protocol == 2)
+ conn->ksnc_proto = &ksocknal_protocol_v2x;
+ else if (*ksocknal_tunables.ksnd_protocol == 1)
+ conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+ hello->kshm_nips = 0;
+ ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+ }
+
+ CERROR("Unknown protocol version (%d.x expected) from %pI4h\n",
+ conn->ksnc_proto->pro_version,
+ &conn->ksnc_ipaddr);
+
+ return -EPROTO;
+ }
+
+ proto_match = (conn->ksnc_proto == proto);
+ conn->ksnc_proto = proto;
+
+ /* receive the rest of hello message anyway */
+ rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading or checking hello from from %pI4h\n",
+ rc, &conn->ksnc_ipaddr);
+ LASSERT (rc < 0);
+ return rc;
+ }
+
+ *incarnation = hello->kshm_src_incarnation;
+
+ if (hello->kshm_src_nid == LNET_NID_ANY) {
+ CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pI4h\n",
+ &conn->ksnc_ipaddr);
+ return -EPROTO;
+ }
+
+ if (!active &&
+ conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+ /* Userspace NAL assigns peer process ID from socket */
+ recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+ recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+ } else {
+ recv_id.nid = hello->kshm_src_nid;
+ recv_id.pid = hello->kshm_src_pid;
+ }
+
+ if (!active) {
+ *peerid = recv_id;
+
+ /* peer determines type */
+ conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+ if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+ CERROR("Unexpected type %d from %s ip %pI4h\n",
+ hello->kshm_ctype, libcfs_id2str(*peerid),
+ &conn->ksnc_ipaddr);
+ return -EPROTO;
+ }
+
+ return 0;
+ }
+
+ if (peerid->pid != recv_id.pid ||
+ peerid->nid != recv_id.nid) {
+ LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host %pI4h, but they claimed they were %s; please check your Lustre configuration.\n",
+ libcfs_id2str(*peerid),
+ &conn->ksnc_ipaddr,
+ libcfs_id2str(recv_id));
+ return -EPROTO;
+ }
+
+ if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+ /* Possible protocol mismatch or I lost the connection race */
+ return proto_match ? EALREADY : EPROTO;
+ }
+
+ if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+ CERROR("Mismatched types: me %d, %s ip %pI4h %d\n",
+ conn->ksnc_type, libcfs_id2str(*peerid),
+ &conn->ksnc_ipaddr,
+ hello->kshm_ctype);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+static int
+ksocknal_connect (ksock_route_t *route)
+{
+ LIST_HEAD (zombies);
+ ksock_peer_t *peer = route->ksnr_peer;
+ int type;
+ int wanted;
+ struct socket *sock;
+ unsigned long deadline;
+ int retry_later = 0;
+ int rc = 0;
+
+ deadline = cfs_time_add(cfs_time_current(),
+ cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ LASSERT (route->ksnr_scheduled);
+ LASSERT (!route->ksnr_connecting);
+
+ route->ksnr_connecting = 1;
+
+ for (;;) {
+ wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+ /* stop connecting if peer/route got closed under me, or
+ * route got connected while queued */
+ if (peer->ksnp_closing || route->ksnr_deleted ||
+ wanted == 0) {
+ retry_later = 0;
+ break;
+ }
+
+ /* reschedule if peer is connecting to me */
+ if (peer->ksnp_accepting > 0) {
+ CDEBUG(D_NET,
+ "peer %s(%d) already connecting to me, retry later.\n",
+ libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+ retry_later = 1;
+ }
+
+ if (retry_later) /* needs reschedule */
+ break;
+
+ if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+ type = SOCKLND_CONN_ANY;
+ } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+ type = SOCKLND_CONN_CONTROL;
+ } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+ type = SOCKLND_CONN_BULK_IN;
+ } else {
+ LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+ type = SOCKLND_CONN_BULK_OUT;
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+ rc = -ETIMEDOUT;
+ lnet_connect_console_error(rc, peer->ksnp_id.nid,
+ route->ksnr_ipaddr,
+ route->ksnr_port);
+ goto failed;
+ }
+
+ rc = lnet_connect(&sock, peer->ksnp_id.nid,
+ route->ksnr_myipaddr,
+ route->ksnr_ipaddr, route->ksnr_port);
+ if (rc != 0)
+ goto failed;
+
+ rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+ if (rc < 0) {
+ lnet_connect_console_error(rc, peer->ksnp_id.nid,
+ route->ksnr_ipaddr,
+ route->ksnr_port);
+ goto failed;
+ }
+
+ /* A +ve RC means I have to retry because I lost the connection
+ * race or I have to renegotiate protocol version */
+ retry_later = (rc != 0);
+ if (retry_later)
+ CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
+ libcfs_nid2str(peer->ksnp_id.nid));
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+ }
+
+ route->ksnr_scheduled = 0;
+ route->ksnr_connecting = 0;
+
+ if (retry_later) {
+ /* re-queue for attention; this frees me up to handle
+ * the peer's incoming connection request */
+
+ if (rc == EALREADY ||
+ (rc == 0 && peer->ksnp_accepting > 0)) {
+ /* We want to introduce a delay before next
+ * attempt to connect if we lost conn race,
+ * but the race is resolved quickly usually,
+ * so min_reconnectms should be good heuristic */
+ route->ksnr_retry_interval =
+ cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+ route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+ route->ksnr_retry_interval);
+ }
+
+ ksocknal_launch_connection_locked(route);
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+ return retry_later;
+
+ failed:
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ route->ksnr_scheduled = 0;
+ route->ksnr_connecting = 0;
+
+ /* This is a retry rather than a new connection */
+ route->ksnr_retry_interval *= 2;
+ route->ksnr_retry_interval =
+ max(route->ksnr_retry_interval,
+ cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+ route->ksnr_retry_interval =
+ min(route->ksnr_retry_interval,
+ cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+
+ LASSERT (route->ksnr_retry_interval != 0);
+ route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+ route->ksnr_retry_interval);
+
+ if (!list_empty(&peer->ksnp_tx_queue) &&
+ peer->ksnp_accepting == 0 &&
+ ksocknal_find_connecting_route_locked(peer) == NULL) {
+ ksock_conn_t *conn;
+
+ /* ksnp_tx_queue is queued on a conn on successful
+ * connection for V1.x and V2.x */
+ if (!list_empty (&peer->ksnp_conns)) {
+ conn = list_entry(peer->ksnp_conns.next,
+ ksock_conn_t, ksnc_list);
+ LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+ }
+
+ /* take all the blocked packets while I've got the lock and
+ * complete below... */
+ list_splice_init(&peer->ksnp_tx_queue, &zombies);
+ }
+
+#if 0 /* irrelevant with only eager routes */
+ if (!route->ksnr_deleted) {
+ /* make this route least-favourite for re-selection */
+ list_del(&route->ksnr_list);
+ list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+ }
+#endif
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_peer_failed(peer);
+ ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+ return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(long sec, long *timeout)
+{
+ char name[16];
+ int rc;
+ int total = ksocknal_data.ksnd_connd_starting +
+ ksocknal_data.ksnd_connd_running;
+
+ if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+ /* still in initializing */
+ return 0;
+ }
+
+ if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+ total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+ /* can't create more connd, or still have enough
+ * threads to handle more connecting */
+ return 0;
+ }
+
+ if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+ /* no pending connecting request */
+ return 0;
+ }
+
+ if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+ /* may run out of resource, retry later */
+ *timeout = cfs_time_seconds(1);
+ return 0;
+ }
+
+ if (ksocknal_data.ksnd_connd_starting > 0) {
+ /* serialize starting to avoid flood */
+ return 0;
+ }
+
+ ksocknal_data.ksnd_connd_starting_stamp = sec;
+ ksocknal_data.ksnd_connd_starting++;
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+ /* NB: total is the next id */
+ snprintf(name, sizeof(name), "socknal_cd%02d", total);
+ rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
+
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+ if (rc == 0)
+ return 1;
+
+ /* we tried ... */
+ LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+ ksocknal_data.ksnd_connd_starting--;
+ ksocknal_data.ksnd_connd_failed_stamp = get_seconds();
+
+ return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(long sec, long *timeout)
+{
+ int val;
+
+ if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+ /* still in initializing */
+ return 0;
+ }
+
+ if (ksocknal_data.ksnd_connd_starting > 0) {
+ /* in progress of starting new thread */
+ return 0;
+ }
+
+ if (ksocknal_data.ksnd_connd_running <=
+ *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+ return 0;
+ }
+
+ /* created thread in past 120 seconds? */
+ val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+ SOCKNAL_CONND_TIMEOUT - sec);
+
+ *timeout = (val > 0) ? cfs_time_seconds(val) :
+ cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+ if (val > 0)
+ return 0;
+
+ /* no creating in past 120 seconds */
+
+ return ksocknal_data.ksnd_connd_running >
+ ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_routes queue looking for a route that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
+{
+ ksock_route_t *route;
+ unsigned long now;
+
+ now = cfs_time_current();
+
+ /* connd_routes can contain both pending and ordinary routes */
+ list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes,
+ ksnr_connd_list) {
+
+ if (route->ksnr_retry_interval == 0 ||
+ cfs_time_aftereq(now, route->ksnr_timeout))
+ return route;
+
+ if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+ (int)*timeout_p > (int)(route->ksnr_timeout - now))
+ *timeout_p = (int)(route->ksnr_timeout - now);
+ }
+
+ return NULL;
+}
+
+int
+ksocknal_connd (void *arg)
+{
+ spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
+ ksock_connreq_t *cr;
+ wait_queue_t wait;
+ int nloops = 0;
+ int cons_retry = 0;
+
+ cfs_block_allsigs ();
+
+ init_waitqueue_entry(&wait, current);
+
+ spin_lock_bh(connd_lock);
+
+ LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+ ksocknal_data.ksnd_connd_starting--;
+ ksocknal_data.ksnd_connd_running++;
+
+ while (!ksocknal_data.ksnd_shuttingdown) {
+ ksock_route_t *route = NULL;
+ long sec = get_seconds();
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ int dropped_lock = 0;
+
+ if (ksocknal_connd_check_stop(sec, &timeout)) {
+ /* wakeup another one to check stop */
+ wake_up(&ksocknal_data.ksnd_connd_waitq);
+ break;
+ }
+
+ if (ksocknal_connd_check_start(sec, &timeout)) {
+ /* created new thread */
+ dropped_lock = 1;
+ }
+
+ if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+ /* Connection accepted by the listener */
+ cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+ next, ksock_connreq_t, ksncr_list);
+
+ list_del(&cr->ksncr_list);
+ spin_unlock_bh(connd_lock);
+ dropped_lock = 1;
+
+ ksocknal_create_conn(cr->ksncr_ni, NULL,
+ cr->ksncr_sock, SOCKLND_CONN_NONE);
+ lnet_ni_decref(cr->ksncr_ni);
+ LIBCFS_FREE(cr, sizeof(*cr));
+
+ spin_lock_bh(connd_lock);
+ }
+
+ /* Only handle an outgoing connection request if there
+ * is a thread left to handle incoming connections and
+ * create new connd */
+ if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+ ksocknal_data.ksnd_connd_running) {
+ route = ksocknal_connd_get_route_locked(&timeout);
+ }
+ if (route != NULL) {
+ list_del (&route->ksnr_connd_list);
+ ksocknal_data.ksnd_connd_connecting++;
+ spin_unlock_bh(connd_lock);
+ dropped_lock = 1;
+
+ if (ksocknal_connect(route)) {
+ /* consecutive retry */
+ if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+ CWARN("massive consecutive re-connecting to %pI4h\n",
+ &route->ksnr_ipaddr);
+ cons_retry = 0;
+ }
+ } else {
+ cons_retry = 0;
+ }
+
+ ksocknal_route_decref(route);
+
+ spin_lock_bh(connd_lock);
+ ksocknal_data.ksnd_connd_connecting--;
+ }
+
+ if (dropped_lock) {
+ if (++nloops < SOCKNAL_RESCHED)
+ continue;
+ spin_unlock_bh(connd_lock);
+ nloops = 0;
+ cond_resched();
+ spin_lock_bh(connd_lock);
+ continue;
+ }
+
+ /* Nothing to do for 'timeout' */
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait);
+ spin_unlock_bh(connd_lock);
+
+ nloops = 0;
+ schedule_timeout(timeout);
+
+ remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+ spin_lock_bh(connd_lock);
+ }
+ ksocknal_data.ksnd_connd_running--;
+ spin_unlock_bh(connd_lock);
+
+ ksocknal_thread_fini();
+ return 0;
+}
+
+static ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_t *peer)
+{
+ /* We're called with a shared lock on ksnd_global_lock */
+ ksock_conn_t *conn;
+ struct list_head *ctmp;
+
+ list_for_each (ctmp, &peer->ksnp_conns) {
+ int error;
+ conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+ /* Don't need the {get,put}connsock dance to deref ksnc_sock */
+ LASSERT (!conn->ksnc_closing);
+
+ /* SOCK_ERROR will reset error code of socket in
+ * some platform (like Darwin8.x) */
+ error = conn->ksnc_sock->sk->sk_err;
+ if (error != 0) {
+ ksocknal_conn_addref(conn);
+
+ switch (error) {
+ case ECONNRESET:
+ CNETERR("A connection with %s (%pI4h:%d) was reset; it may have rebooted.\n",
+ libcfs_id2str(peer->ksnp_id),
+ &conn->ksnc_ipaddr,
+ conn->ksnc_port);
+ break;
+ case ETIMEDOUT:
+ CNETERR("A connection with %s (%pI4h:%d) timed out; the network or node may be down.\n",
+ libcfs_id2str(peer->ksnp_id),
+ &conn->ksnc_ipaddr,
+ conn->ksnc_port);
+ break;
+ default:
+ CNETERR("An unexpected network error %d occurred with %s (%pI4h:%d\n",
+ error,
+ libcfs_id2str(peer->ksnp_id),
+ &conn->ksnc_ipaddr,
+ conn->ksnc_port);
+ break;
+ }
+
+ return conn;
+ }
+
+ if (conn->ksnc_rx_started &&
+ cfs_time_aftereq(cfs_time_current(),
+ conn->ksnc_rx_deadline)) {
+ /* Timed out incomplete incoming message */
+ ksocknal_conn_addref(conn);
+ CNETERR("Timeout receiving from %s (%pI4h:%d), state %d wanted %d left %d\n",
+ libcfs_id2str(peer->ksnp_id),
+ &conn->ksnc_ipaddr,
+ conn->ksnc_port,
+ conn->ksnc_rx_state,
+ conn->ksnc_rx_nob_wanted,
+ conn->ksnc_rx_nob_left);
+ return conn;
+ }
+
+ if ((!list_empty(&conn->ksnc_tx_queue) ||
+ conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
+ cfs_time_aftereq(cfs_time_current(),
+ conn->ksnc_tx_deadline)) {
+ /* Timed out messages queued for sending or
+ * buffered in the socket's send buffer */
+ ksocknal_conn_addref(conn);
+ CNETERR("Timeout sending data to %s (%pI4h:%d) the network or that node may be down.\n",
+ libcfs_id2str(peer->ksnp_id),
+ &conn->ksnc_ipaddr,
+ conn->ksnc_port);
+ return conn;
+ }
+ }
+
+ return NULL;
+}
+
+static inline void
+ksocknal_flush_stale_txs(ksock_peer_t *peer)
+{
+ ksock_tx_t *tx;
+ LIST_HEAD (stale_txs);
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ while (!list_empty (&peer->ksnp_tx_queue)) {
+ tx = list_entry (peer->ksnp_tx_queue.next,
+ ksock_tx_t, tx_list);
+
+ if (!cfs_time_aftereq(cfs_time_current(),
+ tx->tx_deadline))
+ break;
+
+ list_del (&tx->tx_list);
+ list_add_tail (&tx->tx_list, &stale_txs);
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
+}
+
+static int
+ksocknal_send_keepalive_locked(ksock_peer_t *peer)
+{
+ ksock_sched_t *sched;
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+
+ if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */
+ return 0;
+
+ if (peer->ksnp_proto != &ksocknal_protocol_v3x)
+ return 0;
+
+ if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+ time_before(cfs_time_current(),
+ cfs_time_add(peer->ksnp_last_alive,
+ cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+ return 0;
+
+ if (time_before(cfs_time_current(), peer->ksnp_send_keepalive))
+ return 0;
+
+ /* retry 10 secs later, so we wouldn't put pressure
+ * on this peer if we failed to send keepalive this time */
+ peer->ksnp_send_keepalive = cfs_time_shift(10);
+
+ conn = ksocknal_find_conn_locked(peer, NULL, 1);
+ if (conn != NULL) {
+ sched = conn->ksnc_scheduler;
+
+ spin_lock_bh(&sched->kss_lock);
+ if (!list_empty(&conn->ksnc_tx_queue)) {
+ spin_unlock_bh(&sched->kss_lock);
+ /* there is an queued ACK, don't need keepalive */
+ return 0;
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ /* cookie = 1 is reserved for keepalive PING */
+ tx = ksocknal_alloc_tx_noop(1, 1);
+ if (tx == NULL) {
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ return -ENOMEM;
+ }
+
+ if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) {
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ return 1;
+ }
+
+ ksocknal_free_tx(tx);
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ return -EIO;
+}
+
+
+static void
+ksocknal_check_peer_timeouts (int idx)
+{
+ struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
+ ksock_peer_t *peer;
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+
+ again:
+ /* NB. We expect to have a look at all the peers and not find any
+ * connections to time out, so we just use a shared lock while we
+ * take a look... */
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ list_for_each_entry(peer, peers, ksnp_list) {
+ unsigned long deadline = 0;
+ int resid = 0;
+ int n = 0;
+
+ if (ksocknal_send_keepalive_locked(peer) != 0) {
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ goto again;
+ }
+
+ conn = ksocknal_find_timed_out_conn (peer);
+
+ if (conn != NULL) {
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+
+ /* NB we won't find this one again, but we can't
+ * just proceed with the next peer, since we dropped
+ * ksnd_global_lock and it might be dead already! */
+ ksocknal_conn_decref(conn);
+ goto again;
+ }
+
+ /* we can't process stale txs right here because we're
+ * holding only shared lock */
+ if (!list_empty (&peer->ksnp_tx_queue)) {
+ ksock_tx_t *tx =
+ list_entry (peer->ksnp_tx_queue.next,
+ ksock_tx_t, tx_list);
+
+ if (cfs_time_aftereq(cfs_time_current(),
+ tx->tx_deadline)) {
+
+ ksocknal_peer_addref(peer);
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_flush_stale_txs(peer);
+
+ ksocknal_peer_decref(peer);
+ goto again;
+ }
+ }
+
+ if (list_empty(&peer->ksnp_zc_req_list))
+ continue;
+
+ spin_lock(&peer->ksnp_lock);
+ list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
+ if (!cfs_time_aftereq(cfs_time_current(),
+ tx->tx_deadline))
+ break;
+ /* ignore the TX if connection is being closed */
+ if (tx->tx_conn->ksnc_closing)
+ continue;
+ n++;
+ }
+
+ if (n == 0) {
+ spin_unlock(&peer->ksnp_lock);
+ continue;
+ }
+
+ tx = list_entry(peer->ksnp_zc_req_list.next,
+ ksock_tx_t, tx_zc_list);
+ deadline = tx->tx_deadline;
+ resid = tx->tx_resid;
+ conn = tx->tx_conn;
+ ksocknal_conn_addref(conn);
+
+ spin_unlock(&peer->ksnp_lock);
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ CERROR("Total %d stale ZC_REQs for peer %s detected; the oldest(%p) timed out %ld secs ago, resid: %d, wmem: %d\n",
+ n, libcfs_nid2str(peer->ksnp_id.nid), tx,
+ cfs_duration_sec(cfs_time_current() - deadline),
+ resid, conn->ksnc_sock->sk->sk_wmem_queued);
+
+ ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+ ksocknal_conn_decref(conn);
+ goto again;
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+ wait_queue_t wait;
+ ksock_conn_t *conn;
+ ksock_sched_t *sched;
+ struct list_head enomem_conns;
+ int nenomem_conns;
+ long timeout;
+ int i;
+ int peer_index = 0;
+ unsigned long deadline = cfs_time_current();
+
+ cfs_block_allsigs ();
+
+ INIT_LIST_HEAD(&enomem_conns);
+ init_waitqueue_entry(&wait, current);
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ while (!ksocknal_data.ksnd_shuttingdown) {
+
+ if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) {
+ conn = list_entry (ksocknal_data. \
+ ksnd_deathrow_conns.next,
+ ksock_conn_t, ksnc_list);
+ list_del (&conn->ksnc_list);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ ksocknal_terminate_conn(conn);
+ ksocknal_conn_decref(conn);
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+ continue;
+ }
+
+ if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) {
+ conn = list_entry (ksocknal_data.ksnd_zombie_conns.\
+ next, ksock_conn_t, ksnc_list);
+ list_del (&conn->ksnc_list);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ ksocknal_destroy_conn(conn);
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+ continue;
+ }
+
+ if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) {
+ list_add(&enomem_conns,
+ &ksocknal_data.ksnd_enomem_conns);
+ list_del_init(&ksocknal_data.ksnd_enomem_conns);
+ }
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ /* reschedule all the connections that stalled with ENOMEM... */
+ nenomem_conns = 0;
+ while (!list_empty (&enomem_conns)) {
+ conn = list_entry (enomem_conns.next,
+ ksock_conn_t, ksnc_tx_list);
+ list_del (&conn->ksnc_tx_list);
+
+ sched = conn->ksnc_scheduler;
+
+ spin_lock_bh(&sched->kss_lock);
+
+ LASSERT(conn->ksnc_tx_scheduled);
+ conn->ksnc_tx_ready = 1;
+ list_add_tail(&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ wake_up(&sched->kss_waitq);
+
+ spin_unlock_bh(&sched->kss_lock);
+ nenomem_conns++;
+ }
+
+ /* careful with the jiffy wrap... */
+ while ((timeout = cfs_time_sub(deadline,
+ cfs_time_current())) <= 0) {
+ const int n = 4;
+ const int p = 1;
+ int chunk = ksocknal_data.ksnd_peer_hash_size;
+
+ /* Time to check for timeouts on a few more peers: I do
+ * checks every 'p' seconds on a proportion of the peer
+ * table and I need to check every connection 'n' times
+ * within a timeout interval, to ensure I detect a
+ * timeout on any connection within (n+1)/n times the
+ * timeout interval. */
+
+ if (*ksocknal_tunables.ksnd_timeout > n * p)
+ chunk = (chunk * n * p) /
+ *ksocknal_tunables.ksnd_timeout;
+ if (chunk == 0)
+ chunk = 1;
+
+ for (i = 0; i < chunk; i++) {
+ ksocknal_check_peer_timeouts (peer_index);
+ peer_index = (peer_index + 1) %
+ ksocknal_data.ksnd_peer_hash_size;
+ }
+
+ deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+ }
+
+ if (nenomem_conns != 0) {
+ /* Reduce my timeout if I rescheduled ENOMEM conns.
+ * This also prevents me getting woken immediately
+ * if any go back on my enomem list. */
+ timeout = SOCKNAL_ENOMEM_RETRY;
+ }
+ ksocknal_data.ksnd_reaper_waketime =
+ cfs_time_add(cfs_time_current(), timeout);
+
+ set_current_state (TASK_INTERRUPTIBLE);
+ add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+ if (!ksocknal_data.ksnd_shuttingdown &&
+ list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
+ list_empty (&ksocknal_data.ksnd_zombie_conns))
+ schedule_timeout(timeout);
+
+ set_current_state (TASK_RUNNING);
+ remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+ }
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ ksocknal_thread_fini();
+ return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
new file mode 100644
index 000000000..f5e8ab060
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
@@ -0,0 +1,714 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include "socklnd.h"
+
+int
+ksocknal_lib_get_conn_addrs(ksock_conn_t *conn)
+{
+ int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+ &conn->ksnc_ipaddr,
+ &conn->ksnc_port);
+
+ /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+ LASSERT(!conn->ksnc_closing);
+
+ if (rc != 0) {
+ CERROR("Error %d getting sock peer IP\n", rc);
+ return rc;
+ }
+
+ rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+ &conn->ksnc_myipaddr, NULL);
+ if (rc != 0) {
+ CERROR("Error %d getting sock local IP\n", rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+int
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
+{
+ int caps = conn->ksnc_sock->sk->sk_route_caps;
+
+ if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+ return 0;
+
+ /* ZC if the socket supports scatter/gather and doesn't need software
+ * checksums */
+ return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0);
+}
+
+int
+ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ struct socket *sock = conn->ksnc_sock;
+ int nob;
+ int rc;
+
+ if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */
+ conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */
+ tx->tx_nob == tx->tx_resid && /* frist sending */
+ tx->tx_msg.ksm_csum == 0) /* not checksummed */
+ ksocknal_lib_csum_tx(tx);
+
+ /* NB we can't trust socket ops to either consume our iovs
+ * or leave them alone. */
+
+ {
+#if SOCKNAL_SINGLE_FRAG_TX
+ struct kvec scratch;
+ struct kvec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+ struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+ unsigned int niov = tx->tx_niov;
+#endif
+ struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
+ int i;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i] = tx->tx_iov[i];
+ nob += scratchiov[i].iov_len;
+ }
+
+ if (!list_empty(&conn->ksnc_tx_queue) ||
+ nob < tx->tx_resid)
+ msg.msg_flags |= MSG_MORE;
+
+ rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob);
+ }
+ return rc;
+}
+
+int
+ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ struct socket *sock = conn->ksnc_sock;
+ lnet_kiov_t *kiov = tx->tx_kiov;
+ int rc;
+ int nob;
+
+ /* Not NOOP message */
+ LASSERT(tx->tx_lnetmsg != NULL);
+
+ /* NB we can't trust socket ops to either consume our iovs
+ * or leave them alone. */
+ if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+ /* Zero copy is enabled */
+ struct sock *sk = sock->sk;
+ struct page *page = kiov->kiov_page;
+ int offset = kiov->kiov_offset;
+ int fragsize = kiov->kiov_len;
+ int msgflg = MSG_DONTWAIT;
+
+ CDEBUG(D_NET, "page %p + offset %x for %d\n",
+ page, offset, kiov->kiov_len);
+
+ if (!list_empty(&conn->ksnc_tx_queue) ||
+ fragsize < tx->tx_resid)
+ msgflg |= MSG_MORE;
+
+ if (sk->sk_prot->sendpage != NULL) {
+ rc = sk->sk_prot->sendpage(sk, page,
+ offset, fragsize, msgflg);
+ } else {
+ rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+ msgflg);
+ }
+ } else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+ struct kvec scratch;
+ struct kvec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+ struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+ unsigned int niov = tx->tx_nkiov;
+#endif
+ struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
+ int i;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+ kiov[i].kiov_offset;
+ nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+ }
+
+ if (!list_empty(&conn->ksnc_tx_queue) ||
+ nob < tx->tx_resid)
+ msg.msg_flags |= MSG_MORE;
+
+ rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob);
+
+ for (i = 0; i < niov; i++)
+ kunmap(kiov[i].kiov_page);
+ }
+ return rc;
+}
+
+void
+ksocknal_lib_eager_ack(ksock_conn_t *conn)
+{
+ int opt = 1;
+ struct socket *sock = conn->ksnc_sock;
+
+ /* Remind the socket to ACK eagerly. If I don't, the socket might
+ * think I'm about to send something it could piggy-back the ACK
+ * on, introducing delay in completing zero-copy sends in my
+ * peer. */
+
+ kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+ (char *)&opt, sizeof(opt));
+}
+
+int
+ksocknal_lib_recv_iov(ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+ struct kvec scratch;
+ struct kvec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+ struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+ unsigned int niov = conn->ksnc_rx_niov;
+#endif
+ struct kvec *iov = conn->ksnc_rx_iov;
+ struct msghdr msg = {
+ .msg_flags = 0
+ };
+ int nob;
+ int i;
+ int rc;
+ int fragnob;
+ int sum;
+ __u32 saved_csum;
+
+ /* NB we can't trust socket ops to either consume our iovs
+ * or leave them alone. */
+ LASSERT(niov > 0);
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i] = iov[i];
+ nob += scratchiov[i].iov_len;
+ }
+ LASSERT(nob <= conn->ksnc_rx_nob_wanted);
+
+ rc = kernel_recvmsg(conn->ksnc_sock, &msg,
+ scratchiov, niov, nob, MSG_DONTWAIT);
+
+ saved_csum = 0;
+ if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+ saved_csum = conn->ksnc_msg.ksm_csum;
+ conn->ksnc_msg.ksm_csum = 0;
+ }
+
+ if (saved_csum != 0) {
+ /* accumulate checksum */
+ for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+ LASSERT(i < niov);
+
+ fragnob = iov[i].iov_len;
+ if (fragnob > sum)
+ fragnob = sum;
+
+ conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+ iov[i].iov_base, fragnob);
+ }
+ conn->ksnc_msg.ksm_csum = saved_csum;
+ }
+
+ return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+ if (addr == NULL)
+ return;
+
+ vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+ struct kvec *iov, struct page **pages)
+{
+ void *addr;
+ int nob;
+ int i;
+
+ if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+ return NULL;
+
+ LASSERT(niov <= LNET_MAX_IOV);
+
+ if (niov < 2 ||
+ niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+ return NULL;
+
+ for (nob = i = 0; i < niov; i++) {
+ if ((kiov[i].kiov_offset != 0 && i > 0) ||
+ (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
+ return NULL;
+
+ pages[i] = kiov[i].kiov_page;
+ nob += kiov[i].kiov_len;
+ }
+
+ addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+ if (addr == NULL)
+ return NULL;
+
+ iov->iov_base = addr + kiov[0].kiov_offset;
+ iov->iov_len = nob;
+
+ return addr;
+}
+
+int
+ksocknal_lib_recv_kiov(ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+ struct kvec scratch;
+ struct kvec *scratchiov = &scratch;
+ struct page **pages = NULL;
+ unsigned int niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+ struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+ struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+ unsigned int niov = conn->ksnc_rx_nkiov;
+#endif
+ lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
+ struct msghdr msg = {
+ .msg_flags = 0
+ };
+ int nob;
+ int i;
+ int rc;
+ void *base;
+ void *addr;
+ int sum;
+ int fragnob;
+ int n;
+
+ /* NB we can't trust socket ops to either consume our iovs
+ * or leave them alone. */
+ addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages);
+ if (addr != NULL) {
+ nob = scratchiov[0].iov_len;
+ n = 1;
+
+ } else {
+ for (nob = i = 0; i < niov; i++) {
+ nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+ scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+ kiov[i].kiov_offset;
+ }
+ n = niov;
+ }
+
+ LASSERT(nob <= conn->ksnc_rx_nob_wanted);
+
+ rc = kernel_recvmsg(conn->ksnc_sock, &msg,
+ (struct kvec *)scratchiov, n, nob, MSG_DONTWAIT);
+
+ if (conn->ksnc_msg.ksm_csum != 0) {
+ for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+ LASSERT(i < niov);
+
+ /* Dang! have to kmap again because I have nowhere to stash the
+ * mapped address. But by doing it while the page is still
+ * mapped, the kernel just bumps the map count and returns me
+ * the address it stashed. */
+ base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+ fragnob = kiov[i].kiov_len;
+ if (fragnob > sum)
+ fragnob = sum;
+
+ conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+ base, fragnob);
+
+ kunmap(kiov[i].kiov_page);
+ }
+ }
+
+ if (addr != NULL) {
+ ksocknal_lib_kiov_vunmap(addr);
+ } else {
+ for (i = 0; i < niov; i++)
+ kunmap(kiov[i].kiov_page);
+ }
+
+ return rc;
+}
+
+void
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
+{
+ int i;
+ __u32 csum;
+ void *base;
+
+ LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg);
+ LASSERT(tx->tx_conn != NULL);
+ LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+ tx->tx_msg.ksm_csum = 0;
+
+ csum = ksocknal_csum(~0, tx->tx_iov[0].iov_base,
+ tx->tx_iov[0].iov_len);
+
+ if (tx->tx_kiov != NULL) {
+ for (i = 0; i < tx->tx_nkiov; i++) {
+ base = kmap(tx->tx_kiov[i].kiov_page) +
+ tx->tx_kiov[i].kiov_offset;
+
+ csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
+
+ kunmap(tx->tx_kiov[i].kiov_page);
+ }
+ } else {
+ for (i = 1; i < tx->tx_niov; i++)
+ csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+ tx->tx_iov[i].iov_len);
+ }
+
+ if (*ksocknal_tunables.ksnd_inject_csum_error) {
+ csum++;
+ *ksocknal_tunables.ksnd_inject_csum_error = 0;
+ }
+
+ tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+ struct socket *sock = conn->ksnc_sock;
+ int len;
+ int rc;
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) {
+ LASSERT(conn->ksnc_closing);
+ *txmem = *rxmem = *nagle = 0;
+ return -ESHUTDOWN;
+ }
+
+ rc = libcfs_sock_getbuf(sock, txmem, rxmem);
+ if (rc == 0) {
+ len = sizeof(*nagle);
+ rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (char *)nagle, &len);
+ }
+
+ ksocknal_connsock_decref(conn);
+
+ if (rc == 0)
+ *nagle = !*nagle;
+ else
+ *txmem = *rxmem = *nagle = 0;
+
+ return rc;
+}
+
+int
+ksocknal_lib_setup_sock(struct socket *sock)
+{
+ int rc;
+ int option;
+ int keep_idle;
+ int keep_intvl;
+ int keep_count;
+ int do_keepalive;
+ struct linger linger;
+
+ sock->sk->sk_allocation = GFP_NOFS;
+
+ /* Ensure this socket aborts active sends immediately when we close
+ * it. */
+
+ linger.l_onoff = 0;
+ linger.l_linger = 0;
+
+ rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+ (char *)&linger, sizeof(linger));
+ if (rc != 0) {
+ CERROR("Can't set SO_LINGER: %d\n", rc);
+ return rc;
+ }
+
+ option = -1;
+ rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2,
+ (char *)&option, sizeof(option));
+ if (rc != 0) {
+ CERROR("Can't set SO_LINGER2: %d\n", rc);
+ return rc;
+ }
+
+ if (!*ksocknal_tunables.ksnd_nagle) {
+ option = 1;
+
+ rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (char *)&option, sizeof(option));
+ if (rc != 0) {
+ CERROR("Can't disable nagle: %d\n", rc);
+ return rc;
+ }
+ }
+
+ rc = libcfs_sock_setbuf(sock,
+ *ksocknal_tunables.ksnd_tx_buffer_size,
+ *ksocknal_tunables.ksnd_rx_buffer_size);
+ if (rc != 0) {
+ CERROR("Can't set buffer tx %d, rx %d buffers: %d\n",
+ *ksocknal_tunables.ksnd_tx_buffer_size,
+ *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+ return rc;
+ }
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+
+ /* snapshot tunables */
+ keep_idle = *ksocknal_tunables.ksnd_keepalive_idle;
+ keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+ keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+ do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+ option = (do_keepalive ? 1 : 0);
+ rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&option, sizeof(option));
+ if (rc != 0) {
+ CERROR("Can't set SO_KEEPALIVE: %d\n", rc);
+ return rc;
+ }
+
+ if (!do_keepalive)
+ return 0;
+
+ rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+ (char *)&keep_idle, sizeof(keep_idle));
+ if (rc != 0) {
+ CERROR("Can't set TCP_KEEPIDLE: %d\n", rc);
+ return rc;
+ }
+
+ rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+ (char *)&keep_intvl, sizeof(keep_intvl));
+ if (rc != 0) {
+ CERROR("Can't set TCP_KEEPINTVL: %d\n", rc);
+ return rc;
+ }
+
+ rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+ (char *)&keep_count, sizeof(keep_count));
+ if (rc != 0) {
+ CERROR("Can't set TCP_KEEPCNT: %d\n", rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+void
+ksocknal_lib_push_conn(ksock_conn_t *conn)
+{
+ struct sock *sk;
+ struct tcp_sock *tp;
+ int nonagle;
+ int val = 1;
+ int rc;
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) /* being shut down */
+ return;
+
+ sk = conn->ksnc_sock->sk;
+ tp = tcp_sk(sk);
+
+ lock_sock(sk);
+ nonagle = tp->nonagle;
+ tp->nonagle = 1;
+ release_sock(sk);
+
+ rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY,
+ (char *)&val, sizeof(val));
+ LASSERT(rc == 0);
+
+ lock_sock(sk);
+ tp->nonagle = nonagle;
+ release_sock(sk);
+
+ ksocknal_connsock_decref(conn);
+}
+
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+ksocknal_data_ready(struct sock *sk)
+{
+ ksock_conn_t *conn;
+
+ /* interleave correctly with closing sockets... */
+ LASSERT(!in_irq());
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ conn = sk->sk_user_data;
+ if (conn == NULL) { /* raced with ksocknal_terminate_conn */
+ LASSERT(sk->sk_data_ready != &ksocknal_data_ready);
+ sk->sk_data_ready(sk);
+ } else
+ ksocknal_read_callback(conn);
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+static void
+ksocknal_write_space(struct sock *sk)
+{
+ ksock_conn_t *conn;
+ int wspace;
+ int min_wpace;
+
+ /* interleave correctly with closing sockets... */
+ LASSERT(!in_irq());
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ conn = sk->sk_user_data;
+ wspace = SOCKNAL_WSPACE(sk);
+ min_wpace = SOCKNAL_MIN_WSPACE(sk);
+
+ CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+ sk, wspace, min_wpace, conn,
+ (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+ " ready" : " blocked"),
+ (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+ " scheduled" : " idle"),
+ (conn == NULL) ? "" : (list_empty(&conn->ksnc_tx_queue) ?
+ " empty" : " queued"));
+
+ if (conn == NULL) { /* raced with ksocknal_terminate_conn */
+ LASSERT(sk->sk_write_space != &ksocknal_write_space);
+ sk->sk_write_space(sk);
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return;
+ }
+
+ if (wspace >= min_wpace) { /* got enough space */
+ ksocknal_write_callback(conn);
+
+ /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+ * ENOMEM check in ksocknal_transmit is race-free (think about
+ * it). */
+
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+ conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+ conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn)
+{
+ sock->sk->sk_user_data = conn;
+ sock->sk->sk_data_ready = ksocknal_data_ready;
+ sock->sk->sk_write_space = ksocknal_write_space;
+ return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+ /* Remove conn's network callbacks.
+ * NB I _have_ to restore the callback, rather than storing a noop,
+ * since the socket could survive past this module being unloaded!! */
+ sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+ sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+ /* A callback could be in progress already; they hold a read lock
+ * on ksnd_global_lock (to serialise with me) and NOOP if
+ * sk_user_data is NULL. */
+ sock->sk->sk_user_data = NULL;
+
+ return ;
+}
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+ int rc = 0;
+ ksock_sched_t *sched;
+
+ sched = conn->ksnc_scheduler;
+ spin_lock_bh(&sched->kss_lock);
+
+ if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) &&
+ !conn->ksnc_tx_ready) {
+ /* SOCK_NOSPACE is set when the socket fills
+ * and cleared in the write_space callback
+ * (which also sets ksnc_tx_ready). If
+ * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+ * zero, I didn't fill the socket and
+ * write_space won't reschedule me, so I
+ * return -ENOMEM to get my caller to retry
+ * after a timeout */
+ rc = -ENOMEM;
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+
+ return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
new file mode 100644
index 000000000..f5563881b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
@@ -0,0 +1,86 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_PORTAL_ALLOC
+
+#ifndef __LINUX_SOCKNAL_LIB_H__
+#define __LINUX_SOCKNAL_LIB_H__
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/if.h>
+#include <linux/uaccess.h>
+
+#include <asm/irq.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <asm/div64.h>
+#include <linux/syscalls.h>
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#include <linux/crc32.h>
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+ return crc32_le(crc, p, len);
+#else
+ while (len-- > 0)
+ crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+ return crc;
+#endif
+}
+
+#define SOCKNAL_WSPACE(sk) sk_stream_wspace(sk)
+#define SOCKNAL_MIN_WSPACE(sk) sk_stream_min_wspace(sk)
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS 3
+#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1)
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644
index 000000000..86b88db1c
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+module_param(sock_timeout, int, 0644);
+MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
+
+static int credits = 256;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
+
+static int peer_credits = 8;
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
+
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+module_param(nconnds, int, 0444);
+MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
+
+static int nconnds_max = 64;
+module_param(nconnds_max, int, 0444);
+MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
+
+static int min_reconnectms = 1000;
+module_param(min_reconnectms, int, 0644);
+MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+module_param(max_reconnectms, int, 0644);
+MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
+
+# define DEFAULT_EAGER_ACK 0
+static int eager_ack = DEFAULT_EAGER_ACK;
+module_param(eager_ack, int, 0644);
+MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+module_param(typed_conns, int, 0444);
+MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
+
+static int min_bulk = 1<<10;
+module_param(min_bulk, int, 0644);
+MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+module_param(tx_buffer_size, int, 0644);
+MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+module_param(rx_buffer_size, int, 0644);
+MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
+
+static int nagle;
+module_param(nagle, int, 0644);
+MODULE_PARM_DESC(nagle, "enable NAGLE?");
+
+static int round_robin = 1;
+module_param(round_robin, int, 0644);
+MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
+
+static int keepalive = 30;
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+module_param(keepalive_idle, int, 0644);
+MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT 5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+module_param(keepalive_count, int, 0644);
+MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
+
+static int keepalive_intvl = 5;
+module_param(keepalive_intvl, int, 0644);
+MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
+
+static int enable_csum;
+module_param(enable_csum, int, 0644);
+MODULE_PARM_DESC(enable_csum, "enable check sum");
+
+static int inject_csum_error;
+module_param(inject_csum_error, int, 0644);
+MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
+
+static int nonblk_zcack = 1;
+module_param(nonblk_zcack, int, 0644);
+MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = 16 << 10;
+module_param(zc_min_payload, int, 0644);
+MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
+
+static unsigned int zc_recv;
+module_param(zc_recv, int, 0644);
+MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+module_param(zc_recv_min_nfrags, int, 0644);
+MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
+
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+module_param(protocol, int, 0644);
+MODULE_PARM_DESC(protocol, "protocol version");
+#endif
+
+ksock_tunables_t ksocknal_tunables;
+
+int ksocknal_tunables_init(void)
+{
+
+ /* initialize ksocknal_tunables structure */
+ ksocknal_tunables.ksnd_timeout = &sock_timeout;
+ ksocknal_tunables.ksnd_nscheds = &nscheds;
+ ksocknal_tunables.ksnd_nconnds = &nconnds;
+ ksocknal_tunables.ksnd_nconnds_max = &nconnds_max;
+ ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms;
+ ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms;
+ ksocknal_tunables.ksnd_eager_ack = &eager_ack;
+ ksocknal_tunables.ksnd_typed_conns = &typed_conns;
+ ksocknal_tunables.ksnd_min_bulk = &min_bulk;
+ ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size;
+ ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size;
+ ksocknal_tunables.ksnd_nagle = &nagle;
+ ksocknal_tunables.ksnd_round_robin = &round_robin;
+ ksocknal_tunables.ksnd_keepalive = &keepalive;
+ ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle;
+ ksocknal_tunables.ksnd_keepalive_count = &keepalive_count;
+ ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl;
+ ksocknal_tunables.ksnd_credits = &credits;
+ ksocknal_tunables.ksnd_peertxcredits = &peer_credits;
+ ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits;
+ ksocknal_tunables.ksnd_peertimeout = &peer_timeout;
+ ksocknal_tunables.ksnd_enable_csum = &enable_csum;
+ ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error;
+ ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack;
+ ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload;
+ ksocknal_tunables.ksnd_zc_recv = &zc_recv;
+ ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
+
+
+#if SOCKNAL_VERSION_DEBUG
+ ksocknal_tunables.ksnd_protocol = &protocol;
+#endif
+
+ if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+ *ksocknal_tunables.ksnd_zc_min_payload = 2 << 10;
+
+ return 0;
+};
diff --git a/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644
index 000000000..8596581f5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ * pro_send_hello : send hello message
+ * pro_recv_hello : receive hello message
+ * pro_pack : pack message header
+ * pro_unpack : unpack message header
+ * pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ * return 1 if ACK is piggybacked, otherwise return 0
+ * pro_queue_tx_msg() : Called holding BH lock: kss_lock
+ * return the ACK that piggybacked by my message, or NULL
+ * pro_handle_zcreq() : handler of incoming ZC-REQ
+ * pro_handle_zcack() : handler of incoming ZC-ACK
+ * pro_match_tx() : Called holding glock
+ */
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+ /* V1.x, just enqueue it */
+ list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+ return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
+{
+ ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+ /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+ LASSERT(!list_empty(&conn->ksnc_tx_queue));
+ LASSERT(tx != NULL);
+
+ /* Next TX that can carry ZC-ACK or LNet message */
+ if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+ /* no more packets queued */
+ conn->ksnc_tx_carrier = NULL;
+ } else {
+ conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
+ ksock_tx_t, tx_list);
+ LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
+ }
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+ ksock_tx_t *tx_ack, __u64 cookie)
+{
+ ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+ LASSERT(tx_ack == NULL ||
+ tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+ /*
+ * Enqueue or piggyback tx_ack / cookie
+ * . no tx can piggyback cookie of tx_ack (or cookie), just
+ * enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+ * . There is tx can piggyback cookie of tx_ack (or cookie),
+ * piggyback the cookie and return the tx.
+ */
+ if (tx == NULL) {
+ if (tx_ack != NULL) {
+ list_add_tail(&tx_ack->tx_list,
+ &conn->ksnc_tx_queue);
+ conn->ksnc_tx_carrier = tx_ack;
+ }
+ return 0;
+ }
+
+ if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+ /* tx is noop zc-ack, can't piggyback zc-ack cookie */
+ if (tx_ack != NULL)
+ list_add_tail(&tx_ack->tx_list,
+ &conn->ksnc_tx_queue);
+ return 0;
+ }
+
+ LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+ LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+ if (tx_ack != NULL)
+ cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+ /* piggyback the zc-ack cookie */
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+ /* move on to the next TX which can carry cookie */
+ ksocknal_next_tx_carrier(conn);
+
+ return 1;
+}
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+ ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+ /*
+ * Enqueue tx_msg:
+ * . If there is no NOOP on the connection, just enqueue
+ * tx_msg and return NULL
+ * . If there is NOOP on the connection, piggyback the cookie
+ * and replace the NOOP tx, and return the NOOP tx.
+ */
+ if (tx == NULL) { /* nothing on queue */
+ list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+ conn->ksnc_tx_carrier = tx_msg;
+ return NULL;
+ }
+
+ if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+ list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+ return NULL;
+ }
+
+ LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+ /* There is a noop zc-ack can be piggybacked */
+ tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+ ksocknal_next_tx_carrier(conn);
+
+ /* use new_tx to replace the noop zc-ack packet */
+ list_add(&tx_msg->tx_list, &tx->tx_list);
+ list_del(&tx->tx_list);
+
+ return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+ ksock_tx_t *tx_ack, __u64 cookie)
+{
+ ksock_tx_t *tx;
+
+ if (conn->ksnc_type != SOCKLND_CONN_ACK)
+ return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+ /* non-blocking ZC-ACK (to router) */
+ LASSERT(tx_ack == NULL ||
+ tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+ tx = conn->ksnc_tx_carrier;
+ if (tx == NULL) {
+ if (tx_ack != NULL) {
+ list_add_tail(&tx_ack->tx_list,
+ &conn->ksnc_tx_queue);
+ conn->ksnc_tx_carrier = tx_ack;
+ }
+ return 0;
+ }
+
+ /* conn->ksnc_tx_carrier != NULL */
+
+ if (tx_ack != NULL)
+ cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+ if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+ return 1;
+
+ if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+ /* replace the keepalive PING with a real ACK */
+ LASSERT(tx->tx_msg.ksm_zc_cookies[0] == 0);
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+ return 1;
+ }
+
+ if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+ cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+ CWARN("%s: duplicated ZC cookie: %llu\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+ return 1; /* XXX return error in the future */
+ }
+
+ if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+ /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+ if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+ tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+ } else {
+ tx->tx_msg.ksm_zc_cookies[0] = cookie;
+ }
+
+ if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+ /* not likely to carry more ACKs, skip it to simplify logic */
+ ksocknal_next_tx_carrier(conn);
+ }
+
+ return 1;
+ }
+
+ /* takes two or more cookies already */
+
+ if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+ __u64 tmp = 0;
+
+ /* two separated cookies: (a+2, a) or (a+1, a) */
+ LASSERT(tx->tx_msg.ksm_zc_cookies[0] -
+ tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+ if (tx->tx_msg.ksm_zc_cookies[0] -
+ tx->tx_msg.ksm_zc_cookies[1] == 2) {
+ if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+ tmp = cookie;
+ } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+ tmp = tx->tx_msg.ksm_zc_cookies[1];
+ } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+ tmp = tx->tx_msg.ksm_zc_cookies[0];
+ }
+
+ if (tmp != 0) {
+ /* range of cookies */
+ tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+ tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+ return 1;
+ }
+
+ } else {
+ /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */
+ if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+ cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+ CWARN("%s: duplicated ZC cookie: %llu\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+ return 1; /* XXX: return error in the future */
+ }
+
+ if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+ return 1;
+ }
+
+ if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+ tx->tx_msg.ksm_zc_cookies[0] = cookie;
+ return 1;
+ }
+ }
+
+ /* failed to piggyback ZC-ACK */
+ if (tx_ack != NULL) {
+ list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+ /* the next tx can piggyback at least 1 ACK */
+ ksocknal_next_tx_carrier(conn);
+ }
+
+ return 0;
+}
+
+static int
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+ int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+ if (!*ksocknal_tunables.ksnd_typed_conns)
+ return SOCKNAL_MATCH_YES;
+#endif
+
+ if (tx == NULL || tx->tx_lnetmsg == NULL) {
+ /* noop packet */
+ nob = offsetof(ksock_msg_t, ksm_u);
+ } else {
+ nob = tx->tx_lnetmsg->msg_len +
+ ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+ sizeof(lnet_hdr_t) : sizeof(ksock_msg_t));
+ }
+
+ /* default checking for typed connection */
+ switch (conn->ksnc_type) {
+ default:
+ CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+ LBUG();
+ case SOCKLND_CONN_ANY:
+ return SOCKNAL_MATCH_YES;
+
+ case SOCKLND_CONN_BULK_IN:
+ return SOCKNAL_MATCH_MAY;
+
+ case SOCKLND_CONN_BULK_OUT:
+ if (nob < *ksocknal_tunables.ksnd_min_bulk)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_YES;
+
+ case SOCKLND_CONN_CONTROL:
+ if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_YES;
+ }
+}
+
+static int
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+ int nob;
+
+ if (tx == NULL || tx->tx_lnetmsg == NULL)
+ nob = offsetof(ksock_msg_t, ksm_u);
+ else
+ nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t);
+
+ switch (conn->ksnc_type) {
+ default:
+ CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+ LBUG();
+ case SOCKLND_CONN_ANY:
+ return SOCKNAL_MATCH_NO;
+
+ case SOCKLND_CONN_ACK:
+ if (nonblk)
+ return SOCKNAL_MATCH_YES;
+ else if (tx == NULL || tx->tx_lnetmsg == NULL)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_NO;
+
+ case SOCKLND_CONN_BULK_OUT:
+ if (nonblk)
+ return SOCKNAL_MATCH_NO;
+ else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_YES;
+
+ case SOCKLND_CONN_CONTROL:
+ if (nonblk)
+ return SOCKNAL_MATCH_NO;
+ else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_YES;
+ }
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+{
+ ksock_peer_t *peer = c->ksnc_peer;
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+ int rc;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
+ if (conn != NULL) {
+ ksock_sched_t *sched = conn->ksnc_scheduler;
+
+ LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+ spin_lock_bh(&sched->kss_lock);
+
+ rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+ spin_unlock_bh(&sched->kss_lock);
+
+ if (rc) { /* piggybacked */
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return 0;
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ /* ACK connection is not ready, or can't piggyback the ACK */
+ tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+ if (tx == NULL)
+ return -ENOMEM;
+
+ rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id);
+ if (rc == 0)
+ return 0;
+
+ ksocknal_free_tx(tx);
+ return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+{
+ ksock_peer_t *peer = conn->ksnc_peer;
+ ksock_tx_t *tx;
+ ksock_tx_t *tmp;
+ LIST_HEAD(zlist);
+ int count;
+
+ if (cookie1 == 0)
+ cookie1 = cookie2;
+
+ count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+ if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+ conn->ksnc_proto == &ksocknal_protocol_v3x) {
+ /* keepalive PING for V3.x, just ignore it */
+ return count == 1 ? 0 : -EPROTO;
+ }
+
+ spin_lock(&peer->ksnp_lock);
+
+ list_for_each_entry_safe(tx, tmp,
+ &peer->ksnp_zc_req_list, tx_zc_list) {
+ __u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+ if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+ tx->tx_msg.ksm_zc_cookies[0] = 0;
+ list_del(&tx->tx_zc_list);
+ list_add(&tx->tx_zc_list, &zlist);
+
+ if (--count == 0)
+ break;
+ }
+ }
+
+ spin_unlock(&peer->ksnp_lock);
+
+ while (!list_empty(&zlist)) {
+ tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+ list_del(&tx->tx_zc_list);
+ ksocknal_tx_decref(tx);
+ }
+
+ return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+ struct socket *sock = conn->ksnc_sock;
+ lnet_hdr_t *hdr;
+ lnet_magicversion_t *hmv;
+ int rc;
+ int i;
+
+ CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid));
+
+ LIBCFS_ALLOC(hdr, sizeof(*hdr));
+ if (hdr == NULL) {
+ CERROR("Can't allocate lnet_hdr_t\n");
+ return -ENOMEM;
+ }
+
+ hmv = (lnet_magicversion_t *)&hdr->dest_nid;
+
+ /* Re-organize V2.x message header to V1.x (lnet_hdr_t)
+ * header and send out */
+ hmv->magic = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+ hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+ hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+ if (the_lnet.ln_testprotocompat != 0) {
+ /* single-shot proto check */
+ LNET_LOCK();
+ if ((the_lnet.ln_testprotocompat & 1) != 0) {
+ hmv->version_major++; /* just different! */
+ the_lnet.ln_testprotocompat &= ~1;
+ }
+ if ((the_lnet.ln_testprotocompat & 2) != 0) {
+ hmv->magic = LNET_PROTO_MAGIC;
+ the_lnet.ln_testprotocompat &= ~2;
+ }
+ LNET_UNLOCK();
+ }
+
+ hdr->src_nid = cpu_to_le64 (hello->kshm_src_nid);
+ hdr->src_pid = cpu_to_le32 (hello->kshm_src_pid);
+ hdr->type = cpu_to_le32 (LNET_MSG_HELLO);
+ hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+ hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+ hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+ rc = libcfs_sock_write(sock, hdr, sizeof(*hdr),
+ lnet_acceptor_timeout());
+
+ if (rc != 0) {
+ CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
+ rc, &conn->ksnc_ipaddr, conn->ksnc_port);
+ goto out;
+ }
+
+ if (hello->kshm_nips == 0)
+ goto out;
+
+ for (i = 0; i < (int) hello->kshm_nips; i++) {
+ hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+ }
+
+ rc = libcfs_sock_write(sock, hello->kshm_ips,
+ hello->kshm_nips * sizeof(__u32),
+ lnet_acceptor_timeout());
+ if (rc != 0) {
+ CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n",
+ rc, hello->kshm_nips,
+ &conn->ksnc_ipaddr, conn->ksnc_port);
+ }
+out:
+ LIBCFS_FREE(hdr, sizeof(*hdr));
+
+ return rc;
+}
+
+static int
+ksocknal_send_hello_v2(ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+ struct socket *sock = conn->ksnc_sock;
+ int rc;
+
+ hello->kshm_magic = LNET_PROTO_MAGIC;
+ hello->kshm_version = conn->ksnc_proto->pro_version;
+
+ if (the_lnet.ln_testprotocompat != 0) {
+ /* single-shot proto check */
+ LNET_LOCK();
+ if ((the_lnet.ln_testprotocompat & 1) != 0) {
+ hello->kshm_version++; /* just different! */
+ the_lnet.ln_testprotocompat &= ~1;
+ }
+ LNET_UNLOCK();
+ }
+
+ rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips),
+ lnet_acceptor_timeout());
+
+ if (rc != 0) {
+ CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
+ rc, &conn->ksnc_ipaddr, conn->ksnc_port);
+ return rc;
+ }
+
+ if (hello->kshm_nips == 0)
+ return 0;
+
+ rc = libcfs_sock_write(sock, hello->kshm_ips,
+ hello->kshm_nips * sizeof(__u32),
+ lnet_acceptor_timeout());
+ if (rc != 0) {
+ CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n",
+ rc, hello->kshm_nips,
+ &conn->ksnc_ipaddr, conn->ksnc_port);
+ }
+
+ return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,
+ int timeout)
+{
+ struct socket *sock = conn->ksnc_sock;
+ lnet_hdr_t *hdr;
+ int rc;
+ int i;
+
+ LIBCFS_ALLOC(hdr, sizeof(*hdr));
+ if (hdr == NULL) {
+ CERROR("Can't allocate lnet_hdr_t\n");
+ return -ENOMEM;
+ }
+
+ rc = libcfs_sock_read(sock, &hdr->src_nid,
+ sizeof(*hdr) - offsetof(lnet_hdr_t, src_nid),
+ timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading rest of HELLO hdr from %pI4h\n",
+ rc, &conn->ksnc_ipaddr);
+ LASSERT(rc < 0 && rc != -EALREADY);
+ goto out;
+ }
+
+ /* ...and check we got what we expected */
+ if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+ CERROR("Expecting a HELLO hdr, but got type %d from %pI4h\n",
+ le32_to_cpu(hdr->type),
+ &conn->ksnc_ipaddr);
+ rc = -EPROTO;
+ goto out;
+ }
+
+ hello->kshm_src_nid = le64_to_cpu(hdr->src_nid);
+ hello->kshm_src_pid = le32_to_cpu(hdr->src_pid);
+ hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation);
+ hello->kshm_ctype = le32_to_cpu(hdr->msg.hello.type);
+ hello->kshm_nips = le32_to_cpu(hdr->payload_length) /
+ sizeof(__u32);
+
+ if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+ CERROR("Bad nips %d from ip %pI4h\n",
+ hello->kshm_nips, &conn->ksnc_ipaddr);
+ rc = -EPROTO;
+ goto out;
+ }
+
+ if (hello->kshm_nips == 0)
+ goto out;
+
+ rc = libcfs_sock_read(sock, hello->kshm_ips,
+ hello->kshm_nips * sizeof(__u32), timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading IPs from ip %pI4h\n",
+ rc, &conn->ksnc_ipaddr);
+ LASSERT(rc < 0 && rc != -EALREADY);
+ goto out;
+ }
+
+ for (i = 0; i < (int) hello->kshm_nips; i++) {
+ hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+ if (hello->kshm_ips[i] == 0) {
+ CERROR("Zero IP[%d] from ip %pI4h\n",
+ i, &conn->ksnc_ipaddr);
+ rc = -EPROTO;
+ break;
+ }
+ }
+out:
+ LIBCFS_FREE(hdr, sizeof(*hdr));
+
+ return rc;
+}
+
+static int
+ksocknal_recv_hello_v2(ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout)
+{
+ struct socket *sock = conn->ksnc_sock;
+ int rc;
+ int i;
+
+ if (hello->kshm_magic == LNET_PROTO_MAGIC)
+ conn->ksnc_flip = 0;
+ else
+ conn->ksnc_flip = 1;
+
+ rc = libcfs_sock_read(sock, &hello->kshm_src_nid,
+ offsetof(ksock_hello_msg_t, kshm_ips) -
+ offsetof(ksock_hello_msg_t, kshm_src_nid),
+ timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading HELLO from %pI4h\n",
+ rc, &conn->ksnc_ipaddr);
+ LASSERT(rc < 0 && rc != -EALREADY);
+ return rc;
+ }
+
+ if (conn->ksnc_flip) {
+ __swab32s(&hello->kshm_src_pid);
+ __swab64s(&hello->kshm_src_nid);
+ __swab32s(&hello->kshm_dst_pid);
+ __swab64s(&hello->kshm_dst_nid);
+ __swab64s(&hello->kshm_src_incarnation);
+ __swab64s(&hello->kshm_dst_incarnation);
+ __swab32s(&hello->kshm_ctype);
+ __swab32s(&hello->kshm_nips);
+ }
+
+ if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+ CERROR("Bad nips %d from ip %pI4h\n",
+ hello->kshm_nips, &conn->ksnc_ipaddr);
+ return -EPROTO;
+ }
+
+ if (hello->kshm_nips == 0)
+ return 0;
+
+ rc = libcfs_sock_read(sock, hello->kshm_ips,
+ hello->kshm_nips * sizeof(__u32), timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading IPs from ip %pI4h\n",
+ rc, &conn->ksnc_ipaddr);
+ LASSERT(rc < 0 && rc != -EALREADY);
+ return rc;
+ }
+
+ for (i = 0; i < (int) hello->kshm_nips; i++) {
+ if (conn->ksnc_flip)
+ __swab32s(&hello->kshm_ips[i]);
+
+ if (hello->kshm_ips[i] == 0) {
+ CERROR("Zero IP[%d] from ip %pI4h\n",
+ i, &conn->ksnc_ipaddr);
+ return -EPROTO;
+ }
+ }
+
+ return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+ /* V1.x has no KSOCK_MSG_NOOP */
+ LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+ LASSERT(tx->tx_lnetmsg != NULL);
+
+ tx->tx_iov[0].iov_base = &tx->tx_lnetmsg->msg_hdr;
+ tx->tx_iov[0].iov_len = sizeof(lnet_hdr_t);
+
+ tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t);
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+ tx->tx_iov[0].iov_base = &tx->tx_msg;
+
+ if (tx->tx_lnetmsg != NULL) {
+ LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+ tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+ tx->tx_iov[0].iov_len = sizeof(ksock_msg_t);
+ tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len;
+ } else {
+ LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+ tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+ tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+ }
+ /* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(ksock_msg_t *msg)
+{
+ msg->ksm_csum = 0;
+ msg->ksm_type = KSOCK_MSG_LNET;
+ msg->ksm_zc_cookies[0] = msg->ksm_zc_cookies[1] = 0;
+}
+
+static void
+ksocknal_unpack_msg_v2(ksock_msg_t *msg)
+{
+ return; /* Do nothing */
+}
+
+ksock_proto_t ksocknal_protocol_v1x = {
+ .pro_version = KSOCK_PROTO_V1,
+ .pro_send_hello = ksocknal_send_hello_v1,
+ .pro_recv_hello = ksocknal_recv_hello_v1,
+ .pro_pack = ksocknal_pack_msg_v1,
+ .pro_unpack = ksocknal_unpack_msg_v1,
+ .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1,
+ .pro_handle_zcreq = NULL,
+ .pro_handle_zcack = NULL,
+ .pro_queue_tx_zcack = NULL,
+ .pro_match_tx = ksocknal_match_tx
+};
+
+ksock_proto_t ksocknal_protocol_v2x = {
+ .pro_version = KSOCK_PROTO_V2,
+ .pro_send_hello = ksocknal_send_hello_v2,
+ .pro_recv_hello = ksocknal_recv_hello_v2,
+ .pro_pack = ksocknal_pack_msg_v2,
+ .pro_unpack = ksocknal_unpack_msg_v2,
+ .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2,
+ .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2,
+ .pro_handle_zcreq = ksocknal_handle_zcreq,
+ .pro_handle_zcack = ksocknal_handle_zcack,
+ .pro_match_tx = ksocknal_match_tx
+};
+
+ksock_proto_t ksocknal_protocol_v3x = {
+ .pro_version = KSOCK_PROTO_V3,
+ .pro_send_hello = ksocknal_send_hello_v2,
+ .pro_recv_hello = ksocknal_recv_hello_v2,
+ .pro_pack = ksocknal_pack_msg_v2,
+ .pro_unpack = ksocknal_unpack_msg_v2,
+ .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2,
+ .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3,
+ .pro_handle_zcreq = ksocknal_handle_zcreq,
+ .pro_handle_zcack = ksocknal_handle_zcack,
+ .pro_match_tx = ksocknal_match_tx_v3
+};
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/Makefile b/kernel/drivers/staging/lustre/lnet/lnet/Makefile
new file mode 100644
index 000000000..336b8ea4f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LNET) += lnet.o
+
+lnet-y := api-ni.o config.o lib-me.o lib-msg.o lib-eq.o \
+ lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o \
+ router_proc.o acceptor.o peer.o
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/acceptor.c b/kernel/drivers/staging/lustre/lnet/lnet/acceptor.c
new file mode 100644
index 000000000..72fd1bf70
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/acceptor.c
@@ -0,0 +1,500 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+
+static int accept_port = 988;
+static int accept_backlog = 127;
+static int accept_timeout = 5;
+
+static struct {
+ int pta_shutdown;
+ struct socket *pta_sock;
+ struct completion pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_port(void)
+{
+ return accept_port;
+}
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+ return (magic == constant ||
+ magic == __swab32(constant));
+}
+
+static char *accept = "secure";
+
+module_param(accept, charp, 0444);
+MODULE_PARM_DESC(accept, "Accept connections (secure|all|none)");
+module_param(accept_port, int, 0444);
+MODULE_PARM_DESC(accept_port, "Acceptor's port (same on all nodes)");
+module_param(accept_backlog, int, 0444);
+MODULE_PARM_DESC(accept_backlog, "Acceptor's listen backlog");
+module_param(accept_timeout, int, 0644);
+MODULE_PARM_DESC(accept_timeout, "Acceptor's timeout (seconds)");
+
+static char *accept_type;
+
+static int
+lnet_acceptor_get_tunables(void)
+{
+ /* Userland acceptor uses 'accept_type' instead of 'accept', due to
+ * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+ * for compatibility. Hence the trick. */
+ accept_type = accept;
+ return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+ return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
+ __u32 peer_ip, int peer_port)
+{
+ switch (rc) {
+ /* "normal" errors */
+ case -ECONNREFUSED:
+ CNETERR("Connection to %s at host %pI4h on port %d was refused: check that Lustre is running on that node.\n",
+ libcfs_nid2str(peer_nid),
+ &peer_ip, peer_port);
+ break;
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ CNETERR("Connection to %s at host %pI4h was unreachable: the network or that node may be down, or Lustre may be misconfigured.\n",
+ libcfs_nid2str(peer_nid), &peer_ip);
+ break;
+ case -ETIMEDOUT:
+ CNETERR("Connection to %s at host %pI4h on port %d took too long: that node may be hung or experiencing high load.\n",
+ libcfs_nid2str(peer_nid),
+ &peer_ip, peer_port);
+ break;
+ case -ECONNRESET:
+ LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %pI4h on port %d was reset: is it running a compatible version of Lustre and is %s one of its NIDs?\n",
+ libcfs_nid2str(peer_nid),
+ &peer_ip, peer_port,
+ libcfs_nid2str(peer_nid));
+ break;
+ case -EPROTO:
+ LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at host %pI4h on port %d: is it running a compatible version of Lustre?\n",
+ libcfs_nid2str(peer_nid),
+ &peer_ip, peer_port);
+ break;
+ case -EADDRINUSE:
+ LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to connect to %s at host %pI4h on port %d\n",
+ libcfs_nid2str(peer_nid),
+ &peer_ip, peer_port);
+ break;
+ default:
+ LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s at host %pI4h on port %d\n",
+ rc, libcfs_nid2str(peer_nid),
+ &peer_ip, peer_port);
+ break;
+ }
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
+ __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+ lnet_acceptor_connreq_t cr;
+ struct socket *sock;
+ int rc;
+ int port;
+ int fatal;
+
+ CLASSERT(sizeof(cr) <= 16); /* not too big to be on the stack */
+
+ for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+ port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+ --port) {
+ /* Iterate through reserved ports. */
+
+ rc = libcfs_sock_connect(&sock, &fatal,
+ local_ip, port,
+ peer_ip, peer_port);
+ if (rc != 0) {
+ if (fatal)
+ goto failed;
+ continue;
+ }
+
+ CLASSERT(LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+ cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+ cr.acr_nid = peer_nid;
+
+ if (the_lnet.ln_testprotocompat != 0) {
+ /* single-shot proto check */
+ lnet_net_lock(LNET_LOCK_EX);
+ if ((the_lnet.ln_testprotocompat & 4) != 0) {
+ cr.acr_version++;
+ the_lnet.ln_testprotocompat &= ~4;
+ }
+ if ((the_lnet.ln_testprotocompat & 8) != 0) {
+ cr.acr_magic = LNET_PROTO_MAGIC;
+ the_lnet.ln_testprotocompat &= ~8;
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+ }
+
+ rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+ accept_timeout);
+ if (rc != 0)
+ goto failed_sock;
+
+ *sockp = sock;
+ return 0;
+ }
+
+ rc = -EADDRINUSE;
+ goto failed;
+
+ failed_sock:
+ libcfs_sock_release(sock);
+ failed:
+ lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+ return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+
+/* Below is the code common for both kernel and MT user-space */
+
+static int
+lnet_accept(struct socket *sock, __u32 magic)
+{
+ lnet_acceptor_connreq_t cr;
+ __u32 peer_ip;
+ int peer_port;
+ int rc;
+ int flip;
+ lnet_ni_t *ni;
+ char *str;
+
+ LASSERT(sizeof(cr) <= 16); /* not too big for the stack */
+
+ rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+ LASSERT(rc == 0); /* we succeeded before */
+
+ if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+ if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+ /* future version compatibility!
+ * When LNET unifies protocols over all LNDs, the first
+ * thing sent will be a version query. I send back
+ * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+ memset(&cr, 0, sizeof(cr));
+ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+ cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+ rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+ accept_timeout);
+
+ if (rc != 0)
+ CERROR("Error sending magic+version in response to LNET magic from %pI4h: %d\n",
+ &peer_ip, rc);
+ return -EPROTO;
+ }
+
+ if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+ str = "'old' socknal/tcpnal";
+ else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+ str = "'old' ranal";
+ else
+ str = "unrecognised";
+
+ LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %pI4h magic %08x: %s acceptor protocol\n",
+ &peer_ip, magic, str);
+ return -EPROTO;
+ }
+
+ flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+ rc = libcfs_sock_read(sock, &cr.acr_version,
+ sizeof(cr.acr_version),
+ accept_timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading connection request version from %pI4h\n",
+ rc, &peer_ip);
+ return -EIO;
+ }
+
+ if (flip)
+ __swab32s(&cr.acr_version);
+
+ if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+ /* future version compatibility!
+ * An acceptor-specific protocol rev will first send a version
+ * query. I send back my current version to tell her I'm
+ * "old". */
+ int peer_version = cr.acr_version;
+
+ memset(&cr, 0, sizeof(cr));
+ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+ cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+ rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+ accept_timeout);
+
+ if (rc != 0)
+ CERROR("Error sending magic+version in response to version %d from %pI4h: %d\n",
+ peer_version, &peer_ip, rc);
+ return -EPROTO;
+ }
+
+ rc = libcfs_sock_read(sock, &cr.acr_nid,
+ sizeof(cr) -
+ offsetof(lnet_acceptor_connreq_t, acr_nid),
+ accept_timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading connection request from %pI4h\n",
+ rc, &peer_ip);
+ return -EIO;
+ }
+
+ if (flip)
+ __swab64s(&cr.acr_nid);
+
+ ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+ if (ni == NULL || /* no matching net */
+ ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+ if (ni != NULL)
+ lnet_ni_decref(ni);
+ LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %pI4h for %s: No matching NI\n",
+ &peer_ip, libcfs_nid2str(cr.acr_nid));
+ return -EPERM;
+ }
+
+ if (ni->ni_lnd->lnd_accept == NULL) {
+ /* This catches a request for the loopback LND */
+ lnet_ni_decref(ni);
+ LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %pI4h for %s: NI doesn not accept IP connections\n",
+ &peer_ip, libcfs_nid2str(cr.acr_nid));
+ return -EPERM;
+ }
+
+ CDEBUG(D_NET, "Accept %s from %pI4h\n",
+ libcfs_nid2str(cr.acr_nid), &peer_ip);
+
+ rc = ni->ni_lnd->lnd_accept(ni, sock);
+
+ lnet_ni_decref(ni);
+ return rc;
+}
+
+static int
+lnet_acceptor(void *arg)
+{
+ struct socket *newsock;
+ int rc;
+ __u32 magic;
+ __u32 peer_ip;
+ int peer_port;
+ int secure = (int)((long_ptr_t)arg);
+
+ LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+ cfs_block_allsigs();
+
+ rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+ 0, accept_port, accept_backlog);
+ if (rc != 0) {
+ if (rc == -EADDRINUSE)
+ LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port %d: port already in use\n",
+ accept_port);
+ else
+ LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port %d: unexpected error %d\n",
+ accept_port, rc);
+
+ lnet_acceptor_state.pta_sock = NULL;
+ } else {
+ LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+ }
+
+ /* set init status and unblock parent */
+ lnet_acceptor_state.pta_shutdown = rc;
+ complete(&lnet_acceptor_state.pta_signal);
+
+ if (rc != 0)
+ return rc;
+
+ while (!lnet_acceptor_state.pta_shutdown) {
+
+ rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+ if (rc != 0) {
+ if (rc != -EAGAIN) {
+ CWARN("Accept error %d: pausing...\n", rc);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ }
+ continue;
+ }
+
+ /* maybe we're waken up with libcfs_sock_abort_accept() */
+ if (lnet_acceptor_state.pta_shutdown) {
+ libcfs_sock_release(newsock);
+ break;
+ }
+
+ rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+ if (rc != 0) {
+ CERROR("Can't determine new connection's address\n");
+ goto failed;
+ }
+
+ if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+ CERROR("Refusing connection from %pI4h: insecure port %d\n",
+ &peer_ip, peer_port);
+ goto failed;
+ }
+
+ rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+ accept_timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading connection request from %pI4h\n",
+ rc, &peer_ip);
+ goto failed;
+ }
+
+ rc = lnet_accept(newsock, magic);
+ if (rc != 0)
+ goto failed;
+
+ continue;
+
+failed:
+ libcfs_sock_release(newsock);
+ }
+
+ libcfs_sock_release(lnet_acceptor_state.pta_sock);
+ lnet_acceptor_state.pta_sock = NULL;
+
+ CDEBUG(D_NET, "Acceptor stopping\n");
+
+ /* unblock lnet_acceptor_stop() */
+ complete(&lnet_acceptor_state.pta_signal);
+ return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+ if (!strcmp(acc, "secure")) {
+ *sec = 1;
+ return 1;
+ } else if (!strcmp(acc, "all")) {
+ *sec = 0;
+ return 1;
+ } else if (!strcmp(acc, "none")) {
+ return 0;
+ }
+
+ LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+ acc);
+ return -EINVAL;
+}
+
+int
+lnet_acceptor_start(void)
+{
+ int rc;
+ long rc2;
+ long secure;
+
+ LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+ rc = lnet_acceptor_get_tunables();
+ if (rc != 0)
+ return rc;
+
+
+ init_completion(&lnet_acceptor_state.pta_signal);
+ rc = accept2secure(accept_type, &secure);
+ if (rc <= 0)
+ return rc;
+
+ if (lnet_count_acceptor_nis() == 0) /* not required */
+ return 0;
+
+ rc2 = PTR_ERR(kthread_run(lnet_acceptor,
+ (void *)(ulong_ptr_t)secure,
+ "acceptor_%03ld", secure));
+ if (IS_ERR_VALUE(rc2)) {
+ CERROR("Can't start acceptor thread: %ld\n", rc2);
+
+ return -ESRCH;
+ }
+
+ /* wait for acceptor to startup */
+ wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+ if (!lnet_acceptor_state.pta_shutdown) {
+ /* started OK */
+ LASSERT(lnet_acceptor_state.pta_sock != NULL);
+ return 0;
+ }
+
+ LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+ return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+ if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+ return;
+
+ lnet_acceptor_state.pta_shutdown = 1;
+ libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+ /* block until acceptor signals exit */
+ wait_for_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/api-ni.c b/kernel/drivers/staging/lustre/lnet/lnet/api-ni.c
new file mode 100644
index 000000000..4a14e5109
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -0,0 +1,1940 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+#include <linux/log2.h>
+#include <linux/ktime.h>
+
+#define D_LNI D_CONSOLE
+
+lnet_t the_lnet; /* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+
+static char *ip2nets = "";
+module_param(ip2nets, charp, 0444);
+MODULE_PARM_DESC(ip2nets, "LNET network <- IP table");
+
+static char *networks = "";
+module_param(networks, charp, 0444);
+MODULE_PARM_DESC(networks, "local networks");
+
+static char *routes = "";
+module_param(routes, charp, 0444);
+MODULE_PARM_DESC(routes, "routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+module_param(rnet_htable_size, int, 0444);
+MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table");
+
+static char *
+lnet_get_routes(void)
+{
+ return routes;
+}
+
+static char *
+lnet_get_networks(void)
+{
+ char *nets;
+ int rc;
+
+ if (*networks != 0 && *ip2nets != 0) {
+ LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or 'ip2nets' but not both at once\n");
+ return NULL;
+ }
+
+ if (*ip2nets != 0) {
+ rc = lnet_parse_ip2nets(&nets, ip2nets);
+ return (rc == 0) ? nets : NULL;
+ }
+
+ if (*networks != 0)
+ return networks;
+
+ return "tcp";
+}
+
+static void
+lnet_init_locks(void)
+{
+ spin_lock_init(&the_lnet.ln_eq_wait_lock);
+ init_waitqueue_head(&the_lnet.ln_eq_waitq);
+ mutex_init(&the_lnet.ln_lnd_mutex);
+ mutex_init(&the_lnet.ln_api_mutex);
+}
+
+static void
+lnet_fini_locks(void)
+{
+}
+
+
+static int
+lnet_create_remote_nets_table(void)
+{
+ int i;
+ struct list_head *hash;
+
+ LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+ LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+ LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+ if (hash == NULL) {
+ CERROR("Failed to create remote nets hash table\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&hash[i]);
+ the_lnet.ln_remote_nets_hash = hash;
+ return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+ int i;
+
+ if (the_lnet.ln_remote_nets_hash == NULL)
+ return;
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+ LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+ LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+ LNET_REMOTE_NETS_HASH_SIZE *
+ sizeof(the_lnet.ln_remote_nets_hash[0]));
+ the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+ if (the_lnet.ln_res_lock != NULL) {
+ cfs_percpt_lock_free(the_lnet.ln_res_lock);
+ the_lnet.ln_res_lock = NULL;
+ }
+
+ if (the_lnet.ln_net_lock != NULL) {
+ cfs_percpt_lock_free(the_lnet.ln_net_lock);
+ the_lnet.ln_net_lock = NULL;
+ }
+
+ lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+ lnet_init_locks();
+
+ the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+ if (the_lnet.ln_res_lock == NULL)
+ goto failed;
+
+ the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+ if (the_lnet.ln_net_lock == NULL)
+ goto failed;
+
+ return 0;
+
+ failed:
+ lnet_destroy_locks();
+ return -ENOMEM;
+}
+
+static void lnet_assert_wire_constants(void)
+{
+ /* Wire protocol assertions generated by 'wirecheck'
+ * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+ * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+ * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+ /* Constants... */
+ CLASSERT(LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+ CLASSERT(LNET_PROTO_TCP_VERSION_MAJOR == 1);
+ CLASSERT(LNET_PROTO_TCP_VERSION_MINOR == 0);
+ CLASSERT(LNET_MSG_ACK == 0);
+ CLASSERT(LNET_MSG_PUT == 1);
+ CLASSERT(LNET_MSG_GET == 2);
+ CLASSERT(LNET_MSG_REPLY == 3);
+ CLASSERT(LNET_MSG_HELLO == 4);
+
+ /* Checks for struct ptl_handle_wire_t */
+ CLASSERT((int)sizeof(lnet_handle_wire_t) == 16);
+ CLASSERT((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+ CLASSERT((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+ CLASSERT((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+ CLASSERT((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+ /* Checks for struct lnet_magicversion_t */
+ CLASSERT((int)sizeof(lnet_magicversion_t) == 8);
+ CLASSERT((int)offsetof(lnet_magicversion_t, magic) == 0);
+ CLASSERT((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+ CLASSERT((int)offsetof(lnet_magicversion_t, version_major) == 4);
+ CLASSERT((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+ CLASSERT((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+ CLASSERT((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+ /* Checks for struct lnet_hdr_t */
+ CLASSERT((int)sizeof(lnet_hdr_t) == 72);
+ CLASSERT((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+ CLASSERT((int)offsetof(lnet_hdr_t, src_nid) == 8);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+ CLASSERT((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+ CLASSERT((int)offsetof(lnet_hdr_t, src_pid) == 20);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+ CLASSERT((int)offsetof(lnet_hdr_t, type) == 24);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+ CLASSERT((int)offsetof(lnet_hdr_t, payload_length) == 28);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg) == 32);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
+
+ /* Ack */
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+ /* Put */
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+ /* Get */
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+ /* Reply */
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+ /* Hello */
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+ CLASSERT((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+ CLASSERT((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+static lnd_t *
+lnet_find_lnd_by_type(int type)
+{
+ lnd_t *lnd;
+ struct list_head *tmp;
+
+ /* holding lnd mutex */
+ list_for_each(tmp, &the_lnet.ln_lnds) {
+ lnd = list_entry(tmp, lnd_t, lnd_list);
+
+ if ((int)lnd->lnd_type == type)
+ return lnd;
+ }
+
+ return NULL;
+}
+
+void
+lnet_register_lnd(lnd_t *lnd)
+{
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(libcfs_isknown_lnd(lnd->lnd_type));
+ LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+ list_add_tail(&lnd->lnd_list, &the_lnet.ln_lnds);
+ lnd->lnd_refcount = 0;
+
+ CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd(lnd_t *lnd)
+{
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+ LASSERT(lnd->lnd_refcount == 0);
+
+ list_del(&lnd->lnd_list);
+ CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(lnet_counters_t *counters)
+{
+ lnet_counters_t *ctr;
+ int i;
+
+ memset(counters, 0, sizeof(*counters));
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+ counters->msgs_max += ctr->msgs_max;
+ counters->msgs_alloc += ctr->msgs_alloc;
+ counters->errors += ctr->errors;
+ counters->send_count += ctr->send_count;
+ counters->recv_count += ctr->recv_count;
+ counters->route_count += ctr->route_count;
+ counters->drop_count += ctr->drop_count;
+ counters->send_length += ctr->send_length;
+ counters->recv_length += ctr->recv_length;
+ counters->route_length += ctr->route_length;
+ counters->drop_length += ctr->drop_length;
+
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+ lnet_counters_t *counters;
+ int i;
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+ memset(counters, 0, sizeof(lnet_counters_t));
+
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_reset);
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int
+lnet_freelist_init(lnet_freelist_t *fl, int n, int size)
+{
+ char *space;
+
+ LASSERT(n > 0);
+
+ size += offsetof(lnet_freeobj_t, fo_contents);
+
+ LIBCFS_ALLOC(space, n * size);
+ if (space == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&fl->fl_list);
+ fl->fl_objs = space;
+ fl->fl_nobjs = n;
+ fl->fl_objsize = size;
+
+ do {
+ memset(space, 0, size);
+ list_add((struct list_head *)space, &fl->fl_list);
+ space += size;
+ } while (--n != 0);
+
+ return 0;
+}
+
+void
+lnet_freelist_fini(lnet_freelist_t *fl)
+{
+ struct list_head *el;
+ int count;
+
+ if (fl->fl_nobjs == 0)
+ return;
+
+ count = 0;
+ for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+ count++;
+
+ LASSERT(count == fl->fl_nobjs);
+
+ LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+ memset(fl, 0, sizeof(*fl));
+}
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+static __u64
+lnet_create_interface_cookie(void)
+{
+ /* NB the interface cookie in wire handles guards against delayed
+ * replies and ACKs appearing valid after reboot.
+ */
+ return ktime_get_ns();
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+ switch (type) {
+ default:
+ LBUG();
+ case LNET_COOKIE_TYPE_MD:
+ return "MD";
+ case LNET_COOKIE_TYPE_ME:
+ return "ME";
+ case LNET_COOKIE_TYPE_EQ:
+ return "EQ";
+ }
+}
+
+static void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+ int count = 0;
+
+ if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+ return;
+
+ while (!list_empty(&rec->rec_active)) {
+ struct list_head *e = rec->rec_active.next;
+
+ list_del_init(e);
+ if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+ lnet_eq_free(list_entry(e, lnet_eq_t, eq_list));
+
+ } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+ lnet_md_free(list_entry(e, lnet_libmd_t, md_list));
+
+ } else { /* NB: Active MEs should be attached on portals */
+ LBUG();
+ }
+ count++;
+ }
+
+ if (count > 0) {
+ /* Found alive MD/ME/EQ, user really should unlink/free
+ * all of them before finalize LNet, but if someone didn't,
+ * we have to recycle garbage for him */
+ CERROR("%d active elements on exit of %s container\n",
+ count, lnet_res_type2str(rec->rec_type));
+ }
+
+#ifdef LNET_USE_LIB_FREELIST
+ lnet_freelist_fini(&rec->rec_freelist);
+#endif
+ if (rec->rec_lh_hash != NULL) {
+ LIBCFS_FREE(rec->rec_lh_hash,
+ LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+ rec->rec_lh_hash = NULL;
+ }
+
+ rec->rec_type = 0; /* mark it as finalized */
+}
+
+static int
+lnet_res_container_setup(struct lnet_res_container *rec,
+ int cpt, int type, int objnum, int objsz)
+{
+ int rc = 0;
+ int i;
+
+ LASSERT(rec->rec_type == 0);
+
+ rec->rec_type = type;
+ INIT_LIST_HEAD(&rec->rec_active);
+
+#ifdef LNET_USE_LIB_FREELIST
+ memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist));
+ rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz);
+ if (rc != 0)
+ goto out;
+#endif
+ rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+ /* Arbitrary choice of hash table size */
+ LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+ LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+ if (rec->rec_lh_hash == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+ return 0;
+
+out:
+ CERROR("Failed to setup %s resource container\n",
+ lnet_res_type2str(type));
+ lnet_res_container_cleanup(rec);
+ return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+ struct lnet_res_container *rec;
+ int i;
+
+ cfs_percpt_for_each(rec, i, recs)
+ lnet_res_container_cleanup(rec);
+
+ cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type, int objnum, int objsz)
+{
+ struct lnet_res_container **recs;
+ struct lnet_res_container *rec;
+ int rc;
+ int i;
+
+ recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+ if (recs == NULL) {
+ CERROR("Failed to allocate %s resource containers\n",
+ lnet_res_type2str(type));
+ return NULL;
+ }
+
+ cfs_percpt_for_each(rec, i, recs) {
+ rc = lnet_res_container_setup(rec, i, type, objnum, objsz);
+ if (rc != 0) {
+ lnet_res_containers_destroy(recs);
+ return NULL;
+ }
+ }
+
+ return recs;
+}
+
+lnet_libhandle_t *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+ /* ALWAYS called with lnet_res_lock held */
+ struct list_head *head;
+ lnet_libhandle_t *lh;
+ unsigned int hash;
+
+ if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+ return NULL;
+
+ hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+ head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+ list_for_each_entry(lh, head, lh_hash_chain) {
+ if (lh->lh_cookie == cookie)
+ return lh;
+ }
+
+ return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh)
+{
+ /* ALWAYS called with lnet_res_lock held */
+ unsigned int ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+ unsigned int hash;
+
+ lh->lh_cookie = rec->rec_lh_cookie;
+ rec->rec_lh_cookie += 1 << ibits;
+
+ hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+ list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+
+int lnet_unprepare(void);
+
+static int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+ /* Prepare to bring up the network */
+ struct lnet_res_container **recs;
+ int rc = 0;
+
+ LASSERT(the_lnet.ln_refcount == 0);
+
+ the_lnet.ln_routing = 0;
+
+ LASSERT((requested_pid & LNET_PID_USERFLAG) == 0);
+ the_lnet.ln_pid = requested_pid;
+
+ INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+ INIT_LIST_HEAD(&the_lnet.ln_nis);
+ INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
+ INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+ INIT_LIST_HEAD(&the_lnet.ln_routers);
+
+ rc = lnet_create_remote_nets_table();
+ if (rc != 0)
+ goto failed;
+
+ the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+ the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(lnet_counters_t));
+ if (the_lnet.ln_counters == NULL) {
+ CERROR("Failed to allocate counters for LNet\n");
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = lnet_peer_tables_create();
+ if (rc != 0)
+ goto failed;
+
+ rc = lnet_msg_containers_create();
+ if (rc != 0)
+ goto failed;
+
+ rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+ LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS,
+ sizeof(lnet_eq_t));
+ if (rc != 0)
+ goto failed;
+
+ recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES,
+ sizeof(lnet_me_t));
+ if (recs == NULL) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ the_lnet.ln_me_containers = recs;
+
+ recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS,
+ sizeof(lnet_libmd_t));
+ if (recs == NULL) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ the_lnet.ln_md_containers = recs;
+
+ rc = lnet_portals_create();
+ if (rc != 0) {
+ CERROR("Failed to create portals for LNet: %d\n", rc);
+ goto failed;
+ }
+
+ return 0;
+
+ failed:
+ lnet_unprepare();
+ return rc;
+}
+
+int
+lnet_unprepare(void)
+{
+ /* NB no LNET_LOCK since this is the last reference. All LND instances
+ * have shut down already, so it is safe to unlink and free all
+ * descriptors, even those that appear committed to a network op (eg MD
+ * with non-zero pending count) */
+
+ lnet_fail_nid(LNET_NID_ANY, 0);
+
+ LASSERT(the_lnet.ln_refcount == 0);
+ LASSERT(list_empty(&the_lnet.ln_test_peers));
+ LASSERT(list_empty(&the_lnet.ln_nis));
+ LASSERT(list_empty(&the_lnet.ln_nis_cpt));
+ LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+ lnet_portals_destroy();
+
+ if (the_lnet.ln_md_containers != NULL) {
+ lnet_res_containers_destroy(the_lnet.ln_md_containers);
+ the_lnet.ln_md_containers = NULL;
+ }
+
+ if (the_lnet.ln_me_containers != NULL) {
+ lnet_res_containers_destroy(the_lnet.ln_me_containers);
+ the_lnet.ln_me_containers = NULL;
+ }
+
+ lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+ lnet_msg_containers_destroy();
+ lnet_peer_tables_destroy();
+ lnet_rtrpools_free();
+
+ if (the_lnet.ln_counters != NULL) {
+ cfs_percpt_free(the_lnet.ln_counters);
+ the_lnet.ln_counters = NULL;
+ }
+ lnet_destroy_remote_nets_table();
+
+ return 0;
+}
+
+lnet_ni_t *
+lnet_net2ni_locked(__u32 net, int cpt)
+{
+ struct list_head *tmp;
+ lnet_ni_t *ni;
+
+ LASSERT(cpt != LNET_LOCK_EX);
+
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (LNET_NIDNET(ni->ni_nid) == net) {
+ lnet_ni_addref_locked(ni, cpt);
+ return ni;
+ }
+ }
+
+ return NULL;
+}
+
+lnet_ni_t *
+lnet_net2ni(__u32 net)
+{
+ lnet_ni_t *ni;
+
+ lnet_net_lock(0);
+ ni = lnet_net2ni_locked(net, 0);
+ lnet_net_unlock(0);
+
+ return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni);
+
+static unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+ __u64 key = nid;
+ unsigned int val;
+
+ LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+ if (number == 1)
+ return 0;
+
+ val = hash_long(key, LNET_CPT_BITS);
+ /* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+ if (val < number)
+ return val;
+
+ return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid)
+{
+ struct lnet_ni *ni;
+
+ /* must called with hold of lnet_net_lock */
+ if (LNET_CPT_NUMBER == 1)
+ return 0; /* the only one */
+
+ /* take lnet_net_lock(any) would be OK */
+ if (!list_empty(&the_lnet.ln_nis_cpt)) {
+ list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
+ if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
+ continue;
+
+ LASSERT(ni->ni_cpts != NULL);
+ return ni->ni_cpts[lnet_nid_cpt_hash
+ (nid, ni->ni_ncpts)];
+ }
+ }
+
+ return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid)
+{
+ int cpt;
+ int cpt2;
+
+ if (LNET_CPT_NUMBER == 1)
+ return 0; /* the only one */
+
+ if (list_empty(&the_lnet.ln_nis_cpt))
+ return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+ cpt = lnet_net_lock_current();
+ cpt2 = lnet_cpt_of_nid_locked(nid);
+ lnet_net_unlock(cpt);
+
+ return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net)
+{
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+
+ ni = lnet_net2ni_locked(net, cpt);
+ if (ni != NULL)
+ lnet_ni_decref_locked(ni, cpt);
+
+ lnet_net_unlock(cpt);
+
+ return ni != NULL;
+}
+
+lnet_ni_t *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+ struct lnet_ni *ni;
+ struct list_head *tmp;
+
+ LASSERT(cpt != LNET_LOCK_EX);
+
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (ni->ni_nid == nid) {
+ lnet_ni_addref_locked(ni, cpt);
+ return ni;
+ }
+ }
+
+ return NULL;
+}
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+ ni = lnet_nid2ni_locked(nid, cpt);
+ if (ni != NULL)
+ lnet_ni_decref_locked(ni, cpt);
+ lnet_net_unlock(cpt);
+
+ return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nis(void)
+{
+ /* Return the # of NIs that need the acceptor. */
+ int count = 0;
+ struct list_head *tmp;
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (ni->ni_lnd->lnd_accept != NULL)
+ count++;
+ }
+
+ lnet_net_unlock(cpt);
+
+ return count;
+}
+
+static int
+lnet_ni_tq_credits(lnet_ni_t *ni)
+{
+ int credits;
+
+ LASSERT(ni->ni_ncpts >= 1);
+
+ if (ni->ni_ncpts == 1)
+ return ni->ni_maxtxcredits;
+
+ credits = ni->ni_maxtxcredits / ni->ni_ncpts;
+ credits = max(credits, 8 * ni->ni_peertxcredits);
+ credits = min(credits, ni->ni_maxtxcredits);
+
+ return credits;
+}
+
+static void
+lnet_shutdown_lndnis(void)
+{
+ int i;
+ int islo;
+ lnet_ni_t *ni;
+
+ /* NB called holding the global mutex */
+
+ /* All quiet on the API front */
+ LASSERT(!the_lnet.ln_shutdown);
+ LASSERT(the_lnet.ln_refcount == 0);
+ LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_shutdown = 1; /* flag shutdown */
+
+ /* Unlink NIs from the global table */
+ while (!list_empty(&the_lnet.ln_nis)) {
+ ni = list_entry(the_lnet.ln_nis.next,
+ lnet_ni_t, ni_list);
+ /* move it to zombie list and nobody can find it anymore */
+ list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
+ lnet_ni_decref_locked(ni, 0); /* drop ln_nis' ref */
+
+ if (!list_empty(&ni->ni_cptlist)) {
+ list_del_init(&ni->ni_cptlist);
+ lnet_ni_decref_locked(ni, 0);
+ }
+ }
+
+ /* Drop the cached eqwait NI. */
+ if (the_lnet.ln_eq_waitni != NULL) {
+ lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0);
+ the_lnet.ln_eq_waitni = NULL;
+ }
+
+ /* Drop the cached loopback NI. */
+ if (the_lnet.ln_loni != NULL) {
+ lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+ the_lnet.ln_loni = NULL;
+ }
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ /* Clear lazy portals and drop delayed messages which hold refs
+ * on their lnet_msg_t::msg_rxpeer */
+ for (i = 0; i < the_lnet.ln_nportals; i++)
+ LNetClearLazyPortal(i);
+
+ /* Clear the peer table and wait for all peers to go (they hold refs on
+ * their NIs) */
+ lnet_peer_tables_cleanup();
+
+ lnet_net_lock(LNET_LOCK_EX);
+ /* Now wait for the NI's I just nuked to show up on ln_zombie_nis
+ * and shut them down in guaranteed thread context */
+ i = 2;
+ while (!list_empty(&the_lnet.ln_nis_zombie)) {
+ int *ref;
+ int j;
+
+ ni = list_entry(the_lnet.ln_nis_zombie.next,
+ lnet_ni_t, ni_list);
+ list_del_init(&ni->ni_list);
+ cfs_percpt_for_each(ref, j, ni->ni_refs) {
+ if (*ref == 0)
+ continue;
+ /* still busy, add it back to zombie list */
+ list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+ break;
+ }
+
+ if (!list_empty(&ni->ni_list)) {
+ lnet_net_unlock(LNET_LOCK_EX);
+ ++i;
+ if ((i & (-i)) == i) {
+ CDEBUG(D_WARNING, "Waiting for zombie LNI %s\n",
+ libcfs_nid2str(ni->ni_nid));
+ }
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ lnet_net_lock(LNET_LOCK_EX);
+ continue;
+ }
+
+ ni->ni_lnd->lnd_refcount--;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ islo = ni->ni_lnd->lnd_type == LOLND;
+
+ LASSERT(!in_interrupt());
+ (ni->ni_lnd->lnd_shutdown)(ni);
+
+ /* can't deref lnd anymore now; it might have unregistered
+ * itself... */
+
+ if (!islo)
+ CDEBUG(D_LNI, "Removed LNI %s\n",
+ libcfs_nid2str(ni->ni_nid));
+
+ lnet_ni_free(ni);
+ i = 2;
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ the_lnet.ln_shutdown = 0;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ if (the_lnet.ln_network_tokens != NULL) {
+ LIBCFS_FREE(the_lnet.ln_network_tokens,
+ the_lnet.ln_network_tokens_nob);
+ the_lnet.ln_network_tokens = NULL;
+ }
+}
+
+static int
+lnet_startup_lndnis(void)
+{
+ lnd_t *lnd;
+ struct lnet_ni *ni;
+ struct lnet_tx_queue *tq;
+ struct list_head nilist;
+ int i;
+ int rc = 0;
+ int lnd_type;
+ int nicount = 0;
+ char *nets = lnet_get_networks();
+
+ INIT_LIST_HEAD(&nilist);
+
+ if (nets == NULL)
+ goto failed;
+
+ rc = lnet_parse_networks(&nilist, nets);
+ if (rc != 0)
+ goto failed;
+
+ while (!list_empty(&nilist)) {
+ ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+ lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+ LASSERT(libcfs_isknown_lnd(lnd_type));
+
+ if (lnd_type == CIBLND ||
+ lnd_type == OPENIBLND ||
+ lnd_type == IIBLND ||
+ lnd_type == VIBLND) {
+ CERROR("LND %s obsoleted\n",
+ libcfs_lnd2str(lnd_type));
+ goto failed;
+ }
+
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+ lnd = lnet_find_lnd_by_type(lnd_type);
+
+ if (lnd == NULL) {
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+ rc = request_module("%s",
+ libcfs_lnd2modname(lnd_type));
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+ lnd = lnet_find_lnd_by_type(lnd_type);
+ if (lnd == NULL) {
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+ CERROR("Can't load LND %s, module %s, rc=%d\n",
+ libcfs_lnd2str(lnd_type),
+ libcfs_lnd2modname(lnd_type), rc);
+ goto failed;
+ }
+ }
+
+ lnet_net_lock(LNET_LOCK_EX);
+ lnd->lnd_refcount++;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ ni->ni_lnd = lnd;
+
+ rc = (lnd->lnd_startup)(ni);
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+
+ if (rc != 0) {
+ LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s\n",
+ rc, libcfs_lnd2str(lnd->lnd_type));
+ lnet_net_lock(LNET_LOCK_EX);
+ lnd->lnd_refcount--;
+ lnet_net_unlock(LNET_LOCK_EX);
+ goto failed;
+ }
+
+ LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+
+ list_del(&ni->ni_list);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ /* refcount for ln_nis */
+ lnet_ni_addref_locked(ni, 0);
+ list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+ if (ni->ni_cpts != NULL) {
+ list_add_tail(&ni->ni_cptlist,
+ &the_lnet.ln_nis_cpt);
+ lnet_ni_addref_locked(ni, 0);
+ }
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ if (lnd->lnd_type == LOLND) {
+ lnet_ni_addref(ni);
+ LASSERT(the_lnet.ln_loni == NULL);
+ the_lnet.ln_loni = ni;
+ continue;
+ }
+
+ if (ni->ni_peertxcredits == 0 ||
+ ni->ni_maxtxcredits == 0) {
+ LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+ libcfs_lnd2str(lnd->lnd_type),
+ ni->ni_peertxcredits == 0 ?
+ "" : "per-peer ");
+ goto failed;
+ }
+
+ cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+ tq->tq_credits_min =
+ tq->tq_credits_max =
+ tq->tq_credits = lnet_ni_tq_credits(ni);
+ }
+
+ CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+ libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+ lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+ ni->ni_peerrtrcredits, ni->ni_peertimeout);
+
+ nicount++;
+ }
+
+ if (the_lnet.ln_eq_waitni != NULL && nicount > 1) {
+ lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type;
+ LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network\n",
+ libcfs_lnd2str(lnd_type));
+ goto failed;
+ }
+
+ return 0;
+
+ failed:
+ lnet_shutdown_lndnis();
+
+ while (!list_empty(&nilist)) {
+ ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+ list_del(&ni->ni_list);
+ lnet_ni_free(ni);
+ }
+
+ return -ENETDOWN;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Only userspace program needs to call this function - it's automatically
+ * called in the kernel at module loading time. Caller has to call LNetFini()
+ * after a call to LNetInit(), if and only if the latter returned 0. It must
+ * be called exactly once.
+ *
+ * \return 0 on success, and -ve on failures.
+ */
+int
+LNetInit(void)
+{
+ int rc;
+
+ lnet_assert_wire_constants();
+ LASSERT(!the_lnet.ln_init);
+
+ memset(&the_lnet, 0, sizeof(the_lnet));
+
+ /* refer to global cfs_cpt_table for now */
+ the_lnet.ln_cpt_table = cfs_cpt_table;
+ the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_table);
+
+ LASSERT(the_lnet.ln_cpt_number > 0);
+ if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+ /* we are under risk of consuming all lh_cookie */
+ CERROR("Can't have %d CPTs for LNet (max allowed is %d), please change setting of CPT-table and retry\n",
+ the_lnet.ln_cpt_number, LNET_CPT_MAX);
+ return -1;
+ }
+
+ while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+ the_lnet.ln_cpt_bits++;
+
+ rc = lnet_create_locks();
+ if (rc != 0) {
+ CERROR("Can't create LNet global locks: %d\n", rc);
+ return -1;
+ }
+
+ the_lnet.ln_refcount = 0;
+ the_lnet.ln_init = 1;
+ LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+ INIT_LIST_HEAD(&the_lnet.ln_lnds);
+ INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+ INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+ /* The hash table size is the number of bits it takes to express the set
+ * ln_num_routes, minus 1 (better to under estimate than over so we
+ * don't waste memory). */
+ if (rnet_htable_size <= 0)
+ rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+ else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+ rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+ the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+ order_base_2(rnet_htable_size) - 1);
+
+ /* All LNDs apart from the LOLND are in separate modules. They
+ * register themselves when their module loads, and unregister
+ * themselves when their module is unloaded. */
+ lnet_register_lnd(&the_lolnd);
+ return 0;
+}
+EXPORT_SYMBOL(LNetInit);
+
+/**
+ * Finalize LNet library.
+ *
+ * Only userspace program needs to call this function. It can be called
+ * at most once.
+ *
+ * \pre LNetInit() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void
+LNetFini(void)
+{
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount == 0);
+
+ while (!list_empty(&the_lnet.ln_lnds))
+ lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+ lnd_t, lnd_list));
+ lnet_destroy_locks();
+
+ the_lnet.ln_init = 0;
+}
+EXPORT_SYMBOL(LNetFini);
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Userspace program should call this after a successful call to LNetInit().
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+ int im_a_router = 0;
+ int rc;
+
+ LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+ LASSERT(the_lnet.ln_init);
+ CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+ if (the_lnet.ln_refcount > 0) {
+ rc = the_lnet.ln_refcount++;
+ goto out;
+ }
+
+ lnet_get_tunables();
+
+ if (requested_pid == LNET_PID_ANY) {
+ /* Don't instantiate LNET just for me */
+ rc = -ENETDOWN;
+ goto failed0;
+ }
+
+ rc = lnet_prepare(requested_pid);
+ if (rc != 0)
+ goto failed0;
+
+ rc = lnet_startup_lndnis();
+ if (rc != 0)
+ goto failed1;
+
+ rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+ if (rc != 0)
+ goto failed2;
+
+ rc = lnet_check_routes();
+ if (rc != 0)
+ goto failed2;
+
+ rc = lnet_rtrpools_alloc(im_a_router);
+ if (rc != 0)
+ goto failed2;
+
+ rc = lnet_acceptor_start();
+ if (rc != 0)
+ goto failed2;
+
+ the_lnet.ln_refcount = 1;
+ /* Now I may use my own API functions... */
+
+ /* NB router checker needs the_lnet.ln_ping_info in
+ * lnet_router_checker -> lnet_update_ni_status_locked */
+ rc = lnet_ping_target_init();
+ if (rc != 0)
+ goto failed3;
+
+ rc = lnet_router_checker_start();
+ if (rc != 0)
+ goto failed4;
+
+ lnet_proc_init();
+ goto out;
+
+ failed4:
+ lnet_ping_target_fini();
+ failed3:
+ the_lnet.ln_refcount = 0;
+ lnet_acceptor_stop();
+ failed2:
+ lnet_destroy_routes();
+ lnet_shutdown_lndnis();
+ failed1:
+ lnet_unprepare();
+ failed0:
+ LASSERT(rc < 0);
+ out:
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+ return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini(void)
+{
+ LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if (the_lnet.ln_refcount != 1) {
+ the_lnet.ln_refcount--;
+ } else {
+ LASSERT(!the_lnet.ln_niinit_self);
+
+ lnet_proc_fini();
+ lnet_router_checker_stop();
+ lnet_ping_target_fini();
+
+ /* Teardown fns that use my own API functions BEFORE here */
+ the_lnet.ln_refcount = 0;
+
+ lnet_acceptor_stop();
+ lnet_destroy_routes();
+ lnet_shutdown_lndnis();
+ lnet_unprepare();
+ }
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet
+ * internal ioctl handler.
+ *
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it.
+ *
+ * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer.
+ * The data will be printed to system console. Don't use it excessively.
+ * \param arg A pointer to lnet_process_id_t, process ID of the peer.
+ *
+ * \return Always return 0 when called by users directly (i.e., not via ioctl).
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+ struct libcfs_ioctl_data *data = arg;
+ lnet_process_id_t id = {0};
+ lnet_ni_t *ni;
+ int rc;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ switch (cmd) {
+ case IOC_LIBCFS_GET_NI:
+ rc = LNetGetId(data->ioc_count, &id);
+ data->ioc_nid = id.nid;
+ return rc;
+
+ case IOC_LIBCFS_FAIL_NID:
+ return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+ case IOC_LIBCFS_ADD_ROUTE:
+ rc = lnet_add_route(data->ioc_net, data->ioc_count,
+ data->ioc_nid, data->ioc_priority);
+ return (rc != 0) ? rc : lnet_check_routes();
+
+ case IOC_LIBCFS_DEL_ROUTE:
+ return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+ case IOC_LIBCFS_GET_ROUTE:
+ return lnet_get_route(data->ioc_count,
+ &data->ioc_net, &data->ioc_count,
+ &data->ioc_nid, &data->ioc_flags,
+ &data->ioc_priority);
+ case IOC_LIBCFS_NOTIFY_ROUTER:
+ return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+ cfs_time_current() -
+ cfs_time_seconds(get_seconds() -
+ (time_t)data->ioc_u64[0]));
+
+ case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+ /* This can be removed once lustre stops calling it */
+ return 0;
+
+ case IOC_LIBCFS_LNET_DIST:
+ rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+ if (rc < 0 && rc != -EHOSTUNREACH)
+ return rc;
+
+ data->ioc_u32[0] = rc;
+ return 0;
+
+ case IOC_LIBCFS_TESTPROTOCOMPAT:
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_testprotocompat = data->ioc_flags;
+ lnet_net_unlock(LNET_LOCK_EX);
+ return 0;
+
+ case IOC_LIBCFS_PING:
+ id.nid = data->ioc_nid;
+ id.pid = data->ioc_u32[0];
+ rc = lnet_ping(id, data->ioc_u32[1], /* timeout */
+ (lnet_process_id_t *)data->ioc_pbuf1,
+ data->ioc_plen1/sizeof(lnet_process_id_t));
+ if (rc < 0)
+ return rc;
+ data->ioc_count = rc;
+ return 0;
+
+ case IOC_LIBCFS_DEBUG_PEER: {
+ /* CAVEAT EMPTOR: this one designed for calling directly; not
+ * via an ioctl */
+ id = *((lnet_process_id_t *) arg);
+
+ lnet_debug_peer(id.nid);
+
+ ni = lnet_net2ni(LNET_NIDNET(id.nid));
+ if (ni == NULL) {
+ CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id));
+ } else {
+ if (ni->ni_lnd->lnd_ctl == NULL) {
+ CDEBUG(D_WARNING, "No ctl for %s\n",
+ libcfs_id2str(id));
+ } else {
+ (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+ }
+
+ lnet_ni_decref(ni);
+ }
+ return 0;
+ }
+
+ default:
+ ni = lnet_net2ni(data->ioc_net);
+ if (ni == NULL)
+ return -EINVAL;
+
+ if (ni->ni_lnd->lnd_ctl == NULL)
+ rc = -EINVAL;
+ else
+ rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+ lnet_ni_decref(ni);
+ return rc;
+ }
+ /* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+/**
+ * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that
+ * all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * lnet_process_id_t ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+ struct lnet_ni *ni;
+ struct list_head *tmp;
+ int cpt;
+ int rc = -ENOENT;
+
+ LASSERT(the_lnet.ln_init);
+
+ /* LNetNI initilization failed? */
+ if (the_lnet.ln_refcount == 0)
+ return rc;
+
+ cpt = lnet_net_lock_current();
+
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ if (index-- != 0)
+ continue;
+
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ id->nid = ni->ni_nid;
+ id->pid = the_lnet.ln_pid;
+ rc = 0;
+ break;
+ }
+
+ lnet_net_unlock(cpt);
+ return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+/**
+ * Print a string representation of handle \a h into buffer \a str of
+ * \a len bytes.
+ */
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+ snprintf(str, len, "%#llx", h.cookie);
+}
+EXPORT_SYMBOL(LNetSnprintHandle);
+
+static int
+lnet_create_ping_info(void)
+{
+ int i;
+ int n;
+ int rc;
+ unsigned int infosz;
+ lnet_ni_t *ni;
+ lnet_process_id_t id;
+ lnet_ping_info_t *pinfo;
+
+ for (n = 0; ; n++) {
+ rc = LNetGetId(n, &id);
+ if (rc == -ENOENT)
+ break;
+
+ LASSERT(rc == 0);
+ }
+
+ infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+ LIBCFS_ALLOC(pinfo, infosz);
+ if (pinfo == NULL) {
+ CERROR("Can't allocate ping info[%d]\n", n);
+ return -ENOMEM;
+ }
+
+ pinfo->pi_nnis = n;
+ pinfo->pi_pid = the_lnet.ln_pid;
+ pinfo->pi_magic = LNET_PROTO_PING_MAGIC;
+ pinfo->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+ for (i = 0; i < n; i++) {
+ lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
+ rc = LNetGetId(i, &id);
+ LASSERT(rc == 0);
+
+ ns->ns_nid = id.nid;
+ ns->ns_status = LNET_NI_STATUS_UP;
+
+ lnet_net_lock(0);
+
+ ni = lnet_nid2ni_locked(id.nid, 0);
+ LASSERT(ni != NULL);
+
+ lnet_ni_lock(ni);
+ LASSERT(ni->ni_status == NULL);
+ ni->ni_status = ns;
+ lnet_ni_unlock(ni);
+
+ lnet_ni_decref_locked(ni, 0);
+ lnet_net_unlock(0);
+ }
+
+ the_lnet.ln_ping_info = pinfo;
+ return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+ struct lnet_ni *ni;
+
+ lnet_net_lock(0);
+
+ list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+ lnet_ni_lock(ni);
+ ni->ni_status = NULL;
+ lnet_ni_unlock(ni);
+ }
+
+ lnet_net_unlock(0);
+
+ LIBCFS_FREE(the_lnet.ln_ping_info,
+ offsetof(lnet_ping_info_t,
+ pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+ the_lnet.ln_ping_info = NULL;
+}
+
+int
+lnet_ping_target_init(void)
+{
+ lnet_md_t md = { NULL };
+ lnet_handle_me_t meh;
+ lnet_process_id_t id;
+ int rc;
+ int rc2;
+ int infosz;
+
+ rc = lnet_create_ping_info();
+ if (rc != 0)
+ return rc;
+
+ /* We can have a tiny EQ since we only need to see the unlink event on
+ * teardown, which by definition is the last one! */
+ rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+ if (rc != 0) {
+ CERROR("Can't allocate ping EQ: %d\n", rc);
+ goto failed_0;
+ }
+
+ memset(&id, 0, sizeof(lnet_process_id_t));
+ id.nid = LNET_NID_ANY;
+ id.pid = LNET_PID_ANY;
+
+ rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+ LNET_PROTO_PING_MATCHBITS, 0,
+ LNET_UNLINK, LNET_INS_AFTER,
+ &meh);
+ if (rc != 0) {
+ CERROR("Can't create ping ME: %d\n", rc);
+ goto failed_1;
+ }
+
+ /* initialize md content */
+ infosz = offsetof(lnet_ping_info_t,
+ pi_ni[the_lnet.ln_ping_info->pi_nnis]);
+ md.start = the_lnet.ln_ping_info;
+ md.length = infosz;
+ md.threshold = LNET_MD_THRESH_INF;
+ md.max_size = 0;
+ md.options = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+ LNET_MD_MANAGE_REMOTE;
+ md.user_ptr = NULL;
+ md.eq_handle = the_lnet.ln_ping_target_eq;
+
+ rc = LNetMDAttach(meh, md,
+ LNET_RETAIN,
+ &the_lnet.ln_ping_target_md);
+ if (rc != 0) {
+ CERROR("Can't attach ping MD: %d\n", rc);
+ goto failed_2;
+ }
+
+ return 0;
+
+ failed_2:
+ rc2 = LNetMEUnlink(meh);
+ LASSERT(rc2 == 0);
+ failed_1:
+ rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+ LASSERT(rc2 == 0);
+ failed_0:
+ lnet_destroy_ping_info();
+ return rc;
+}
+
+void
+lnet_ping_target_fini(void)
+{
+ lnet_event_t event;
+ int rc;
+ int which;
+ int timeout_ms = 1000;
+ sigset_t blocked = cfs_block_allsigs();
+
+ LNetMDUnlink(the_lnet.ln_ping_target_md);
+ /* NB md could be busy; this just starts the unlink */
+
+ for (;;) {
+ rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+ timeout_ms, &event, &which);
+
+ /* I expect overflow... */
+ LASSERT(rc >= 0 || rc == -EOVERFLOW);
+
+ if (rc == 0) {
+ /* timed out: provide a diagnostic */
+ CWARN("Still waiting for ping MD to unlink\n");
+ timeout_ms *= 2;
+ continue;
+ }
+
+ /* Got a valid event */
+ if (event.unlinked)
+ break;
+ }
+
+ rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+ LASSERT(rc == 0);
+ lnet_destroy_ping_info();
+ cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping(lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+ lnet_handle_eq_t eqh;
+ lnet_handle_md_t mdh;
+ lnet_event_t event;
+ lnet_md_t md = { NULL };
+ int which;
+ int unlinked = 0;
+ int replied = 0;
+ const int a_long_time = 60000; /* mS */
+ int infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
+ lnet_ping_info_t *info;
+ lnet_process_id_t tmpid;
+ int i;
+ int nob;
+ int rc;
+ int rc2;
+ sigset_t blocked;
+
+ if (n_ids <= 0 ||
+ id.nid == LNET_NID_ANY ||
+ timeout_ms > 500000 || /* arbitrary limit! */
+ n_ids > 20) /* arbitrary limit! */
+ return -EINVAL;
+
+ if (id.pid == LNET_PID_ANY)
+ id.pid = LUSTRE_SRV_LNET_PID;
+
+ LIBCFS_ALLOC(info, infosz);
+ if (info == NULL)
+ return -ENOMEM;
+
+ /* NB 2 events max (including any unlink event) */
+ rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+ if (rc != 0) {
+ CERROR("Can't allocate EQ: %d\n", rc);
+ goto out_0;
+ }
+
+ /* initialize md content */
+ md.start = info;
+ md.length = infosz;
+ md.threshold = 2; /*GET/REPLY*/
+ md.max_size = 0;
+ md.options = LNET_MD_TRUNCATE;
+ md.user_ptr = NULL;
+ md.eq_handle = eqh;
+
+ rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+ if (rc != 0) {
+ CERROR("Can't bind MD: %d\n", rc);
+ goto out_1;
+ }
+
+ rc = LNetGet(LNET_NID_ANY, mdh, id,
+ LNET_RESERVED_PORTAL,
+ LNET_PROTO_PING_MATCHBITS, 0);
+
+ if (rc != 0) {
+ /* Don't CERROR; this could be deliberate! */
+
+ rc2 = LNetMDUnlink(mdh);
+ LASSERT(rc2 == 0);
+
+ /* NB must wait for the UNLINK event below... */
+ unlinked = 1;
+ timeout_ms = a_long_time;
+ }
+
+ do {
+ /* MUST block for unlink to complete */
+ if (unlinked)
+ blocked = cfs_block_allsigs();
+
+ rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+ if (unlinked)
+ cfs_restore_sigs(blocked);
+
+ CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+ (rc2 <= 0) ? -1 : event.type,
+ (rc2 <= 0) ? -1 : event.status,
+ (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+ LASSERT(rc2 != -EOVERFLOW); /* can't miss anything */
+
+ if (rc2 <= 0 || event.status != 0) {
+ /* timeout or error */
+ if (!replied && rc == 0)
+ rc = (rc2 < 0) ? rc2 :
+ (rc2 == 0) ? -ETIMEDOUT :
+ event.status;
+
+ if (!unlinked) {
+ /* Ensure completion in finite time... */
+ LNetMDUnlink(mdh);
+ /* No assertion (racing with network) */
+ unlinked = 1;
+ timeout_ms = a_long_time;
+ } else if (rc2 == 0) {
+ /* timed out waiting for unlink */
+ CWARN("ping %s: late network completion\n",
+ libcfs_id2str(id));
+ }
+ } else if (event.type == LNET_EVENT_REPLY) {
+ replied = 1;
+ rc = event.mlength;
+ }
+
+ } while (rc2 <= 0 || !event.unlinked);
+
+ if (!replied) {
+ if (rc >= 0)
+ CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+ libcfs_id2str(id));
+ rc = -EIO;
+ goto out_1;
+ }
+
+ nob = rc;
+ LASSERT(nob >= 0 && nob <= infosz);
+
+ rc = -EPROTO; /* if I can't parse... */
+
+ if (nob < 8) {
+ /* can't check magic/version */
+ CERROR("%s: ping info too short %d\n",
+ libcfs_id2str(id), nob);
+ goto out_1;
+ }
+
+ if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+ lnet_swap_pinginfo(info);
+ } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+ CERROR("%s: Unexpected magic %08x\n",
+ libcfs_id2str(id), info->pi_magic);
+ goto out_1;
+ }
+
+ if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+ CERROR("%s: ping w/o NI status: 0x%x\n",
+ libcfs_id2str(id), info->pi_features);
+ goto out_1;
+ }
+
+ if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
+ CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+ nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
+ goto out_1;
+ }
+
+ if (info->pi_nnis < n_ids)
+ n_ids = info->pi_nnis;
+
+ if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
+ CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+ nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
+ goto out_1;
+ }
+
+ rc = -EFAULT; /* If I SEGV... */
+
+ memset(&tmpid, 0, sizeof(tmpid));
+ for (i = 0; i < n_ids; i++) {
+ tmpid.pid = info->pi_pid;
+ tmpid.nid = info->pi_ni[i].ns_nid;
+ if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+ goto out_1;
+ }
+ rc = info->pi_nnis;
+
+ out_1:
+ rc2 = LNetEQFree(eqh);
+ if (rc2 != 0)
+ CERROR("rc2 %d\n", rc2);
+ LASSERT(rc2 == 0);
+
+ out_0:
+ LIBCFS_FREE(info, infosz);
+ return rc;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/config.c b/kernel/drivers/staging/lustre/lnet/lnet/config.c
new file mode 100644
index 000000000..2dc4c4a1a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/config.c
@@ -0,0 +1,1292 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+struct lnet_text_buf_t { /* tmp struct for parsing routes */
+ struct list_head ltb_list; /* stash on lists */
+ int ltb_size; /* allocated size */
+ char ltb_text[0]; /* text buffer */
+};
+
+static int lnet_tbnob; /* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB (64<<10) /* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB (4<<10)
+
+static void
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+ static char dots[LNET_SINGLE_TEXTBUF_NOB];
+ static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+ memset(dots, '.', sizeof(dots));
+ dots[sizeof(dots)-1] = 0;
+ memset(dashes, '-', sizeof(dashes));
+ dashes[sizeof(dashes)-1] = 0;
+
+ LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+ LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+ (int)strlen(name), dots, offset, dots,
+ (width < 1) ? 0 : width - 1, dashes);
+}
+
+static int
+lnet_issep(char c)
+{
+ switch (c) {
+ case '\n':
+ case '\r':
+ case ';':
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+ struct list_head *tmp;
+ lnet_ni_t *ni;
+
+ list_for_each(tmp, nilist) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (LNET_NIDNET(ni->ni_nid) == net)
+ return 0;
+ }
+
+ return 1;
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+ if (ni->ni_refs != NULL)
+ cfs_percpt_free(ni->ni_refs);
+
+ if (ni->ni_tx_queues != NULL)
+ cfs_percpt_free(ni->ni_tx_queues);
+
+ if (ni->ni_cpts != NULL)
+ cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+ LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+static lnet_ni_t *
+lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+{
+ struct lnet_tx_queue *tq;
+ struct lnet_ni *ni;
+ int rc;
+ int i;
+
+ if (!lnet_net_unique(net, nilist)) {
+ LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
+ libcfs_net2str(net));
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(ni, sizeof(*ni));
+ if (ni == NULL) {
+ CERROR("Out of memory creating network %s\n",
+ libcfs_net2str(net));
+ return NULL;
+ }
+
+ spin_lock_init(&ni->ni_lock);
+ INIT_LIST_HEAD(&ni->ni_cptlist);
+ ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*ni->ni_refs[0]));
+ if (ni->ni_refs == NULL)
+ goto failed;
+
+ ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*ni->ni_tx_queues[0]));
+ if (ni->ni_tx_queues == NULL)
+ goto failed;
+
+ cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+ INIT_LIST_HEAD(&tq->tq_delayed);
+
+ if (el == NULL) {
+ ni->ni_cpts = NULL;
+ ni->ni_ncpts = LNET_CPT_NUMBER;
+ } else {
+ rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+ if (rc <= 0) {
+ CERROR("Failed to set CPTs for NI %s: %d\n",
+ libcfs_net2str(net), rc);
+ goto failed;
+ }
+
+ LASSERT(rc <= LNET_CPT_NUMBER);
+ if (rc == LNET_CPT_NUMBER) {
+ LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+ ni->ni_cpts = NULL;
+ }
+
+ ni->ni_ncpts = rc;
+ }
+
+ /* LND will fill in the address part of the NID */
+ ni->ni_nid = LNET_MKNID(net, 0);
+ ni->ni_last_alive = get_seconds();
+ list_add_tail(&ni->ni_list, nilist);
+ return ni;
+ failed:
+ lnet_ni_free(ni);
+ return NULL;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+ struct cfs_expr_list *el = NULL;
+ int tokensize = strlen(networks) + 1;
+ char *tokens;
+ char *str;
+ char *tmp;
+ struct lnet_ni *ni;
+ __u32 net;
+ int nnets = 0;
+
+ if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+ /* _WAY_ conservative */
+ LCONSOLE_ERROR_MSG(0x112,
+ "Can't parse networks: string too long\n");
+ return -EINVAL;
+ }
+
+ LIBCFS_ALLOC(tokens, tokensize);
+ if (tokens == NULL) {
+ CERROR("Can't allocate net tokens\n");
+ return -ENOMEM;
+ }
+
+ the_lnet.ln_network_tokens = tokens;
+ the_lnet.ln_network_tokens_nob = tokensize;
+ memcpy(tokens, networks, tokensize);
+ str = tmp = tokens;
+
+ /* Add in the loopback network */
+ ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist);
+ if (ni == NULL)
+ goto failed;
+
+ while (str != NULL && *str != 0) {
+ char *comma = strchr(str, ',');
+ char *bracket = strchr(str, '(');
+ char *square = strchr(str, '[');
+ char *iface;
+ int niface;
+ int rc;
+
+ /* NB we don't check interface conflicts here; it's the LNDs
+ * responsibility (if it cares at all) */
+
+ if (square != NULL && (comma == NULL || square < comma)) {
+ /* i.e: o2ib0(ib0)[1,2], number between square
+ * brackets are CPTs this NI needs to be bond */
+ if (bracket != NULL && bracket > square) {
+ tmp = square;
+ goto failed_syntax;
+ }
+
+ tmp = strchr(square, ']');
+ if (tmp == NULL) {
+ tmp = square;
+ goto failed_syntax;
+ }
+
+ rc = cfs_expr_list_parse(square, tmp - square + 1,
+ 0, LNET_CPT_NUMBER - 1, &el);
+ if (rc != 0) {
+ tmp = square;
+ goto failed_syntax;
+ }
+
+ while (square <= tmp)
+ *square++ = ' ';
+ }
+
+ if (bracket == NULL ||
+ (comma != NULL && comma < bracket)) {
+
+ /* no interface list specified */
+
+ if (comma != NULL)
+ *comma++ = 0;
+ net = libcfs_str2net(cfs_trimwhite(str));
+
+ if (net == LNET_NIDNET(LNET_NID_ANY)) {
+ LCONSOLE_ERROR_MSG(0x113,
+ "Unrecognised network type\n");
+ tmp = str;
+ goto failed_syntax;
+ }
+
+ if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
+ lnet_ni_alloc(net, el, nilist) == NULL)
+ goto failed;
+
+ if (el != NULL) {
+ cfs_expr_list_free(el);
+ el = NULL;
+ }
+
+ str = comma;
+ continue;
+ }
+
+ *bracket = 0;
+ net = libcfs_str2net(cfs_trimwhite(str));
+ if (net == LNET_NIDNET(LNET_NID_ANY)) {
+ tmp = str;
+ goto failed_syntax;
+ }
+
+ nnets++;
+ ni = lnet_ni_alloc(net, el, nilist);
+ if (ni == NULL)
+ goto failed;
+
+ if (el != NULL) {
+ cfs_expr_list_free(el);
+ el = NULL;
+ }
+
+ niface = 0;
+ iface = bracket + 1;
+
+ bracket = strchr(iface, ')');
+ if (bracket == NULL) {
+ tmp = iface;
+ goto failed_syntax;
+ }
+
+ *bracket = 0;
+ do {
+ comma = strchr(iface, ',');
+ if (comma != NULL)
+ *comma++ = 0;
+
+ iface = cfs_trimwhite(iface);
+ if (*iface == 0) {
+ tmp = iface;
+ goto failed_syntax;
+ }
+
+ if (niface == LNET_MAX_INTERFACES) {
+ LCONSOLE_ERROR_MSG(0x115,
+ "Too many interfaces for net %s\n",
+ libcfs_net2str(net));
+ goto failed;
+ }
+
+ ni->ni_interfaces[niface++] = iface;
+ iface = comma;
+ } while (iface != NULL);
+
+ str = bracket + 1;
+ comma = strchr(bracket + 1, ',');
+ if (comma != NULL) {
+ *comma = 0;
+ str = cfs_trimwhite(str);
+ if (*str != 0) {
+ tmp = str;
+ goto failed_syntax;
+ }
+ str = comma + 1;
+ continue;
+ }
+
+ str = cfs_trimwhite(str);
+ if (*str != 0) {
+ tmp = str;
+ goto failed_syntax;
+ }
+ }
+
+ LASSERT(!list_empty(nilist));
+ return 0;
+
+ failed_syntax:
+ lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+ failed:
+ while (!list_empty(nilist)) {
+ ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+
+ list_del(&ni->ni_list);
+ lnet_ni_free(ni);
+ }
+
+ if (el != NULL)
+ cfs_expr_list_free(el);
+
+ LIBCFS_FREE(tokens, tokensize);
+ the_lnet.ln_network_tokens = NULL;
+
+ return -EINVAL;
+}
+
+static struct lnet_text_buf_t *
+lnet_new_text_buf(int str_len)
+{
+ struct lnet_text_buf_t *ltb;
+ int nob;
+
+ /* NB allocate space for the terminating 0 */
+ nob = offsetof(struct lnet_text_buf_t, ltb_text[str_len + 1]);
+ if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+ /* _way_ conservative for "route net gateway..." */
+ CERROR("text buffer too big\n");
+ return NULL;
+ }
+
+ if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+ CERROR("Too many text buffers\n");
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(ltb, nob);
+ if (ltb == NULL)
+ return NULL;
+
+ ltb->ltb_size = nob;
+ ltb->ltb_text[0] = 0;
+ lnet_tbnob += nob;
+ return ltb;
+}
+
+static void
+lnet_free_text_buf(struct lnet_text_buf_t *ltb)
+{
+ lnet_tbnob -= ltb->ltb_size;
+ LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+static void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+ struct lnet_text_buf_t *ltb;
+
+ while (!list_empty(tbs)) {
+ ltb = list_entry(tbs->next, struct lnet_text_buf_t, ltb_list);
+
+ list_del(&ltb->ltb_list);
+ lnet_free_text_buf(ltb);
+ }
+}
+
+static int
+lnet_str2tbs_sep(struct list_head *tbs, char *str)
+{
+ struct list_head pending;
+ char *sep;
+ int nob;
+ int i;
+ struct lnet_text_buf_t *ltb;
+
+ INIT_LIST_HEAD(&pending);
+
+ /* Split 'str' into separate commands */
+ for (;;) {
+ /* skip leading whitespace */
+ while (isspace(*str))
+ str++;
+
+ /* scan for separator or comment */
+ for (sep = str; *sep != 0; sep++)
+ if (lnet_issep(*sep) || *sep == '#')
+ break;
+
+ nob = (int)(sep - str);
+ if (nob > 0) {
+ ltb = lnet_new_text_buf(nob);
+ if (ltb == NULL) {
+ lnet_free_text_bufs(&pending);
+ return -1;
+ }
+
+ for (i = 0; i < nob; i++)
+ if (isspace(str[i]))
+ ltb->ltb_text[i] = ' ';
+ else
+ ltb->ltb_text[i] = str[i];
+
+ ltb->ltb_text[nob] = 0;
+
+ list_add_tail(&ltb->ltb_list, &pending);
+ }
+
+ if (*sep == '#') {
+ /* scan for separator */
+ do {
+ sep++;
+ } while (*sep != 0 && !lnet_issep(*sep));
+ }
+
+ if (*sep == 0)
+ break;
+
+ str = sep + 1;
+ }
+
+ list_splice(&pending, tbs->prev);
+ return 0;
+}
+
+static int
+lnet_expand1tb(struct list_head *list,
+ char *str, char *sep1, char *sep2,
+ char *item, int itemlen)
+{
+ int len1 = (int)(sep1 - str);
+ int len2 = strlen(sep2 + 1);
+ struct lnet_text_buf_t *ltb;
+
+ LASSERT(*sep1 == '[');
+ LASSERT(*sep2 == ']');
+
+ ltb = lnet_new_text_buf(len1 + itemlen + len2);
+ if (ltb == NULL)
+ return -ENOMEM;
+
+ memcpy(ltb->ltb_text, str, len1);
+ memcpy(&ltb->ltb_text[len1], item, itemlen);
+ memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+ ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+ list_add_tail(&ltb->ltb_list, list);
+ return 0;
+}
+
+static int
+lnet_str2tbs_expand(struct list_head *tbs, char *str)
+{
+ char num[16];
+ struct list_head pending;
+ char *sep;
+ char *sep2;
+ char *parsed;
+ char *enditem;
+ int lo;
+ int hi;
+ int stride;
+ int i;
+ int nob;
+ int scanned;
+
+ INIT_LIST_HEAD(&pending);
+
+ sep = strchr(str, '[');
+ if (sep == NULL) /* nothing to expand */
+ return 0;
+
+ sep2 = strchr(sep, ']');
+ if (sep2 == NULL)
+ goto failed;
+
+ for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+ enditem = ++parsed;
+ while (enditem < sep2 && *enditem != ',')
+ enditem++;
+
+ if (enditem == parsed) /* no empty items */
+ goto failed;
+
+ if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi,
+ &stride, &scanned) < 3) {
+
+ if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+ /* simple string enumeration */
+ if (lnet_expand1tb(
+ &pending, str, sep, sep2,
+ parsed,
+ (int)(enditem - parsed)) != 0) {
+ goto failed;
+ }
+
+ continue;
+ }
+
+ stride = 1;
+ }
+
+ /* range expansion */
+
+ if (enditem != parsed + scanned) /* no trailing junk */
+ goto failed;
+
+ if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+ (hi - lo) % stride != 0)
+ goto failed;
+
+ for (i = lo; i <= hi; i += stride) {
+
+ snprintf(num, sizeof(num), "%d", i);
+ nob = strlen(num);
+ if (nob + 1 == sizeof(num))
+ goto failed;
+
+ if (lnet_expand1tb(&pending, str, sep, sep2,
+ num, nob) != 0)
+ goto failed;
+ }
+ }
+
+ list_splice(&pending, tbs->prev);
+ return 1;
+
+ failed:
+ lnet_free_text_bufs(&pending);
+ return -1;
+}
+
+static int
+lnet_parse_hops(char *str, unsigned int *hops)
+{
+ int len = strlen(str);
+ int nob = len;
+
+ return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+ nob == len &&
+ *hops > 0 && *hops < 256);
+}
+
+#define LNET_PRIORITY_SEPARATOR (':')
+
+static int
+lnet_parse_priority(char *str, unsigned int *priority, char **token)
+{
+ int nob;
+ char *sep;
+ int len;
+
+ sep = strchr(str, LNET_PRIORITY_SEPARATOR);
+ if (sep == NULL) {
+ *priority = 0;
+ return 0;
+ }
+ len = strlen(sep + 1);
+
+ if ((sscanf((sep+1), "%u%n", priority, &nob) < 1) || (len != nob)) {
+ /* Update the caller's token pointer so it treats the found
+ priority as the token to report in the error message. */
+ *token += sep - str + 1;
+ return -1;
+ }
+
+ CDEBUG(D_NET, "gateway %s, priority %d, nob %d\n", str, *priority, nob);
+
+ /*
+ * Change priority separator to \0 to be able to parse NID
+ */
+ *sep = '\0';
+ return 0;
+}
+
+static int
+lnet_parse_route(char *str, int *im_a_router)
+{
+ /* static scratch buffer OK (single threaded) */
+ static char cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+ struct list_head nets;
+ struct list_head gateways;
+ struct list_head *tmp1;
+ struct list_head *tmp2;
+ __u32 net;
+ lnet_nid_t nid;
+ struct lnet_text_buf_t *ltb;
+ int rc;
+ char *sep;
+ char *token = str;
+ int ntokens = 0;
+ int myrc = -1;
+ unsigned int hops;
+ int got_hops = 0;
+ unsigned int priority = 0;
+
+ INIT_LIST_HEAD(&gateways);
+ INIT_LIST_HEAD(&nets);
+
+ /* save a copy of the string for error messages */
+ strncpy(cmd, str, sizeof(cmd) - 1);
+ cmd[sizeof(cmd) - 1] = 0;
+
+ sep = str;
+ for (;;) {
+ /* scan for token start */
+ while (isspace(*sep))
+ sep++;
+ if (*sep == 0) {
+ if (ntokens < (got_hops ? 3 : 2))
+ goto token_error;
+ break;
+ }
+
+ ntokens++;
+ token = sep++;
+
+ /* scan for token end */
+ while (*sep != 0 && !isspace(*sep))
+ sep++;
+ if (*sep != 0)
+ *sep++ = 0;
+
+ if (ntokens == 1) {
+ tmp2 = &nets; /* expanding nets */
+ } else if (ntokens == 2 &&
+ lnet_parse_hops(token, &hops)) {
+ got_hops = 1; /* got a hop count */
+ continue;
+ } else {
+ tmp2 = &gateways; /* expanding gateways */
+ }
+
+ ltb = lnet_new_text_buf(strlen(token));
+ if (ltb == NULL)
+ goto out;
+
+ strcpy(ltb->ltb_text, token);
+ tmp1 = &ltb->ltb_list;
+ list_add_tail(tmp1, tmp2);
+
+ while (tmp1 != tmp2) {
+ ltb = list_entry(tmp1, struct lnet_text_buf_t,
+ ltb_list);
+
+ rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+ if (rc < 0)
+ goto token_error;
+
+ tmp1 = tmp1->next;
+
+ if (rc > 0) { /* expanded! */
+ list_del(&ltb->ltb_list);
+ lnet_free_text_buf(ltb);
+ continue;
+ }
+
+ if (ntokens == 1) {
+ net = libcfs_str2net(ltb->ltb_text);
+ if (net == LNET_NIDNET(LNET_NID_ANY) ||
+ LNET_NETTYP(net) == LOLND)
+ goto token_error;
+ } else {
+ rc = lnet_parse_priority(ltb->ltb_text,
+ &priority, &token);
+ if (rc < 0)
+ goto token_error;
+
+ nid = libcfs_str2nid(ltb->ltb_text);
+ if (nid == LNET_NID_ANY ||
+ LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+ goto token_error;
+ }
+ }
+ }
+
+ if (!got_hops)
+ hops = 1;
+
+ LASSERT(!list_empty(&nets));
+ LASSERT(!list_empty(&gateways));
+
+ list_for_each(tmp1, &nets) {
+ ltb = list_entry(tmp1, struct lnet_text_buf_t, ltb_list);
+ net = libcfs_str2net(ltb->ltb_text);
+ LASSERT(net != LNET_NIDNET(LNET_NID_ANY));
+
+ list_for_each(tmp2, &gateways) {
+ ltb = list_entry(tmp2, struct lnet_text_buf_t,
+ ltb_list);
+ nid = libcfs_str2nid(ltb->ltb_text);
+ LASSERT(nid != LNET_NID_ANY);
+
+ if (lnet_islocalnid(nid)) {
+ *im_a_router = 1;
+ continue;
+ }
+
+ rc = lnet_add_route(net, hops, nid, priority);
+ if (rc != 0) {
+ CERROR("Can't create route to %s via %s\n",
+ libcfs_net2str(net),
+ libcfs_nid2str(nid));
+ goto out;
+ }
+ }
+ }
+
+ myrc = 0;
+ goto out;
+
+ token_error:
+ lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+ out:
+ lnet_free_text_bufs(&nets);
+ lnet_free_text_bufs(&gateways);
+ return myrc;
+}
+
+static int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+ struct lnet_text_buf_t *ltb;
+
+ while (!list_empty(tbs)) {
+ ltb = list_entry(tbs->next, struct lnet_text_buf_t, ltb_list);
+
+ if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+ lnet_free_text_bufs(tbs);
+ return -EINVAL;
+ }
+
+ list_del(&ltb->ltb_list);
+ lnet_free_text_buf(ltb);
+ }
+
+ return 0;
+}
+
+int
+lnet_parse_routes(char *routes, int *im_a_router)
+{
+ struct list_head tbs;
+ int rc = 0;
+
+ *im_a_router = 0;
+
+ INIT_LIST_HEAD(&tbs);
+
+ if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+ CERROR("Error parsing routes\n");
+ rc = -EINVAL;
+ } else {
+ rc = lnet_parse_route_tbs(&tbs, im_a_router);
+ }
+
+ LASSERT(lnet_tbnob == 0);
+ return rc;
+}
+
+static int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+ LIST_HEAD(list);
+ int rc;
+ int i;
+
+ rc = cfs_ip_addr_parse(token, len, &list);
+ if (rc != 0)
+ return rc;
+
+ for (rc = i = 0; !rc && i < nip; i++)
+ rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+ cfs_ip_addr_free(&list);
+
+ return rc;
+}
+
+static int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+ static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+ int matched = 0;
+ int ntokens = 0;
+ int len;
+ char *net = NULL;
+ char *sep;
+ char *token;
+ int rc;
+
+ LASSERT(strlen(net_entry) < sizeof(tokens));
+
+ /* work on a copy of the string */
+ strcpy(tokens, net_entry);
+ sep = tokens;
+ for (;;) {
+ /* scan for token start */
+ while (isspace(*sep))
+ sep++;
+ if (*sep == 0)
+ break;
+
+ token = sep++;
+
+ /* scan for token end */
+ while (*sep != 0 && !isspace(*sep))
+ sep++;
+ if (*sep != 0)
+ *sep++ = 0;
+
+ if (ntokens++ == 0) {
+ net = token;
+ continue;
+ }
+
+ len = strlen(token);
+
+ rc = lnet_match_network_token(token, len, ipaddrs, nip);
+ if (rc < 0) {
+ lnet_syntax("ip2nets", net_entry,
+ (int)(token - tokens), len);
+ return rc;
+ }
+
+ matched |= (rc != 0);
+ }
+
+ if (!matched)
+ return 0;
+
+ strcpy(net_entry, net); /* replace with matched net */
+ return 1;
+}
+
+static __u32
+lnet_netspec2net(char *netspec)
+{
+ char *bracket = strchr(netspec, '(');
+ __u32 net;
+
+ if (bracket != NULL)
+ *bracket = 0;
+
+ net = libcfs_str2net(netspec);
+
+ if (bracket != NULL)
+ *bracket = '(';
+
+ return net;
+}
+
+static int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+ int offset = 0;
+ int offset2;
+ int len;
+ struct lnet_text_buf_t *tb;
+ struct lnet_text_buf_t *tb2;
+ struct list_head *t;
+ char *sep;
+ char *bracket;
+ __u32 net;
+
+ LASSERT(!list_empty(nets));
+ LASSERT(nets->next == nets->prev); /* single entry */
+
+ tb = list_entry(nets->next, struct lnet_text_buf_t, ltb_list);
+
+ for (;;) {
+ sep = strchr(tb->ltb_text, ',');
+ bracket = strchr(tb->ltb_text, '(');
+
+ if (sep != NULL &&
+ bracket != NULL &&
+ bracket < sep) {
+ /* netspec lists interfaces... */
+
+ offset2 = offset + (int)(bracket - tb->ltb_text);
+ len = strlen(bracket);
+
+ bracket = strchr(bracket + 1, ')');
+
+ if (bracket == NULL ||
+ !(bracket[1] == ',' || bracket[1] == 0)) {
+ lnet_syntax("ip2nets", source, offset2, len);
+ return -EINVAL;
+ }
+
+ sep = (bracket[1] == 0) ? NULL : bracket + 1;
+ }
+
+ if (sep != NULL)
+ *sep++ = 0;
+
+ net = lnet_netspec2net(tb->ltb_text);
+ if (net == LNET_NIDNET(LNET_NID_ANY)) {
+ lnet_syntax("ip2nets", source, offset,
+ strlen(tb->ltb_text));
+ return -EINVAL;
+ }
+
+ list_for_each(t, nets) {
+ tb2 = list_entry(t, struct lnet_text_buf_t, ltb_list);
+
+ if (tb2 == tb)
+ continue;
+
+ if (net == lnet_netspec2net(tb2->ltb_text)) {
+ /* duplicate network */
+ lnet_syntax("ip2nets", source, offset,
+ strlen(tb->ltb_text));
+ return -EINVAL;
+ }
+ }
+
+ if (sep == NULL)
+ return 0;
+
+ offset += (int)(sep - tb->ltb_text);
+ tb2 = lnet_new_text_buf(strlen(sep));
+ if (tb2 == NULL)
+ return -ENOMEM;
+
+ strcpy(tb2->ltb_text, sep);
+ list_add_tail(&tb2->ltb_list, nets);
+
+ tb = tb2;
+ }
+}
+
+static int
+lnet_match_networks(char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+ static char networks[LNET_SINGLE_TEXTBUF_NOB];
+ static char source[LNET_SINGLE_TEXTBUF_NOB];
+
+ struct list_head raw_entries;
+ struct list_head matched_nets;
+ struct list_head current_nets;
+ struct list_head *t;
+ struct list_head *t2;
+ struct lnet_text_buf_t *tb;
+ struct lnet_text_buf_t *tb2;
+ __u32 net1;
+ __u32 net2;
+ int len;
+ int count;
+ int dup;
+ int rc;
+
+ INIT_LIST_HEAD(&raw_entries);
+ if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+ CERROR("Error parsing ip2nets\n");
+ LASSERT(lnet_tbnob == 0);
+ return -EINVAL;
+ }
+
+ INIT_LIST_HEAD(&matched_nets);
+ INIT_LIST_HEAD(&current_nets);
+ networks[0] = 0;
+ count = 0;
+ len = 0;
+ rc = 0;
+
+ while (!list_empty(&raw_entries)) {
+ tb = list_entry(raw_entries.next, struct lnet_text_buf_t,
+ ltb_list);
+
+ strncpy(source, tb->ltb_text, sizeof(source)-1);
+ source[sizeof(source)-1] = 0;
+
+ /* replace ltb_text with the network(s) add on match */
+ rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+ if (rc < 0)
+ break;
+
+ list_del(&tb->ltb_list);
+
+ if (rc == 0) { /* no match */
+ lnet_free_text_buf(tb);
+ continue;
+ }
+
+ /* split into separate networks */
+ INIT_LIST_HEAD(&current_nets);
+ list_add(&tb->ltb_list, &current_nets);
+ rc = lnet_splitnets(source, &current_nets);
+ if (rc < 0)
+ break;
+
+ dup = 0;
+ list_for_each(t, &current_nets) {
+ tb = list_entry(t, struct lnet_text_buf_t, ltb_list);
+ net1 = lnet_netspec2net(tb->ltb_text);
+ LASSERT(net1 != LNET_NIDNET(LNET_NID_ANY));
+
+ list_for_each(t2, &matched_nets) {
+ tb2 = list_entry(t2, struct lnet_text_buf_t,
+ ltb_list);
+ net2 = lnet_netspec2net(tb2->ltb_text);
+ LASSERT(net2 != LNET_NIDNET(LNET_NID_ANY));
+
+ if (net1 == net2) {
+ dup = 1;
+ break;
+ }
+ }
+
+ if (dup)
+ break;
+ }
+
+ if (dup) {
+ lnet_free_text_bufs(&current_nets);
+ continue;
+ }
+
+ list_for_each_safe(t, t2, &current_nets) {
+ tb = list_entry(t, struct lnet_text_buf_t, ltb_list);
+
+ list_del(&tb->ltb_list);
+ list_add_tail(&tb->ltb_list, &matched_nets);
+
+ len += snprintf(networks + len, sizeof(networks) - len,
+ "%s%s", (len == 0) ? "" : ",",
+ tb->ltb_text);
+
+ if (len >= sizeof(networks)) {
+ CERROR("Too many matched networks\n");
+ rc = -E2BIG;
+ goto out;
+ }
+ }
+
+ count++;
+ }
+
+ out:
+ lnet_free_text_bufs(&raw_entries);
+ lnet_free_text_bufs(&matched_nets);
+ lnet_free_text_bufs(&current_nets);
+ LASSERT(lnet_tbnob == 0);
+
+ if (rc < 0)
+ return rc;
+
+ *networksp = networks;
+ return count;
+}
+
+static void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+ LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+static int
+lnet_ipaddr_enumerate(__u32 **ipaddrsp)
+{
+ int up;
+ __u32 netmask;
+ __u32 *ipaddrs;
+ __u32 *ipaddrs2;
+ int nip;
+ char **ifnames;
+ int nif = libcfs_ipif_enumerate(&ifnames);
+ int i;
+ int rc;
+
+ if (nif <= 0)
+ return nif;
+
+ LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+ if (ipaddrs == NULL) {
+ CERROR("Can't allocate ipaddrs[%d]\n", nif);
+ libcfs_ipif_free_enumeration(ifnames, nif);
+ return -ENOMEM;
+ }
+
+ for (i = nip = 0; i < nif; i++) {
+ if (!strcmp(ifnames[i], "lo"))
+ continue;
+
+ rc = libcfs_ipif_query(ifnames[i], &up,
+ &ipaddrs[nip], &netmask);
+ if (rc != 0) {
+ CWARN("Can't query interface %s: %d\n",
+ ifnames[i], rc);
+ continue;
+ }
+
+ if (!up) {
+ CWARN("Ignoring interface %s: it's down\n",
+ ifnames[i]);
+ continue;
+ }
+
+ nip++;
+ }
+
+ libcfs_ipif_free_enumeration(ifnames, nif);
+
+ if (nip == nif) {
+ *ipaddrsp = ipaddrs;
+ } else {
+ if (nip > 0) {
+ LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+ if (ipaddrs2 == NULL) {
+ CERROR("Can't allocate ipaddrs[%d]\n", nip);
+ nip = -ENOMEM;
+ } else {
+ memcpy(ipaddrs2, ipaddrs,
+ nip * sizeof(*ipaddrs));
+ *ipaddrsp = ipaddrs2;
+ rc = nip;
+ }
+ }
+ lnet_ipaddr_free_enumeration(ipaddrs, nif);
+ }
+ return nip;
+}
+
+int
+lnet_parse_ip2nets(char **networksp, char *ip2nets)
+{
+ __u32 *ipaddrs = NULL;
+ int nip = lnet_ipaddr_enumerate(&ipaddrs);
+ int rc;
+
+ if (nip < 0) {
+ LCONSOLE_ERROR_MSG(0x117,
+ "Error %d enumerating local IP interfaces for ip2nets to match\n",
+ nip);
+ return nip;
+ }
+
+ if (nip == 0) {
+ LCONSOLE_ERROR_MSG(0x118,
+ "No local IP interfaces for ip2nets to match\n");
+ return -ENOENT;
+ }
+
+ rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+ lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+ if (rc < 0) {
+ LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+ return rc;
+ }
+
+ if (rc == 0) {
+ LCONSOLE_ERROR_MSG(0x11a,
+ "ip2nets does not match any local IP interfaces\n");
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+int
+lnet_set_ip_niaddr(lnet_ni_t *ni)
+{
+ __u32 net = LNET_NIDNET(ni->ni_nid);
+ char **names;
+ int n;
+ __u32 ip;
+ __u32 netmask;
+ int up;
+ int i;
+ int rc;
+
+ /* Convenience for LNDs that use the IP address of a local interface as
+ * the local address part of their NID */
+
+ if (ni->ni_interfaces[0] != NULL) {
+
+ CLASSERT(LNET_MAX_INTERFACES > 1);
+
+ if (ni->ni_interfaces[1] != NULL) {
+ CERROR("Net %s doesn't support multiple interfaces\n",
+ libcfs_net2str(net));
+ return -EPERM;
+ }
+
+ rc = libcfs_ipif_query(ni->ni_interfaces[0],
+ &up, &ip, &netmask);
+ if (rc != 0) {
+ CERROR("Net %s can't query interface %s: %d\n",
+ libcfs_net2str(net), ni->ni_interfaces[0], rc);
+ return -EPERM;
+ }
+
+ if (!up) {
+ CERROR("Net %s can't use interface %s: it's down\n",
+ libcfs_net2str(net), ni->ni_interfaces[0]);
+ return -ENETDOWN;
+ }
+
+ ni->ni_nid = LNET_MKNID(net, ip);
+ return 0;
+ }
+
+ n = libcfs_ipif_enumerate(&names);
+ if (n <= 0) {
+ CERROR("Net %s can't enumerate interfaces: %d\n",
+ libcfs_net2str(net), n);
+ return 0;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+ continue;
+
+ rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+
+ if (rc != 0) {
+ CWARN("Net %s can't query interface %s: %d\n",
+ libcfs_net2str(net), names[i], rc);
+ continue;
+ }
+
+ if (!up) {
+ CWARN("Net %s ignoring interface %s (down)\n",
+ libcfs_net2str(net), names[i]);
+ continue;
+ }
+
+ libcfs_ipif_free_enumeration(names, n);
+ ni->ni_nid = LNET_MKNID(net, ip);
+ return 0;
+ }
+
+ CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+ libcfs_ipif_free_enumeration(names, n);
+ return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c
new file mode 100644
index 000000000..5470148f5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -0,0 +1,441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+ lnet_handle_eq_t *handle)
+{
+ lnet_eq_t *eq;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ /* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+ * overflow, they don't skip entries, so the queue has the same
+ * apparent capacity at all times */
+
+ count = cfs_power2_roundup(count);
+
+ if (callback != LNET_EQ_HANDLER_NONE && count != 0)
+ CWARN("EQ callback is guaranteed to get every event, do you still want to set eqcount %d for polling event which will have locking overhead? Please contact with developer to confirm\n", count);
+
+ /* count can be 0 if only need callback, we can eliminate
+ * overhead of enqueue event */
+ if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+ return -EINVAL;
+
+ eq = lnet_eq_alloc();
+ if (eq == NULL)
+ return -ENOMEM;
+
+ if (count != 0) {
+ LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
+ if (eq->eq_events == NULL)
+ goto failed;
+ /* NB allocator has set all event sequence numbers to 0,
+ * so all them should be earlier than eq_deq_seq */
+ }
+
+ eq->eq_deq_seq = 1;
+ eq->eq_enq_seq = 1;
+ eq->eq_size = count;
+ eq->eq_callback = callback;
+
+ eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*eq->eq_refs[0]));
+ if (eq->eq_refs == NULL)
+ goto failed;
+
+ /* MUST hold both exclusive lnet_res_lock */
+ lnet_res_lock(LNET_LOCK_EX);
+ /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+ * both EQ lookup and poll event with only lnet_eq_wait_lock */
+ lnet_eq_wait_lock();
+
+ lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+ list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+ lnet_eq_wait_unlock();
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ lnet_eq2handle(handle, eq);
+ return 0;
+
+failed:
+ if (eq->eq_events != NULL)
+ LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t));
+
+ if (eq->eq_refs != NULL)
+ cfs_percpt_free(eq->eq_refs);
+
+ lnet_eq_free(eq);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(lnet_handle_eq_t eqh)
+{
+ struct lnet_eq *eq;
+ lnet_event_t *events = NULL;
+ int **refs = NULL;
+ int *ref;
+ int rc = 0;
+ int size = 0;
+ int i;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ lnet_res_lock(LNET_LOCK_EX);
+ /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+ * both EQ lookup and poll event with only lnet_eq_wait_lock */
+ lnet_eq_wait_lock();
+
+ eq = lnet_handle2eq(&eqh);
+ if (eq == NULL) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ cfs_percpt_for_each(ref, i, eq->eq_refs) {
+ LASSERT(*ref >= 0);
+ if (*ref == 0)
+ continue;
+
+ CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+ i, *ref);
+ rc = -EBUSY;
+ goto out;
+ }
+
+ /* stash for free after lock dropped */
+ events = eq->eq_events;
+ size = eq->eq_size;
+ refs = eq->eq_refs;
+
+ lnet_res_lh_invalidate(&eq->eq_lh);
+ list_del(&eq->eq_list);
+ lnet_eq_free_locked(eq);
+ out:
+ lnet_eq_wait_unlock();
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ if (events != NULL)
+ LIBCFS_FREE(events, size * sizeof(lnet_event_t));
+ if (refs != NULL)
+ cfs_percpt_free(refs);
+
+ return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+ /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+ int index;
+
+ if (eq->eq_size == 0) {
+ LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+ eq->eq_callback(ev);
+ return;
+ }
+
+ lnet_eq_wait_lock();
+ ev->sequence = eq->eq_enq_seq++;
+
+ LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+ index = ev->sequence & (eq->eq_size - 1);
+
+ eq->eq_events[index] = *ev;
+
+ if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+ eq->eq_callback(ev);
+
+ /* Wake anyone waiting in LNetEQPoll() */
+ if (waitqueue_active(&the_lnet.ln_eq_waitq))
+ wake_up_all(&the_lnet.ln_eq_waitq);
+ lnet_eq_wait_unlock();
+}
+
+static int
+lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+ int new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+ lnet_event_t *new_event = &eq->eq_events[new_index];
+ int rc;
+
+ /* must called with lnet_eq_wait_lock hold */
+ if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+ return 0;
+
+ /* We've got a new event... */
+ *ev = *new_event;
+
+ CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+ new_event, eq->eq_deq_seq, eq->eq_size);
+
+ /* ...but did it overwrite an event we've not seen yet? */
+ if (eq->eq_deq_seq == new_event->sequence) {
+ rc = 1;
+ } else {
+ /* don't complain with CERROR: some EQs are sized small
+ * anyway; if it's important, the caller should complain */
+ CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+ eq->eq_deq_seq, new_event->sequence);
+ rc = -EOVERFLOW;
+ }
+
+ eq->eq_deq_seq = new_event->sequence + 1;
+ return rc;
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0 No pending event in the EQ.
+ * \retval 1 Indicates success.
+ * \retval -ENOENT If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet(lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+ int which;
+
+ return LNetEQPoll(&eventq, 1, 0,
+ event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1 Indicates success.
+ * \retval -ENOENT If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait(lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+ int which;
+
+ return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+ event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+
+static int
+lnet_eq_wait_locked(int *timeout_ms)
+__must_hold(&the_lnet.ln_eq_wait_lock)
+{
+ int tms = *timeout_ms;
+ int wait;
+ wait_queue_t wl;
+ unsigned long now;
+
+ if (tms == 0)
+ return -1; /* don't want to wait and no new event */
+
+ init_waitqueue_entry(&wl, current);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+ lnet_eq_wait_unlock();
+
+ if (tms < 0) {
+ schedule();
+
+ } else {
+ struct timeval tv;
+
+ now = cfs_time_current();
+ schedule_timeout(cfs_time_seconds(tms) / 1000);
+ cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv);
+ tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+ if (tms < 0) /* no more wait but may have new event */
+ tms = 0;
+ }
+
+ wait = tms != 0; /* might need to call here again */
+ *timeout_ms = tms;
+
+ lnet_eq_wait_lock();
+ remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+ return wait;
+}
+
+
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout_ms Time in milliseconds to wait for an event to occur on
+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0 No pending event in the EQs after timeout.
+ * \retval 1 Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+ lnet_event_t *event, int *which)
+{
+ int wait = 1;
+ int rc;
+ int i;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if (neq < 1)
+ return -ENOENT;
+
+ lnet_eq_wait_lock();
+
+ for (;;) {
+ for (i = 0; i < neq; i++) {
+ lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+ if (eq == NULL) {
+ lnet_eq_wait_unlock();
+ return -ENOENT;
+ }
+
+ rc = lnet_eq_dequeue_event(eq, event);
+ if (rc != 0) {
+ lnet_eq_wait_unlock();
+ *which = i;
+ return rc;
+ }
+ }
+
+ if (wait == 0)
+ break;
+
+ /*
+ * return value of lnet_eq_wait_locked:
+ * -1 : did nothing and it's sure no new event
+ * 1 : sleep inside and wait until new event
+ * 0 : don't want to wait anymore, but might have new event
+ * so need to call dequeue again
+ */
+ wait = lnet_eq_wait_locked(&timeout_ms);
+ if (wait < 0) /* no new event */
+ break;
+ }
+
+ lnet_eq_wait_unlock();
+ return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-md.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-md.c
new file mode 100644
index 000000000..89d660fef
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-md.c
@@ -0,0 +1,454 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(lnet_libmd_t *md)
+{
+ if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+ /* first unlink attempt... */
+ lnet_me_t *me = md->md_me;
+
+ md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+ /* Disassociate from ME (if any),
+ * and unlink it if it was created
+ * with LNET_UNLINK */
+ if (me != NULL) {
+ /* detach MD from portal */
+ lnet_ptl_detach_md(me, md);
+ if (me->me_unlink == LNET_UNLINK)
+ lnet_me_unlink(me);
+ }
+
+ /* ensure all future handle lookups fail */
+ lnet_res_lh_invalidate(&md->md_lh);
+ }
+
+ if (md->md_refcount != 0) {
+ CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+ return;
+ }
+
+ CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+ if (md->md_eq != NULL) {
+ int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+ LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+ (*md->md_eq->eq_refs[cpt])--;
+ }
+
+ LASSERT(!list_empty(&md->md_list));
+ list_del_init(&md->md_list);
+ lnet_md_free_locked(md);
+}
+
+static int
+lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
+{
+ int i;
+ unsigned int niov;
+ int total_length = 0;
+
+ lmd->md_me = NULL;
+ lmd->md_start = umd->start;
+ lmd->md_offset = 0;
+ lmd->md_max_size = umd->max_size;
+ lmd->md_options = umd->options;
+ lmd->md_user_ptr = umd->user_ptr;
+ lmd->md_eq = NULL;
+ lmd->md_threshold = umd->threshold;
+ lmd->md_refcount = 0;
+ lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+ if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+ if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+ return -EINVAL;
+
+ lmd->md_niov = niov = umd->length;
+ memcpy(lmd->md_iov.iov, umd->start,
+ niov * sizeof(lmd->md_iov.iov[0]));
+
+ for (i = 0; i < (int)niov; i++) {
+ /* We take the base address on trust */
+ /* invalid length */
+ if (lmd->md_iov.iov[i].iov_len <= 0)
+ return -EINVAL;
+
+ total_length += lmd->md_iov.iov[i].iov_len;
+ }
+
+ lmd->md_length = total_length;
+
+ if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* use max size */
+ (umd->max_size < 0 ||
+ umd->max_size > total_length)) /* illegal max_size */
+ return -EINVAL;
+
+ } else if ((umd->options & LNET_MD_KIOV) != 0) {
+ lmd->md_niov = niov = umd->length;
+ memcpy(lmd->md_iov.kiov, umd->start,
+ niov * sizeof(lmd->md_iov.kiov[0]));
+
+ for (i = 0; i < (int)niov; i++) {
+ /* We take the page pointer on trust */
+ if (lmd->md_iov.kiov[i].kiov_offset +
+ lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE)
+ return -EINVAL; /* invalid length */
+
+ total_length += lmd->md_iov.kiov[i].kiov_len;
+ }
+
+ lmd->md_length = total_length;
+
+ if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+ (umd->max_size < 0 ||
+ umd->max_size > total_length)) /* illegal max_size */
+ return -EINVAL;
+ } else { /* contiguous */
+ lmd->md_length = umd->length;
+ lmd->md_niov = niov = 1;
+ lmd->md_iov.iov[0].iov_base = umd->start;
+ lmd->md_iov.iov[0].iov_len = umd->length;
+
+ if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+ (umd->max_size < 0 ||
+ umd->max_size > (int)umd->length)) /* illegal max_size */
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt)
+{
+ struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+ /* NB we are passed an allocated, but inactive md.
+ * if we return success, caller may lnet_md_unlink() it.
+ * otherwise caller may only lnet_md_free() it.
+ */
+ /* This implementation doesn't know how to create START events or
+ * disable END events. Best to LASSERT our caller is compliant so
+ * we find out quickly... */
+ /* TODO - reevaluate what should be here in light of
+ * the removal of the start and end events
+ * maybe there we shouldn't even allow LNET_EQ_NONE!)
+ * LASSERT (eq == NULL);
+ */
+ if (!LNetHandleIsInvalid(eq_handle)) {
+ md->md_eq = lnet_handle2eq(&eq_handle);
+
+ if (md->md_eq == NULL)
+ return -ENOENT;
+
+ (*md->md_eq->eq_refs[cpt])++;
+ }
+
+ lnet_res_lh_initialize(container, &md->md_lh);
+
+ LASSERT(list_empty(&md->md_list));
+ list_add(&md->md_list, &container->rec_active);
+
+ return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
+{
+ /* NB this doesn't copy out all the iov entries so when a
+ * discontiguous MD is copied out, the target gets to know the
+ * original iov pointer (in start) and the number of entries it had
+ * and that's all.
+ */
+ umd->start = lmd->md_start;
+ umd->length = ((lmd->md_options &
+ (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+ lmd->md_length : lmd->md_niov;
+ umd->threshold = lmd->md_threshold;
+ umd->max_size = lmd->md_max_size;
+ umd->options = lmd->md_options;
+ umd->user_ptr = lmd->md_user_ptr;
+ lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+static int
+lnet_md_validate(lnet_md_t *umd)
+{
+ if (umd->start == NULL && umd->length != 0) {
+ CERROR("MD start pointer can not be NULL with length %u\n",
+ umd->length);
+ return -EINVAL;
+ }
+
+ if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+ umd->length > LNET_MAX_IOV) {
+ CERROR("Invalid option: too many fragments %u, %d max\n",
+ umd->length, LNET_MAX_IOV);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+ lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+ LIST_HEAD(matches);
+ LIST_HEAD(drops);
+ struct lnet_me *me;
+ struct lnet_libmd *md;
+ int cpt;
+ int rc;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if (lnet_md_validate(&umd) != 0)
+ return -EINVAL;
+
+ if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+ CERROR("Invalid option: no MD_OP set\n");
+ return -EINVAL;
+ }
+
+ md = lnet_md_alloc(&umd);
+ if (md == NULL)
+ return -ENOMEM;
+
+ rc = lnet_md_build(md, &umd, unlink);
+ cpt = lnet_cpt_of_cookie(meh.cookie);
+
+ lnet_res_lock(cpt);
+ if (rc != 0)
+ goto failed;
+
+ me = lnet_handle2me(&meh);
+ if (me == NULL)
+ rc = -ENOENT;
+ else if (me->me_md != NULL)
+ rc = -EBUSY;
+ else
+ rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+ if (rc != 0)
+ goto failed;
+
+ /* attach this MD to portal of ME and check if it matches any
+ * blocked msgs on this portal */
+ lnet_ptl_attach_md(me, md, &matches, &drops);
+
+ lnet_md2handle(handle, md);
+
+ lnet_res_unlock(cpt);
+
+ lnet_drop_delayed_msg_list(&drops, "Bad match");
+ lnet_recv_delayed_msg_list(&matches);
+
+ return 0;
+
+ failed:
+ lnet_md_free_locked(md);
+
+ lnet_res_unlock(cpt);
+ return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+ lnet_libmd_t *md;
+ int cpt;
+ int rc;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if (lnet_md_validate(&umd) != 0)
+ return -EINVAL;
+
+ if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+ CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+ return -EINVAL;
+ }
+
+ md = lnet_md_alloc(&umd);
+ if (md == NULL)
+ return -ENOMEM;
+
+ rc = lnet_md_build(md, &umd, unlink);
+
+ cpt = lnet_res_lock_current();
+ if (rc != 0)
+ goto failed;
+
+ rc = lnet_md_link(md, umd.eq_handle, cpt);
+ if (rc != 0)
+ goto failed;
+
+ lnet_md2handle(handle, md);
+
+ lnet_res_unlock(cpt);
+ return 0;
+
+ failed:
+ lnet_md_free_locked(md);
+
+ lnet_res_unlock(cpt);
+ return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it. As a result, active messages
+ * associated with the MD may get aborted.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ * and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ * returns, and the unlinked event will be piggybacked on the event of
+ * the completion of the last operation by setting the unlinked field of
+ * the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0 On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink(lnet_handle_md_t mdh)
+{
+ lnet_event_t ev;
+ lnet_libmd_t *md;
+ int cpt;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ cpt = lnet_cpt_of_cookie(mdh.cookie);
+ lnet_res_lock(cpt);
+
+ md = lnet_handle2md(&mdh);
+ if (md == NULL) {
+ lnet_res_unlock(cpt);
+ return -ENOENT;
+ }
+
+ md->md_flags |= LNET_MD_FLAG_ABORTED;
+ /* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+ * when the LND is done, the completion event flags that the MD was
+ * unlinked. Otherwise, we enqueue an event now... */
+ if (md->md_eq != NULL && md->md_refcount == 0) {
+ lnet_build_unlink_event(md, &ev);
+ lnet_eq_enqueue_event(md->md_eq, &ev);
+ }
+
+ lnet_md_unlink(md);
+
+ lnet_res_unlock(cpt);
+ return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-me.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-me.c
new file mode 100644
index 000000000..a3f929244
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-me.c
@@ -0,0 +1,298 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the lnet_process_id_t
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+ lnet_process_id_t match_id,
+ __u64 match_bits, __u64 ignore_bits,
+ lnet_unlink_t unlink, lnet_ins_pos_t pos,
+ lnet_handle_me_t *handle)
+{
+ struct lnet_match_table *mtable;
+ struct lnet_me *me;
+ struct list_head *head;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if ((int)portal >= the_lnet.ln_nportals)
+ return -EINVAL;
+
+ mtable = lnet_mt_of_attach(portal, match_id,
+ match_bits, ignore_bits, pos);
+ if (mtable == NULL) /* can't match portal type */
+ return -EPERM;
+
+ me = lnet_me_alloc();
+ if (me == NULL)
+ return -ENOMEM;
+
+ lnet_res_lock(mtable->mt_cpt);
+
+ me->me_portal = portal;
+ me->me_match_id = match_id;
+ me->me_match_bits = match_bits;
+ me->me_ignore_bits = ignore_bits;
+ me->me_unlink = unlink;
+ me->me_md = NULL;
+
+ lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+ &me->me_lh);
+ if (ignore_bits != 0)
+ head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+ else
+ head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+ me->me_pos = head - &mtable->mt_mhash[0];
+ if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+ list_add_tail(&me->me_list, head);
+ else
+ list_add(&me->me_list, head);
+
+ lnet_me2handle(handle, me);
+
+ lnet_res_unlock(mtable->mt_cpt);
+ return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0 On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(lnet_handle_me_t current_meh,
+ lnet_process_id_t match_id,
+ __u64 match_bits, __u64 ignore_bits,
+ lnet_unlink_t unlink, lnet_ins_pos_t pos,
+ lnet_handle_me_t *handle)
+{
+ struct lnet_me *current_me;
+ struct lnet_me *new_me;
+ struct lnet_portal *ptl;
+ int cpt;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if (pos == LNET_INS_LOCAL)
+ return -EPERM;
+
+ new_me = lnet_me_alloc();
+ if (new_me == NULL)
+ return -ENOMEM;
+
+ cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+ lnet_res_lock(cpt);
+
+ current_me = lnet_handle2me(&current_meh);
+ if (current_me == NULL) {
+ lnet_me_free_locked(new_me);
+
+ lnet_res_unlock(cpt);
+ return -ENOENT;
+ }
+
+ LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+ ptl = the_lnet.ln_portals[current_me->me_portal];
+ if (lnet_ptl_is_unique(ptl)) {
+ /* nosense to insertion on unique portal */
+ lnet_me_free_locked(new_me);
+ lnet_res_unlock(cpt);
+ return -EPERM;
+ }
+
+ new_me->me_pos = current_me->me_pos;
+ new_me->me_portal = current_me->me_portal;
+ new_me->me_match_id = match_id;
+ new_me->me_match_bits = match_bits;
+ new_me->me_ignore_bits = ignore_bits;
+ new_me->me_unlink = unlink;
+ new_me->me_md = NULL;
+
+ lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+ if (pos == LNET_INS_AFTER)
+ list_add(&new_me->me_list, &current_me->me_list);
+ else
+ list_add_tail(&new_me->me_list, &current_me->me_list);
+
+ lnet_me2handle(handle, new_me);
+
+ lnet_res_unlock(cpt);
+
+ return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0 On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(lnet_handle_me_t meh)
+{
+ lnet_me_t *me;
+ lnet_libmd_t *md;
+ lnet_event_t ev;
+ int cpt;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ cpt = lnet_cpt_of_cookie(meh.cookie);
+ lnet_res_lock(cpt);
+
+ me = lnet_handle2me(&meh);
+ if (me == NULL) {
+ lnet_res_unlock(cpt);
+ return -ENOENT;
+ }
+
+ md = me->me_md;
+ if (md != NULL) {
+ md->md_flags |= LNET_MD_FLAG_ABORTED;
+ if (md->md_eq != NULL && md->md_refcount == 0) {
+ lnet_build_unlink_event(md, &ev);
+ lnet_eq_enqueue_event(md->md_eq, &ev);
+ }
+ }
+
+ lnet_me_unlink(me);
+
+ lnet_res_unlock(cpt);
+ return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(lnet_me_t *me)
+{
+ list_del(&me->me_list);
+
+ if (me->me_md != NULL) {
+ lnet_libmd_t *md = me->me_md;
+
+ /* detach MD from portal of this ME */
+ lnet_ptl_detach_md(me, md);
+ lnet_md_unlink(md);
+ }
+
+ lnet_res_lh_invalidate(&me->me_lh);
+ lnet_me_free_locked(me);
+}
+
+#if 0
+static void
+lib_me_dump(lnet_me_t *me)
+{
+ CWARN("Match Entry %p (%#llx)\n", me,
+ me->me_lh.lh_cookie);
+
+ CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+ me->me_match_bits, me->me_ignore_bits);
+
+ CWARN("\tMD\t= %p\n", me->md);
+ CWARN("\tprev\t= %p\n",
+ list_entry(me->me_list.prev, lnet_me_t, me_list));
+ CWARN("\tnext\t= %p\n",
+ list_entry(me->me_list.next, lnet_me_t, me_list));
+}
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-move.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-move.c
new file mode 100644
index 000000000..c2fb70e5f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -0,0 +1,2460 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+static int local_nid_dist_zero = 1;
+module_param(local_nid_dist_zero, int, 0444);
+MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
+
+int
+lnet_fail_nid(lnet_nid_t nid, unsigned int threshold)
+{
+ lnet_test_peer_t *tp;
+ struct list_head *el;
+ struct list_head *next;
+ struct list_head cull;
+
+ LASSERT(the_lnet.ln_init);
+
+ /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+ if (threshold != 0) {
+ /* Adding a new entry */
+ LIBCFS_ALLOC(tp, sizeof(*tp));
+ if (tp == NULL)
+ return -ENOMEM;
+
+ tp->tp_nid = nid;
+ tp->tp_threshold = threshold;
+
+ lnet_net_lock(0);
+ list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+ lnet_net_unlock(0);
+ return 0;
+ }
+
+ /* removing entries */
+ INIT_LIST_HEAD(&cull);
+
+ lnet_net_lock(0);
+
+ list_for_each_safe(el, next, &the_lnet.ln_test_peers) {
+ tp = list_entry(el, lnet_test_peer_t, tp_list);
+
+ if (tp->tp_threshold == 0 || /* needs culling anyway */
+ nid == LNET_NID_ANY || /* removing all entries */
+ tp->tp_nid == nid) { /* matched this one */
+ list_del(&tp->tp_list);
+ list_add(&tp->tp_list, &cull);
+ }
+ }
+
+ lnet_net_unlock(0);
+
+ while (!list_empty(&cull)) {
+ tp = list_entry(cull.next, lnet_test_peer_t, tp_list);
+
+ list_del(&tp->tp_list);
+ LIBCFS_FREE(tp, sizeof(*tp));
+ }
+ return 0;
+}
+
+static int
+fail_peer(lnet_nid_t nid, int outgoing)
+{
+ lnet_test_peer_t *tp;
+ struct list_head *el;
+ struct list_head *next;
+ struct list_head cull;
+ int fail = 0;
+
+ INIT_LIST_HEAD(&cull);
+
+ /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+ lnet_net_lock(0);
+
+ list_for_each_safe(el, next, &the_lnet.ln_test_peers) {
+ tp = list_entry(el, lnet_test_peer_t, tp_list);
+
+ if (tp->tp_threshold == 0) {
+ /* zombie entry */
+ if (outgoing) {
+ /* only cull zombies on outgoing tests,
+ * since we may be at interrupt priority on
+ * incoming messages. */
+ list_del(&tp->tp_list);
+ list_add(&tp->tp_list, &cull);
+ }
+ continue;
+ }
+
+ if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
+ nid == tp->tp_nid) { /* fail this peer */
+ fail = 1;
+
+ if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+ tp->tp_threshold--;
+ if (outgoing &&
+ tp->tp_threshold == 0) {
+ /* see above */
+ list_del(&tp->tp_list);
+ list_add(&tp->tp_list, &cull);
+ }
+ }
+ break;
+ }
+ }
+
+ lnet_net_unlock(0);
+
+ while (!list_empty(&cull)) {
+ tp = list_entry(cull.next, lnet_test_peer_t, tp_list);
+ list_del(&tp->tp_list);
+
+ LIBCFS_FREE(tp, sizeof(*tp));
+ }
+
+ return fail;
+}
+
+unsigned int
+lnet_iov_nob(unsigned int niov, struct kvec *iov)
+{
+ unsigned int nob = 0;
+
+ while (niov-- > 0)
+ nob += (iov++)->iov_len;
+
+ return nob;
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset,
+ unsigned int nsiov, struct kvec *siov, unsigned int soffset,
+ unsigned int nob)
+{
+ /* NB diov, siov are READ-ONLY */
+ unsigned int this_nob;
+
+ if (nob == 0)
+ return;
+
+ /* skip complete frags before 'doffset' */
+ LASSERT(ndiov > 0);
+ while (doffset >= diov->iov_len) {
+ doffset -= diov->iov_len;
+ diov++;
+ ndiov--;
+ LASSERT(ndiov > 0);
+ }
+
+ /* skip complete frags before 'soffset' */
+ LASSERT(nsiov > 0);
+ while (soffset >= siov->iov_len) {
+ soffset -= siov->iov_len;
+ siov++;
+ nsiov--;
+ LASSERT(nsiov > 0);
+ }
+
+ do {
+ LASSERT(ndiov > 0);
+ LASSERT(nsiov > 0);
+ this_nob = min(diov->iov_len - doffset,
+ siov->iov_len - soffset);
+ this_nob = min(this_nob, nob);
+
+ memcpy((char *)diov->iov_base + doffset,
+ (char *)siov->iov_base + soffset, this_nob);
+ nob -= this_nob;
+
+ if (diov->iov_len > doffset + this_nob) {
+ doffset += this_nob;
+ } else {
+ diov++;
+ ndiov--;
+ doffset = 0;
+ }
+
+ if (siov->iov_len > soffset + this_nob) {
+ soffset += this_nob;
+ } else {
+ siov++;
+ nsiov--;
+ soffset = 0;
+ }
+ } while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov(int dst_niov, struct kvec *dst,
+ int src_niov, struct kvec *src,
+ unsigned int offset, unsigned int len)
+{
+ /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+ * for exactly 'len' bytes, and return the number of entries.
+ * NB not destructive to 'src' */
+ unsigned int frag_len;
+ unsigned int niov;
+
+ if (len == 0) /* no data => */
+ return 0; /* no frags */
+
+ LASSERT(src_niov > 0);
+ while (offset >= src->iov_len) { /* skip initial frags */
+ offset -= src->iov_len;
+ src_niov--;
+ src++;
+ LASSERT(src_niov > 0);
+ }
+
+ niov = 1;
+ for (;;) {
+ LASSERT(src_niov > 0);
+ LASSERT((int)niov <= dst_niov);
+
+ frag_len = src->iov_len - offset;
+ dst->iov_base = ((char *)src->iov_base) + offset;
+
+ if (len <= frag_len) {
+ dst->iov_len = len;
+ return niov;
+ }
+
+ dst->iov_len = frag_len;
+
+ len -= frag_len;
+ dst++;
+ src++;
+ niov++;
+ src_niov--;
+ offset = 0;
+ }
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov)
+{
+ unsigned int nob = 0;
+
+ while (niov-- > 0)
+ nob += (kiov++)->kiov_len;
+
+ return nob;
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+ unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+ unsigned int nob)
+{
+ /* NB diov, siov are READ-ONLY */
+ unsigned int this_nob;
+ char *daddr = NULL;
+ char *saddr = NULL;
+
+ if (nob == 0)
+ return;
+
+ LASSERT(!in_interrupt());
+
+ LASSERT(ndiov > 0);
+ while (doffset >= diov->kiov_len) {
+ doffset -= diov->kiov_len;
+ diov++;
+ ndiov--;
+ LASSERT(ndiov > 0);
+ }
+
+ LASSERT(nsiov > 0);
+ while (soffset >= siov->kiov_len) {
+ soffset -= siov->kiov_len;
+ siov++;
+ nsiov--;
+ LASSERT(nsiov > 0);
+ }
+
+ do {
+ LASSERT(ndiov > 0);
+ LASSERT(nsiov > 0);
+ this_nob = min(diov->kiov_len - doffset,
+ siov->kiov_len - soffset);
+ this_nob = min(this_nob, nob);
+
+ if (daddr == NULL)
+ daddr = ((char *)kmap(diov->kiov_page)) +
+ diov->kiov_offset + doffset;
+ if (saddr == NULL)
+ saddr = ((char *)kmap(siov->kiov_page)) +
+ siov->kiov_offset + soffset;
+
+ /* Vanishing risk of kmap deadlock when mapping 2 pages.
+ * However in practice at least one of the kiovs will be mapped
+ * kernel pages and the map/unmap will be NOOPs */
+
+ memcpy(daddr, saddr, this_nob);
+ nob -= this_nob;
+
+ if (diov->kiov_len > doffset + this_nob) {
+ daddr += this_nob;
+ doffset += this_nob;
+ } else {
+ kunmap(diov->kiov_page);
+ daddr = NULL;
+ diov++;
+ ndiov--;
+ doffset = 0;
+ }
+
+ if (siov->kiov_len > soffset + this_nob) {
+ saddr += this_nob;
+ soffset += this_nob;
+ } else {
+ kunmap(siov->kiov_page);
+ saddr = NULL;
+ siov++;
+ nsiov--;
+ soffset = 0;
+ }
+ } while (nob > 0);
+
+ if (daddr != NULL)
+ kunmap(diov->kiov_page);
+ if (saddr != NULL)
+ kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov, unsigned int iovoffset,
+ unsigned int nkiov, lnet_kiov_t *kiov,
+ unsigned int kiovoffset, unsigned int nob)
+{
+ /* NB iov, kiov are READ-ONLY */
+ unsigned int this_nob;
+ char *addr = NULL;
+
+ if (nob == 0)
+ return;
+
+ LASSERT(!in_interrupt());
+
+ LASSERT(niov > 0);
+ while (iovoffset >= iov->iov_len) {
+ iovoffset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT(niov > 0);
+ }
+
+ LASSERT(nkiov > 0);
+ while (kiovoffset >= kiov->kiov_len) {
+ kiovoffset -= kiov->kiov_len;
+ kiov++;
+ nkiov--;
+ LASSERT(nkiov > 0);
+ }
+
+ do {
+ LASSERT(niov > 0);
+ LASSERT(nkiov > 0);
+ this_nob = min(iov->iov_len - iovoffset,
+ (__kernel_size_t) kiov->kiov_len - kiovoffset);
+ this_nob = min(this_nob, nob);
+
+ if (addr == NULL)
+ addr = ((char *)kmap(kiov->kiov_page)) +
+ kiov->kiov_offset + kiovoffset;
+
+ memcpy((char *)iov->iov_base + iovoffset, addr, this_nob);
+ nob -= this_nob;
+
+ if (iov->iov_len > iovoffset + this_nob) {
+ iovoffset += this_nob;
+ } else {
+ iov++;
+ niov--;
+ iovoffset = 0;
+ }
+
+ if (kiov->kiov_len > kiovoffset + this_nob) {
+ addr += this_nob;
+ kiovoffset += this_nob;
+ } else {
+ kunmap(kiov->kiov_page);
+ addr = NULL;
+ kiov++;
+ nkiov--;
+ kiovoffset = 0;
+ }
+
+ } while (nob > 0);
+
+ if (addr != NULL)
+ kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov,
+ unsigned int kiovoffset, unsigned int niov,
+ struct kvec *iov, unsigned int iovoffset,
+ unsigned int nob)
+{
+ /* NB kiov, iov are READ-ONLY */
+ unsigned int this_nob;
+ char *addr = NULL;
+
+ if (nob == 0)
+ return;
+
+ LASSERT(!in_interrupt());
+
+ LASSERT(nkiov > 0);
+ while (kiovoffset >= kiov->kiov_len) {
+ kiovoffset -= kiov->kiov_len;
+ kiov++;
+ nkiov--;
+ LASSERT(nkiov > 0);
+ }
+
+ LASSERT(niov > 0);
+ while (iovoffset >= iov->iov_len) {
+ iovoffset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT(niov > 0);
+ }
+
+ do {
+ LASSERT(nkiov > 0);
+ LASSERT(niov > 0);
+ this_nob = min((__kernel_size_t) kiov->kiov_len - kiovoffset,
+ iov->iov_len - iovoffset);
+ this_nob = min(this_nob, nob);
+
+ if (addr == NULL)
+ addr = ((char *)kmap(kiov->kiov_page)) +
+ kiov->kiov_offset + kiovoffset;
+
+ memcpy(addr, (char *)iov->iov_base + iovoffset, this_nob);
+ nob -= this_nob;
+
+ if (kiov->kiov_len > kiovoffset + this_nob) {
+ addr += this_nob;
+ kiovoffset += this_nob;
+ } else {
+ kunmap(kiov->kiov_page);
+ addr = NULL;
+ kiov++;
+ nkiov--;
+ kiovoffset = 0;
+ }
+
+ if (iov->iov_len > iovoffset + this_nob) {
+ iovoffset += this_nob;
+ } else {
+ iov++;
+ niov--;
+ iovoffset = 0;
+ }
+ } while (nob > 0);
+
+ if (addr != NULL)
+ kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst,
+ int src_niov, lnet_kiov_t *src,
+ unsigned int offset, unsigned int len)
+{
+ /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+ * for exactly 'len' bytes, and return the number of entries.
+ * NB not destructive to 'src' */
+ unsigned int frag_len;
+ unsigned int niov;
+
+ if (len == 0) /* no data => */
+ return 0; /* no frags */
+
+ LASSERT(src_niov > 0);
+ while (offset >= src->kiov_len) { /* skip initial frags */
+ offset -= src->kiov_len;
+ src_niov--;
+ src++;
+ LASSERT(src_niov > 0);
+ }
+
+ niov = 1;
+ for (;;) {
+ LASSERT(src_niov > 0);
+ LASSERT((int)niov <= dst_niov);
+
+ frag_len = src->kiov_len - offset;
+ dst->kiov_page = src->kiov_page;
+ dst->kiov_offset = src->kiov_offset + offset;
+
+ if (len <= frag_len) {
+ dst->kiov_len = len;
+ LASSERT(dst->kiov_offset + dst->kiov_len
+ <= PAGE_CACHE_SIZE);
+ return niov;
+ }
+
+ dst->kiov_len = frag_len;
+ LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+
+ len -= frag_len;
+ dst++;
+ src++;
+ niov++;
+ src_niov--;
+ offset = 0;
+ }
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+static void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ unsigned int niov = 0;
+ struct kvec *iov = NULL;
+ lnet_kiov_t *kiov = NULL;
+ int rc;
+
+ LASSERT(!in_interrupt());
+ LASSERT(mlen == 0 || msg != NULL);
+
+ if (msg != NULL) {
+ LASSERT(msg->msg_receiving);
+ LASSERT(!msg->msg_sending);
+ LASSERT(rlen == msg->msg_len);
+ LASSERT(mlen <= msg->msg_len);
+ LASSERT(msg->msg_offset == offset);
+ LASSERT(msg->msg_wanted == mlen);
+
+ msg->msg_receiving = 0;
+
+ if (mlen != 0) {
+ niov = msg->msg_niov;
+ iov = msg->msg_iov;
+ kiov = msg->msg_kiov;
+
+ LASSERT(niov > 0);
+ LASSERT((iov == NULL) != (kiov == NULL));
+ }
+ }
+
+ rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+ niov, iov, kiov, offset, mlen, rlen);
+ if (rc < 0)
+ lnet_finalize(ni, msg, rc);
+}
+
+static void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+ lnet_libmd_t *md = msg->msg_md;
+
+ LASSERT(msg->msg_len > 0);
+ LASSERT(!msg->msg_routing);
+ LASSERT(md != NULL);
+ LASSERT(msg->msg_niov == 0);
+ LASSERT(msg->msg_iov == NULL);
+ LASSERT(msg->msg_kiov == NULL);
+
+ msg->msg_niov = md->md_niov;
+ if ((md->md_options & LNET_MD_KIOV) != 0)
+ msg->msg_kiov = md->md_iov.kiov;
+ else
+ msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+ unsigned int offset, unsigned int len)
+{
+ msg->msg_type = type;
+ msg->msg_target = target;
+ msg->msg_len = len;
+ msg->msg_offset = offset;
+
+ if (len != 0)
+ lnet_setpayloadbuffer(msg);
+
+ memset(&msg->msg_hdr, 0, sizeof(msg->msg_hdr));
+ msg->msg_hdr.type = cpu_to_le32(type);
+ msg->msg_hdr.dest_nid = cpu_to_le64(target.nid);
+ msg->msg_hdr.dest_pid = cpu_to_le32(target.pid);
+ /* src_nid will be set later */
+ msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid);
+ msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+static void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ void *priv = msg->msg_private;
+ int rc;
+
+ LASSERT(!in_interrupt());
+ LASSERT(LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+ (msg->msg_txcredit && msg->msg_peertxcredit));
+
+ rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+ if (rc < 0)
+ lnet_finalize(ni, msg, rc);
+}
+
+static int
+lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ int rc;
+
+ LASSERT(!msg->msg_sending);
+ LASSERT(msg->msg_receiving);
+ LASSERT(!msg->msg_rx_ready_delay);
+ LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+
+ msg->msg_rx_ready_delay = 1;
+ rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+ &msg->msg_private);
+ if (rc != 0) {
+ CERROR("recv from %s / send to %s aborted: eager_recv failed %d\n",
+ libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+ libcfs_id2str(msg->msg_target), rc);
+ LASSERT(rc < 0); /* required by my callers */
+ }
+
+ return rc;
+}
+
+/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+static void
+lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+ unsigned long last_alive = 0;
+
+ LASSERT(lnet_peer_aliveness_enabled(lp));
+ LASSERT(ni->ni_lnd->lnd_query != NULL);
+
+ lnet_net_unlock(lp->lp_cpt);
+ (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+ lnet_net_lock(lp->lp_cpt);
+
+ lp->lp_last_query = cfs_time_current();
+
+ if (last_alive != 0) /* NI has updated timestamp */
+ lp->lp_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive(lnet_peer_t *lp, unsigned long now)
+{
+ int alive;
+ unsigned long deadline;
+
+ LASSERT(lnet_peer_aliveness_enabled(lp));
+
+ /* Trust lnet_notify() if it has more recent aliveness news, but
+ * ignore the initial assumed death (see lnet_peers_start_down()).
+ */
+ if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+ cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+ return 0;
+
+ deadline = cfs_time_add(lp->lp_last_alive,
+ cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+ alive = cfs_time_after(deadline, now);
+
+ /* Update obsolete lp_alive except for routers assumed to be dead
+ * initially, because router checker would update aliveness in this
+ * case, and moreover lp_last_alive at peer creation is assumed.
+ */
+ if (alive && !lp->lp_alive &&
+ !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
+ lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+ return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ * may drop the lnet_net_lock */
+static int
+lnet_peer_alive_locked(lnet_peer_t *lp)
+{
+ unsigned long now = cfs_time_current();
+
+ if (!lnet_peer_aliveness_enabled(lp))
+ return -ENODEV;
+
+ if (lnet_peer_is_alive(lp, now))
+ return 1;
+
+ /* Peer appears dead, but we should avoid frequent NI queries (at
+ * most once per lnet_queryinterval seconds). */
+ if (lp->lp_last_query != 0) {
+ static const int lnet_queryinterval = 1;
+
+ unsigned long next_query =
+ cfs_time_add(lp->lp_last_query,
+ cfs_time_seconds(lnet_queryinterval));
+
+ if (time_before(now, next_query)) {
+ if (lp->lp_alive)
+ CWARN("Unexpected aliveness of peer %s: %d < %d (%d/%d)\n",
+ libcfs_nid2str(lp->lp_nid),
+ (int)now, (int)next_query,
+ lnet_queryinterval,
+ lp->lp_ni->ni_peertimeout);
+ return 0;
+ }
+ }
+
+ /* query NI for latest aliveness news */
+ lnet_ni_query_locked(lp->lp_ni, lp);
+
+ if (lnet_peer_is_alive(lp, now))
+ return 1;
+
+ lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+ return 0;
+}
+
+/**
+ * \param msg The message to be sent.
+ * \param do_send True if lnet_ni_send() should be called in this function.
+ * lnet_send() is going to lnet_net_unlock immediately after this, so
+ * it sets do_send FALSE and I don't do the unlock/send/lock bit.
+ *
+ * \retval 0 If \a msg sent or OK to send.
+ * \retval EAGAIN If \a msg blocked for credit.
+ * \retval EHOSTUNREACH If the next hop of the message appears dead.
+ * \retval ECANCELED If the MD of the message has been unlinked.
+ */
+static int
+lnet_post_send_locked(lnet_msg_t *msg, int do_send)
+{
+ lnet_peer_t *lp = msg->msg_txpeer;
+ lnet_ni_t *ni = lp->lp_ni;
+ int cpt = msg->msg_tx_cpt;
+ struct lnet_tx_queue *tq = ni->ni_tx_queues[cpt];
+
+ /* non-lnet_send() callers have checked before */
+ LASSERT(!do_send || msg->msg_tx_delayed);
+ LASSERT(!msg->msg_receiving);
+ LASSERT(msg->msg_tx_committed);
+
+ /* NB 'lp' is always the next hop */
+ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+ lnet_peer_alive_locked(lp) == 0) {
+ the_lnet.ln_counters[cpt]->drop_count++;
+ the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+ lnet_net_unlock(cpt);
+
+ CNETERR("Dropping message for %s: peer not alive\n",
+ libcfs_id2str(msg->msg_target));
+ if (do_send)
+ lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+ lnet_net_lock(cpt);
+ return EHOSTUNREACH;
+ }
+
+ if (msg->msg_md != NULL &&
+ (msg->msg_md->md_flags & LNET_MD_FLAG_ABORTED) != 0) {
+ lnet_net_unlock(cpt);
+
+ CNETERR("Aborting message for %s: LNetM[DE]Unlink() already called on the MD/ME.\n",
+ libcfs_id2str(msg->msg_target));
+ if (do_send)
+ lnet_finalize(ni, msg, -ECANCELED);
+
+ lnet_net_lock(cpt);
+ return ECANCELED;
+ }
+
+ if (!msg->msg_peertxcredit) {
+ LASSERT((lp->lp_txcredits < 0) ==
+ !list_empty(&lp->lp_txq));
+
+ msg->msg_peertxcredit = 1;
+ lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+ lp->lp_txcredits--;
+
+ if (lp->lp_txcredits < lp->lp_mintxcredits)
+ lp->lp_mintxcredits = lp->lp_txcredits;
+
+ if (lp->lp_txcredits < 0) {
+ msg->msg_tx_delayed = 1;
+ list_add_tail(&msg->msg_list, &lp->lp_txq);
+ return EAGAIN;
+ }
+ }
+
+ if (!msg->msg_txcredit) {
+ LASSERT((tq->tq_credits < 0) ==
+ !list_empty(&tq->tq_delayed));
+
+ msg->msg_txcredit = 1;
+ tq->tq_credits--;
+
+ if (tq->tq_credits < tq->tq_credits_min)
+ tq->tq_credits_min = tq->tq_credits;
+
+ if (tq->tq_credits < 0) {
+ msg->msg_tx_delayed = 1;
+ list_add_tail(&msg->msg_list, &tq->tq_delayed);
+ return EAGAIN;
+ }
+ }
+
+ if (do_send) {
+ lnet_net_unlock(cpt);
+ lnet_ni_send(ni, msg);
+ lnet_net_lock(cpt);
+ }
+ return 0;
+}
+
+
+static lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg)
+{
+ lnet_rtrbufpool_t *rbp;
+ int cpt;
+
+ LASSERT(msg->msg_rx_committed);
+
+ cpt = msg->msg_rx_cpt;
+ rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+ LASSERT(msg->msg_len <= LNET_MTU);
+ while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+ rbp++;
+ LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+ }
+
+ return rbp;
+}
+
+static int
+lnet_post_routed_recv_locked(lnet_msg_t *msg, int do_recv)
+{
+ /* lnet_parse is going to lnet_net_unlock immediately after this, so it
+ * sets do_recv FALSE and I don't do the unlock/send/lock bit. I
+ * return EAGAIN if msg blocked and 0 if received or OK to receive */
+ lnet_peer_t *lp = msg->msg_rxpeer;
+ lnet_rtrbufpool_t *rbp;
+ lnet_rtrbuf_t *rb;
+
+ LASSERT(msg->msg_iov == NULL);
+ LASSERT(msg->msg_kiov == NULL);
+ LASSERT(msg->msg_niov == 0);
+ LASSERT(msg->msg_routing);
+ LASSERT(msg->msg_receiving);
+ LASSERT(!msg->msg_sending);
+
+ /* non-lnet_parse callers only receive delayed messages */
+ LASSERT(!do_recv || msg->msg_rx_delayed);
+
+ if (!msg->msg_peerrtrcredit) {
+ LASSERT((lp->lp_rtrcredits < 0) ==
+ !list_empty(&lp->lp_rtrq));
+
+ msg->msg_peerrtrcredit = 1;
+ lp->lp_rtrcredits--;
+ if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+ lp->lp_minrtrcredits = lp->lp_rtrcredits;
+
+ if (lp->lp_rtrcredits < 0) {
+ /* must have checked eager_recv before here */
+ LASSERT(msg->msg_rx_ready_delay);
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+ return EAGAIN;
+ }
+ }
+
+ rbp = lnet_msg2bufpool(msg);
+
+ if (!msg->msg_rtrcredit) {
+ LASSERT((rbp->rbp_credits < 0) ==
+ !list_empty(&rbp->rbp_msgs));
+
+ msg->msg_rtrcredit = 1;
+ rbp->rbp_credits--;
+ if (rbp->rbp_credits < rbp->rbp_mincredits)
+ rbp->rbp_mincredits = rbp->rbp_credits;
+
+ if (rbp->rbp_credits < 0) {
+ /* must have checked eager_recv before here */
+ LASSERT(msg->msg_rx_ready_delay);
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+ return EAGAIN;
+ }
+ }
+
+ LASSERT(!list_empty(&rbp->rbp_bufs));
+ rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+ list_del(&rb->rb_list);
+
+ msg->msg_niov = rbp->rbp_npages;
+ msg->msg_kiov = &rb->rb_kiov[0];
+
+ if (do_recv) {
+ int cpt = msg->msg_rx_cpt;
+
+ lnet_net_unlock(cpt);
+ lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+ 0, msg->msg_len, msg->msg_len);
+ lnet_net_lock(cpt);
+ }
+ return 0;
+}
+
+void
+lnet_return_tx_credits_locked(lnet_msg_t *msg)
+{
+ lnet_peer_t *txpeer = msg->msg_txpeer;
+ lnet_msg_t *msg2;
+
+ if (msg->msg_txcredit) {
+ struct lnet_ni *ni = txpeer->lp_ni;
+ struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+ /* give back NI txcredits */
+ msg->msg_txcredit = 0;
+
+ LASSERT((tq->tq_credits < 0) ==
+ !list_empty(&tq->tq_delayed));
+
+ tq->tq_credits++;
+ if (tq->tq_credits <= 0) {
+ msg2 = list_entry(tq->tq_delayed.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ LASSERT(msg2->msg_txpeer->lp_ni == ni);
+ LASSERT(msg2->msg_tx_delayed);
+
+ (void) lnet_post_send_locked(msg2, 1);
+ }
+ }
+
+ if (msg->msg_peertxcredit) {
+ /* give back peer txcredits */
+ msg->msg_peertxcredit = 0;
+
+ LASSERT((txpeer->lp_txcredits < 0) ==
+ !list_empty(&txpeer->lp_txq));
+
+ txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+ LASSERT(txpeer->lp_txqnob >= 0);
+
+ txpeer->lp_txcredits++;
+ if (txpeer->lp_txcredits <= 0) {
+ msg2 = list_entry(txpeer->lp_txq.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ LASSERT(msg2->msg_txpeer == txpeer);
+ LASSERT(msg2->msg_tx_delayed);
+
+ (void) lnet_post_send_locked(msg2, 1);
+ }
+ }
+
+ if (txpeer != NULL) {
+ msg->msg_txpeer = NULL;
+ lnet_peer_decref_locked(txpeer);
+ }
+}
+
+void
+lnet_return_rx_credits_locked(lnet_msg_t *msg)
+{
+ lnet_peer_t *rxpeer = msg->msg_rxpeer;
+ lnet_msg_t *msg2;
+
+ if (msg->msg_rtrcredit) {
+ /* give back global router credits */
+ lnet_rtrbuf_t *rb;
+ lnet_rtrbufpool_t *rbp;
+
+ /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+ * there until it gets one allocated, or aborts the wait
+ * itself */
+ LASSERT(msg->msg_kiov != NULL);
+
+ rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+ rbp = rb->rb_pool;
+ LASSERT(rbp == lnet_msg2bufpool(msg));
+
+ msg->msg_kiov = NULL;
+ msg->msg_rtrcredit = 0;
+
+ LASSERT((rbp->rbp_credits < 0) ==
+ !list_empty(&rbp->rbp_msgs));
+ LASSERT((rbp->rbp_credits > 0) ==
+ !list_empty(&rbp->rbp_bufs));
+
+ list_add(&rb->rb_list, &rbp->rbp_bufs);
+ rbp->rbp_credits++;
+ if (rbp->rbp_credits <= 0) {
+ msg2 = list_entry(rbp->rbp_msgs.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ (void) lnet_post_routed_recv_locked(msg2, 1);
+ }
+ }
+
+ if (msg->msg_peerrtrcredit) {
+ /* give back peer router credits */
+ msg->msg_peerrtrcredit = 0;
+
+ LASSERT((rxpeer->lp_rtrcredits < 0) ==
+ !list_empty(&rxpeer->lp_rtrq));
+
+ rxpeer->lp_rtrcredits++;
+ if (rxpeer->lp_rtrcredits <= 0) {
+ msg2 = list_entry(rxpeer->lp_rtrq.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ (void) lnet_post_routed_recv_locked(msg2, 1);
+ }
+ }
+ if (rxpeer != NULL) {
+ msg->msg_rxpeer = NULL;
+ lnet_peer_decref_locked(rxpeer);
+ }
+}
+
+static int
+lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
+{
+ lnet_peer_t *p1 = r1->lr_gateway;
+ lnet_peer_t *p2 = r2->lr_gateway;
+
+ if (r1->lr_priority < r2->lr_priority)
+ return 1;
+
+ if (r1->lr_priority > r2->lr_priority)
+ return -1;
+
+ if (r1->lr_hops < r2->lr_hops)
+ return 1;
+
+ if (r1->lr_hops > r2->lr_hops)
+ return -1;
+
+ if (p1->lp_txqnob < p2->lp_txqnob)
+ return 1;
+
+ if (p1->lp_txqnob > p2->lp_txqnob)
+ return -1;
+
+ if (p1->lp_txcredits > p2->lp_txcredits)
+ return 1;
+
+ if (p1->lp_txcredits < p2->lp_txcredits)
+ return -1;
+
+ if (r1->lr_seq - r2->lr_seq <= 0)
+ return 1;
+
+ return -1;
+}
+
+static lnet_peer_t *
+lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+{
+ lnet_remotenet_t *rnet;
+ lnet_route_t *rtr;
+ lnet_route_t *rtr_best;
+ lnet_route_t *rtr_last;
+ struct lnet_peer *lp_best;
+ struct lnet_peer *lp;
+ int rc;
+
+ /* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+ * rtr_nid nid, otherwise find the best gateway I can use */
+
+ rnet = lnet_find_net_locked(LNET_NIDNET(target));
+ if (rnet == NULL)
+ return NULL;
+
+ lp_best = NULL;
+ rtr_best = rtr_last = NULL;
+ list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) {
+ lp = rtr->lr_gateway;
+
+ if (!lp->lp_alive || /* gateway is down */
+ ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 &&
+ rtr->lr_downis != 0)) /* NI to target is down */
+ continue;
+
+ if (ni != NULL && lp->lp_ni != ni)
+ continue;
+
+ if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+ return lp;
+
+ if (lp_best == NULL) {
+ rtr_best = rtr_last = rtr;
+ lp_best = lp;
+ continue;
+ }
+
+ /* no protection on below fields, but it's harmless */
+ if (rtr_last->lr_seq - rtr->lr_seq < 0)
+ rtr_last = rtr;
+
+ rc = lnet_compare_routes(rtr, rtr_best);
+ if (rc < 0)
+ continue;
+
+ rtr_best = rtr;
+ lp_best = lp;
+ }
+
+ /* set sequence number on the best router to the latest sequence + 1
+ * so we can round-robin all routers, it's race and inaccurate but
+ * harmless and functional */
+ if (rtr_best != NULL)
+ rtr_best->lr_seq = rtr_last->lr_seq + 1;
+ return lp_best;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+ lnet_nid_t dst_nid = msg->msg_target.nid;
+ struct lnet_ni *src_ni;
+ struct lnet_ni *local_ni;
+ struct lnet_peer *lp;
+ int cpt;
+ int cpt2;
+ int rc;
+
+ /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+ * but we might want to use pre-determined router for ACK/REPLY
+ * in the future */
+ /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+ LASSERT(msg->msg_txpeer == NULL);
+ LASSERT(!msg->msg_sending);
+ LASSERT(!msg->msg_target_is_router);
+ LASSERT(!msg->msg_receiving);
+
+ msg->msg_sending = 1;
+
+ LASSERT(!msg->msg_tx_committed);
+ cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
+ again:
+ lnet_net_lock(cpt);
+
+ if (the_lnet.ln_shutdown) {
+ lnet_net_unlock(cpt);
+ return -ESHUTDOWN;
+ }
+
+ if (src_nid == LNET_NID_ANY) {
+ src_ni = NULL;
+ } else {
+ src_ni = lnet_nid2ni_locked(src_nid, cpt);
+ if (src_ni == NULL) {
+ lnet_net_unlock(cpt);
+ LCONSOLE_WARN("Can't send to %s: src %s is not a local nid\n",
+ libcfs_nid2str(dst_nid),
+ libcfs_nid2str(src_nid));
+ return -EINVAL;
+ }
+ LASSERT(!msg->msg_routing);
+ }
+
+ /* Is this for someone on a local network? */
+ local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+
+ if (local_ni != NULL) {
+ if (src_ni == NULL) {
+ src_ni = local_ni;
+ src_nid = src_ni->ni_nid;
+ } else if (src_ni == local_ni) {
+ lnet_ni_decref_locked(local_ni, cpt);
+ } else {
+ lnet_ni_decref_locked(local_ni, cpt);
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+ LCONSOLE_WARN("No route to %s via from %s\n",
+ libcfs_nid2str(dst_nid),
+ libcfs_nid2str(src_nid));
+ return -EINVAL;
+ }
+
+ LASSERT(src_nid != LNET_NID_ANY);
+ lnet_msg_commit(msg, cpt);
+
+ if (!msg->msg_routing)
+ msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+
+ if (src_ni == the_lnet.ln_loni) {
+ /* No send credit hassles with LOLND */
+ lnet_net_unlock(cpt);
+ lnet_ni_send(src_ni, msg);
+
+ lnet_net_lock(cpt);
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+ return 0;
+ }
+
+ rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
+ /* lp has ref on src_ni; lose mine */
+ lnet_ni_decref_locked(src_ni, cpt);
+ if (rc != 0) {
+ lnet_net_unlock(cpt);
+ LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+ libcfs_nid2str(dst_nid));
+ /* ENOMEM or shutting down */
+ return rc;
+ }
+ LASSERT(lp->lp_ni == src_ni);
+ } else {
+ /* sending to a remote network */
+ lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
+ if (lp == NULL) {
+ if (src_ni != NULL)
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+
+ LCONSOLE_WARN("No route to %s via %s (all routers down)\n",
+ libcfs_id2str(msg->msg_target),
+ libcfs_nid2str(src_nid));
+ return -EHOSTUNREACH;
+ }
+
+ /* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
+ * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
+ * pre-determined router, this can happen if router table
+ * was changed when we release the lock */
+ if (rtr_nid != lp->lp_nid) {
+ cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
+ if (cpt2 != cpt) {
+ if (src_ni != NULL)
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+
+ rtr_nid = lp->lp_nid;
+ cpt = cpt2;
+ goto again;
+ }
+ }
+
+ CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+ libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+ lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+ if (src_ni == NULL) {
+ src_ni = lp->lp_ni;
+ src_nid = src_ni->ni_nid;
+ } else {
+ LASSERT(src_ni == lp->lp_ni);
+ lnet_ni_decref_locked(src_ni, cpt);
+ }
+
+ lnet_peer_addref_locked(lp);
+
+ LASSERT(src_nid != LNET_NID_ANY);
+ lnet_msg_commit(msg, cpt);
+
+ if (!msg->msg_routing) {
+ /* I'm the source and now I know which NI to send on */
+ msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+ }
+
+ msg->msg_target_is_router = 1;
+ msg->msg_target.nid = lp->lp_nid;
+ msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+ }
+
+ /* 'lp' is our best choice of peer */
+
+ LASSERT(!msg->msg_peertxcredit);
+ LASSERT(!msg->msg_txcredit);
+ LASSERT(msg->msg_txpeer == NULL);
+
+ msg->msg_txpeer = lp; /* msg takes my ref on lp */
+
+ rc = lnet_post_send_locked(msg, 0);
+ lnet_net_unlock(cpt);
+
+ if (rc == EHOSTUNREACH || rc == ECANCELED)
+ return -rc;
+
+ if (rc == 0)
+ lnet_ni_send(src_ni, msg);
+
+ return 0; /* rc == 0 or EAGAIN */
+}
+
+static void
+lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob)
+{
+ lnet_net_lock(cpt);
+ the_lnet.ln_counters[cpt]->drop_count++;
+ the_lnet.ln_counters[cpt]->drop_length += nob;
+ lnet_net_unlock(cpt);
+
+ lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+
+ if (msg->msg_wanted != 0)
+ lnet_setpayloadbuffer(msg);
+
+ lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+ /* Must I ACK? If so I'll grab the ack_wmd out of the header and put
+ * it back into the ACK during lnet_finalize() */
+ msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+ (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+ lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+ msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ struct lnet_match_info info;
+ int rc;
+
+ /* Convert put fields to host byte order */
+ hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+ hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index);
+ hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset);
+
+ info.mi_id.nid = hdr->src_nid;
+ info.mi_id.pid = hdr->src_pid;
+ info.mi_opc = LNET_MD_OP_PUT;
+ info.mi_portal = hdr->msg.put.ptl_index;
+ info.mi_rlength = hdr->payload_length;
+ info.mi_roffset = hdr->msg.put.offset;
+ info.mi_mbits = hdr->msg.put.match_bits;
+
+ msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+
+ again:
+ rc = lnet_ptl_match_md(&info, msg);
+ switch (rc) {
+ default:
+ LBUG();
+
+ case LNET_MATCHMD_OK:
+ lnet_recv_put(ni, msg);
+ return 0;
+
+ case LNET_MATCHMD_NONE:
+ if (msg->msg_rx_delayed) /* attached on delayed list */
+ return 0;
+
+ rc = lnet_ni_eager_recv(ni, msg);
+ if (rc == 0)
+ goto again;
+ /* fall through */
+
+ case LNET_MATCHMD_DROP:
+ CNETERR("Dropping PUT from %s portal %d match %llu offset %d length %d: %d\n",
+ libcfs_id2str(info.mi_id), info.mi_portal,
+ info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+ return ENOENT; /* +ve: OK but no match */
+ }
+}
+
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
+{
+ struct lnet_match_info info;
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_handle_wire_t reply_wmd;
+ int rc;
+
+ /* Convert get fields to host byte order */
+ hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+ hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index);
+ hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length);
+ hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset);
+
+ info.mi_id.nid = hdr->src_nid;
+ info.mi_id.pid = hdr->src_pid;
+ info.mi_opc = LNET_MD_OP_GET;
+ info.mi_portal = hdr->msg.get.ptl_index;
+ info.mi_rlength = hdr->msg.get.sink_length;
+ info.mi_roffset = hdr->msg.get.src_offset;
+ info.mi_mbits = hdr->msg.get.match_bits;
+
+ rc = lnet_ptl_match_md(&info, msg);
+ if (rc == LNET_MATCHMD_DROP) {
+ CNETERR("Dropping GET from %s portal %d match %llu offset %d length %d\n",
+ libcfs_id2str(info.mi_id), info.mi_portal,
+ info.mi_mbits, info.mi_roffset, info.mi_rlength);
+ return ENOENT; /* +ve: OK but no match */
+ }
+
+ LASSERT(rc == LNET_MATCHMD_OK);
+
+ lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+ reply_wmd = hdr->msg.get.return_wmd;
+
+ lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+ msg->msg_offset, msg->msg_wanted);
+
+ msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+ if (rdma_get) {
+ /* The LND completes the REPLY from her recv procedure */
+ lnet_ni_recv(ni, msg->msg_private, msg, 0,
+ msg->msg_offset, msg->msg_len, msg->msg_len);
+ return 0;
+ }
+
+ lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+ msg->msg_receiving = 0;
+
+ rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+ if (rc < 0) {
+ /* didn't get as far as lnet_ni_send() */
+ CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+ libcfs_nid2str(ni->ni_nid),
+ libcfs_id2str(info.mi_id), rc);
+
+ lnet_finalize(ni, msg, rc);
+ }
+
+ return 0;
+}
+
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ void *private = msg->msg_private;
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_process_id_t src = {0};
+ lnet_libmd_t *md;
+ int rlength;
+ int mlength;
+ int cpt;
+
+ cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+ lnet_res_lock(cpt);
+
+ src.nid = hdr->src_nid;
+ src.pid = hdr->src_pid;
+
+ /* NB handles only looked up by creator (no flips) */
+ md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ CNETERR("%s: Dropping REPLY from %s for %s MD %#llx.%#llx\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ (md == NULL) ? "invalid" : "inactive",
+ hdr->msg.reply.dst_wmd.wh_interface_cookie,
+ hdr->msg.reply.dst_wmd.wh_object_cookie);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("REPLY MD also attached to portal %d\n",
+ md->md_me->me_portal);
+
+ lnet_res_unlock(cpt);
+ return ENOENT; /* +ve: OK but no match */
+ }
+
+ LASSERT(md->md_offset == 0);
+
+ rlength = hdr->payload_length;
+ mlength = min_t(uint, rlength, md->md_length);
+
+ if (mlength < rlength &&
+ (md->md_options & LNET_MD_TRUNCATE) == 0) {
+ CNETERR("%s: Dropping REPLY from %s length %d for MD %#llx would overflow (%d)\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+ mlength);
+ lnet_res_unlock(cpt);
+ return ENOENT; /* +ve: OK but no match */
+ }
+
+ CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+ lnet_msg_attach_md(msg, md, 0, mlength);
+
+ if (mlength != 0)
+ lnet_setpayloadbuffer(msg);
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+ lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+ return 0;
+}
+
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_process_id_t src = {0};
+ lnet_libmd_t *md;
+ int cpt;
+
+ src.nid = hdr->src_nid;
+ src.pid = hdr->src_pid;
+
+ /* Convert ack fields to host byte order */
+ hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+ hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+ cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+ lnet_res_lock(cpt);
+
+ /* NB handles only looked up by creator (no flips) */
+ md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ /* Don't moan; this is expected */
+ CDEBUG(D_NET,
+ "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ (md == NULL) ? "invalid" : "inactive",
+ hdr->msg.ack.dst_wmd.wh_interface_cookie,
+ hdr->msg.ack.dst_wmd.wh_object_cookie);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("Source MD also attached to portal %d\n",
+ md->md_me->me_portal);
+
+ lnet_res_unlock(cpt);
+ return ENOENT; /* +ve! */
+ }
+
+ CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+ lnet_msg_attach_md(msg, md, 0, 0);
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+ lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+ return 0;
+}
+
+static int
+lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ int rc = 0;
+
+ if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+ lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+ if (ni->ni_lnd->lnd_eager_recv == NULL) {
+ msg->msg_rx_ready_delay = 1;
+ } else {
+ lnet_net_unlock(msg->msg_rx_cpt);
+ rc = lnet_ni_eager_recv(ni, msg);
+ lnet_net_lock(msg->msg_rx_cpt);
+ }
+ }
+
+ if (rc == 0)
+ rc = lnet_post_routed_recv_locked(msg, 0);
+ return rc;
+}
+
+char *
+lnet_msgtyp2str(int type)
+{
+ switch (type) {
+ case LNET_MSG_ACK:
+ return "ACK";
+ case LNET_MSG_PUT:
+ return "PUT";
+ case LNET_MSG_GET:
+ return "GET";
+ case LNET_MSG_REPLY:
+ return "REPLY";
+ case LNET_MSG_HELLO:
+ return "HELLO";
+ default:
+ return "<UNKNOWN>";
+ }
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+void
+lnet_print_hdr(lnet_hdr_t *hdr)
+{
+ lnet_process_id_t src = {0};
+ lnet_process_id_t dst = {0};
+ char *type_str = lnet_msgtyp2str(hdr->type);
+
+ src.nid = hdr->src_nid;
+ src.pid = hdr->src_pid;
+
+ dst.nid = hdr->dest_nid;
+ dst.pid = hdr->dest_pid;
+
+ CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+ CWARN(" From %s\n", libcfs_id2str(src));
+ CWARN(" To %s\n", libcfs_id2str(dst));
+
+ switch (hdr->type) {
+ default:
+ break;
+
+ case LNET_MSG_PUT:
+ CWARN(" Ptl index %d, ack md %#llx.%#llx, match bits %llu\n",
+ hdr->msg.put.ptl_index,
+ hdr->msg.put.ack_wmd.wh_interface_cookie,
+ hdr->msg.put.ack_wmd.wh_object_cookie,
+ hdr->msg.put.match_bits);
+ CWARN(" Length %d, offset %d, hdr data %#llx\n",
+ hdr->payload_length, hdr->msg.put.offset,
+ hdr->msg.put.hdr_data);
+ break;
+
+ case LNET_MSG_GET:
+ CWARN(" Ptl index %d, return md %#llx.%#llx, match bits %llu\n",
+ hdr->msg.get.ptl_index,
+ hdr->msg.get.return_wmd.wh_interface_cookie,
+ hdr->msg.get.return_wmd.wh_object_cookie,
+ hdr->msg.get.match_bits);
+ CWARN(" Length %d, src offset %d\n",
+ hdr->msg.get.sink_length,
+ hdr->msg.get.src_offset);
+ break;
+
+ case LNET_MSG_ACK:
+ CWARN(" dst md %#llx.%#llx, manipulated length %d\n",
+ hdr->msg.ack.dst_wmd.wh_interface_cookie,
+ hdr->msg.ack.dst_wmd.wh_object_cookie,
+ hdr->msg.ack.mlength);
+ break;
+
+ case LNET_MSG_REPLY:
+ CWARN(" dst md %#llx.%#llx, length %d\n",
+ hdr->msg.reply.dst_wmd.wh_interface_cookie,
+ hdr->msg.reply.dst_wmd.wh_object_cookie,
+ hdr->payload_length);
+ }
+
+}
+
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
+ void *private, int rdma_req)
+{
+ int rc = 0;
+ int cpt;
+ int for_me;
+ struct lnet_msg *msg;
+ lnet_pid_t dest_pid;
+ lnet_nid_t dest_nid;
+ lnet_nid_t src_nid;
+ __u32 payload_length;
+ __u32 type;
+
+ LASSERT(!in_interrupt());
+
+ type = le32_to_cpu(hdr->type);
+ src_nid = le64_to_cpu(hdr->src_nid);
+ dest_nid = le64_to_cpu(hdr->dest_nid);
+ dest_pid = le32_to_cpu(hdr->dest_pid);
+ payload_length = le32_to_cpu(hdr->payload_length);
+
+ for_me = (ni->ni_nid == dest_nid);
+ cpt = lnet_cpt_of_nid(from_nid);
+
+ switch (type) {
+ case LNET_MSG_ACK:
+ case LNET_MSG_GET:
+ if (payload_length > 0) {
+ CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type), payload_length);
+ return -EPROTO;
+ }
+ break;
+
+ case LNET_MSG_PUT:
+ case LNET_MSG_REPLY:
+ if (payload_length >
+ (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+ CERROR("%s, src %s: bad %s payload %d (%d max expected)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type),
+ payload_length,
+ for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+ return -EPROTO;
+ }
+ break;
+
+ default:
+ CERROR("%s, src %s: Bad message type 0x%x\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid), type);
+ return -EPROTO;
+ }
+
+ if (the_lnet.ln_routing &&
+ ni->ni_last_alive != get_seconds()) {
+ lnet_ni_lock(ni);
+
+ /* NB: so far here is the only place to set NI status to "up */
+ ni->ni_last_alive = get_seconds();
+ if (ni->ni_status != NULL &&
+ ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+ ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+ lnet_ni_unlock(ni);
+ }
+
+ /* Regard a bad destination NID as a protocol error. Senders should
+ * know what they're doing; if they don't they're misconfigured, buggy
+ * or malicious so we chop them off at the knees :) */
+
+ if (!for_me) {
+ if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+ /* should have gone direct */
+ CERROR("%s, src %s: Bad dest nid %s (should have been sent direct)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ return -EPROTO;
+ }
+
+ if (lnet_islocalnid(dest_nid)) {
+ /* dest is another local NI; sender should have used
+ * this node's NID on its own network */
+ CERROR("%s, src %s: Bad dest nid %s (it's my nid but on a different network)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ return -EPROTO;
+ }
+
+ if (rdma_req && type == LNET_MSG_GET) {
+ CERROR("%s, src %s: Bad optimized GET for %s (final destination must be me)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ return -EPROTO;
+ }
+
+ if (!the_lnet.ln_routing) {
+ CERROR("%s, src %s: Dropping message for %s (routing not enabled)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ goto drop;
+ }
+ }
+
+ /* Message looks OK; we're not going to return an error, so we MUST
+ * call back lnd_recv() come what may... */
+
+ if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */
+ fail_peer(src_nid, 0)) { /* shall we now? */
+ CERROR("%s, src %s: Dropping %s to simulate failure\n",
+ libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type));
+ goto drop;
+ }
+
+ msg = lnet_msg_alloc();
+ if (msg == NULL) {
+ CERROR("%s, src %s: Dropping %s (out of memory)\n",
+ libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type));
+ goto drop;
+ }
+
+ /* msg zeroed in lnet_msg_alloc;
+ * i.e. flags all clear, pointers NULL etc
+ */
+
+ msg->msg_type = type;
+ msg->msg_private = private;
+ msg->msg_receiving = 1;
+ msg->msg_len = msg->msg_wanted = payload_length;
+ msg->msg_offset = 0;
+ msg->msg_hdr = *hdr;
+ /* for building message event */
+ msg->msg_from = from_nid;
+ if (!for_me) {
+ msg->msg_target.pid = dest_pid;
+ msg->msg_target.nid = dest_nid;
+ msg->msg_routing = 1;
+
+ } else {
+ /* convert common msg->hdr fields to host byteorder */
+ msg->msg_hdr.type = type;
+ msg->msg_hdr.src_nid = src_nid;
+ msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid);
+ msg->msg_hdr.dest_nid = dest_nid;
+ msg->msg_hdr.dest_pid = dest_pid;
+ msg->msg_hdr.payload_length = payload_length;
+ }
+
+ lnet_net_lock(cpt);
+ rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
+ if (rc != 0) {
+ lnet_net_unlock(cpt);
+ CERROR("%s, src %s: Dropping %s (error %d looking up sender)\n",
+ libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type), rc);
+ lnet_msg_free(msg);
+ goto drop;
+ }
+
+ if (lnet_isrouter(msg->msg_rxpeer)) {
+ lnet_peer_set_alive(msg->msg_rxpeer);
+ if (avoid_asym_router_failure &&
+ LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
+ /* received a remote message from router, update
+ * remote NI status on this router.
+ * NB: multi-hop routed message will be ignored.
+ */
+ lnet_router_ni_update_locked(msg->msg_rxpeer,
+ LNET_NIDNET(src_nid));
+ }
+ }
+
+ lnet_msg_commit(msg, cpt);
+
+ if (!for_me) {
+ rc = lnet_parse_forward_locked(ni, msg);
+ lnet_net_unlock(cpt);
+
+ if (rc < 0)
+ goto free_drop;
+ if (rc == 0) {
+ lnet_ni_recv(ni, msg->msg_private, msg, 0,
+ 0, payload_length, payload_length);
+ }
+ return 0;
+ }
+
+ lnet_net_unlock(cpt);
+
+ switch (type) {
+ case LNET_MSG_ACK:
+ rc = lnet_parse_ack(ni, msg);
+ break;
+ case LNET_MSG_PUT:
+ rc = lnet_parse_put(ni, msg);
+ break;
+ case LNET_MSG_GET:
+ rc = lnet_parse_get(ni, msg, rdma_req);
+ break;
+ case LNET_MSG_REPLY:
+ rc = lnet_parse_reply(ni, msg);
+ break;
+ default:
+ LASSERT(0);
+ rc = -EPROTO;
+ goto free_drop; /* prevent an unused label if !kernel */
+ }
+
+ if (rc == 0)
+ return 0;
+
+ LASSERT(rc == ENOENT);
+
+ free_drop:
+ LASSERT(msg->msg_md == NULL);
+ lnet_finalize(ni, msg, rc);
+
+ drop:
+ lnet_drop_message(ni, cpt, private, payload_length);
+ return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+ while (!list_empty(head)) {
+ lnet_process_id_t id = {0};
+ lnet_msg_t *msg;
+
+ msg = list_entry(head->next, lnet_msg_t, msg_list);
+ list_del(&msg->msg_list);
+
+ id.nid = msg->msg_hdr.src_nid;
+ id.pid = msg->msg_hdr.src_pid;
+
+ LASSERT(msg->msg_md == NULL);
+ LASSERT(msg->msg_rx_delayed);
+ LASSERT(msg->msg_rxpeer != NULL);
+ LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+ CWARN("Dropping delayed PUT from %s portal %d match %llu offset %d length %d: %s\n",
+ libcfs_id2str(id),
+ msg->msg_hdr.msg.put.ptl_index,
+ msg->msg_hdr.msg.put.match_bits,
+ msg->msg_hdr.msg.put.offset,
+ msg->msg_hdr.payload_length, reason);
+
+ /* NB I can't drop msg's ref on msg_rxpeer until after I've
+ * called lnet_drop_message(), so I just hang onto msg as well
+ * until that's done */
+
+ lnet_drop_message(msg->msg_rxpeer->lp_ni,
+ msg->msg_rxpeer->lp_cpt,
+ msg->msg_private, msg->msg_len);
+ /*
+ * NB: message will not generate event because w/o attached MD,
+ * but we still should give error code so lnet_msg_decommit()
+ * can skip counters operations and other checks.
+ */
+ lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+ }
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ lnet_msg_t *msg;
+ lnet_process_id_t id;
+
+ msg = list_entry(head->next, lnet_msg_t, msg_list);
+ list_del(&msg->msg_list);
+
+ /* md won't disappear under me, since each msg
+ * holds a ref on it */
+
+ id.nid = msg->msg_hdr.src_nid;
+ id.pid = msg->msg_hdr.src_pid;
+
+ LASSERT(msg->msg_rx_delayed);
+ LASSERT(msg->msg_md != NULL);
+ LASSERT(msg->msg_rxpeer != NULL);
+ LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+ CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d match %llu offset %d length %d.\n",
+ libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+ msg->msg_hdr.msg.put.match_bits,
+ msg->msg_hdr.msg.put.offset,
+ msg->msg_hdr.payload_length);
+
+ lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+ }
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval 0 Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see lnet_event_t::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+ lnet_process_id_t target, unsigned int portal,
+ __u64 match_bits, unsigned int offset,
+ __u64 hdr_data)
+{
+ struct lnet_msg *msg;
+ struct lnet_libmd *md;
+ int cpt;
+ int rc;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */
+ fail_peer(target.nid, 1)) { /* shall we now? */
+ CERROR("Dropping PUT to %s: simulated failure\n",
+ libcfs_id2str(target));
+ return -EIO;
+ }
+
+ msg = lnet_msg_alloc();
+ if (msg == NULL) {
+ CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+ libcfs_id2str(target));
+ return -ENOMEM;
+ }
+ msg->msg_vmflush = !!memory_pressure_get();
+
+ cpt = lnet_cpt_of_cookie(mdh.cookie);
+ lnet_res_lock(cpt);
+
+ md = lnet_handle2md(&mdh);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ CERROR("Dropping PUT (%llu:%d:%s): MD (%d) invalid\n",
+ match_bits, portal, libcfs_id2str(target),
+ md == NULL ? -1 : md->md_threshold);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("Source MD also attached to portal %d\n",
+ md->md_me->me_portal);
+ lnet_res_unlock(cpt);
+
+ lnet_msg_free(msg);
+ return -ENOENT;
+ }
+
+ CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+ lnet_msg_attach_md(msg, md, 0, 0);
+
+ lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+ msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+ msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+ msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+ msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+ /* NB handles only looked up by creator (no flips) */
+ if (ack == LNET_ACK_REQ) {
+ msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+ the_lnet.ln_interface_cookie;
+ msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+ md->md_lh.lh_cookie;
+ } else {
+ msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+ LNET_WIRE_HANDLE_COOKIE_NONE;
+ msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+ LNET_WIRE_HANDLE_COOKIE_NONE;
+ }
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+ rc = lnet_send(self, msg, LNET_NID_ANY);
+ if (rc != 0) {
+ CNETERR("Error sending PUT to %s: %d\n",
+ libcfs_id2str(target), rc);
+ lnet_finalize(NULL, msg, rc);
+ }
+
+ /* completion will be signalled by an event */
+ return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+lnet_msg_t *
+lnet_create_reply_msg(lnet_ni_t *ni, lnet_msg_t *getmsg)
+{
+ /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This
+ * returns a msg for the LND to pass to lnet_finalize() when the sink
+ * data has been received.
+ *
+ * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+ * lnet_finalize() is called on it, so the LND must call this first */
+
+ struct lnet_msg *msg = lnet_msg_alloc();
+ struct lnet_libmd *getmd = getmsg->msg_md;
+ lnet_process_id_t peer_id = getmsg->msg_target;
+ int cpt;
+
+ LASSERT(!getmsg->msg_target_is_router);
+ LASSERT(!getmsg->msg_routing);
+
+ cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+ lnet_res_lock(cpt);
+
+ LASSERT(getmd->md_refcount > 0);
+
+ if (msg == NULL) {
+ CERROR("%s: Dropping REPLY from %s: can't allocate msg\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+ goto drop;
+ }
+
+ if (getmd->md_threshold == 0) {
+ CERROR("%s: Dropping REPLY from %s for inactive MD %p\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+ getmd);
+ lnet_res_unlock(cpt);
+ goto drop;
+ }
+
+ LASSERT(getmd->md_offset == 0);
+
+ CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+ /* setup information for lnet_build_msg_event */
+ msg->msg_from = peer_id.nid;
+ msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+ msg->msg_hdr.src_nid = peer_id.nid;
+ msg->msg_hdr.payload_length = getmd->md_length;
+ msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+ lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+ lnet_res_unlock(cpt);
+
+ cpt = lnet_cpt_of_nid(peer_id.nid);
+
+ lnet_net_lock(cpt);
+ lnet_msg_commit(msg, cpt);
+ lnet_net_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+ return msg;
+
+ drop:
+ cpt = lnet_cpt_of_nid(peer_id.nid);
+
+ lnet_net_lock(cpt);
+ the_lnet.ln_counters[cpt]->drop_count++;
+ the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+ lnet_net_unlock(cpt);
+
+ if (msg != NULL)
+ lnet_msg_free(msg);
+
+ return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+ /* Set the REPLY length, now the RDMA that elides the REPLY message has
+ * completed and I know it. */
+ LASSERT(reply != NULL);
+ LASSERT(reply->msg_type == LNET_MSG_GET);
+ LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY);
+
+ /* NB I trusted my peer to RDMA. If she tells me she's written beyond
+ * the end of my buffer, I might as well be dead. */
+ LASSERT(len <= reply->msg_ev.mlength);
+
+ reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating"
+ * (See LNetMDBind()).
+ *
+ * \retval 0 Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh,
+ lnet_process_id_t target, unsigned int portal,
+ __u64 match_bits, unsigned int offset)
+{
+ struct lnet_msg *msg;
+ struct lnet_libmd *md;
+ int cpt;
+ int rc;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */
+ fail_peer(target.nid, 1)) { /* shall we now? */
+ CERROR("Dropping GET to %s: simulated failure\n",
+ libcfs_id2str(target));
+ return -EIO;
+ }
+
+ msg = lnet_msg_alloc();
+ if (msg == NULL) {
+ CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+ libcfs_id2str(target));
+ return -ENOMEM;
+ }
+
+ cpt = lnet_cpt_of_cookie(mdh.cookie);
+ lnet_res_lock(cpt);
+
+ md = lnet_handle2md(&mdh);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ CERROR("Dropping GET (%llu:%d:%s): MD (%d) invalid\n",
+ match_bits, portal, libcfs_id2str(target),
+ md == NULL ? -1 : md->md_threshold);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("REPLY MD also attached to portal %d\n",
+ md->md_me->me_portal);
+
+ lnet_res_unlock(cpt);
+
+ lnet_msg_free(msg);
+ return -ENOENT;
+ }
+
+ CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+ lnet_msg_attach_md(msg, md, 0, 0);
+
+ lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+ msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+ msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+ msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+ msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+ /* NB handles only looked up by creator (no flips) */
+ msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+ the_lnet.ln_interface_cookie;
+ msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+ md->md_lh.lh_cookie;
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+ rc = lnet_send(self, msg, LNET_NID_ANY);
+ if (rc < 0) {
+ CNETERR("Error sending GET to %s: %d\n",
+ libcfs_id2str(target), rc);
+ lnet_finalize(NULL, msg, rc);
+ }
+
+ /* completion will be signalled by an event */
+ return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+ struct list_head *e;
+ struct lnet_ni *ni;
+ lnet_remotenet_t *rnet;
+ __u32 dstnet = LNET_NIDNET(dstnid);
+ int hops;
+ int cpt;
+ __u32 order = 2;
+ struct list_head *rn_list;
+
+ /* if !local_nid_dist_zero, I don't return a distance of 0 ever
+ * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+ * keep order 0 free for 0@lo and order 1 free for a local NID
+ * match */
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ cpt = lnet_net_lock_current();
+
+ list_for_each(e, &the_lnet.ln_nis) {
+ ni = list_entry(e, lnet_ni_t, ni_list);
+
+ if (ni->ni_nid == dstnid) {
+ if (srcnidp != NULL)
+ *srcnidp = dstnid;
+ if (orderp != NULL) {
+ if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+ *orderp = 0;
+ else
+ *orderp = 1;
+ }
+ lnet_net_unlock(cpt);
+
+ return local_nid_dist_zero ? 0 : 1;
+ }
+
+ if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+ if (srcnidp != NULL)
+ *srcnidp = ni->ni_nid;
+ if (orderp != NULL)
+ *orderp = order;
+ lnet_net_unlock(cpt);
+ return 1;
+ }
+
+ order++;
+ }
+
+ rn_list = lnet_net2rnethash(dstnet);
+ list_for_each(e, rn_list) {
+ rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+ if (rnet->lrn_net == dstnet) {
+ lnet_route_t *route;
+ lnet_route_t *shortest = NULL;
+
+ LASSERT(!list_empty(&rnet->lrn_routes));
+
+ list_for_each_entry(route, &rnet->lrn_routes,
+ lr_list) {
+ if (shortest == NULL ||
+ route->lr_hops < shortest->lr_hops)
+ shortest = route;
+ }
+
+ LASSERT(shortest != NULL);
+ hops = shortest->lr_hops;
+ if (srcnidp != NULL)
+ *srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+ if (orderp != NULL)
+ *orderp = order;
+ lnet_net_unlock(cpt);
+ return hops + 1;
+ }
+ order++;
+ }
+
+ lnet_net_unlock(cpt);
+ return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
+
+/**
+ * Set the number of asynchronous messages expected from a target process.
+ *
+ * This function is only meaningful for userspace callers. It's a no-op when
+ * called from kernel.
+ *
+ * Asynchronous messages are those that can come from a target when the
+ * userspace process is not waiting for IO to complete; e.g., AST callbacks
+ * from Lustre servers. Specifying the expected number of such messages
+ * allows them to be eagerly received when user process is not running in
+ * LNet; otherwise network errors may occur.
+ *
+ * \param id Process ID of the target process.
+ * \param nasync Number of asynchronous messages expected from the target.
+ *
+ * \return 0 on success, and an error code otherwise.
+ */
+int
+LNetSetAsync(lnet_process_id_t id, int nasync)
+{
+ return 0;
+}
+EXPORT_SYMBOL(LNetSetAsync);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c
new file mode 100644
index 000000000..a46ccbf66
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-msg.c
@@ -0,0 +1,647 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+void
+lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev)
+{
+ memset(ev, 0, sizeof(*ev));
+
+ ev->status = 0;
+ ev->unlinked = 1;
+ ev->type = LNET_EVENT_UNLINK;
+ lnet_md_deconstruct(md, &ev->md);
+ lnet_md2handle(&ev->md_handle, md);
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_event_t *ev = &msg->msg_ev;
+
+ LASSERT(!msg->msg_routing);
+
+ ev->type = ev_type;
+
+ if (ev_type == LNET_EVENT_SEND) {
+ /* event for active message */
+ ev->target.nid = le64_to_cpu(hdr->dest_nid);
+ ev->target.pid = le32_to_cpu(hdr->dest_pid);
+ ev->initiator.nid = LNET_NID_ANY;
+ ev->initiator.pid = the_lnet.ln_pid;
+ ev->sender = LNET_NID_ANY;
+
+ } else {
+ /* event for passive message */
+ ev->target.pid = hdr->dest_pid;
+ ev->target.nid = hdr->dest_nid;
+ ev->initiator.pid = hdr->src_pid;
+ ev->initiator.nid = hdr->src_nid;
+ ev->rlength = hdr->payload_length;
+ ev->sender = msg->msg_from;
+ ev->mlength = msg->msg_wanted;
+ ev->offset = msg->msg_offset;
+ }
+
+ switch (ev_type) {
+ default:
+ LBUG();
+
+ case LNET_EVENT_PUT: /* passive PUT */
+ ev->pt_index = hdr->msg.put.ptl_index;
+ ev->match_bits = hdr->msg.put.match_bits;
+ ev->hdr_data = hdr->msg.put.hdr_data;
+ return;
+
+ case LNET_EVENT_GET: /* passive GET */
+ ev->pt_index = hdr->msg.get.ptl_index;
+ ev->match_bits = hdr->msg.get.match_bits;
+ ev->hdr_data = 0;
+ return;
+
+ case LNET_EVENT_ACK: /* ACK */
+ ev->match_bits = hdr->msg.ack.match_bits;
+ ev->mlength = hdr->msg.ack.mlength;
+ return;
+
+ case LNET_EVENT_REPLY: /* REPLY */
+ return;
+
+ case LNET_EVENT_SEND: /* active message */
+ if (msg->msg_type == LNET_MSG_PUT) {
+ ev->pt_index = le32_to_cpu(hdr->msg.put.ptl_index);
+ ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+ ev->offset = le32_to_cpu(hdr->msg.put.offset);
+ ev->mlength =
+ ev->rlength = le32_to_cpu(hdr->payload_length);
+ ev->hdr_data = le64_to_cpu(hdr->msg.put.hdr_data);
+
+ } else {
+ LASSERT(msg->msg_type == LNET_MSG_GET);
+ ev->pt_index = le32_to_cpu(hdr->msg.get.ptl_index);
+ ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+ ev->mlength =
+ ev->rlength = le32_to_cpu(hdr->msg.get.sink_length);
+ ev->offset = le32_to_cpu(hdr->msg.get.src_offset);
+ ev->hdr_data = 0;
+ }
+ return;
+ }
+}
+
+void
+lnet_msg_commit(lnet_msg_t *msg, int cpt)
+{
+ struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+ lnet_counters_t *counters = the_lnet.ln_counters[cpt];
+
+ /* routed message can be committed for both receiving and sending */
+ LASSERT(!msg->msg_tx_committed);
+
+ if (msg->msg_sending) {
+ LASSERT(!msg->msg_receiving);
+
+ msg->msg_tx_cpt = cpt;
+ msg->msg_tx_committed = 1;
+ if (msg->msg_rx_committed) { /* routed message REPLY */
+ LASSERT(msg->msg_onactivelist);
+ return;
+ }
+ } else {
+ LASSERT(!msg->msg_sending);
+ msg->msg_rx_cpt = cpt;
+ msg->msg_rx_committed = 1;
+ }
+
+ LASSERT(!msg->msg_onactivelist);
+ msg->msg_onactivelist = 1;
+ list_add(&msg->msg_activelist, &container->msc_active);
+
+ counters->msgs_alloc++;
+ if (counters->msgs_alloc > counters->msgs_max)
+ counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
+{
+ lnet_counters_t *counters;
+ lnet_event_t *ev = &msg->msg_ev;
+
+ LASSERT(msg->msg_tx_committed);
+ if (status != 0)
+ goto out;
+
+ counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+ switch (ev->type) {
+ default: /* routed message */
+ LASSERT(msg->msg_routing);
+ LASSERT(msg->msg_rx_committed);
+ LASSERT(ev->type == 0);
+
+ counters->route_length += msg->msg_len;
+ counters->route_count++;
+ goto out;
+
+ case LNET_EVENT_PUT:
+ /* should have been decommitted */
+ LASSERT(!msg->msg_rx_committed);
+ /* overwritten while sending ACK */
+ LASSERT(msg->msg_type == LNET_MSG_ACK);
+ msg->msg_type = LNET_MSG_PUT; /* fix type */
+ break;
+
+ case LNET_EVENT_SEND:
+ LASSERT(!msg->msg_rx_committed);
+ if (msg->msg_type == LNET_MSG_PUT)
+ counters->send_length += msg->msg_len;
+ break;
+
+ case LNET_EVENT_GET:
+ LASSERT(msg->msg_rx_committed);
+ /* overwritten while sending reply, we should never be
+ * here for optimized GET */
+ LASSERT(msg->msg_type == LNET_MSG_REPLY);
+ msg->msg_type = LNET_MSG_GET; /* fix type */
+ break;
+ }
+
+ counters->send_count++;
+ out:
+ lnet_return_tx_credits_locked(msg);
+ msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
+{
+ lnet_counters_t *counters;
+ lnet_event_t *ev = &msg->msg_ev;
+
+ LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+ LASSERT(msg->msg_rx_committed);
+
+ if (status != 0)
+ goto out;
+
+ counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+ switch (ev->type) {
+ default:
+ LASSERT(ev->type == 0);
+ LASSERT(msg->msg_routing);
+ goto out;
+
+ case LNET_EVENT_ACK:
+ LASSERT(msg->msg_type == LNET_MSG_ACK);
+ break;
+
+ case LNET_EVENT_GET:
+ /* type is "REPLY" if it's an optimized GET on passive side,
+ * because optimized GET will never be committed for sending,
+ * so message type wouldn't be changed back to "GET" by
+ * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+ LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+ msg->msg_type == LNET_MSG_GET);
+ counters->send_length += msg->msg_wanted;
+ break;
+
+ case LNET_EVENT_PUT:
+ LASSERT(msg->msg_type == LNET_MSG_PUT);
+ break;
+
+ case LNET_EVENT_REPLY:
+ /* type is "GET" if it's an optimized GET on active side,
+ * see details in lnet_create_reply_msg() */
+ LASSERT(msg->msg_type == LNET_MSG_GET ||
+ msg->msg_type == LNET_MSG_REPLY);
+ break;
+ }
+
+ counters->recv_count++;
+ if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+ counters->recv_length += msg->msg_wanted;
+
+ out:
+ lnet_return_rx_credits_locked(msg);
+ msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status)
+{
+ int cpt2 = cpt;
+
+ LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+ LASSERT(msg->msg_onactivelist);
+
+ if (msg->msg_tx_committed) { /* always decommit for sending first */
+ LASSERT(cpt == msg->msg_tx_cpt);
+ lnet_msg_decommit_tx(msg, status);
+ }
+
+ if (msg->msg_rx_committed) {
+ /* forwarding msg committed for both receiving and sending */
+ if (cpt != msg->msg_rx_cpt) {
+ lnet_net_unlock(cpt);
+ cpt2 = msg->msg_rx_cpt;
+ lnet_net_lock(cpt2);
+ }
+ lnet_msg_decommit_rx(msg, status);
+ }
+
+ list_del(&msg->msg_activelist);
+ msg->msg_onactivelist = 0;
+
+ the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+ if (cpt2 != cpt) {
+ lnet_net_unlock(cpt2);
+ lnet_net_lock(cpt);
+ }
+}
+
+void
+lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+ unsigned int offset, unsigned int mlen)
+{
+ /* NB: @offset and @len are only useful for receiving */
+ /* Here, we attach the MD on lnet_msg and mark it busy and
+ * decrementing its threshold. Come what may, the lnet_msg "owns"
+ * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+ * signals completion. */
+ LASSERT(!msg->msg_routing);
+
+ msg->msg_md = md;
+ if (msg->msg_receiving) { /* committed for receiving */
+ msg->msg_offset = offset;
+ msg->msg_wanted = mlen;
+ }
+
+ md->md_refcount++;
+ if (md->md_threshold != LNET_MD_THRESH_INF) {
+ LASSERT(md->md_threshold > 0);
+ md->md_threshold--;
+ }
+
+ /* build umd in event */
+ lnet_md2handle(&msg->msg_ev.md_handle, md);
+ lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(lnet_msg_t *msg, int status)
+{
+ lnet_libmd_t *md = msg->msg_md;
+ int unlink;
+
+ /* Now it's safe to drop my caller's ref */
+ md->md_refcount--;
+ LASSERT(md->md_refcount >= 0);
+
+ unlink = lnet_md_unlinkable(md);
+ if (md->md_eq != NULL) {
+ msg->msg_ev.status = status;
+ msg->msg_ev.unlinked = unlink;
+ lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+ }
+
+ if (unlink)
+ lnet_md_unlink(md);
+
+ msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
+{
+ lnet_handle_wire_t ack_wmd;
+ int rc;
+ int status = msg->msg_ev.status;
+
+ LASSERT(msg->msg_onactivelist);
+
+ if (status == 0 && msg->msg_ack) {
+ /* Only send an ACK if the PUT completed successfully */
+
+ lnet_msg_decommit(msg, cpt, 0);
+
+ msg->msg_ack = 0;
+ lnet_net_unlock(cpt);
+
+ LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+ LASSERT(!msg->msg_routing);
+
+ ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+ lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+ msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+ msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+ msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+ /* NB: we probably want to use NID of msg::msg_from as 3rd
+ * parameter (router NID) if it's routed message */
+ rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+ lnet_net_lock(cpt);
+ /*
+ * NB: message is committed for sending, we should return
+ * on success because LND will finalize this message later.
+ *
+ * Also, there is possibility that message is committed for
+ * sending and also failed before delivering to LND,
+ * i.e: ENOMEM, in that case we can't fall through either
+ * because CPT for sending can be different with CPT for
+ * receiving, so we should return back to lnet_finalize()
+ * to make sure we are locking the correct partition.
+ */
+ return rc;
+
+ } else if (status == 0 && /* OK so far */
+ (msg->msg_routing && !msg->msg_sending)) {
+ /* not forwarded */
+ LASSERT(!msg->msg_receiving); /* called back recv already */
+ lnet_net_unlock(cpt);
+
+ rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+ lnet_net_lock(cpt);
+ /*
+ * NB: message is committed for sending, we should return
+ * on success because LND will finalize this message later.
+ *
+ * Also, there is possibility that message is committed for
+ * sending and also failed before delivering to LND,
+ * i.e: ENOMEM, in that case we can't fall through either:
+ * - The rule is message must decommit for sending first if
+ * the it's committed for both sending and receiving
+ * - CPT for sending can be different with CPT for receiving,
+ * so we should return back to lnet_finalize() to make
+ * sure we are locking the correct partition.
+ */
+ return rc;
+ }
+
+ lnet_msg_decommit(msg, cpt, status);
+ lnet_msg_free_locked(msg);
+ return 0;
+}
+
+void
+lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int status)
+{
+ struct lnet_msg_container *container;
+ int my_slot;
+ int cpt;
+ int rc;
+ int i;
+
+ LASSERT(!in_interrupt());
+
+ if (msg == NULL)
+ return;
+#if 0
+ CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+ lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+ msg->msg_target_is_router ? "t" : "",
+ msg->msg_routing ? "X" : "",
+ msg->msg_ack ? "A" : "",
+ msg->msg_sending ? "S" : "",
+ msg->msg_receiving ? "R" : "",
+ msg->msg_delayed ? "d" : "",
+ msg->msg_txcredit ? "C" : "",
+ msg->msg_peertxcredit ? "c" : "",
+ msg->msg_rtrcredit ? "F" : "",
+ msg->msg_peerrtrcredit ? "f" : "",
+ msg->msg_onactivelist ? "!" : "",
+ msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+ msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+ msg->msg_ev.status = status;
+
+ if (msg->msg_md != NULL) {
+ cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+ lnet_res_lock(cpt);
+ lnet_msg_detach_md(msg, status);
+ lnet_res_unlock(cpt);
+ }
+
+ again:
+ rc = 0;
+ if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+ /* not committed to network yet */
+ LASSERT(!msg->msg_onactivelist);
+ lnet_msg_free(msg);
+ return;
+ }
+
+ /*
+ * NB: routed message can be committed for both receiving and sending,
+ * we should finalize in LIFO order and keep counters correct.
+ * (finalize sending first then finalize receiving)
+ */
+ cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+ lnet_net_lock(cpt);
+
+ container = the_lnet.ln_msg_containers[cpt];
+ list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+ /* Recursion breaker. Don't complete the message here if I am (or
+ * enough other threads are) already completing messages */
+
+ my_slot = -1;
+ for (i = 0; i < container->msc_nfinalizers; i++) {
+ if (container->msc_finalizers[i] == current)
+ break;
+
+ if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+ my_slot = i;
+ }
+
+ if (i < container->msc_nfinalizers || my_slot < 0) {
+ lnet_net_unlock(cpt);
+ return;
+ }
+
+ container->msc_finalizers[my_slot] = current;
+
+ while (!list_empty(&container->msc_finalizing)) {
+ msg = list_entry(container->msc_finalizing.next,
+ lnet_msg_t, msg_list);
+
+ list_del(&msg->msg_list);
+
+ /* NB drops and regains the lnet lock if it actually does
+ * anything, so my finalizing friends can chomp along too */
+ rc = lnet_complete_msg_locked(msg, cpt);
+ if (rc != 0)
+ break;
+ }
+
+ container->msc_finalizers[my_slot] = NULL;
+ lnet_net_unlock(cpt);
+
+ if (rc != 0)
+ goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+ int count = 0;
+
+ if (container->msc_init == 0)
+ return;
+
+ while (!list_empty(&container->msc_active)) {
+ lnet_msg_t *msg = list_entry(container->msc_active.next,
+ lnet_msg_t, msg_activelist);
+
+ LASSERT(msg->msg_onactivelist);
+ msg->msg_onactivelist = 0;
+ list_del(&msg->msg_activelist);
+ lnet_msg_free(msg);
+ count++;
+ }
+
+ if (count > 0)
+ CERROR("%d active msg on exit\n", count);
+
+ if (container->msc_finalizers != NULL) {
+ LIBCFS_FREE(container->msc_finalizers,
+ container->msc_nfinalizers *
+ sizeof(*container->msc_finalizers));
+ container->msc_finalizers = NULL;
+ }
+#ifdef LNET_USE_LIB_FREELIST
+ lnet_freelist_fini(&container->msc_freelist);
+#endif
+ container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+ int rc;
+
+ container->msc_init = 1;
+
+ INIT_LIST_HEAD(&container->msc_active);
+ INIT_LIST_HEAD(&container->msc_finalizing);
+
+#ifdef LNET_USE_LIB_FREELIST
+ memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t));
+
+ rc = lnet_freelist_init(&container->msc_freelist,
+ LNET_FL_MAX_MSGS, sizeof(lnet_msg_t));
+ if (rc != 0) {
+ CERROR("Failed to init freelist for message container\n");
+ lnet_msg_container_cleanup(container);
+ return rc;
+ }
+#else
+ rc = 0;
+#endif
+ /* number of CPUs */
+ container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+
+ LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+ container->msc_nfinalizers *
+ sizeof(*container->msc_finalizers));
+
+ if (container->msc_finalizers == NULL) {
+ CERROR("Failed to allocate message finalizers\n");
+ lnet_msg_container_cleanup(container);
+ return -ENOMEM;
+ }
+
+ return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+ struct lnet_msg_container *container;
+ int i;
+
+ if (the_lnet.ln_msg_containers == NULL)
+ return;
+
+ cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+ lnet_msg_container_cleanup(container);
+
+ cfs_percpt_free(the_lnet.ln_msg_containers);
+ the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+ struct lnet_msg_container *container;
+ int rc;
+ int i;
+
+ the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*container));
+
+ if (the_lnet.ln_msg_containers == NULL) {
+ CERROR("Failed to allocate cpu-partition data for network\n");
+ return -ENOMEM;
+ }
+
+ cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+ rc = lnet_msg_container_setup(container, i);
+ if (rc != 0) {
+ lnet_msg_containers_destroy();
+ return rc;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c
new file mode 100644
index 000000000..3ba0da919
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lib-ptl.c
@@ -0,0 +1,935 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+/* NB: add /proc interfaces in upcoming patches */
+int portal_rotor = LNET_PTL_ROTOR_HASH_RT;
+module_param(portal_rotor, int, 0644);
+MODULE_PARM_DESC(portal_rotor, "redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id,
+ __u64 mbits, __u64 ignore_bits)
+{
+ struct lnet_portal *ptl = the_lnet.ln_portals[index];
+ int unique;
+
+ unique = ignore_bits == 0 &&
+ match_id.nid != LNET_NID_ANY &&
+ match_id.pid != LNET_PID_ANY;
+
+ LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+ /* prefer to check w/o any lock */
+ if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+ goto match;
+
+ /* unset, new portal */
+ lnet_ptl_lock(ptl);
+ /* check again with lock */
+ if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+ lnet_ptl_unlock(ptl);
+ goto match;
+ }
+
+ /* still not set */
+ if (unique)
+ lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+ else
+ lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+ lnet_ptl_unlock(ptl);
+
+ return 1;
+
+ match:
+ if ((lnet_ptl_is_unique(ptl) && !unique) ||
+ (lnet_ptl_is_wildcard(ptl) && unique))
+ return 0;
+ return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+ struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+ int i;
+
+ /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+ LASSERT(lnet_ptl_is_wildcard(ptl));
+
+ mtable->mt_enabled = 1;
+
+ ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+ for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+ LASSERT(ptl->ptl_mt_maps[i] != cpt);
+ if (ptl->ptl_mt_maps[i] < cpt)
+ break;
+
+ /* swap to order */
+ ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+ ptl->ptl_mt_maps[i] = cpt;
+ }
+
+ ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+ struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+ int i;
+
+ /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+ LASSERT(lnet_ptl_is_wildcard(ptl));
+
+ if (LNET_CPT_NUMBER == 1)
+ return; /* never disable the only match-table */
+
+ mtable->mt_enabled = 0;
+
+ LASSERT(ptl->ptl_mt_nmaps > 0 &&
+ ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+ /* remove it from mt_maps */
+ ptl->ptl_mt_nmaps--;
+ for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+ if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+ ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+ }
+}
+
+static int
+lnet_try_match_md(lnet_libmd_t *md,
+ struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+ * lnet_match_blocked_msg() relies on this to avoid races */
+ unsigned int offset;
+ unsigned int mlength;
+ lnet_me_t *me = md->md_me;
+
+ /* MD exhausted */
+ if (lnet_md_exhausted(md))
+ return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+ /* mismatched MD op */
+ if ((md->md_options & info->mi_opc) == 0)
+ return LNET_MATCHMD_NONE;
+
+ /* mismatched ME nid/pid? */
+ if (me->me_match_id.nid != LNET_NID_ANY &&
+ me->me_match_id.nid != info->mi_id.nid)
+ return LNET_MATCHMD_NONE;
+
+ if (me->me_match_id.pid != LNET_PID_ANY &&
+ me->me_match_id.pid != info->mi_id.pid)
+ return LNET_MATCHMD_NONE;
+
+ /* mismatched ME matchbits? */
+ if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+ return LNET_MATCHMD_NONE;
+
+ /* Hurrah! This _is_ a match; check it out... */
+
+ if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+ offset = md->md_offset;
+ else
+ offset = info->mi_roffset;
+
+ if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+ mlength = md->md_max_size;
+ LASSERT(md->md_offset + mlength <= md->md_length);
+ } else {
+ mlength = md->md_length - offset;
+ }
+
+ if (info->mi_rlength <= mlength) { /* fits in allowed space */
+ mlength = info->mi_rlength;
+ } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+ /* this packet _really_ is too big */
+ CERROR("Matching packet from %s, match %llu length %d too big: %d left, %d allowed\n",
+ libcfs_id2str(info->mi_id), info->mi_mbits,
+ info->mi_rlength, md->md_length - offset, mlength);
+
+ return LNET_MATCHMD_DROP;
+ }
+
+ /* Commit to this ME/MD */
+ CDEBUG(D_NET, "Incoming %s index %x from %s of length %d/%d into md %#llx [%d] + %d\n",
+ (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+ info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+ info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+ lnet_msg_attach_md(msg, md, offset, mlength);
+ md->md_offset = offset + mlength;
+
+ if (!lnet_md_exhausted(md))
+ return LNET_MATCHMD_OK;
+
+ /* Auto-unlink NOW, so the ME gets unlinked if required.
+ * We bumped md->md_refcount above so the MD just gets flagged
+ * for unlink when it is finalized. */
+ if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+ lnet_md_unlink(md);
+
+ return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
+{
+ if (LNET_CPT_NUMBER == 1)
+ return ptl->ptl_mtables[0]; /* the only one */
+
+ /* if it's a unique portal, return match-table hashed by NID */
+ return lnet_ptl_is_unique(ptl) ?
+ ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, lnet_process_id_t id,
+ __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos)
+{
+ struct lnet_portal *ptl;
+ struct lnet_match_table *mtable;
+
+ /* NB: called w/o lock */
+ LASSERT(index < the_lnet.ln_nportals);
+
+ if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+ return NULL;
+
+ ptl = the_lnet.ln_portals[index];
+
+ mtable = lnet_match2mt(ptl, id, mbits);
+ if (mtable != NULL) /* unique portal or only one match-table */
+ return mtable;
+
+ /* it's a wildcard portal */
+ switch (pos) {
+ default:
+ return NULL;
+ case LNET_INS_BEFORE:
+ case LNET_INS_AFTER:
+ /* posted by no affinity thread, always hash to specific
+ * match-table to avoid buffer stealing which is heavy */
+ return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+ case LNET_INS_LOCAL:
+ /* posted by cpu-affinity thread */
+ return ptl->ptl_mtables[lnet_cpt_current()];
+ }
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ struct lnet_match_table *mtable;
+ struct lnet_portal *ptl;
+ unsigned int nmaps;
+ unsigned int rotor;
+ unsigned int cpt;
+ bool routed;
+
+ /* NB: called w/o lock */
+ LASSERT(info->mi_portal < the_lnet.ln_nportals);
+ ptl = the_lnet.ln_portals[info->mi_portal];
+
+ LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+ mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+ if (mtable != NULL)
+ return mtable;
+
+ /* it's a wildcard portal */
+ routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+ LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+ if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+ (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+ cpt = lnet_cpt_current();
+ if (ptl->ptl_mtables[cpt]->mt_enabled)
+ return ptl->ptl_mtables[cpt];
+ }
+
+ rotor = ptl->ptl_rotor++; /* get round-robin factor */
+ if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+ cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+ else
+ cpt = rotor % LNET_CPT_NUMBER;
+
+ if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+ /* is there any active entry for this portal? */
+ nmaps = ptl->ptl_mt_nmaps;
+ /* map to an active mtable to avoid heavy "stealing" */
+ if (nmaps != 0) {
+ /* NB: there is possibility that ptl_mt_maps is being
+ * changed because we are not under protection of
+ * lnet_ptl_lock, but it shouldn't hurt anything */
+ cpt = ptl->ptl_mt_maps[rotor % nmaps];
+ }
+ }
+
+ return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+ __u64 *bmap;
+ int i;
+
+ if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+ return 0;
+
+ if (pos < 0) { /* check all bits */
+ for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+ if (mtable->mt_exhausted[i] != (__u64)(-1))
+ return 0;
+ }
+ return 1;
+ }
+
+ LASSERT(pos <= LNET_MT_HASH_IGNORE);
+ /* mtable::mt_mhash[pos] is marked as exhausted or not */
+ bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+ pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+ return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+ __u64 *bmap;
+
+ LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+ LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+ /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+ bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+ pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+ if (!exhausted)
+ *bmap &= ~(1ULL << pos);
+ else
+ *bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+ lnet_process_id_t id, __u64 mbits)
+{
+ struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+ if (lnet_ptl_is_wildcard(ptl)) {
+ return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+ } else {
+ unsigned long hash = mbits + id.nid + id.pid;
+
+ LASSERT(lnet_ptl_is_unique(ptl));
+ hash = hash_long(hash, LNET_MT_HASH_BITS);
+ return &mtable->mt_mhash[hash];
+ }
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+ struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ struct list_head *head;
+ lnet_me_t *me;
+ lnet_me_t *tmp;
+ int exhausted = 0;
+ int rc;
+
+ /* any ME with ignore bits? */
+ if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+ head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+ else
+ head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+ /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+ if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+ exhausted = LNET_MATCHMD_EXHAUSTED;
+
+ list_for_each_entry_safe(me, tmp, head, me_list) {
+ /* ME attached but MD not attached yet */
+ if (me->me_md == NULL)
+ continue;
+
+ LASSERT(me == me->me_md->md_me);
+
+ rc = lnet_try_match_md(me->me_md, info, msg);
+ if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+ exhausted = 0; /* mlist is not empty */
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0) {
+ /* don't return EXHAUSTED bit because we don't know
+ * whether the mlist is empty or not */
+ return rc & ~LNET_MATCHMD_EXHAUSTED;
+ }
+ }
+
+ if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+ lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+ if (!lnet_mt_test_exhausted(mtable, -1))
+ exhausted = 0;
+ }
+
+ if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+ head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ goto again; /* re-check MEs w/o ignore-bits */
+ }
+
+ if (info->mi_opc == LNET_MD_OP_GET ||
+ !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+ return LNET_MATCHMD_DROP | exhausted;
+
+ return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+ int rc;
+
+ /* message arrived before any buffer posting on this portal,
+ * simply delay or drop this message */
+ if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+ return 0;
+
+ lnet_ptl_lock(ptl);
+ /* check it again with hold of lock */
+ if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+ lnet_ptl_unlock(ptl);
+ return 0;
+ }
+
+ if (lnet_ptl_is_lazy(ptl)) {
+ if (msg->msg_rx_ready_delay) {
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list,
+ &ptl->ptl_msg_delayed);
+ }
+ rc = LNET_MATCHMD_NONE;
+ } else {
+ rc = LNET_MATCHMD_DROP;
+ }
+
+ lnet_ptl_unlock(ptl);
+ return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+ struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ int first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+ int rc = 0;
+ int i;
+
+ /* steal buffer from other CPTs, and delay it if nothing to steal,
+ * this function is more expensive than a regular match, but we
+ * don't expect it can happen a lot */
+ LASSERT(lnet_ptl_is_wildcard(ptl));
+
+ for (i = 0; i < LNET_CPT_NUMBER; i++) {
+ struct lnet_match_table *mtable;
+ int cpt;
+
+ cpt = (first + i) % LNET_CPT_NUMBER;
+ mtable = ptl->ptl_mtables[cpt];
+ if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+ continue;
+
+ lnet_res_lock(cpt);
+ lnet_ptl_lock(ptl);
+
+ if (i == 0) { /* the first try, attach on stealing list */
+ list_add_tail(&msg->msg_list,
+ &ptl->ptl_msg_stealing);
+ }
+
+ if (!list_empty(&msg->msg_list)) { /* on stealing list */
+ rc = lnet_mt_match_md(mtable, info, msg);
+
+ if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+ mtable->mt_enabled)
+ lnet_ptl_disable_mt(ptl, cpt);
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0)
+ list_del_init(&msg->msg_list);
+
+ } else {
+ /* could be matched by lnet_ptl_attach_md()
+ * which is called by another thread */
+ rc = msg->msg_md == NULL ?
+ LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+ }
+
+ if (!list_empty(&msg->msg_list) && /* not matched yet */
+ (i == LNET_CPT_NUMBER - 1 || /* the last CPT */
+ ptl->ptl_mt_nmaps == 0 || /* no active CPT */
+ (ptl->ptl_mt_nmaps == 1 && /* the only active CPT */
+ ptl->ptl_mt_maps[0] == cpt))) {
+ /* nothing to steal, delay or drop */
+ list_del_init(&msg->msg_list);
+
+ if (lnet_ptl_is_lazy(ptl)) {
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list,
+ &ptl->ptl_msg_delayed);
+ rc = LNET_MATCHMD_NONE;
+ } else {
+ rc = LNET_MATCHMD_DROP;
+ }
+ }
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(cpt);
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed)
+ break;
+ }
+
+ return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ struct lnet_match_table *mtable;
+ struct lnet_portal *ptl;
+ int rc;
+
+ CDEBUG(D_NET, "Request from %s of length %d into portal %d MB=%#llx\n",
+ libcfs_id2str(info->mi_id), info->mi_rlength, info->mi_portal,
+ info->mi_mbits);
+
+ if (info->mi_portal >= the_lnet.ln_nportals) {
+ CERROR("Invalid portal %d not in [0-%d]\n",
+ info->mi_portal, the_lnet.ln_nportals);
+ return LNET_MATCHMD_DROP;
+ }
+
+ ptl = the_lnet.ln_portals[info->mi_portal];
+ rc = lnet_ptl_match_early(ptl, msg);
+ if (rc != 0) /* matched or delayed early message */
+ return rc;
+
+ mtable = lnet_mt_of_match(info, msg);
+ lnet_res_lock(mtable->mt_cpt);
+
+ if (the_lnet.ln_shutdown) {
+ rc = LNET_MATCHMD_DROP;
+ goto out1;
+ }
+
+ rc = lnet_mt_match_md(mtable, info, msg);
+ if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+ lnet_ptl_lock(ptl);
+ lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+ lnet_ptl_unlock(ptl);
+ }
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0) /* matched or dropping */
+ goto out1;
+
+ if (!msg->msg_rx_ready_delay)
+ goto out1;
+
+ LASSERT(lnet_ptl_is_lazy(ptl));
+ LASSERT(!msg->msg_rx_delayed);
+
+ /* NB: we don't expect "delay" can happen a lot */
+ if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+ lnet_ptl_lock(ptl);
+
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(mtable->mt_cpt);
+
+ } else {
+ lnet_res_unlock(mtable->mt_cpt);
+ rc = lnet_ptl_match_delay(ptl, info, msg);
+ }
+
+ if (msg->msg_rx_delayed) {
+ CDEBUG(D_NET,
+ "Delaying %s from %s ptl %d MB %#llx off %d len %d\n",
+ info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+ libcfs_id2str(info->mi_id), info->mi_portal,
+ info->mi_mbits, info->mi_roffset, info->mi_rlength);
+ }
+ goto out0;
+ out1:
+ lnet_res_unlock(mtable->mt_cpt);
+ out0:
+ /* EXHAUSTED bit is only meaningful for internal functions */
+ return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md)
+{
+ LASSERT(me->me_md == md && md->md_me == me);
+
+ me->me_md = NULL;
+ md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+ struct list_head *matches, struct list_head *drops)
+{
+ struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal];
+ struct lnet_match_table *mtable;
+ struct list_head *head;
+ lnet_msg_t *tmp;
+ lnet_msg_t *msg;
+ int exhausted = 0;
+ int cpt;
+
+ LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+ me->me_md = md;
+ md->md_me = me;
+
+ cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+ mtable = ptl->ptl_mtables[cpt];
+
+ if (list_empty(&ptl->ptl_msg_stealing) &&
+ list_empty(&ptl->ptl_msg_delayed) &&
+ !lnet_mt_test_exhausted(mtable, me->me_pos))
+ return;
+
+ lnet_ptl_lock(ptl);
+ head = &ptl->ptl_msg_stealing;
+ again:
+ list_for_each_entry_safe(msg, tmp, head, msg_list) {
+ struct lnet_match_info info;
+ lnet_hdr_t *hdr;
+ int rc;
+
+ LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+ hdr = &msg->msg_hdr;
+ info.mi_id.nid = hdr->src_nid;
+ info.mi_id.pid = hdr->src_pid;
+ info.mi_opc = LNET_MD_OP_PUT;
+ info.mi_portal = hdr->msg.put.ptl_index;
+ info.mi_rlength = hdr->payload_length;
+ info.mi_roffset = hdr->msg.put.offset;
+ info.mi_mbits = hdr->msg.put.match_bits;
+
+ rc = lnet_try_match_md(md, &info, msg);
+
+ exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+ if ((rc & LNET_MATCHMD_NONE) != 0) {
+ if (exhausted)
+ break;
+ continue;
+ }
+
+ /* Hurrah! This _is_ a match */
+ LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+ list_del_init(&msg->msg_list);
+
+ if (head == &ptl->ptl_msg_stealing) {
+ if (exhausted)
+ break;
+ /* stealing thread will handle the message */
+ continue;
+ }
+
+ if ((rc & LNET_MATCHMD_OK) != 0) {
+ list_add_tail(&msg->msg_list, matches);
+
+ CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d match %llu offset %d length %d.\n",
+ libcfs_id2str(info.mi_id),
+ info.mi_portal, info.mi_mbits,
+ info.mi_roffset, info.mi_rlength);
+ } else {
+ list_add_tail(&msg->msg_list, drops);
+ }
+
+ if (exhausted)
+ break;
+ }
+
+ if (!exhausted && head == &ptl->ptl_msg_stealing) {
+ head = &ptl->ptl_msg_delayed;
+ goto again;
+ }
+
+ if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+ lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+ if (!mtable->mt_enabled)
+ lnet_ptl_enable_mt(ptl, cpt);
+ }
+
+ lnet_ptl_unlock(ptl);
+}
+
+static void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+ struct lnet_match_table *mtable;
+ int i;
+
+ if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+ return;
+
+ LASSERT(list_empty(&ptl->ptl_msg_delayed));
+ LASSERT(list_empty(&ptl->ptl_msg_stealing));
+ cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+ struct list_head *mhash;
+ lnet_me_t *me;
+ int j;
+
+ if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+ continue;
+
+ mhash = mtable->mt_mhash;
+ /* cleanup ME */
+ for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+ while (!list_empty(&mhash[j])) {
+ me = list_entry(mhash[j].next,
+ lnet_me_t, me_list);
+ CERROR("Active ME %p on exit\n", me);
+ list_del(&me->me_list);
+ lnet_me_free(me);
+ }
+ }
+ /* the extra entry is for MEs with ignore bits */
+ LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+ }
+
+ cfs_percpt_free(ptl->ptl_mtables);
+ ptl->ptl_mtables = NULL;
+}
+
+static int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+ struct lnet_match_table *mtable;
+ struct list_head *mhash;
+ int i;
+ int j;
+
+ ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(struct lnet_match_table));
+ if (ptl->ptl_mtables == NULL) {
+ CERROR("Failed to create match table for portal %d\n", index);
+ return -ENOMEM;
+ }
+
+ ptl->ptl_index = index;
+ INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+ INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+ spin_lock_init(&ptl->ptl_lock);
+ cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+ /* the extra entry is for MEs with ignore bits */
+ LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+ sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+ if (mhash == NULL) {
+ CERROR("Failed to create match hash for portal %d\n",
+ index);
+ goto failed;
+ }
+
+ memset(&mtable->mt_exhausted[0], -1,
+ sizeof(mtable->mt_exhausted[0]) *
+ LNET_MT_EXHAUSTED_BMAP);
+ mtable->mt_mhash = mhash;
+ for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+ INIT_LIST_HEAD(&mhash[j]);
+
+ mtable->mt_portal = index;
+ mtable->mt_cpt = i;
+ }
+
+ return 0;
+ failed:
+ lnet_ptl_cleanup(ptl);
+ return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+ int i;
+
+ if (the_lnet.ln_portals == NULL)
+ return;
+
+ for (i = 0; i < the_lnet.ln_nportals; i++)
+ lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+ cfs_array_free(the_lnet.ln_portals);
+ the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+ int size;
+ int i;
+
+ size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+ the_lnet.ln_nportals = MAX_PORTALS;
+ the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+ if (the_lnet.ln_portals == NULL) {
+ CERROR("Failed to allocate portals table\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < the_lnet.ln_nportals; i++) {
+ if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+ lnet_portals_destroy();
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+ struct lnet_portal *ptl;
+
+ if (portal < 0 || portal >= the_lnet.ln_nportals)
+ return -EINVAL;
+
+ CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+ ptl = the_lnet.ln_portals[portal];
+
+ lnet_res_lock(LNET_LOCK_EX);
+ lnet_ptl_lock(ptl);
+
+ lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+ struct lnet_portal *ptl;
+ LIST_HEAD (zombies);
+
+ if (portal < 0 || portal >= the_lnet.ln_nportals)
+ return -EINVAL;
+
+ ptl = the_lnet.ln_portals[portal];
+
+ lnet_res_lock(LNET_LOCK_EX);
+ lnet_ptl_lock(ptl);
+
+ if (!lnet_ptl_is_lazy(ptl)) {
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(LNET_LOCK_EX);
+ return 0;
+ }
+
+ if (the_lnet.ln_shutdown)
+ CWARN("Active lazy portal %d on exit\n", portal);
+ else
+ CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+ /* grab all the blocked messages atomically */
+ list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+ lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr");
+
+ return 0;
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/lo.c b/kernel/drivers/staging/lustre/lnet/lnet/lo.c
new file mode 100644
index 000000000..f708c2e64
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/lo.c
@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+static int
+lolnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+ LASSERT(!lntmsg->msg_routing);
+ LASSERT(!lntmsg->msg_target_is_router);
+
+ return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+static int
+lolnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+ int delayed, unsigned int niov,
+ struct kvec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ lnet_msg_t *sendmsg = private;
+
+ if (lntmsg != NULL) { /* not discarding */
+ if (sendmsg->msg_iov != NULL) {
+ if (iov != NULL)
+ lnet_copy_iov2iov(niov, iov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_iov,
+ sendmsg->msg_offset, mlen);
+ else
+ lnet_copy_iov2kiov(niov, kiov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_iov,
+ sendmsg->msg_offset, mlen);
+ } else {
+ if (iov != NULL)
+ lnet_copy_kiov2iov(niov, iov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_kiov,
+ sendmsg->msg_offset, mlen);
+ else
+ lnet_copy_kiov2kiov(niov, kiov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_kiov,
+ sendmsg->msg_offset, mlen);
+ }
+
+ lnet_finalize(ni, lntmsg, 0);
+ }
+
+ lnet_finalize(ni, sendmsg, 0);
+ return 0;
+}
+
+static int lolnd_instanced;
+
+static void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+ CDEBUG(D_NET, "shutdown\n");
+ LASSERT(lolnd_instanced);
+
+ lolnd_instanced = 0;
+}
+
+static int
+lolnd_startup(lnet_ni_t *ni)
+{
+ LASSERT(ni->ni_lnd == &the_lolnd);
+ LASSERT(!lolnd_instanced);
+ lolnd_instanced = 1;
+
+ return 0;
+}
+
+lnd_t the_lolnd = {
+ /* .lnd_list = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+ /* .lnd_refcount = */ 0,
+ /* .lnd_type = */ LOLND,
+ /* .lnd_startup = */ lolnd_startup,
+ /* .lnd_shutdown = */ lolnd_shutdown,
+ /* .lnt_ctl = */ NULL,
+ /* .lnd_send = */ lolnd_send,
+ /* .lnd_recv = */ lolnd_recv,
+ /* .lnd_eager_recv = */ NULL,
+ /* .lnd_notify = */ NULL,
+ /* .lnd_accept = */ NULL
+};
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/module.c b/kernel/drivers/staging/lustre/lnet/lnet/module.c
new file mode 100644
index 000000000..72b7fbc83
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/module.c
@@ -0,0 +1,155 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+static int config_on_load;
+module_param(config_on_load, int, 0444);
+MODULE_PARM_DESC(config_on_load, "configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+static int
+lnet_configure(void *arg)
+{
+ /* 'arg' only there so I can be passed to cfs_create_thread() */
+ int rc = 0;
+
+ LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+ if (!the_lnet.ln_niinit_self) {
+ rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+ if (rc >= 0) {
+ the_lnet.ln_niinit_self = 1;
+ rc = 0;
+ }
+ }
+
+ LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+ return rc;
+}
+
+static int
+lnet_unconfigure(void)
+{
+ int refcount;
+
+ LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+ if (the_lnet.ln_niinit_self) {
+ the_lnet.ln_niinit_self = 0;
+ LNetNIFini();
+ }
+
+ LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+ refcount = the_lnet.ln_refcount;
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+
+ LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+ return (refcount == 0) ? 0 : -EBUSY;
+}
+
+static int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+ int rc;
+
+ switch (cmd) {
+ case IOC_LIBCFS_CONFIGURE:
+ return lnet_configure(NULL);
+
+ case IOC_LIBCFS_UNCONFIGURE:
+ return lnet_unconfigure();
+
+ default:
+ /* Passing LNET_PID_ANY only gives me a ref if the net is up
+ * already; I'll need it to ensure the net can't go down while
+ * I'm called into it */
+ rc = LNetNIInit(LNET_PID_ANY);
+ if (rc >= 0) {
+ rc = LNetCtl(cmd, data);
+ LNetNIFini();
+ }
+ return rc;
+ }
+}
+
+static DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+static int __init
+init_lnet(void)
+{
+ int rc;
+
+ mutex_init(&lnet_config_mutex);
+
+ rc = LNetInit();
+ if (rc != 0) {
+ CERROR("LNetInit: error %d\n", rc);
+ return rc;
+ }
+
+ rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+ LASSERT(rc == 0);
+
+ if (config_on_load) {
+ /* Have to schedule a separate thread to avoid deadlocking
+ * in modload */
+ (void) kthread_run(lnet_configure, NULL, "lnet_initd");
+ }
+
+ return 0;
+}
+
+static void __exit
+fini_lnet(void)
+{
+ int rc;
+
+ rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+ LASSERT(rc == 0);
+
+ LNetFini();
+}
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0.0");
+
+module_init(init_lnet);
+module_exit(fini_lnet);
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/peer.c b/kernel/drivers/staging/lustre/lnet/lnet/peer.c
new file mode 100644
index 000000000..45b5742f1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/peer.c
@@ -0,0 +1,338 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/lnet/lib-lnet.h"
+
+int
+lnet_peer_tables_create(void)
+{
+ struct lnet_peer_table *ptable;
+ struct list_head *hash;
+ int i;
+ int j;
+
+ the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*ptable));
+ if (the_lnet.ln_peer_tables == NULL) {
+ CERROR("Failed to allocate cpu-partition peer tables\n");
+ return -ENOMEM;
+ }
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ INIT_LIST_HEAD(&ptable->pt_deathrow);
+
+ LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+ LNET_PEER_HASH_SIZE * sizeof(*hash));
+ if (hash == NULL) {
+ CERROR("Failed to create peer hash table\n");
+ lnet_peer_tables_destroy();
+ return -ENOMEM;
+ }
+
+ for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+ INIT_LIST_HEAD(&hash[j]);
+ ptable->pt_hash = hash; /* sign of initialization */
+ }
+
+ return 0;
+}
+
+void
+lnet_peer_tables_destroy(void)
+{
+ struct lnet_peer_table *ptable;
+ struct list_head *hash;
+ int i;
+ int j;
+
+ if (the_lnet.ln_peer_tables == NULL)
+ return;
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ hash = ptable->pt_hash;
+ if (hash == NULL) /* not initialized */
+ break;
+
+ LASSERT(list_empty(&ptable->pt_deathrow));
+
+ ptable->pt_hash = NULL;
+ for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+ LASSERT(list_empty(&hash[j]));
+
+ LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+ }
+
+ cfs_percpt_free(the_lnet.ln_peer_tables);
+ the_lnet.ln_peer_tables = NULL;
+}
+
+void
+lnet_peer_tables_cleanup(void)
+{
+ struct lnet_peer_table *ptable;
+ int i;
+ int j;
+
+ LASSERT(the_lnet.ln_shutdown); /* i.e. no new peers */
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ lnet_net_lock(i);
+
+ for (j = 0; j < LNET_PEER_HASH_SIZE; j++) {
+ struct list_head *peers = &ptable->pt_hash[j];
+
+ while (!list_empty(peers)) {
+ lnet_peer_t *lp = list_entry(peers->next,
+ lnet_peer_t,
+ lp_hashlist);
+ list_del_init(&lp->lp_hashlist);
+ /* lose hash table's ref */
+ lnet_peer_decref_locked(lp);
+ }
+ }
+
+ lnet_net_unlock(i);
+ }
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ LIST_HEAD(deathrow);
+ lnet_peer_t *lp;
+
+ lnet_net_lock(i);
+
+ for (j = 3; ptable->pt_number != 0; j++) {
+ lnet_net_unlock(i);
+
+ if ((j & (j - 1)) == 0) {
+ CDEBUG(D_WARNING,
+ "Waiting for %d peers on peer table\n",
+ ptable->pt_number);
+ }
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1) / 2);
+ lnet_net_lock(i);
+ }
+ list_splice_init(&ptable->pt_deathrow, &deathrow);
+
+ lnet_net_unlock(i);
+
+ while (!list_empty(&deathrow)) {
+ lp = list_entry(deathrow.next,
+ lnet_peer_t, lp_hashlist);
+ list_del(&lp->lp_hashlist);
+ LIBCFS_FREE(lp, sizeof(*lp));
+ }
+ }
+}
+
+void
+lnet_destroy_peer_locked(lnet_peer_t *lp)
+{
+ struct lnet_peer_table *ptable;
+
+ LASSERT(lp->lp_refcount == 0);
+ LASSERT(lp->lp_rtr_refcount == 0);
+ LASSERT(list_empty(&lp->lp_txq));
+ LASSERT(list_empty(&lp->lp_hashlist));
+ LASSERT(lp->lp_txqnob == 0);
+
+ ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+ LASSERT(ptable->pt_number > 0);
+ ptable->pt_number--;
+
+ lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
+ lp->lp_ni = NULL;
+
+ list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+}
+
+lnet_peer_t *
+lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+ struct list_head *peers;
+ lnet_peer_t *lp;
+
+ LASSERT(!the_lnet.ln_shutdown);
+
+ peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+ list_for_each_entry(lp, peers, lp_hashlist) {
+ if (lp->lp_nid == nid) {
+ lnet_peer_addref_locked(lp);
+ return lp;
+ }
+ }
+
+ return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+{
+ struct lnet_peer_table *ptable;
+ lnet_peer_t *lp = NULL;
+ lnet_peer_t *lp2;
+ int cpt2;
+ int rc = 0;
+
+ *lpp = NULL;
+ if (the_lnet.ln_shutdown) /* it's shutting down */
+ return -ESHUTDOWN;
+
+ /* cpt can be LNET_LOCK_EX if it's called from router functions */
+ cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+
+ ptable = the_lnet.ln_peer_tables[cpt2];
+ lp = lnet_find_peer_locked(ptable, nid);
+ if (lp != NULL) {
+ *lpp = lp;
+ return 0;
+ }
+
+ if (!list_empty(&ptable->pt_deathrow)) {
+ lp = list_entry(ptable->pt_deathrow.next,
+ lnet_peer_t, lp_hashlist);
+ list_del(&lp->lp_hashlist);
+ }
+
+ /*
+ * take extra refcount in case another thread has shutdown LNet
+ * and destroyed locks and peer-table before I finish the allocation
+ */
+ ptable->pt_number++;
+ lnet_net_unlock(cpt);
+
+ if (lp != NULL)
+ memset(lp, 0, sizeof(*lp));
+ else
+ LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
+
+ if (lp == NULL) {
+ rc = -ENOMEM;
+ lnet_net_lock(cpt);
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&lp->lp_txq);
+ INIT_LIST_HEAD(&lp->lp_rtrq);
+ INIT_LIST_HEAD(&lp->lp_routes);
+
+ lp->lp_notify = 0;
+ lp->lp_notifylnd = 0;
+ lp->lp_notifying = 0;
+ lp->lp_alive_count = 0;
+ lp->lp_timestamp = 0;
+ lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+ lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+ lp->lp_last_query = 0; /* haven't asked NI yet */
+ lp->lp_ping_timestamp = 0;
+ lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ lp->lp_nid = nid;
+ lp->lp_cpt = cpt2;
+ lp->lp_refcount = 2; /* 1 for caller; 1 for hash */
+ lp->lp_rtr_refcount = 0;
+
+ lnet_net_lock(cpt);
+
+ if (the_lnet.ln_shutdown) {
+ rc = -ESHUTDOWN;
+ goto out;
+ }
+
+ lp2 = lnet_find_peer_locked(ptable, nid);
+ if (lp2 != NULL) {
+ *lpp = lp2;
+ goto out;
+ }
+
+ lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
+ if (lp->lp_ni == NULL) {
+ rc = -EHOSTUNREACH;
+ goto out;
+ }
+
+ lp->lp_txcredits =
+ lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+ lp->lp_rtrcredits =
+ lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+
+ list_add_tail(&lp->lp_hashlist,
+ &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+ ptable->pt_version++;
+ *lpp = lp;
+
+ return 0;
+out:
+ if (lp != NULL)
+ list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+ ptable->pt_number--;
+ return rc;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+ char *aliveness = "NA";
+ lnet_peer_t *lp;
+ int rc;
+ int cpt;
+
+ cpt = lnet_cpt_of_nid(nid);
+ lnet_net_lock(cpt);
+
+ rc = lnet_nid2peer_locked(&lp, nid, cpt);
+ if (rc != 0) {
+ lnet_net_unlock(cpt);
+ CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+ return;
+ }
+
+ if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+ aliveness = lp->lp_alive ? "up" : "down";
+
+ CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+ libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
+ aliveness, lp->lp_ni->ni_peertxcredits,
+ lp->lp_rtrcredits, lp->lp_minrtrcredits,
+ lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+ lnet_peer_decref_locked(lp);
+
+ lnet_net_unlock(cpt);
+}
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/router.c b/kernel/drivers/staging/lustre/lnet/lnet/router.c
new file mode 100644
index 000000000..8510bae48
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/router.c
@@ -0,0 +1,1706 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * This file is part of Portals
+ * http://sourceforge.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/lnet/lib-lnet.h"
+
+#if defined(LNET_ROUTER)
+
+#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */
+#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */
+#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */
+#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4)
+
+static char *forwarding = "";
+module_param(forwarding, charp, 0444);
+MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+module_param(tiny_router_buffers, int, 0444);
+MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+module_param(small_router_buffers, int, 0444);
+MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+module_param(large_router_buffers, int, 0444);
+MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router");
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer");
+
+static int auto_down = 1;
+module_param(auto_down, int, 0444);
+MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+ /* NI option overrides LNet default */
+ if (ni->ni_peerrtrcredits > 0)
+ return ni->ni_peerrtrcredits;
+ if (peer_buffer_credits > 0)
+ return peer_buffer_credits;
+
+ /* As an approximation, allow this peer the same number of router
+ * buffers as it is allowed outstanding sends */
+ return ni->ni_peertxcredits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+#else
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+ return 0;
+}
+
+#endif
+
+static int check_routers_before_use;
+module_param(check_routers_before_use, int, 0444);
+MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use");
+
+int avoid_asym_router_failure = 1;
+module_param(avoid_asym_router_failure, int, 0644);
+MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+module_param(dead_router_check_interval, int, 0644);
+MODULE_PARM_DESC(dead_router_check_interval, "Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+module_param(live_router_check_interval, int, 0644);
+MODULE_PARM_DESC(live_router_check_interval, "Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+module_param(router_ping_timeout, int, 0644);
+MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+ return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive,
+ unsigned long when)
+{
+ if (time_before(when, lp->lp_timestamp)) { /* out of date information */
+ CDEBUG(D_NET, "Out of date\n");
+ return;
+ }
+
+ lp->lp_timestamp = when; /* update timestamp */
+ lp->lp_ping_deadline = 0; /* disable ping timeout */
+
+ if (lp->lp_alive_count != 0 && /* got old news */
+ (!lp->lp_alive) == (!alive)) { /* new date for old news */
+ CDEBUG(D_NET, "Old news\n");
+ return;
+ }
+
+ /* Flag that notification is outstanding */
+
+ lp->lp_alive_count++;
+ lp->lp_alive = !(!alive); /* 1 bit! */
+ lp->lp_notify = 1;
+ lp->lp_notifylnd |= notifylnd;
+ if (lp->lp_alive)
+ lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+ CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+static void
+lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+ int alive;
+ int notifylnd;
+
+ /* Notify only in 1 thread at any time to ensure ordered notification.
+ * NB individual events can be missed; the only guarantee is that you
+ * always get the most recent news */
+
+ if (lp->lp_notifying || ni == NULL)
+ return;
+
+ lp->lp_notifying = 1;
+
+ while (lp->lp_notify) {
+ alive = lp->lp_alive;
+ notifylnd = lp->lp_notifylnd;
+
+ lp->lp_notifylnd = 0;
+ lp->lp_notify = 0;
+
+ if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
+ lnet_net_unlock(lp->lp_cpt);
+
+ /* A new notification could happen now; I'll handle it
+ * when control returns to me */
+
+ (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+
+ lnet_net_lock(lp->lp_cpt);
+ }
+ }
+
+ lp->lp_notifying = 0;
+}
+
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+ LASSERT(lp->lp_refcount > 0);
+ LASSERT(lp->lp_rtr_refcount >= 0);
+
+ /* lnet_net_lock must be exclusively locked */
+ lp->lp_rtr_refcount++;
+ if (lp->lp_rtr_refcount == 1) {
+ struct list_head *pos;
+
+ /* a simple insertion sort */
+ list_for_each_prev(pos, &the_lnet.ln_routers) {
+ lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
+ lp_rtr_list);
+
+ if (rtr->lp_nid < lp->lp_nid)
+ break;
+ }
+
+ list_add(&lp->lp_rtr_list, pos);
+ /* addref for the_lnet.ln_routers */
+ lnet_peer_addref_locked(lp);
+ the_lnet.ln_routers_version++;
+ }
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+ LASSERT(lp->lp_refcount > 0);
+ LASSERT(lp->lp_rtr_refcount > 0);
+
+ /* lnet_net_lock must be exclusively locked */
+ lp->lp_rtr_refcount--;
+ if (lp->lp_rtr_refcount == 0) {
+ LASSERT(list_empty(&lp->lp_routes));
+
+ if (lp->lp_rcd != NULL) {
+ list_add(&lp->lp_rcd->rcd_list,
+ &the_lnet.ln_rcd_deathrow);
+ lp->lp_rcd = NULL;
+ }
+
+ list_del(&lp->lp_rtr_list);
+ /* decref for the_lnet.ln_routers */
+ lnet_peer_decref_locked(lp);
+ the_lnet.ln_routers_version++;
+ }
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked(__u32 net)
+{
+ lnet_remotenet_t *rnet;
+ struct list_head *tmp;
+ struct list_head *rn_list;
+
+ LASSERT(!the_lnet.ln_shutdown);
+
+ rn_list = lnet_net2rnethash(net);
+ list_for_each(tmp, rn_list) {
+ rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+ if (rnet->lrn_net == net)
+ return rnet;
+ }
+ return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+ static int seeded;
+ int lnd_type, seed[2];
+ struct timeval tv;
+ lnet_ni_t *ni;
+ struct list_head *tmp;
+
+ if (seeded)
+ return;
+
+ cfs_get_random_bytes(seed, sizeof(seed));
+
+ /* Nodes with small feet have little entropy
+ * the NID for this node gives the most entropy in the low bits */
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+ lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+ if (lnd_type != LOLND)
+ seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+ }
+
+ do_gettimeofday(&tv);
+ cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+ seeded = 1;
+}
+
+/* NB expects LNET_LOCK held */
+static void
+lnet_add_route_to_rnet(lnet_remotenet_t *rnet, lnet_route_t *route)
+{
+ unsigned int len = 0;
+ unsigned int offset = 0;
+ struct list_head *e;
+
+ lnet_shuffle_seed();
+
+ list_for_each(e, &rnet->lrn_routes) {
+ len++;
+ }
+
+ /* len+1 positions to add a new entry, also prevents division by 0 */
+ offset = cfs_rand() % (len + 1);
+ list_for_each(e, &rnet->lrn_routes) {
+ if (offset == 0)
+ break;
+ offset--;
+ }
+ list_add(&route->lr_list, e);
+ list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+ the_lnet.ln_remote_nets_version++;
+ lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway,
+ unsigned int priority)
+{
+ struct list_head *e;
+ lnet_remotenet_t *rnet;
+ lnet_remotenet_t *rnet2;
+ lnet_route_t *route;
+ lnet_ni_t *ni;
+ int add_route;
+ int rc;
+
+ CDEBUG(D_NET, "Add route: net %s hops %u priority %u gw %s\n",
+ libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway));
+
+ if (gateway == LNET_NID_ANY ||
+ LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+ net == LNET_NIDNET(LNET_NID_ANY) ||
+ LNET_NETTYP(net) == LOLND ||
+ LNET_NIDNET(gateway) == net ||
+ hops < 1 || hops > 255)
+ return -EINVAL;
+
+ if (lnet_islocalnet(net)) /* it's a local network */
+ return 0; /* ignore the route entry */
+
+ /* Assume net, route, all new */
+ LIBCFS_ALLOC(route, sizeof(*route));
+ LIBCFS_ALLOC(rnet, sizeof(*rnet));
+ if (route == NULL || rnet == NULL) {
+ CERROR("Out of memory creating route %s %d %s\n",
+ libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+ if (route != NULL)
+ LIBCFS_FREE(route, sizeof(*route));
+ if (rnet != NULL)
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&rnet->lrn_routes);
+ rnet->lrn_net = net;
+ route->lr_hops = hops;
+ route->lr_net = net;
+ route->lr_priority = priority;
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
+ if (rc != 0) {
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ LIBCFS_FREE(route, sizeof(*route));
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+
+ if (rc == -EHOSTUNREACH) /* gateway is not on a local net */
+ return 0; /* ignore the route entry */
+ CERROR("Error %d creating route %s %d %s\n", rc,
+ libcfs_net2str(net), hops,
+ libcfs_nid2str(gateway));
+
+ return rc;
+ }
+
+ LASSERT(!the_lnet.ln_shutdown);
+
+ rnet2 = lnet_find_net_locked(net);
+ if (rnet2 == NULL) {
+ /* new network */
+ list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+ rnet2 = rnet;
+ }
+
+ /* Search for a duplicate route (it's a NOOP if it is) */
+ add_route = 1;
+ list_for_each(e, &rnet2->lrn_routes) {
+ lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
+
+ if (route2->lr_gateway == route->lr_gateway) {
+ add_route = 0;
+ break;
+ }
+
+ /* our lookups must be true */
+ LASSERT(route2->lr_gateway->lp_nid != gateway);
+ }
+
+ if (add_route) {
+ lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+ lnet_add_route_to_rnet(rnet2, route);
+
+ ni = route->lr_gateway->lp_ni;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ /* XXX Assume alive */
+ if (ni->ni_lnd->lnd_notify != NULL)
+ (ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ /* -1 for notify or !add_route */
+ lnet_peer_decref_locked(route->lr_gateway);
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ if (!add_route)
+ LIBCFS_FREE(route, sizeof(*route));
+
+ if (rnet != rnet2)
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+
+ return 0;
+}
+
+int
+lnet_check_routes(void)
+{
+ lnet_remotenet_t *rnet;
+ lnet_route_t *route;
+ lnet_route_t *route2;
+ struct list_head *e1;
+ struct list_head *e2;
+ int cpt;
+ struct list_head *rn_list;
+ int i;
+
+ cpt = lnet_net_lock_current();
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+ rn_list = &the_lnet.ln_remote_nets_hash[i];
+ list_for_each(e1, rn_list) {
+ rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+ route2 = NULL;
+ list_for_each(e2, &rnet->lrn_routes) {
+ lnet_nid_t nid1;
+ lnet_nid_t nid2;
+ int net;
+
+ route = list_entry(e2, lnet_route_t,
+ lr_list);
+
+ if (route2 == NULL) {
+ route2 = route;
+ continue;
+ }
+
+ if (route->lr_gateway->lp_ni ==
+ route2->lr_gateway->lp_ni)
+ continue;
+
+ nid1 = route->lr_gateway->lp_nid;
+ nid2 = route2->lr_gateway->lp_nid;
+ net = rnet->lrn_net;
+
+ lnet_net_unlock(cpt);
+
+ CERROR("Routes to %s via %s and %s not supported\n",
+ libcfs_net2str(net),
+ libcfs_nid2str(nid1),
+ libcfs_nid2str(nid2));
+ return -EINVAL;
+ }
+ }
+ }
+
+ lnet_net_unlock(cpt);
+ return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+ struct lnet_peer *gateway;
+ lnet_remotenet_t *rnet;
+ lnet_route_t *route;
+ struct list_head *e1;
+ struct list_head *e2;
+ int rc = -ENOENT;
+ struct list_head *rn_list;
+ int idx = 0;
+
+ CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+ libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+ /* NB Caller may specify either all routes via the given gateway
+ * or a specific route entry actual NIDs) */
+
+ lnet_net_lock(LNET_LOCK_EX);
+ if (net == LNET_NIDNET(LNET_NID_ANY))
+ rn_list = &the_lnet.ln_remote_nets_hash[0];
+ else
+ rn_list = lnet_net2rnethash(net);
+
+ again:
+ list_for_each(e1, rn_list) {
+ rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+ if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+ net == rnet->lrn_net))
+ continue;
+
+ list_for_each(e2, &rnet->lrn_routes) {
+ route = list_entry(e2, lnet_route_t, lr_list);
+
+ gateway = route->lr_gateway;
+ if (!(gw_nid == LNET_NID_ANY ||
+ gw_nid == gateway->lp_nid))
+ continue;
+
+ list_del(&route->lr_list);
+ list_del(&route->lr_gwlist);
+ the_lnet.ln_remote_nets_version++;
+
+ if (list_empty(&rnet->lrn_routes))
+ list_del(&rnet->lrn_list);
+ else
+ rnet = NULL;
+
+ lnet_rtr_decref_locked(gateway);
+ lnet_peer_decref_locked(gateway);
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ LIBCFS_FREE(route, sizeof(*route));
+
+ if (rnet != NULL)
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+
+ rc = 0;
+ lnet_net_lock(LNET_LOCK_EX);
+ goto again;
+ }
+ }
+
+ if (net == LNET_NIDNET(LNET_NID_ANY) &&
+ ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+ rn_list = &the_lnet.ln_remote_nets_hash[idx];
+ goto again;
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ return rc;
+}
+
+void
+lnet_destroy_routes(void)
+{
+ lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+ lnet_nid_t *gateway, __u32 *alive, __u32 *priority)
+{
+ struct list_head *e1;
+ struct list_head *e2;
+ lnet_remotenet_t *rnet;
+ lnet_route_t *route;
+ int cpt;
+ int i;
+ struct list_head *rn_list;
+
+ cpt = lnet_net_lock_current();
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+ rn_list = &the_lnet.ln_remote_nets_hash[i];
+ list_for_each(e1, rn_list) {
+ rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+ list_for_each(e2, &rnet->lrn_routes) {
+ route = list_entry(e2, lnet_route_t,
+ lr_list);
+
+ if (idx-- == 0) {
+ *net = rnet->lrn_net;
+ *hops = route->lr_hops;
+ *priority = route->lr_priority;
+ *gateway = route->lr_gateway->lp_nid;
+ *alive = route->lr_gateway->lp_alive;
+ lnet_net_unlock(cpt);
+ return 0;
+ }
+ }
+ }
+ }
+
+ lnet_net_unlock(cpt);
+ return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+ int i;
+ lnet_ni_status_t *stat;
+
+ __swab32s(&info->pi_magic);
+ __swab32s(&info->pi_features);
+ __swab32s(&info->pi_pid);
+ __swab32s(&info->pi_nnis);
+ for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+ stat = &info->pi_ni[i];
+ __swab64s(&stat->ns_nid);
+ __swab32s(&stat->ns_status);
+ }
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(lnet_rc_data_t *rcd)
+{
+ lnet_ping_info_t *info = rcd->rcd_pinginfo;
+ struct lnet_peer *gw = rcd->rcd_gateway;
+ lnet_route_t *rtr;
+
+ if (!gw->lp_alive)
+ return;
+
+ if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+ lnet_swap_pinginfo(info);
+
+ /* NB always racing with network! */
+ if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+ CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+ libcfs_nid2str(gw->lp_nid), info->pi_magic);
+ gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ return;
+ }
+
+ gw->lp_ping_feats = info->pi_features;
+ if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+ CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+ libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+ return; /* nothing I can understand */
+ }
+
+ if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+ return; /* can't carry NI status info */
+
+ list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
+ int ptl_status = LNET_NI_STATUS_INVALID;
+ int down = 0;
+ int up = 0;
+ int i;
+
+ for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+ lnet_ni_status_t *stat = &info->pi_ni[i];
+ lnet_nid_t nid = stat->ns_nid;
+
+ if (nid == LNET_NID_ANY) {
+ CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+ libcfs_nid2str(gw->lp_nid));
+ gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ return;
+ }
+
+ if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+ continue;
+
+ if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+ if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
+ down++;
+ else if (ptl_status != LNET_NI_STATUS_UP)
+ ptl_status = LNET_NI_STATUS_DOWN;
+ continue;
+ }
+
+ if (stat->ns_status == LNET_NI_STATUS_UP) {
+ if (LNET_NIDNET(nid) == rtr->lr_net) {
+ up = 1;
+ break;
+ }
+ /* ptl NIs are considered down only when
+ * they're all down */
+ if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+ ptl_status = LNET_NI_STATUS_UP;
+ continue;
+ }
+
+ CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+ libcfs_nid2str(gw->lp_nid), stat->ns_status);
+ gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ return;
+ }
+
+ if (up) { /* ignore downed NIs if NI for dest network is up */
+ rtr->lr_downis = 0;
+ continue;
+ }
+ rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
+ }
+}
+
+static void
+lnet_router_checker_event(lnet_event_t *event)
+{
+ lnet_rc_data_t *rcd = event->md.user_ptr;
+ struct lnet_peer *lp;
+
+ LASSERT(rcd != NULL);
+
+ if (event->unlinked) {
+ LNetInvalidateHandle(&rcd->rcd_mdh);
+ return;
+ }
+
+ LASSERT(event->type == LNET_EVENT_SEND ||
+ event->type == LNET_EVENT_REPLY);
+
+ lp = rcd->rcd_gateway;
+ LASSERT(lp != NULL);
+
+ /* NB: it's called with holding lnet_res_lock, we have a few
+ * places need to hold both locks at the same time, please take
+ * care of lock ordering */
+ lnet_net_lock(lp->lp_cpt);
+ if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+ /* ignore if no longer a router or rcd is replaced */
+ goto out;
+ }
+
+ if (event->type == LNET_EVENT_SEND) {
+ lp->lp_ping_notsent = 0;
+ if (event->status == 0)
+ goto out;
+ }
+
+ /* LNET_EVENT_REPLY */
+ /* A successful REPLY means the router is up. If _any_ comms
+ * to the router fail I assume it's down (this will happen if
+ * we ping alive routers to try to detect router death before
+ * apps get burned). */
+
+ lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+ /* The router checker will wake up very shortly and do the
+ * actual notification.
+ * XXX If 'lp' stops being a router before then, it will still
+ * have the notification pending!!! */
+
+ if (avoid_asym_router_failure && event->status == 0)
+ lnet_parse_rc_info(rcd);
+
+ out:
+ lnet_net_unlock(lp->lp_cpt);
+}
+
+static void
+lnet_wait_known_routerstate(void)
+{
+ lnet_peer_t *rtr;
+ struct list_head *entry;
+ int all_known;
+
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+ for (;;) {
+ int cpt = lnet_net_lock_current();
+
+ all_known = 1;
+ list_for_each(entry, &the_lnet.ln_routers) {
+ rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+ if (rtr->lp_alive_count == 0) {
+ all_known = 0;
+ break;
+ }
+ }
+
+ lnet_net_unlock(cpt);
+
+ if (all_known)
+ return;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ }
+}
+
+void
+lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net)
+{
+ lnet_route_t *rte;
+
+ if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) {
+ list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) {
+ if (rte->lr_net == net) {
+ rte->lr_downis = 0;
+ break;
+ }
+ }
+ }
+}
+
+static void
+lnet_update_ni_status_locked(void)
+{
+ lnet_ni_t *ni;
+ long now;
+ int timeout;
+
+ LASSERT(the_lnet.ln_routing);
+
+ timeout = router_ping_timeout +
+ max(live_router_check_interval, dead_router_check_interval);
+
+ now = get_seconds();
+ list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+ if (ni->ni_lnd->lnd_type == LOLND)
+ continue;
+
+ if (now < ni->ni_last_alive + timeout)
+ continue;
+
+ lnet_ni_lock(ni);
+ /* re-check with lock */
+ if (now < ni->ni_last_alive + timeout) {
+ lnet_ni_unlock(ni);
+ continue;
+ }
+
+ LASSERT(ni->ni_status != NULL);
+
+ if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+ CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+ libcfs_nid2str(ni->ni_nid), timeout);
+ /* NB: so far, this is the only place to set
+ * NI status to "down" */
+ ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+ }
+ lnet_ni_unlock(ni);
+ }
+}
+
+static void
+lnet_destroy_rc_data(lnet_rc_data_t *rcd)
+{
+ LASSERT(list_empty(&rcd->rcd_list));
+ /* detached from network */
+ LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
+
+ if (rcd->rcd_gateway != NULL) {
+ int cpt = rcd->rcd_gateway->lp_cpt;
+
+ lnet_net_lock(cpt);
+ lnet_peer_decref_locked(rcd->rcd_gateway);
+ lnet_net_unlock(cpt);
+ }
+
+ if (rcd->rcd_pinginfo != NULL)
+ LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+ LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+static lnet_rc_data_t *
+lnet_create_rc_data_locked(lnet_peer_t *gateway)
+{
+ lnet_rc_data_t *rcd = NULL;
+ lnet_ping_info_t *pi;
+ int rc;
+ int i;
+
+ lnet_net_unlock(gateway->lp_cpt);
+
+ LIBCFS_ALLOC(rcd, sizeof(*rcd));
+ if (rcd == NULL)
+ goto out;
+
+ LNetInvalidateHandle(&rcd->rcd_mdh);
+ INIT_LIST_HEAD(&rcd->rcd_list);
+
+ LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+ if (pi == NULL)
+ goto out;
+
+ for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+ pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+ pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+ }
+ rcd->rcd_pinginfo = pi;
+
+ LASSERT(!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+ rc = LNetMDBind((lnet_md_t){.start = pi,
+ .user_ptr = rcd,
+ .length = LNET_PINGINFO_SIZE,
+ .threshold = LNET_MD_THRESH_INF,
+ .options = LNET_MD_TRUNCATE,
+ .eq_handle = the_lnet.ln_rc_eqh},
+ LNET_UNLINK,
+ &rcd->rcd_mdh);
+ if (rc < 0) {
+ CERROR("Can't bind MD: %d\n", rc);
+ goto out;
+ }
+ LASSERT(rc == 0);
+
+ lnet_net_lock(gateway->lp_cpt);
+ /* router table changed or someone has created rcd for this gateway */
+ if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
+ lnet_net_unlock(gateway->lp_cpt);
+ goto out;
+ }
+
+ lnet_peer_addref_locked(gateway);
+ rcd->rcd_gateway = gateway;
+ gateway->lp_rcd = rcd;
+ gateway->lp_ping_notsent = 0;
+
+ return rcd;
+
+ out:
+ if (rcd != NULL) {
+ if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
+ rc = LNetMDUnlink(rcd->rcd_mdh);
+ LASSERT(rc == 0);
+ }
+ lnet_destroy_rc_data(rcd);
+ }
+
+ lnet_net_lock(gateway->lp_cpt);
+ return gateway->lp_rcd;
+}
+
+static int
+lnet_router_check_interval(lnet_peer_t *rtr)
+{
+ int secs;
+
+ secs = rtr->lp_alive ? live_router_check_interval :
+ dead_router_check_interval;
+ if (secs < 0)
+ secs = 0;
+
+ return secs;
+}
+
+static void
+lnet_ping_router_locked(lnet_peer_t *rtr)
+{
+ lnet_rc_data_t *rcd = NULL;
+ unsigned long now = cfs_time_current();
+ int secs;
+
+ lnet_peer_addref_locked(rtr);
+
+ if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+ cfs_time_after(now, rtr->lp_ping_deadline))
+ lnet_notify_locked(rtr, 1, 0, now);
+
+ /* Run any outstanding notifications */
+ lnet_ni_notify_locked(rtr->lp_ni, rtr);
+
+ if (!lnet_isrouter(rtr) ||
+ the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+ /* router table changed or router checker is shutting down */
+ lnet_peer_decref_locked(rtr);
+ return;
+ }
+
+ rcd = rtr->lp_rcd != NULL ?
+ rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+
+ if (rcd == NULL)
+ return;
+
+ secs = lnet_router_check_interval(rtr);
+
+ CDEBUG(D_NET,
+ "rtr %s %d: deadline %lu ping_notsent %d alive %d alive_count %d lp_ping_timestamp %lu\n",
+ libcfs_nid2str(rtr->lp_nid), secs,
+ rtr->lp_ping_deadline, rtr->lp_ping_notsent,
+ rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+
+ if (secs != 0 && !rtr->lp_ping_notsent &&
+ cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+ cfs_time_seconds(secs)))) {
+ int rc;
+ lnet_process_id_t id;
+ lnet_handle_md_t mdh;
+
+ id.nid = rtr->lp_nid;
+ id.pid = LUSTRE_SRV_LNET_PID;
+ CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+ rtr->lp_ping_notsent = 1;
+ rtr->lp_ping_timestamp = now;
+
+ mdh = rcd->rcd_mdh;
+
+ if (rtr->lp_ping_deadline == 0) {
+ rtr->lp_ping_deadline =
+ cfs_time_shift(router_ping_timeout);
+ }
+
+ lnet_net_unlock(rtr->lp_cpt);
+
+ rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+ LNET_PROTO_PING_MATCHBITS, 0);
+
+ lnet_net_lock(rtr->lp_cpt);
+ if (rc != 0)
+ rtr->lp_ping_notsent = 0; /* no event pending */
+ }
+
+ lnet_peer_decref_locked(rtr);
+}
+
+int
+lnet_router_checker_start(void)
+{
+ int rc;
+ int eqsz;
+
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+ if (check_routers_before_use &&
+ dead_router_check_interval <= 0) {
+ LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be set if 'check_routers_before_use' is set\n");
+ return -EINVAL;
+ }
+
+ if (!the_lnet.ln_routing &&
+ live_router_check_interval <= 0 &&
+ dead_router_check_interval <= 0)
+ return 0;
+
+ sema_init(&the_lnet.ln_rc_signal, 0);
+ /* EQ size doesn't matter; the callback is guaranteed to get every
+ * event */
+ eqsz = 0;
+ rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
+ &the_lnet.ln_rc_eqh);
+ if (rc != 0) {
+ CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+ return -ENOMEM;
+ }
+
+ the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+ rc = PTR_ERR(kthread_run(lnet_router_checker,
+ NULL, "router_checker"));
+ if (IS_ERR_VALUE(rc)) {
+ CERROR("Can't start router checker thread: %d\n", rc);
+ /* block until event callback signals exit */
+ down(&the_lnet.ln_rc_signal);
+ rc = LNetEQFree(the_lnet.ln_rc_eqh);
+ LASSERT(rc == 0);
+ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+ return -ENOMEM;
+ }
+
+ if (check_routers_before_use) {
+ /* Note that a helpful side-effect of pinging all known routers
+ * at startup is that it makes them drop stale connections they
+ * may have to a previous instance of me. */
+ lnet_wait_known_routerstate();
+ }
+
+ return 0;
+}
+
+void
+lnet_router_checker_stop(void)
+{
+ int rc;
+
+ if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+ return;
+
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+ the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+
+ /* block until event callback signals exit */
+ down(&the_lnet.ln_rc_signal);
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+ rc = LNetEQFree(the_lnet.ln_rc_eqh);
+ LASSERT(rc == 0);
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+ lnet_rc_data_t *rcd;
+ lnet_rc_data_t *tmp;
+ lnet_peer_t *lp;
+ struct list_head head;
+ int i = 2;
+
+ if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+ list_empty(&the_lnet.ln_rcd_deathrow) &&
+ list_empty(&the_lnet.ln_rcd_zombie)))
+ return;
+
+ INIT_LIST_HEAD(&head);
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+ /* router checker is stopping, prune all */
+ list_for_each_entry(lp, &the_lnet.ln_routers,
+ lp_rtr_list) {
+ if (lp->lp_rcd == NULL)
+ continue;
+
+ LASSERT(list_empty(&lp->lp_rcd->rcd_list));
+ list_add(&lp->lp_rcd->rcd_list,
+ &the_lnet.ln_rcd_deathrow);
+ lp->lp_rcd = NULL;
+ }
+ }
+
+ /* unlink all RCDs on deathrow list */
+ list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+ if (!list_empty(&head)) {
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ list_for_each_entry(rcd, &head, rcd_list)
+ LNetMDUnlink(rcd->rcd_mdh);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+ /* release all zombie RCDs */
+ while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+ list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+ rcd_list) {
+ if (LNetHandleIsInvalid(rcd->rcd_mdh))
+ list_move(&rcd->rcd_list, &head);
+ }
+
+ wait_unlink = wait_unlink &&
+ !list_empty(&the_lnet.ln_rcd_zombie);
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ while (!list_empty(&head)) {
+ rcd = list_entry(head.next,
+ lnet_rc_data_t, rcd_list);
+ list_del_init(&rcd->rcd_list);
+ lnet_destroy_rc_data(rcd);
+ }
+
+ if (!wait_unlink)
+ return;
+
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for rc buffers to unlink\n");
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1) / 4);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+
+
+#if defined(LNET_ROUTER)
+
+static int
+lnet_router_checker(void *arg)
+{
+ lnet_peer_t *rtr;
+ struct list_head *entry;
+
+ cfs_block_allsigs();
+
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+ while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+ __u64 version;
+ int cpt;
+ int cpt2;
+
+ cpt = lnet_net_lock_current();
+rescan:
+ version = the_lnet.ln_routers_version;
+
+ list_for_each(entry, &the_lnet.ln_routers) {
+ rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+ cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+ if (cpt != cpt2) {
+ lnet_net_unlock(cpt);
+ cpt = cpt2;
+ lnet_net_lock(cpt);
+ /* the routers list has changed */
+ if (version != the_lnet.ln_routers_version)
+ goto rescan;
+ }
+
+ lnet_ping_router_locked(rtr);
+
+ /* NB dropped lock */
+ if (version != the_lnet.ln_routers_version) {
+ /* the routers list has changed */
+ goto rescan;
+ }
+ }
+
+ if (the_lnet.ln_routing)
+ lnet_update_ni_status_locked();
+
+ lnet_net_unlock(cpt);
+
+ lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+ /* Call schedule_timeout() here always adds 1 to load average
+ * because kernel counts # active tasks as nr_running
+ * + nr_uninterruptible. */
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ }
+
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
+
+ lnet_prune_rc_data(1); /* wait for UNLINK */
+
+ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+ up(&the_lnet.ln_rc_signal);
+ /* The unlink event callback will signal final completion */
+ return 0;
+}
+
+static void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+ int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+ while (--npages >= 0)
+ __free_page(rb->rb_kiov[npages].kiov_page);
+
+ LIBCFS_FREE(rb, sz);
+}
+
+static lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
+{
+ int npages = rbp->rbp_npages;
+ int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+ struct page *page;
+ lnet_rtrbuf_t *rb;
+ int i;
+
+ LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+ if (rb == NULL)
+ return NULL;
+
+ rb->rb_pool = rbp;
+
+ for (i = 0; i < npages; i++) {
+ page = alloc_pages_node(
+ cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+ __GFP_ZERO | GFP_IOFS, 0);
+ if (page == NULL) {
+ while (--i >= 0)
+ __free_page(rb->rb_kiov[i].kiov_page);
+
+ LIBCFS_FREE(rb, sz);
+ return NULL;
+ }
+
+ rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+ rb->rb_kiov[i].kiov_offset = 0;
+ rb->rb_kiov[i].kiov_page = page;
+ }
+
+ return rb;
+}
+
+static void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+ int npages = rbp->rbp_npages;
+ int nbuffers = 0;
+ lnet_rtrbuf_t *rb;
+
+ if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+ return;
+
+ LASSERT(list_empty(&rbp->rbp_msgs));
+ LASSERT(rbp->rbp_credits == rbp->rbp_nbuffers);
+
+ while (!list_empty(&rbp->rbp_bufs)) {
+ LASSERT(rbp->rbp_credits > 0);
+
+ rb = list_entry(rbp->rbp_bufs.next,
+ lnet_rtrbuf_t, rb_list);
+ list_del(&rb->rb_list);
+ lnet_destroy_rtrbuf(rb, npages);
+ nbuffers++;
+ }
+
+ LASSERT(rbp->rbp_nbuffers == nbuffers);
+ LASSERT(rbp->rbp_credits == nbuffers);
+
+ rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+static int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
+{
+ lnet_rtrbuf_t *rb;
+ int i;
+
+ if (rbp->rbp_nbuffers != 0) {
+ LASSERT(rbp->rbp_nbuffers == nbufs);
+ return 0;
+ }
+
+ for (i = 0; i < nbufs; i++) {
+ rb = lnet_new_rtrbuf(rbp, cpt);
+
+ if (rb == NULL) {
+ CERROR("Failed to allocate %d router bufs of %d pages\n",
+ nbufs, rbp->rbp_npages);
+ return -ENOMEM;
+ }
+
+ rbp->rbp_nbuffers++;
+ rbp->rbp_credits++;
+ rbp->rbp_mincredits++;
+ list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+ /* No allocation "under fire" */
+ /* Otherwise we'd need code to schedule blocked msgs etc */
+ LASSERT(!the_lnet.ln_routing);
+ }
+
+ LASSERT(rbp->rbp_credits == nbufs);
+ return 0;
+}
+
+static void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+ INIT_LIST_HEAD(&rbp->rbp_msgs);
+ INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+ rbp->rbp_npages = npages;
+ rbp->rbp_credits = 0;
+ rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+ lnet_rtrbufpool_t *rtrp;
+ int i;
+
+ if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+ return;
+
+ cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+ lnet_rtrpool_free_bufs(&rtrp[0]);
+ lnet_rtrpool_free_bufs(&rtrp[1]);
+ lnet_rtrpool_free_bufs(&rtrp[2]);
+ }
+
+ cfs_percpt_free(the_lnet.ln_rtrpools);
+ the_lnet.ln_rtrpools = NULL;
+}
+
+static int
+lnet_nrb_tiny_calculate(int npages)
+{
+ int nrbs = LNET_NRB_TINY;
+
+ if (tiny_router_buffers < 0) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "tiny_router_buffers=%d invalid when routing enabled\n",
+ tiny_router_buffers);
+ return -1;
+ }
+
+ if (tiny_router_buffers > 0)
+ nrbs = tiny_router_buffers;
+
+ nrbs /= LNET_CPT_NUMBER;
+ return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(int npages)
+{
+ int nrbs = LNET_NRB_SMALL;
+
+ if (small_router_buffers < 0) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "small_router_buffers=%d invalid when routing enabled\n",
+ small_router_buffers);
+ return -1;
+ }
+
+ if (small_router_buffers > 0)
+ nrbs = small_router_buffers;
+
+ nrbs /= LNET_CPT_NUMBER;
+ return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(int npages)
+{
+ int nrbs = LNET_NRB_LARGE;
+
+ if (large_router_buffers < 0) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "large_router_buffers=%d invalid when routing enabled\n",
+ large_router_buffers);
+ return -1;
+ }
+
+ if (large_router_buffers > 0)
+ nrbs = large_router_buffers;
+
+ nrbs /= LNET_CPT_NUMBER;
+ return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+ lnet_rtrbufpool_t *rtrp;
+ int large_pages;
+ int small_pages = 1;
+ int nrb_tiny;
+ int nrb_small;
+ int nrb_large;
+ int rc;
+ int i;
+
+ large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+ if (!strcmp(forwarding, "")) {
+ /* not set either way */
+ if (!im_a_router)
+ return 0;
+ } else if (!strcmp(forwarding, "disabled")) {
+ /* explicitly disabled */
+ return 0;
+ } else if (!strcmp(forwarding, "enabled")) {
+ /* explicitly enabled */
+ } else {
+ LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either 'enabled' or 'disabled'\n");
+ return -EINVAL;
+ }
+
+ nrb_tiny = lnet_nrb_tiny_calculate(0);
+ if (nrb_tiny < 0)
+ return -EINVAL;
+
+ nrb_small = lnet_nrb_small_calculate(small_pages);
+ if (nrb_small < 0)
+ return -EINVAL;
+
+ nrb_large = lnet_nrb_large_calculate(large_pages);
+ if (nrb_large < 0)
+ return -EINVAL;
+
+ the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+ LNET_NRBPOOLS *
+ sizeof(lnet_rtrbufpool_t));
+ if (the_lnet.ln_rtrpools == NULL) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "Failed to initialize router buffe pool\n");
+ return -ENOMEM;
+ }
+
+ cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+ lnet_rtrpool_init(&rtrp[0], 0);
+ rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
+ if (rc != 0)
+ goto failed;
+
+ lnet_rtrpool_init(&rtrp[1], small_pages);
+ rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
+ if (rc != 0)
+ goto failed;
+
+ lnet_rtrpool_init(&rtrp[2], large_pages);
+ rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
+ if (rc != 0)
+ goto failed;
+ }
+
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_routing = 1;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ return 0;
+
+ failed:
+ lnet_rtrpools_free();
+ return rc;
+}
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, unsigned long when)
+{
+ struct lnet_peer *lp = NULL;
+ unsigned long now = cfs_time_current();
+ int cpt = lnet_cpt_of_nid(nid);
+
+ LASSERT(!in_interrupt ());
+
+ CDEBUG(D_NET, "%s notifying %s: %s\n",
+ (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+ libcfs_nid2str(nid),
+ alive ? "up" : "down");
+
+ if (ni != NULL &&
+ LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+ CWARN("Ignoring notification of %s %s by %s (different net)\n",
+ libcfs_nid2str(nid), alive ? "birth" : "death",
+ libcfs_nid2str(ni->ni_nid));
+ return -EINVAL;
+ }
+
+ /* can't do predictions... */
+ if (cfs_time_after(when, now)) {
+ CWARN("Ignoring prediction from %s of %s %s %ld seconds in the future\n",
+ (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+ libcfs_nid2str(nid), alive ? "up" : "down",
+ cfs_duration_sec(cfs_time_sub(when, now)));
+ return -EINVAL;
+ }
+
+ if (ni != NULL && !alive && /* LND telling me she's down */
+ !auto_down) { /* auto-down disabled */
+ CDEBUG(D_NET, "Auto-down disabled\n");
+ return 0;
+ }
+
+ lnet_net_lock(cpt);
+
+ if (the_lnet.ln_shutdown) {
+ lnet_net_unlock(cpt);
+ return -ESHUTDOWN;
+ }
+
+ lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+ if (lp == NULL) {
+ /* nid not found */
+ lnet_net_unlock(cpt);
+ CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+ return 0;
+ }
+
+ /* We can't fully trust LND on reporting exact peer last_alive
+ * if he notifies us about dead peer. For example ksocklnd can
+ * call us with when == _time_when_the_node_was_booted_ if
+ * no connections were successfully established */
+ if (ni != NULL && !alive && when < lp->lp_last_alive)
+ when = lp->lp_last_alive;
+
+ lnet_notify_locked(lp, ni == NULL, alive, when);
+
+ lnet_ni_notify_locked(ni, lp);
+
+ lnet_peer_decref_locked(lp);
+
+ lnet_net_unlock(cpt);
+ return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+void
+lnet_get_tunables(void)
+{
+}
+
+#else
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, unsigned long when)
+{
+ return -EOPNOTSUPP;
+}
+
+void
+lnet_router_checker(void)
+{
+ static time_t last;
+ static int running;
+
+ time_t now = get_seconds();
+ int interval = now - last;
+ int rc;
+ __u64 version;
+ lnet_peer_t *rtr;
+
+ /* It's no use to call me again within a sec - all intervals and
+ * timeouts are measured in seconds */
+ if (last != 0 && interval < 2)
+ return;
+
+ if (last != 0 &&
+ interval > max(live_router_check_interval,
+ dead_router_check_interval))
+ CNETERR("Checker(%d/%d) not called for %d seconds\n",
+ live_router_check_interval, dead_router_check_interval,
+ interval);
+
+ LASSERT(LNET_CPT_NUMBER == 1);
+
+ lnet_net_lock(0);
+ LASSERT(!running); /* recursion check */
+ running = 1;
+ lnet_net_unlock(0);
+
+ last = now;
+
+ if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
+ lnet_prune_rc_data(0); /* unlink all rcd and nowait */
+
+ /* consume all pending events */
+ while (1) {
+ int i;
+ lnet_event_t ev;
+
+ /* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
+ * recursion breaker in LNetEQPoll would fail */
+ rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
+ if (rc == 0) /* no event pending */
+ break;
+
+ /* NB a lost SENT prevents me from pinging a router again */
+ if (rc == -EOVERFLOW) {
+ CERROR("Dropped an event!!!\n");
+ abort();
+ }
+
+ LASSERT(rc == 1);
+
+ lnet_router_checker_event(&ev);
+ }
+
+ if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
+ lnet_prune_rc_data(1); /* release rcd */
+ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+ running = 0;
+ return;
+ }
+
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+ lnet_net_lock(0);
+
+ version = the_lnet.ln_routers_version;
+ list_for_each_entry(rtr, &the_lnet.ln_routers, lp_rtr_list) {
+ lnet_ping_router_locked(rtr);
+ LASSERT(version == the_lnet.ln_routers_version);
+ }
+
+ lnet_net_unlock(0);
+
+ running = 0; /* lock only needed for the recursion check */
+}
+
+/* NB lnet_peers_start_down depends on me,
+ * so must be called before any peer creation */
+void
+lnet_get_tunables(void)
+{
+ char *s;
+
+ s = getenv("LNET_ROUTER_PING_TIMEOUT");
+ if (s != NULL)
+ router_ping_timeout = atoi(s);
+
+ s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
+ if (s != NULL)
+ live_router_check_interval = atoi(s);
+
+ s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
+ if (s != NULL)
+ dead_router_check_interval = atoi(s);
+
+ /* This replaces old lnd_notify mechanism */
+ check_routers_before_use = 1;
+ if (dead_router_check_interval <= 0)
+ dead_router_check_interval = 30;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+}
+
+int
+lnet_rtrpools_alloc(int im_a_arouter)
+{
+ return 0;
+}
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/lnet/router_proc.c b/kernel/drivers/staging/lustre/lnet/lnet/router_proc.c
new file mode 100644
index 000000000..c055afc86
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/lnet/router_proc.c
@@ -0,0 +1,968 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * This file is part of Portals
+ * http://sourceforge.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+
+#if defined(LNET_ROUTER)
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static struct ctl_table_header *lnet_table_header;
+
+#define CTL_LNET (0x100)
+enum {
+ PSDEV_LNET_STATS = 100,
+ PSDEV_LNET_ROUTES,
+ PSDEV_LNET_ROUTERS,
+ PSDEV_LNET_PEERS,
+ PSDEV_LNET_BUFFERS,
+ PSDEV_LNET_NIS,
+ PSDEV_LNET_PTL_ROTOR,
+};
+
+#define LNET_LOFFT_BITS (sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS max_t(size_t, min_t(size_t, LNET_LOFFT_BITS, 64) / 4, 8)
+
+#define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \
+ LNET_PROC_CPT_BITS - \
+ LNET_PROC_VER_BITS - \
+ LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK ((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK ((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK ((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK ((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos) \
+ (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos) \
+ (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos) \
+ (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos) \
+ (int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off) \
+ (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) | \
+ ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) | \
+ ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+ ((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int proc_call_handler(void *data, int write, loff_t *ppos,
+ void __user *buffer, size_t *lenp,
+ int (*handler)(void *data, int write,
+ loff_t pos, void __user *buffer, int len))
+{
+ int rc = handler(data, write, *ppos, buffer, *lenp);
+
+ if (rc < 0)
+ return rc;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+ *lenp = rc;
+ *ppos += rc;
+ }
+ return 0;
+}
+
+static int __proc_lnet_stats(void *data, int write,
+ loff_t pos, void __user *buffer, int nob)
+{
+ int rc;
+ lnet_counters_t *ctrs;
+ int len;
+ char *tmpstr;
+ const int tmpsiz = 256; /* 7 %u and 4 %llu */
+
+ if (write) {
+ lnet_counters_reset();
+ return 0;
+ }
+
+ /* read */
+
+ LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+ if (ctrs == NULL)
+ return -ENOMEM;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL) {
+ LIBCFS_FREE(ctrs, sizeof(*ctrs));
+ return -ENOMEM;
+ }
+
+ lnet_counters_get(ctrs);
+
+ len = snprintf(tmpstr, tmpsiz,
+ "%u %u %u %u %u %u %u %llu %llu %llu %llu",
+ ctrs->msgs_alloc, ctrs->msgs_max,
+ ctrs->errors,
+ ctrs->send_count, ctrs->recv_count,
+ ctrs->route_count, ctrs->drop_count,
+ ctrs->send_length, ctrs->recv_length,
+ ctrs->route_length, ctrs->drop_length);
+
+ if (pos >= min_t(int, len, strlen(tmpstr)))
+ rc = 0;
+ else
+ rc = cfs_trace_copyout_string(buffer, nob,
+ tmpstr + pos, "\n");
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ LIBCFS_FREE(ctrs, sizeof(*ctrs));
+ return rc;
+}
+
+static int proc_lnet_stats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_call_handler(table->data, write, ppos, buffer, lenp,
+ __proc_lnet_stats);
+}
+
+static int proc_lnet_routes(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ const int tmpsiz = 256;
+ char *tmpstr;
+ char *s;
+ int rc = 0;
+ int len;
+ int ver;
+ int off;
+
+ CLASSERT(sizeof(loff_t) >= 4);
+
+ off = LNET_PROC_HOFF_GET(*ppos);
+ ver = LNET_PROC_VER_GET(*ppos);
+
+ LASSERT(!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+ the_lnet.ln_routing ? "enabled" : "disabled");
+ LASSERT(tmpstr + tmpsiz - s > 0);
+
+ s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %8s %7s %s\n",
+ "net", "hops", "priority", "state", "router");
+ LASSERT(tmpstr + tmpsiz - s > 0);
+
+ lnet_net_lock(0);
+ ver = (unsigned int)the_lnet.ln_remote_nets_version;
+ lnet_net_unlock(0);
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ } else {
+ struct list_head *n;
+ struct list_head *r;
+ lnet_route_t *route = NULL;
+ lnet_remotenet_t *rnet = NULL;
+ int skip = off - 1;
+ struct list_head *rn_list;
+ int i;
+
+ lnet_net_lock(0);
+
+ if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+ lnet_net_unlock(0);
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return -ESTALE;
+ }
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+ i++) {
+ rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+ n = rn_list->next;
+
+ while (n != rn_list && route == NULL) {
+ rnet = list_entry(n, lnet_remotenet_t,
+ lrn_list);
+
+ r = rnet->lrn_routes.next;
+
+ while (r != &rnet->lrn_routes) {
+ lnet_route_t *re =
+ list_entry(r, lnet_route_t,
+ lr_list);
+ if (skip == 0) {
+ route = re;
+ break;
+ }
+
+ skip--;
+ r = r->next;
+ }
+
+ n = n->next;
+ }
+ }
+
+ if (route != NULL) {
+ __u32 net = rnet->lrn_net;
+ unsigned int hops = route->lr_hops;
+ unsigned int priority = route->lr_priority;
+ lnet_nid_t nid = route->lr_gateway->lp_nid;
+ int alive = route->lr_gateway->lp_alive;
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-8s %4u %8u %7s %s\n",
+ libcfs_net2str(net), hops,
+ priority,
+ alive ? "up" : "down",
+ libcfs_nid2str(nid));
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+
+ lnet_net_unlock(0);
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else {
+ off += 1;
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ }
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+static int proc_lnet_routers(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+ char *tmpstr;
+ char *s;
+ const int tmpsiz = 256;
+ int len;
+ int ver;
+ int off;
+
+ off = LNET_PROC_HOFF_GET(*ppos);
+ ver = LNET_PROC_VER_GET(*ppos);
+
+ LASSERT(!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+ "ref", "rtr_ref", "alive_cnt", "state",
+ "last_ping", "ping_sent", "deadline",
+ "down_ni", "router");
+ LASSERT(tmpstr + tmpsiz - s > 0);
+
+ lnet_net_lock(0);
+ ver = (unsigned int)the_lnet.ln_routers_version;
+ lnet_net_unlock(0);
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ } else {
+ struct list_head *r;
+ struct lnet_peer *peer = NULL;
+ int skip = off - 1;
+
+ lnet_net_lock(0);
+
+ if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+ lnet_net_unlock(0);
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return -ESTALE;
+ }
+
+ r = the_lnet.ln_routers.next;
+
+ while (r != &the_lnet.ln_routers) {
+ lnet_peer_t *lp = list_entry(r, lnet_peer_t,
+ lp_rtr_list);
+
+ if (skip == 0) {
+ peer = lp;
+ break;
+ }
+
+ skip--;
+ r = r->next;
+ }
+
+ if (peer != NULL) {
+ lnet_nid_t nid = peer->lp_nid;
+ unsigned long now = cfs_time_current();
+ unsigned long deadline = peer->lp_ping_deadline;
+ int nrefs = peer->lp_refcount;
+ int nrtrrefs = peer->lp_rtr_refcount;
+ int alive_cnt = peer->lp_alive_count;
+ int alive = peer->lp_alive;
+ int pingsent = !peer->lp_ping_notsent;
+ int last_ping = cfs_duration_sec(cfs_time_sub(now,
+ peer->lp_ping_timestamp));
+ int down_ni = 0;
+ lnet_route_t *rtr;
+
+ if ((peer->lp_ping_feats &
+ LNET_PING_FEAT_NI_STATUS) != 0) {
+ list_for_each_entry(rtr, &peer->lp_routes,
+ lr_gwlist) {
+ /* downis on any route should be the
+ * number of downis on the gateway */
+ if (rtr->lr_downis != 0) {
+ down_ni = rtr->lr_downis;
+ break;
+ }
+ }
+ }
+
+ if (deadline == 0)
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+ nrefs, nrtrrefs, alive_cnt,
+ alive ? "up" : "down", last_ping,
+ pingsent, "NA", down_ni,
+ libcfs_nid2str(nid));
+ else
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+ nrefs, nrtrrefs, alive_cnt,
+ alive ? "up" : "down", last_ping,
+ pingsent,
+ cfs_duration_sec(cfs_time_sub(deadline, now)),
+ down_ni, libcfs_nid2str(nid));
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+
+ lnet_net_unlock(0);
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else {
+ off += 1;
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ }
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+static int proc_lnet_peers(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ const int tmpsiz = 256;
+ struct lnet_peer_table *ptable;
+ char *tmpstr;
+ char *s;
+ int cpt = LNET_PROC_CPT_GET(*ppos);
+ int ver = LNET_PROC_VER_GET(*ppos);
+ int hash = LNET_PROC_HASH_GET(*ppos);
+ int hoff = LNET_PROC_HOFF_GET(*ppos);
+ int rc = 0;
+ int len;
+
+ CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+ LASSERT(!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ if (cpt >= LNET_CPT_NUMBER) {
+ *lenp = 0;
+ return 0;
+ }
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+ "nid", "refs", "state", "last", "max",
+ "rtr", "min", "tx", "min", "queue");
+ LASSERT(tmpstr + tmpsiz - s > 0);
+
+ hoff++;
+ } else {
+ struct lnet_peer *peer;
+ struct list_head *p;
+ int skip;
+ again:
+ p = NULL;
+ peer = NULL;
+ skip = hoff - 1;
+
+ lnet_net_lock(cpt);
+ ptable = the_lnet.ln_peer_tables[cpt];
+ if (hoff == 1)
+ ver = LNET_PROC_VERSION(ptable->pt_version);
+
+ if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+ lnet_net_unlock(cpt);
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return -ESTALE;
+ }
+
+ while (hash < LNET_PEER_HASH_SIZE) {
+ if (p == NULL)
+ p = ptable->pt_hash[hash].next;
+
+ while (p != &ptable->pt_hash[hash]) {
+ lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+ lp_hashlist);
+ if (skip == 0) {
+ peer = lp;
+
+ /* minor optimization: start from idx+1
+ * on next iteration if we've just
+ * drained lp_hashlist */
+ if (lp->lp_hashlist.next ==
+ &ptable->pt_hash[hash]) {
+ hoff = 1;
+ hash++;
+ } else {
+ hoff++;
+ }
+
+ break;
+ }
+
+ skip--;
+ p = lp->lp_hashlist.next;
+ }
+
+ if (peer != NULL)
+ break;
+
+ p = NULL;
+ hoff = 1;
+ hash++;
+ }
+
+ if (peer != NULL) {
+ lnet_nid_t nid = peer->lp_nid;
+ int nrefs = peer->lp_refcount;
+ int lastalive = -1;
+ char *aliveness = "NA";
+ int maxcr = peer->lp_ni->ni_peertxcredits;
+ int txcr = peer->lp_txcredits;
+ int mintxcr = peer->lp_mintxcredits;
+ int rtrcr = peer->lp_rtrcredits;
+ int minrtrcr = peer->lp_minrtrcredits;
+ int txqnob = peer->lp_txqnob;
+
+ if (lnet_isrouter(peer) ||
+ lnet_peer_aliveness_enabled(peer))
+ aliveness = peer->lp_alive ? "up" : "down";
+
+ if (lnet_peer_aliveness_enabled(peer)) {
+ unsigned long now = cfs_time_current();
+ long delta;
+
+ delta = cfs_time_sub(now, peer->lp_last_alive);
+ lastalive = cfs_duration_sec(delta);
+
+ /* No need to mess up peers contents with
+ * arbitrarily long integers - it suffices to
+ * know that lastalive is more than 10000s old
+ */
+ if (lastalive >= 10000)
+ lastalive = 9999;
+ }
+
+ lnet_net_unlock(cpt);
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+ libcfs_nid2str(nid), nrefs, aliveness,
+ lastalive, maxcr, rtrcr, minrtrcr, txcr,
+ mintxcr, txqnob);
+ LASSERT(tmpstr + tmpsiz - s > 0);
+
+ } else { /* peer is NULL */
+ lnet_net_unlock(cpt);
+ }
+
+ if (hash == LNET_PEER_HASH_SIZE) {
+ cpt++;
+ hash = 0;
+ hoff = 1;
+ if (peer == NULL && cpt < LNET_CPT_NUMBER)
+ goto again;
+ }
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else
+ *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+ loff_t pos, void __user *buffer, int nob)
+{
+ char *s;
+ char *tmpstr;
+ int tmpsiz;
+ int idx;
+ int len;
+ int rc;
+ int i;
+
+ LASSERT(!write);
+
+ /* (4 %d) * 4 * LNET_CPT_NUMBER */
+ tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%5s %5s %7s %7s\n",
+ "pages", "count", "credits", "min");
+ LASSERT(tmpstr + tmpsiz - s > 0);
+
+ if (the_lnet.ln_rtrpools == NULL)
+ goto out; /* I'm not a router */
+
+ for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+ lnet_rtrbufpool_t *rbp;
+
+ lnet_net_lock(LNET_LOCK_EX);
+ cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%5d %5d %7d %7d\n",
+ rbp[idx].rbp_npages,
+ rbp[idx].rbp_nbuffers,
+ rbp[idx].rbp_credits,
+ rbp[idx].rbp_mincredits);
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+ }
+
+ out:
+ len = s - tmpstr;
+
+ if (pos >= min_t(int, len, strlen(tmpstr)))
+ rc = 0;
+ else
+ rc = cfs_trace_copyout_string(buffer, nob,
+ tmpstr + pos, NULL);
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return rc;
+}
+
+static int proc_lnet_buffers(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_call_handler(table->data, write, ppos, buffer, lenp,
+ __proc_lnet_buffers);
+}
+
+static int proc_lnet_nis(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int tmpsiz = 128 * LNET_CPT_NUMBER;
+ int rc = 0;
+ char *tmpstr;
+ char *s;
+ int len;
+
+ LASSERT(!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+ "nid", "status", "alive", "refs", "peer",
+ "rtr", "max", "tx", "min");
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ } else {
+ struct list_head *n;
+ lnet_ni_t *ni = NULL;
+ int skip = *ppos - 1;
+
+ lnet_net_lock(0);
+
+ n = the_lnet.ln_nis.next;
+
+ while (n != &the_lnet.ln_nis) {
+ lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
+
+ if (skip == 0) {
+ ni = a_ni;
+ break;
+ }
+
+ skip--;
+ n = n->next;
+ }
+
+ if (ni != NULL) {
+ struct lnet_tx_queue *tq;
+ char *stat;
+ long now = get_seconds();
+ int last_alive = -1;
+ int i;
+ int j;
+
+ if (the_lnet.ln_routing)
+ last_alive = now - ni->ni_last_alive;
+
+ /* @lo forever alive */
+ if (ni->ni_lnd->lnd_type == LOLND)
+ last_alive = 0;
+
+ lnet_ni_lock(ni);
+ LASSERT(ni->ni_status != NULL);
+ stat = (ni->ni_status->ns_status ==
+ LNET_NI_STATUS_UP) ? "up" : "down";
+ lnet_ni_unlock(ni);
+
+ /* we actually output credits information for
+ * TX queue of each partition */
+ cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+ for (j = 0; ni->ni_cpts != NULL &&
+ j < ni->ni_ncpts; j++) {
+ if (i == ni->ni_cpts[j])
+ break;
+ }
+
+ if (j == ni->ni_ncpts)
+ continue;
+
+ if (i != 0)
+ lnet_net_lock(i);
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+ libcfs_nid2str(ni->ni_nid), stat,
+ last_alive, *ni->ni_refs[i],
+ ni->ni_peertxcredits,
+ ni->ni_peerrtrcredits,
+ tq->tq_credits_max,
+ tq->tq_credits, tq->tq_credits_min);
+ if (i != 0)
+ lnet_net_unlock(i);
+ }
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+
+ lnet_net_unlock(0);
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else
+ *ppos += 1;
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+struct lnet_portal_rotors {
+ int pr_value;
+ const char *pr_name;
+ const char *pr_desc;
+};
+
+static struct lnet_portal_rotors portal_rotors[] = {
+ {
+ .pr_value = LNET_PTL_ROTOR_OFF,
+ .pr_name = "OFF",
+ .pr_desc = "Turn off message rotor for wildcard portals"
+ },
+ {
+ .pr_value = LNET_PTL_ROTOR_ON,
+ .pr_name = "ON",
+ .pr_desc = "round-robin dispatch all PUT messages for wildcard portals"
+ },
+ {
+ .pr_value = LNET_PTL_ROTOR_RR_RT,
+ .pr_name = "RR_RT",
+ .pr_desc = "round-robin dispatch routed PUT message for wildcard portals"
+ },
+ {
+ .pr_value = LNET_PTL_ROTOR_HASH_RT,
+ .pr_name = "HASH_RT",
+ .pr_desc = "dispatch routed PUT message by hashing source NID for wildcard portals"
+ },
+ {
+ .pr_value = -1,
+ .pr_name = NULL,
+ .pr_desc = NULL
+ },
+};
+
+extern int portal_rotor;
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+ loff_t pos, void __user *buffer, int nob)
+{
+ const int buf_len = 128;
+ char *buf;
+ char *tmp;
+ int rc;
+ int i;
+
+ LIBCFS_ALLOC(buf, buf_len);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ if (!write) {
+ lnet_res_lock(0);
+
+ for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+ if (portal_rotors[i].pr_value == portal_rotor)
+ break;
+ }
+
+ LASSERT(portal_rotors[i].pr_value == portal_rotor);
+ lnet_res_unlock(0);
+
+ rc = snprintf(buf, buf_len,
+ "{\n\tportals: all\n"
+ "\trotor: %s\n\tdescription: %s\n}",
+ portal_rotors[i].pr_name,
+ portal_rotors[i].pr_desc);
+
+ if (pos >= min_t(int, rc, buf_len)) {
+ rc = 0;
+ } else {
+ rc = cfs_trace_copyout_string(buffer, nob,
+ buf + pos, "\n");
+ }
+ goto out;
+ }
+
+ rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+ if (rc < 0)
+ goto out;
+
+ tmp = cfs_trimwhite(buf);
+
+ rc = -EINVAL;
+ lnet_res_lock(0);
+ for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+ if (strncasecmp(portal_rotors[i].pr_name, tmp,
+ strlen(portal_rotors[i].pr_name)) == 0) {
+ portal_rotor = portal_rotors[i].pr_value;
+ rc = 0;
+ break;
+ }
+ }
+ lnet_res_unlock(0);
+out:
+ LIBCFS_FREE(buf, buf_len);
+ return rc;
+}
+
+static int proc_lnet_portal_rotor(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ return proc_call_handler(table->data, write, ppos, buffer, lenp,
+ __proc_lnet_portal_rotor);
+}
+
+static struct ctl_table lnet_table[] = {
+ /*
+ * NB No .strategy entries have been provided since sysctl(8) prefers
+ * to go via /proc for portability.
+ */
+ {
+ .procname = "stats",
+ .mode = 0644,
+ .proc_handler = &proc_lnet_stats,
+ },
+ {
+ .procname = "routes",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_routes,
+ },
+ {
+ .procname = "routers",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_routers,
+ },
+ {
+ .procname = "peers",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_peers,
+ },
+ {
+ .procname = "buffers",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_buffers,
+ },
+ {
+ .procname = "nis",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_nis,
+ },
+ {
+ .procname = "portal_rotor",
+ .mode = 0644,
+ .proc_handler = &proc_lnet_portal_rotor,
+ },
+ {
+ }
+};
+
+static struct ctl_table top_table[] = {
+ {
+ .procname = "lnet",
+ .mode = 0555,
+ .data = NULL,
+ .maxlen = 0,
+ .child = lnet_table,
+ },
+ {
+ }
+};
+
+void
+lnet_proc_init(void)
+{
+ if (lnet_table_header == NULL)
+ lnet_table_header = register_sysctl_table(top_table);
+}
+
+void
+lnet_proc_fini(void)
+{
+ if (lnet_table_header != NULL)
+ unregister_sysctl_table(lnet_table_header);
+
+ lnet_table_header = NULL;
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/Makefile b/kernel/drivers/staging/lustre/lnet/selftest/Makefile
new file mode 100644
index 000000000..c0de6e2d9
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o
+
+lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \
+ module.o ping_test.o brw_test.o
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/brw_test.c b/kernel/drivers/staging/lustre/lnet/selftest/brw_test.c
new file mode 100644
index 000000000..658f4584f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/brw_test.c
@@ -0,0 +1,508 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+module_param(brw_srv_workitems, int, 0644);
+MODULE_PARM_DESC(brw_srv_workitems, "# BRW server workitems");
+
+static int brw_inject_errors;
+module_param(brw_inject_errors, int, 0644);
+MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by default");
+
+static void
+brw_client_fini(sfw_test_instance_t *tsi)
+{
+ srpc_bulk_t *bulk;
+ sfw_test_unit_t *tsu;
+
+ LASSERT(tsi->tsi_is_client);
+
+ list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+ bulk = tsu->tsu_private;
+ if (bulk == NULL)
+ continue;
+
+ srpc_free_bulk(bulk);
+ tsu->tsu_private = NULL;
+ }
+}
+
+static int
+brw_client_init(sfw_test_instance_t *tsi)
+{
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ int flags;
+ int npg;
+ int len;
+ int opc;
+ srpc_bulk_t *bulk;
+ sfw_test_unit_t *tsu;
+
+ LASSERT(sn != NULL);
+ LASSERT(tsi->tsi_is_client);
+
+ if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+ test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+ opc = breq->blk_opc;
+ flags = breq->blk_flags;
+ npg = breq->blk_npg;
+ /* NB: this is not going to work for variable page size,
+ * but we have to keep it for compatibility */
+ len = npg * PAGE_CACHE_SIZE;
+
+ } else {
+ test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1;
+
+ /* I should never get this step if it's unknown feature
+ * because make_session will reject unknown feature */
+ LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+ opc = breq->blk_opc;
+ flags = breq->blk_flags;
+ len = breq->blk_len;
+ npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ }
+
+ if (npg > LNET_MAX_IOV || npg <= 0)
+ return -EINVAL;
+
+ if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+ return -EINVAL;
+
+ if (flags != LST_BRW_CHECK_NONE &&
+ flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+ return -EINVAL;
+
+ list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+ bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid),
+ npg, len, opc == LST_BRW_READ);
+ if (bulk == NULL) {
+ brw_client_fini(tsi);
+ return -ENOMEM;
+ }
+
+ tsu->tsu_private = bulk;
+ }
+
+ return 0;
+}
+
+#define BRW_POISON 0xbeefbeefbeefbeefULL
+#define BRW_MAGIC 0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE sizeof(__u64)
+
+static int
+brw_inject_one_error(void)
+{
+ struct timeval tv;
+
+ if (brw_inject_errors <= 0)
+ return 0;
+
+ do_gettimeofday(&tv);
+
+ if ((tv.tv_usec & 1) == 0)
+ return 0;
+
+ return brw_inject_errors--;
+}
+
+static void
+brw_fill_page(struct page *pg, int pattern, __u64 magic)
+{
+ char *addr = page_address(pg);
+ int i;
+
+ LASSERT(addr != NULL);
+
+ if (pattern == LST_BRW_CHECK_NONE)
+ return;
+
+ if (magic == BRW_MAGIC)
+ magic += brw_inject_one_error();
+
+ if (pattern == LST_BRW_CHECK_SIMPLE) {
+ memcpy(addr, &magic, BRW_MSIZE);
+ addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+ memcpy(addr, &magic, BRW_MSIZE);
+ return;
+ }
+
+ if (pattern == LST_BRW_CHECK_FULL) {
+ for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++)
+ memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE);
+ return;
+ }
+
+ LBUG();
+}
+
+static int
+brw_check_page(struct page *pg, int pattern, __u64 magic)
+{
+ char *addr = page_address(pg);
+ __u64 data = 0; /* make compiler happy */
+ int i;
+
+ LASSERT(addr != NULL);
+
+ if (pattern == LST_BRW_CHECK_NONE)
+ return 0;
+
+ if (pattern == LST_BRW_CHECK_SIMPLE) {
+ data = *((__u64 *) addr);
+ if (data != magic)
+ goto bad_data;
+
+ addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+ data = *((__u64 *) addr);
+ if (data != magic)
+ goto bad_data;
+
+ return 0;
+ }
+
+ if (pattern == LST_BRW_CHECK_FULL) {
+ for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) {
+ data = *(((__u64 *) addr) + i);
+ if (data != magic)
+ goto bad_data;
+ }
+
+ return 0;
+ }
+
+ LBUG();
+
+bad_data:
+ CERROR("Bad data in page %p: %#llx, %#llx expected\n",
+ pg, data, magic);
+ return 1;
+}
+
+static void
+brw_fill_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+ int i;
+ struct page *pg;
+
+ for (i = 0; i < bk->bk_niov; i++) {
+ pg = bk->bk_iovs[i].kiov_page;
+ brw_fill_page(pg, pattern, magic);
+ }
+}
+
+static int
+brw_check_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+ int i;
+ struct page *pg;
+
+ for (i = 0; i < bk->bk_niov; i++) {
+ pg = bk->bk_iovs[i].kiov_page;
+ if (brw_check_page(pg, pattern, magic) != 0) {
+ CERROR("Bulk page %p (%d/%d) is corrupted!\n",
+ pg, i, bk->bk_niov);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+brw_client_prep_rpc(sfw_test_unit_t *tsu,
+ lnet_process_id_t dest, srpc_client_rpc_t **rpcpp)
+{
+ srpc_bulk_t *bulk = tsu->tsu_private;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ srpc_client_rpc_t *rpc;
+ srpc_brw_reqst_t *req;
+ int flags;
+ int npg;
+ int len;
+ int opc;
+ int rc;
+
+ LASSERT(sn != NULL);
+ LASSERT(bulk != NULL);
+
+ if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+ test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+ opc = breq->blk_opc;
+ flags = breq->blk_flags;
+ npg = breq->blk_npg;
+ len = npg * PAGE_CACHE_SIZE;
+
+ } else {
+ test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1;
+
+ /* I should never get this step if it's unknown feature
+ * because make_session will reject unknown feature */
+ LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+ opc = breq->blk_opc;
+ flags = breq->blk_flags;
+ len = breq->blk_len;
+ npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ }
+
+ rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+ if (rc != 0)
+ return rc;
+
+ memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+ if (opc == LST_BRW_WRITE)
+ brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+ else
+ brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+ req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+ req->brw_flags = flags;
+ req->brw_rw = opc;
+ req->brw_len = len;
+
+ *rpcpp = rpc;
+ return 0;
+}
+
+static void
+brw_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+ __u64 magic = BRW_MAGIC;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ srpc_msg_t *msg = &rpc->crpc_replymsg;
+ srpc_brw_reply_t *reply = &msg->msg_body.brw_reply;
+ srpc_brw_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+ LASSERT(sn != NULL);
+
+ if (rpc->crpc_status != 0) {
+ CERROR("BRW RPC to %s failed with %d\n",
+ libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+ if (!tsi->tsi_stopping) /* rpc could have been aborted */
+ atomic_inc(&sn->sn_brw_errors);
+ goto out;
+ }
+
+ if (msg->msg_magic != SRPC_MSG_MAGIC) {
+ __swab64s(&magic);
+ __swab32s(&reply->brw_status);
+ }
+
+ CDEBUG(reply->brw_status ? D_WARNING : D_NET,
+ "BRW RPC to %s finished with brw_status: %d\n",
+ libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+ if (reply->brw_status != 0) {
+ atomic_inc(&sn->sn_brw_errors);
+ rpc->crpc_status = -(int)reply->brw_status;
+ goto out;
+ }
+
+ if (reqst->brw_rw == LST_BRW_WRITE)
+ goto out;
+
+ if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+ CERROR("Bulk data from %s is corrupted!\n",
+ libcfs_id2str(rpc->crpc_dest));
+ atomic_inc(&sn->sn_brw_errors);
+ rpc->crpc_status = -EBADMSG;
+ }
+
+out:
+ return;
+}
+
+static void
+brw_server_rpc_done(srpc_server_rpc_t *rpc)
+{
+ srpc_bulk_t *blk = rpc->srpc_bulk;
+
+ if (blk == NULL)
+ return;
+
+ if (rpc->srpc_status != 0)
+ CERROR("Bulk transfer %s %s has failed: %d\n",
+ blk->bk_sink ? "from" : "to",
+ libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+ else
+ CDEBUG(D_NET, "Transferred %d pages bulk data %s %s\n",
+ blk->bk_niov, blk->bk_sink ? "from" : "to",
+ libcfs_id2str(rpc->srpc_peer));
+
+ sfw_free_pages(rpc);
+}
+
+static int
+brw_bulk_ready(srpc_server_rpc_t *rpc, int status)
+{
+ __u64 magic = BRW_MAGIC;
+ srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+ srpc_brw_reqst_t *reqst;
+ srpc_msg_t *reqstmsg;
+
+ LASSERT(rpc->srpc_bulk != NULL);
+ LASSERT(rpc->srpc_reqstbuf != NULL);
+
+ reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+ reqst = &reqstmsg->msg_body.brw_reqst;
+
+ if (status != 0) {
+ CERROR("BRW bulk %s failed for RPC from %s: %d\n",
+ reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+ libcfs_id2str(rpc->srpc_peer), status);
+ return -EIO;
+ }
+
+ if (reqst->brw_rw == LST_BRW_READ)
+ return 0;
+
+ if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+ __swab64s(&magic);
+
+ if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+ CERROR("Bulk data from %s is corrupted!\n",
+ libcfs_id2str(rpc->srpc_peer));
+ reply->brw_status = EBADMSG;
+ }
+
+ return 0;
+}
+
+static int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ srpc_msg_t *replymsg = &rpc->srpc_replymsg;
+ srpc_msg_t *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+ srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+ srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+ int npg;
+ int rc;
+
+ LASSERT(sv->sv_id == SRPC_SERVICE_BRW);
+
+ if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+ LASSERT(reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+ __swab32s(&reqst->brw_rw);
+ __swab32s(&reqst->brw_len);
+ __swab32s(&reqst->brw_flags);
+ __swab64s(&reqst->brw_rpyid);
+ __swab64s(&reqst->brw_bulkid);
+ }
+ LASSERT(reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+ reply->brw_status = 0;
+ rpc->srpc_done = brw_server_rpc_done;
+
+ if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+ (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+ reqst->brw_flags != LST_BRW_CHECK_FULL &&
+ reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+ reply->brw_status = EINVAL;
+ return 0;
+ }
+
+ if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ replymsg->msg_ses_feats = LST_FEATS_MASK;
+ reply->brw_status = EPROTO;
+ return 0;
+ }
+
+ if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+ /* compat with old version */
+ if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) {
+ reply->brw_status = EINVAL;
+ return 0;
+ }
+ npg = reqst->brw_len >> PAGE_CACHE_SHIFT;
+
+ } else {
+ npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ }
+
+ replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+ if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+ reply->brw_status = EINVAL;
+ return 0;
+ }
+
+ rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+ reqst->brw_len,
+ reqst->brw_rw == LST_BRW_WRITE);
+ if (rc != 0)
+ return rc;
+
+ if (reqst->brw_rw == LST_BRW_READ)
+ brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+ else
+ brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+ return 0;
+}
+
+sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void)
+{
+ brw_test_client.tso_init = brw_client_init;
+ brw_test_client.tso_fini = brw_client_fini;
+ brw_test_client.tso_prep_rpc = brw_client_prep_rpc;
+ brw_test_client.tso_done_rpc = brw_client_done_rpc;
+};
+
+srpc_service_t brw_test_service;
+void brw_init_test_service(void)
+{
+
+ brw_test_service.sv_id = SRPC_SERVICE_BRW;
+ brw_test_service.sv_name = "brw_test";
+ brw_test_service.sv_handler = brw_server_handle;
+ brw_test_service.sv_bulk_ready = brw_bulk_ready;
+ brw_test_service.sv_wi_total = brw_srv_workitems;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/conctl.c b/kernel/drivers/staging/lustre/lnet/selftest/conctl.c
new file mode 100644
index 000000000..045fe295a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/conctl.c
@@ -0,0 +1,929 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "../../include/linux/lnet/lnetst.h"
+#include "console.h"
+
+static int
+lst_session_new_ioctl(lstio_session_new_args_t *args)
+{
+ char *name;
+ int rc;
+
+ if (args->lstio_ses_idp == NULL || /* address for output sid */
+ args->lstio_ses_key == 0 || /* no key is specified */
+ args->lstio_ses_namep == NULL || /* session name */
+ args->lstio_ses_nmlen <= 0 ||
+ args->lstio_ses_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_ses_namep,
+ args->lstio_ses_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_ses_nmlen] = 0;
+
+ rc = lstcon_session_new(name,
+ args->lstio_ses_key,
+ args->lstio_ses_feats,
+ args->lstio_ses_force,
+ args->lstio_ses_timeout,
+ args->lstio_ses_idp);
+
+ LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+ return rc;
+}
+
+static int
+lst_session_end_ioctl(lstio_session_end_args_t *args)
+{
+ if (args->lstio_ses_key != console_session.ses_key)
+ return -EACCES;
+
+ return lstcon_session_end();
+}
+
+static int
+lst_session_info_ioctl(lstio_session_info_args_t *args)
+{
+ /* no checking of key */
+
+ if (args->lstio_ses_idp == NULL || /* address for output sid */
+ args->lstio_ses_keyp == NULL || /* address for output key */
+ args->lstio_ses_featp == NULL || /* address for output features */
+ args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+ args->lstio_ses_namep == NULL || /* address for output name */
+ args->lstio_ses_nmlen <= 0 ||
+ args->lstio_ses_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ return lstcon_session_info(args->lstio_ses_idp,
+ args->lstio_ses_keyp,
+ args->lstio_ses_featp,
+ args->lstio_ses_ndinfo,
+ args->lstio_ses_namep,
+ args->lstio_ses_nmlen);
+}
+
+static int
+lst_debug_ioctl(lstio_debug_args_t *args)
+{
+ char *name = NULL;
+ int client = 1;
+ int rc;
+
+ if (args->lstio_dbg_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_dbg_resultp == NULL)
+ return -EINVAL;
+
+ if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+ (args->lstio_dbg_nmlen <= 0 ||
+ args->lstio_dbg_nmlen > LST_NAME_SIZE))
+ return -EINVAL;
+
+ if (args->lstio_dbg_namep != NULL) {
+ LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name, args->lstio_dbg_namep,
+ args->lstio_dbg_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+ return -EFAULT;
+ }
+
+ name[args->lstio_dbg_nmlen] = 0;
+ }
+
+ rc = -EINVAL;
+
+ switch (args->lstio_dbg_type) {
+ case LST_OPC_SESSION:
+ rc = lstcon_session_debug(args->lstio_dbg_timeout,
+ args->lstio_dbg_resultp);
+ break;
+
+ case LST_OPC_BATCHSRV:
+ client = 0;
+ case LST_OPC_BATCHCLI:
+ if (name == NULL)
+ goto out;
+
+ rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+ name, client, args->lstio_dbg_resultp);
+ break;
+
+ case LST_OPC_GROUP:
+ if (name == NULL)
+ goto out;
+
+ rc = lstcon_group_debug(args->lstio_dbg_timeout,
+ name, args->lstio_dbg_resultp);
+ break;
+
+ case LST_OPC_NODES:
+ if (args->lstio_dbg_count <= 0 ||
+ args->lstio_dbg_idsp == NULL)
+ goto out;
+
+ rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+ args->lstio_dbg_count,
+ args->lstio_dbg_idsp,
+ args->lstio_dbg_resultp);
+ break;
+
+ default:
+ break;
+ }
+
+out:
+ if (name != NULL)
+ LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+ return rc;
+}
+
+static int
+lst_group_add_ioctl(lstio_group_add_args_t *args)
+{
+ char *name;
+ int rc;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen);
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ rc = lstcon_group_add(name);
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ return rc;
+}
+
+static int
+lst_group_del_ioctl(lstio_group_del_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ rc = lstcon_group_del(name);
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ return rc;
+}
+
+static int
+lst_group_update_ioctl(lstio_group_update_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_resultp == NULL ||
+ args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ switch (args->lstio_grp_opc) {
+ case LST_GROUP_CLEAN:
+ rc = lstcon_group_clean(name, args->lstio_grp_args);
+ break;
+
+ case LST_GROUP_REFRESH:
+ rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+ break;
+
+ case LST_GROUP_RMND:
+ if (args->lstio_grp_count <= 0 ||
+ args->lstio_grp_idsp == NULL) {
+ rc = -EINVAL;
+ break;
+ }
+ rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+ args->lstio_grp_idsp,
+ args->lstio_grp_resultp);
+ break;
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ return rc;
+}
+
+static int
+lst_nodes_add_ioctl(lstio_group_nodes_args_t *args)
+{
+ unsigned feats;
+ int rc;
+ char *name;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_idsp == NULL || /* array of ids */
+ args->lstio_grp_count <= 0 ||
+ args->lstio_grp_resultp == NULL ||
+ args->lstio_grp_featp == NULL ||
+ args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name, args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ rc = lstcon_nodes_add(name, args->lstio_grp_count,
+ args->lstio_grp_idsp, &feats,
+ args->lstio_grp_resultp);
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+ if (rc == 0 &&
+ copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+ return -EINVAL;
+ }
+
+ return rc;
+}
+
+static int
+lst_group_list_ioctl(lstio_group_list_args_t *args)
+{
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_idx < 0 ||
+ args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ return lstcon_group_list(args->lstio_grp_idx,
+ args->lstio_grp_nmlen,
+ args->lstio_grp_namep);
+}
+
+static int
+lst_group_info_ioctl(lstio_group_info_args_t *args)
+{
+ char *name;
+ int ndent;
+ int index;
+ int rc;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_grp_entp == NULL && /* output: group entry */
+ args->lstio_grp_dentsp == NULL) /* output: node entry */
+ return -EINVAL;
+
+ if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+ if (args->lstio_grp_idxp == NULL || /* node index */
+ args->lstio_grp_ndentp == NULL) /* # of node entry */
+ return -EINVAL;
+
+ if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+ sizeof(ndent)) ||
+ copy_from_user(&index, args->lstio_grp_idxp,
+ sizeof(index)))
+ return -EFAULT;
+
+ if (ndent <= 0 || index < 0)
+ return -EINVAL;
+ }
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ rc = lstcon_group_info(name, args->lstio_grp_entp,
+ &index, &ndent, args->lstio_grp_dentsp);
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ if (rc != 0)
+ return rc;
+
+ if (args->lstio_grp_dentsp != NULL &&
+ (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+ copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+ rc = -EFAULT;
+
+ return 0;
+}
+
+static int
+lst_batch_add_ioctl(lstio_batch_add_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep,
+ args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_batch_add(name);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ return rc;
+}
+
+static int
+lst_batch_run_ioctl(lstio_batch_run_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep,
+ args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+ args->lstio_bat_resultp);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ return rc;
+}
+
+static int
+lst_batch_stop_ioctl(lstio_batch_stop_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_resultp == NULL ||
+ args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep,
+ args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_batch_stop(name, args->lstio_bat_force,
+ args->lstio_bat_resultp);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ return rc;
+}
+
+static int
+lst_batch_query_ioctl(lstio_batch_query_args_t *args)
+{
+ char *name;
+ int rc;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_resultp == NULL ||
+ args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_bat_testidx < 0)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep,
+ args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_test_batch_query(name,
+ args->lstio_bat_testidx,
+ args->lstio_bat_client,
+ args->lstio_bat_timeout,
+ args->lstio_bat_resultp);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ return rc;
+}
+
+static int
+lst_batch_list_ioctl(lstio_batch_list_args_t *args)
+{
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_idx < 0 ||
+ args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ return lstcon_batch_list(args->lstio_bat_idx,
+ args->lstio_bat_nmlen,
+ args->lstio_bat_namep);
+}
+
+static int
+lst_batch_info_ioctl(lstio_batch_info_args_t *args)
+{
+ char *name;
+ int rc;
+ int index;
+ int ndent;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_namep == NULL || /* batch name */
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_bat_entp == NULL && /* output: batch entry */
+ args->lstio_bat_dentsp == NULL) /* output: node entry */
+ return -EINVAL;
+
+ if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+ if (args->lstio_bat_idxp == NULL || /* node index */
+ args->lstio_bat_ndentp == NULL) /* # of node entry */
+ return -EINVAL;
+
+ if (copy_from_user(&index, args->lstio_bat_idxp,
+ sizeof(index)) ||
+ copy_from_user(&ndent, args->lstio_bat_ndentp,
+ sizeof(ndent)))
+ return -EFAULT;
+
+ if (ndent <= 0 || index < 0)
+ return -EINVAL;
+ }
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep, args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_batch_info(name,
+ args->lstio_bat_entp, args->lstio_bat_server,
+ args->lstio_bat_testidx, &index, &ndent,
+ args->lstio_bat_dentsp);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ if (rc != 0)
+ return rc;
+
+ if (args->lstio_bat_dentsp != NULL &&
+ (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+ copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+ rc = -EFAULT;
+
+ return rc;
+}
+
+static int
+lst_stat_query_ioctl(lstio_stat_args_t *args)
+{
+ int rc;
+ char *name;
+
+ /* TODO: not finished */
+ if (args->lstio_sta_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_sta_resultp == NULL ||
+ (args->lstio_sta_namep == NULL &&
+ args->lstio_sta_idsp == NULL) ||
+ args->lstio_sta_nmlen <= 0 ||
+ args->lstio_sta_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_sta_idsp != NULL &&
+ args->lstio_sta_count <= 0)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name, args->lstio_sta_namep,
+ args->lstio_sta_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+ return -EFAULT;
+ }
+
+ if (args->lstio_sta_idsp == NULL) {
+ rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+ args->lstio_sta_resultp);
+ } else {
+ rc = lstcon_nodes_stat(args->lstio_sta_count,
+ args->lstio_sta_idsp,
+ args->lstio_sta_timeout,
+ args->lstio_sta_resultp);
+ }
+
+ LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+
+ return rc;
+}
+
+static int lst_test_add_ioctl(lstio_test_args_t *args)
+{
+ char *batch_name;
+ char *src_name = NULL;
+ char *dst_name = NULL;
+ void *param = NULL;
+ int ret = 0;
+ int rc = -ENOMEM;
+
+ if (args->lstio_tes_resultp == NULL ||
+ args->lstio_tes_retp == NULL ||
+ args->lstio_tes_bat_name == NULL || /* no specified batch */
+ args->lstio_tes_bat_nmlen <= 0 ||
+ args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+ args->lstio_tes_sgrp_name == NULL || /* no source group */
+ args->lstio_tes_sgrp_nmlen <= 0 ||
+ args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+ args->lstio_tes_dgrp_name == NULL || /* no target group */
+ args->lstio_tes_dgrp_nmlen <= 0 ||
+ args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_tes_loop == 0 || /* negative is infinite */
+ args->lstio_tes_concur <= 0 ||
+ args->lstio_tes_dist <= 0 ||
+ args->lstio_tes_span <= 0)
+ return -EINVAL;
+
+ /* have parameter, check if parameter length is valid */
+ if (args->lstio_tes_param != NULL &&
+ (args->lstio_tes_param_len <= 0 ||
+ args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t)))
+ return -EINVAL;
+
+ LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1);
+ if (batch_name == NULL)
+ return rc;
+
+ LIBCFS_ALLOC(src_name, args->lstio_tes_sgrp_nmlen + 1);
+ if (src_name == NULL)
+ goto out;
+
+ LIBCFS_ALLOC(dst_name, args->lstio_tes_dgrp_nmlen + 1);
+ if (dst_name == NULL)
+ goto out;
+
+ if (args->lstio_tes_param != NULL) {
+ LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+ if (param == NULL)
+ goto out;
+ }
+
+ rc = -EFAULT;
+ if (copy_from_user(batch_name, args->lstio_tes_bat_name,
+ args->lstio_tes_bat_nmlen) ||
+ copy_from_user(src_name, args->lstio_tes_sgrp_name,
+ args->lstio_tes_sgrp_nmlen) ||
+ copy_from_user(dst_name, args->lstio_tes_dgrp_name,
+ args->lstio_tes_dgrp_nmlen) ||
+ copy_from_user(param, args->lstio_tes_param,
+ args->lstio_tes_param_len))
+ goto out;
+
+ rc = lstcon_test_add(batch_name,
+ args->lstio_tes_type,
+ args->lstio_tes_loop,
+ args->lstio_tes_concur,
+ args->lstio_tes_dist, args->lstio_tes_span,
+ src_name, dst_name, param,
+ args->lstio_tes_param_len,
+ &ret, args->lstio_tes_resultp);
+
+ if (ret != 0)
+ rc = (copy_to_user(args->lstio_tes_retp, &ret,
+ sizeof(ret))) ? -EFAULT : 0;
+out:
+ if (batch_name != NULL)
+ LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1);
+
+ if (src_name != NULL)
+ LIBCFS_FREE(src_name, args->lstio_tes_sgrp_nmlen + 1);
+
+ if (dst_name != NULL)
+ LIBCFS_FREE(dst_name, args->lstio_tes_dgrp_nmlen + 1);
+
+ if (param != NULL)
+ LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+ return rc;
+}
+
+int
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+ char *buf;
+ int opc = data->ioc_u32[0];
+ int rc;
+
+ if (cmd != IOC_LIBCFS_LNETST)
+ return -EINVAL;
+
+ if (data->ioc_plen1 > PAGE_CACHE_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(buf, data->ioc_plen1);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ /* copy in parameter */
+ if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+ LIBCFS_FREE(buf, data->ioc_plen1);
+ return -EFAULT;
+ }
+
+ mutex_lock(&console_session.ses_mutex);
+
+ console_session.ses_laststamp = get_seconds();
+
+ if (console_session.ses_shutdown) {
+ rc = -ESHUTDOWN;
+ goto out;
+ }
+
+ if (console_session.ses_expired)
+ lstcon_session_end();
+
+ if (opc != LSTIO_SESSION_NEW &&
+ console_session.ses_state == LST_SESSION_NONE) {
+ CDEBUG(D_NET, "LST no active session\n");
+ rc = -ESRCH;
+ goto out;
+ }
+
+ memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t));
+
+ switch (opc) {
+ case LSTIO_SESSION_NEW:
+ rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf);
+ break;
+ case LSTIO_SESSION_END:
+ rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf);
+ break;
+ case LSTIO_SESSION_INFO:
+ rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf);
+ break;
+ case LSTIO_DEBUG:
+ rc = lst_debug_ioctl((lstio_debug_args_t *)buf);
+ break;
+ case LSTIO_GROUP_ADD:
+ rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf);
+ break;
+ case LSTIO_GROUP_DEL:
+ rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf);
+ break;
+ case LSTIO_GROUP_UPDATE:
+ rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf);
+ break;
+ case LSTIO_NODES_ADD:
+ rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf);
+ break;
+ case LSTIO_GROUP_LIST:
+ rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf);
+ break;
+ case LSTIO_GROUP_INFO:
+ rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf);
+ break;
+ case LSTIO_BATCH_ADD:
+ rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf);
+ break;
+ case LSTIO_BATCH_START:
+ rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf);
+ break;
+ case LSTIO_BATCH_STOP:
+ rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf);
+ break;
+ case LSTIO_BATCH_QUERY:
+ rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf);
+ break;
+ case LSTIO_BATCH_LIST:
+ rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf);
+ break;
+ case LSTIO_BATCH_INFO:
+ rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf);
+ break;
+ case LSTIO_TEST_ADD:
+ rc = lst_test_add_ioctl((lstio_test_args_t *)buf);
+ break;
+ case LSTIO_STAT_QUERY:
+ rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf);
+ break;
+ default:
+ rc = -EINVAL;
+ }
+
+ if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+ sizeof(lstcon_trans_stat_t)))
+ rc = -EFAULT;
+out:
+ mutex_unlock(&console_session.ses_mutex);
+
+ LIBCFS_FREE(buf, data->ioc_plen1);
+
+ return rc;
+}
+
+EXPORT_SYMBOL(lstcon_ioctl_entry);
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/conrpc.c b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.c
new file mode 100644
index 000000000..77f02b761
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.c
@@ -0,0 +1,1396 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+ lstcon_node_t *, lstcon_trans_stat_t *);
+
+static void
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
+{
+ lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+
+ LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+ LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+ spin_lock(&rpc->crpc_lock);
+
+ if (crpc->crp_trans == NULL) {
+ /* Orphan RPC is not in any transaction,
+ * I'm just a poor body and nobody loves me */
+ spin_unlock(&rpc->crpc_lock);
+
+ /* release it */
+ lstcon_rpc_put(crpc);
+ return;
+ }
+
+ /* not an orphan RPC */
+ crpc->crp_finished = 1;
+
+ if (crpc->crp_stamp == 0) {
+ /* not aborted */
+ LASSERT(crpc->crp_status == 0);
+
+ crpc->crp_stamp = cfs_time_current();
+ crpc->crp_status = rpc->crpc_status;
+ }
+
+ /* wakeup (transaction)thread if I'm the last RPC in the transaction */
+ if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+ wake_up(&crpc->crp_trans->tas_waitq);
+
+ spin_unlock(&rpc->crpc_lock);
+}
+
+static int
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+ int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+{
+ crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+ feats, bulk_npg, bulk_len,
+ lstcon_rpc_done, (void *)crpc);
+ if (crpc->crp_rpc == NULL)
+ return -ENOMEM;
+
+ crpc->crp_trans = NULL;
+ crpc->crp_node = nd;
+ crpc->crp_posted = 0;
+ crpc->crp_finished = 0;
+ crpc->crp_unpacked = 0;
+ crpc->crp_status = 0;
+ crpc->crp_stamp = 0;
+ crpc->crp_embedded = embedded;
+ INIT_LIST_HEAD(&crpc->crp_link);
+
+ atomic_inc(&console_session.ses_rpc_counter);
+
+ return 0;
+}
+
+static int
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+ int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+{
+ lstcon_rpc_t *crpc = NULL;
+ int rc;
+
+ spin_lock(&console_session.ses_rpc_lock);
+
+ if (!list_empty(&console_session.ses_rpc_freelist)) {
+ crpc = list_entry(console_session.ses_rpc_freelist.next,
+ lstcon_rpc_t, crp_link);
+ list_del_init(&crpc->crp_link);
+ }
+
+ spin_unlock(&console_session.ses_rpc_lock);
+
+ if (crpc == NULL) {
+ LIBCFS_ALLOC(crpc, sizeof(*crpc));
+ if (crpc == NULL)
+ return -ENOMEM;
+ }
+
+ rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+ if (rc == 0) {
+ *crpcpp = crpc;
+ return 0;
+ }
+
+ LIBCFS_FREE(crpc, sizeof(*crpc));
+
+ return rc;
+}
+
+void
+lstcon_rpc_put(lstcon_rpc_t *crpc)
+{
+ srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+ int i;
+
+ LASSERT(list_empty(&crpc->crp_link));
+
+ for (i = 0; i < bulk->bk_niov; i++) {
+ if (bulk->bk_iovs[i].kiov_page == NULL)
+ continue;
+
+ __free_page(bulk->bk_iovs[i].kiov_page);
+ }
+
+ srpc_client_rpc_decref(crpc->crp_rpc);
+
+ if (crpc->crp_embedded) {
+ /* embedded RPC, don't recycle it */
+ memset(crpc, 0, sizeof(*crpc));
+ crpc->crp_embedded = 1;
+
+ } else {
+ spin_lock(&console_session.ses_rpc_lock);
+
+ list_add(&crpc->crp_link,
+ &console_session.ses_rpc_freelist);
+
+ spin_unlock(&console_session.ses_rpc_lock);
+ }
+
+ /* RPC is not alive now */
+ atomic_dec(&console_session.ses_rpc_counter);
+}
+
+static void
+lstcon_rpc_post(lstcon_rpc_t *crpc)
+{
+ lstcon_rpc_trans_t *trans = crpc->crp_trans;
+
+ LASSERT(trans != NULL);
+
+ atomic_inc(&trans->tas_remaining);
+ crpc->crp_posted = 1;
+
+ sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+ if (transop == LST_TRANS_SESNEW)
+ return "SESNEW";
+
+ if (transop == LST_TRANS_SESEND)
+ return "SESEND";
+
+ if (transop == LST_TRANS_SESQRY)
+ return "SESQRY";
+
+ if (transop == LST_TRANS_SESPING)
+ return "SESPING";
+
+ if (transop == LST_TRANS_TSBCLIADD)
+ return "TSBCLIADD";
+
+ if (transop == LST_TRANS_TSBSRVADD)
+ return "TSBSRVADD";
+
+ if (transop == LST_TRANS_TSBRUN)
+ return "TSBRUN";
+
+ if (transop == LST_TRANS_TSBSTOP)
+ return "TSBSTOP";
+
+ if (transop == LST_TRANS_TSBCLIQRY)
+ return "TSBCLIQRY";
+
+ if (transop == LST_TRANS_TSBSRVQRY)
+ return "TSBSRVQRY";
+
+ if (transop == LST_TRANS_STATQRY)
+ return "STATQRY";
+
+ return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist,
+ int transop, lstcon_rpc_trans_t **transpp)
+{
+ lstcon_rpc_trans_t *trans;
+
+ if (translist != NULL) {
+ list_for_each_entry(trans, translist, tas_link) {
+ /* Can't enqueue two private transaction on
+ * the same object */
+ if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+ return -EPERM;
+ }
+ }
+
+ /* create a trans group */
+ LIBCFS_ALLOC(trans, sizeof(*trans));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ trans->tas_opc = transop;
+
+ if (translist == NULL)
+ INIT_LIST_HEAD(&trans->tas_olink);
+ else
+ list_add_tail(&trans->tas_olink, translist);
+
+ list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+ INIT_LIST_HEAD(&trans->tas_rpcs_list);
+ atomic_set(&trans->tas_remaining, 0);
+ init_waitqueue_head(&trans->tas_waitq);
+
+ spin_lock(&console_session.ses_rpc_lock);
+ trans->tas_features = console_session.ses_features;
+ spin_unlock(&console_session.ses_rpc_lock);
+
+ *transpp = trans;
+ return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+{
+ list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+ crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+{
+ srpc_client_rpc_t *rpc;
+ lstcon_rpc_t *crpc;
+ lstcon_node_t *nd;
+
+ list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+ rpc = crpc->crp_rpc;
+
+ spin_lock(&rpc->crpc_lock);
+
+ if (!crpc->crp_posted || /* not posted */
+ crpc->crp_stamp != 0) { /* rpc done or aborted already */
+ if (crpc->crp_stamp == 0) {
+ crpc->crp_stamp = cfs_time_current();
+ crpc->crp_status = -EINTR;
+ }
+ spin_unlock(&rpc->crpc_lock);
+ continue;
+ }
+
+ crpc->crp_stamp = cfs_time_current();
+ crpc->crp_status = error;
+
+ spin_unlock(&rpc->crpc_lock);
+
+ sfw_abort_rpc(rpc);
+
+ if (error != ETIMEDOUT)
+ continue;
+
+ nd = crpc->crp_node;
+ if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+ continue;
+
+ nd->nd_stamp = crpc->crp_stamp;
+ nd->nd_state = LST_NODE_DOWN;
+ }
+}
+
+static int
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+{
+ if (console_session.ses_shutdown &&
+ !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+ return 1;
+
+ return (atomic_read(&trans->tas_remaining) == 0) ? 1 : 0;
+}
+
+int
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+{
+ lstcon_rpc_t *crpc;
+ int rc;
+
+ if (list_empty(&trans->tas_rpcs_list))
+ return 0;
+
+ if (timeout < LST_TRANS_MIN_TIMEOUT)
+ timeout = LST_TRANS_MIN_TIMEOUT;
+
+ CDEBUG(D_NET, "Transaction %s started\n",
+ lstcon_rpc_trans_name(trans->tas_opc));
+
+ /* post all requests */
+ list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+ LASSERT(!crpc->crp_posted);
+
+ lstcon_rpc_post(crpc);
+ }
+
+ mutex_unlock(&console_session.ses_mutex);
+
+ rc = wait_event_interruptible_timeout(trans->tas_waitq,
+ lstcon_rpc_trans_check(trans),
+ cfs_time_seconds(timeout));
+ rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT);
+
+ mutex_lock(&console_session.ses_mutex);
+
+ if (console_session.ses_shutdown)
+ rc = -ESHUTDOWN;
+
+ if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+ /* treat short timeout as canceled */
+ if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+ rc = -EINTR;
+
+ lstcon_rpc_trans_abort(trans, rc);
+ }
+
+ CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+ lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+ lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+ return rc;
+}
+
+static int
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+{
+ lstcon_node_t *nd = crpc->crp_node;
+ srpc_client_rpc_t *rpc = crpc->crp_rpc;
+ srpc_generic_reply_t *rep;
+
+ LASSERT(nd != NULL && rpc != NULL);
+ LASSERT(crpc->crp_stamp != 0);
+
+ if (crpc->crp_status != 0) {
+ *msgpp = NULL;
+ return crpc->crp_status;
+ }
+
+ *msgpp = &rpc->crpc_replymsg;
+ if (!crpc->crp_unpacked) {
+ sfw_unpack_message(*msgpp);
+ crpc->crp_unpacked = 1;
+ }
+
+ if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+ return 0;
+
+ nd->nd_stamp = crpc->crp_stamp;
+ rep = &(*msgpp)->msg_body.reply;
+
+ if (rep->sid.ses_nid == LNET_NID_ANY)
+ nd->nd_state = LST_NODE_UNKNOWN;
+ else if (lstcon_session_match(rep->sid))
+ nd->nd_state = LST_NODE_ACTIVE;
+ else
+ nd->nd_state = LST_NODE_BUSY;
+
+ return 0;
+}
+
+void
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat)
+{
+ lstcon_rpc_t *crpc;
+ srpc_msg_t *rep;
+ int error;
+
+ LASSERT(stat != NULL);
+
+ memset(stat, 0, sizeof(*stat));
+
+ list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+ lstcon_rpc_stat_total(stat, 1);
+
+ LASSERT(crpc->crp_stamp != 0);
+
+ error = lstcon_rpc_get_reply(crpc, &rep);
+ if (error != 0) {
+ lstcon_rpc_stat_failure(stat, 1);
+ if (stat->trs_rpc_errno == 0)
+ stat->trs_rpc_errno = -error;
+
+ continue;
+ }
+
+ lstcon_rpc_stat_success(stat, 1);
+
+ lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+ }
+
+ if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+ stat->trs_fwk_errno =
+ lstcon_session_feats_check(trans->tas_features);
+ }
+
+ CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, RPC error(%d), Framework error(%d)\n",
+ lstcon_rpc_trans_name(trans->tas_opc),
+ lstcon_rpc_stat_success(stat, 0),
+ lstcon_rpc_stat_failure(stat, 0),
+ lstcon_rpc_stat_total(stat, 0),
+ stat->trs_rpc_errno, stat->trs_fwk_errno);
+
+ return;
+}
+
+int
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+ struct list_head *head_up,
+ lstcon_rpc_readent_func_t readent)
+{
+ struct list_head tmp;
+ struct list_head *next;
+ lstcon_rpc_ent_t *ent;
+ srpc_generic_reply_t *rep;
+ lstcon_rpc_t *crpc;
+ srpc_msg_t *msg;
+ lstcon_node_t *nd;
+ long dur;
+ struct timeval tv;
+ int error;
+
+ LASSERT(head_up != NULL);
+
+ next = head_up;
+
+ list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+ if (copy_from_user(&tmp, next,
+ sizeof(struct list_head)))
+ return -EFAULT;
+
+ if (tmp.next == head_up)
+ return 0;
+
+ next = tmp.next;
+
+ ent = list_entry(next, lstcon_rpc_ent_t, rpe_link);
+
+ LASSERT(crpc->crp_stamp != 0);
+
+ error = lstcon_rpc_get_reply(crpc, &msg);
+
+ nd = crpc->crp_node;
+
+ dur = (long)cfs_time_sub(crpc->crp_stamp,
+ (unsigned long)console_session.ses_id.ses_stamp);
+ cfs_duration_usec(dur, &tv);
+
+ if (copy_to_user(&ent->rpe_peer,
+ &nd->nd_id, sizeof(lnet_process_id_t)) ||
+ copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+ copy_to_user(&ent->rpe_state,
+ &nd->nd_state, sizeof(nd->nd_state)) ||
+ copy_to_user(&ent->rpe_rpc_errno, &error,
+ sizeof(error)))
+ return -EFAULT;
+
+ if (error != 0)
+ continue;
+
+ /* RPC is done */
+ rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+
+ if (copy_to_user(&ent->rpe_sid,
+ &rep->sid, sizeof(lst_sid_t)) ||
+ copy_to_user(&ent->rpe_fwk_errno,
+ &rep->status, sizeof(rep->status)))
+ return -EFAULT;
+
+ if (readent == NULL)
+ continue;
+
+ error = readent(trans->tas_opc, msg, ent);
+
+ if (error != 0)
+ return error;
+ }
+
+ return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+{
+ srpc_client_rpc_t *rpc;
+ lstcon_rpc_t *crpc;
+ lstcon_rpc_t *tmp;
+ int count = 0;
+
+ list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list,
+ crp_link) {
+ rpc = crpc->crp_rpc;
+
+ spin_lock(&rpc->crpc_lock);
+
+ /* free it if not posted or finished already */
+ if (!crpc->crp_posted || crpc->crp_finished) {
+ spin_unlock(&rpc->crpc_lock);
+
+ list_del_init(&crpc->crp_link);
+ lstcon_rpc_put(crpc);
+
+ continue;
+ }
+
+ /* rpcs can be still not callbacked (even LNetMDUnlink is called)
+ * because huge timeout for inaccessible network, don't make
+ * user wait for them, just abandon them, they will be recycled
+ * in callback */
+
+ LASSERT(crpc->crp_status != 0);
+
+ crpc->crp_node = NULL;
+ crpc->crp_trans = NULL;
+ list_del_init(&crpc->crp_link);
+ count++;
+
+ spin_unlock(&rpc->crpc_lock);
+
+ atomic_dec(&trans->tas_remaining);
+ }
+
+ LASSERT(atomic_read(&trans->tas_remaining) == 0);
+
+ list_del(&trans->tas_link);
+ if (!list_empty(&trans->tas_olink))
+ list_del(&trans->tas_olink);
+
+ CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+ lstcon_rpc_trans_name(trans->tas_opc), count);
+
+ LIBCFS_FREE(trans, sizeof(*trans));
+
+ return;
+}
+
+int
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+ unsigned feats, lstcon_rpc_t **crpc)
+{
+ srpc_mksn_reqst_t *msrq;
+ srpc_rmsn_reqst_t *rsrq;
+ int rc;
+
+ switch (transop) {
+ case LST_TRANS_SESNEW:
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+ feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+ msrq->mksn_sid = console_session.ses_id;
+ msrq->mksn_force = console_session.ses_force;
+ strncpy(msrq->mksn_name, console_session.ses_name,
+ strlen(console_session.ses_name));
+ break;
+
+ case LST_TRANS_SESEND:
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+ feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+ rsrq->rmsn_sid = console_session.ses_id;
+ break;
+
+ default:
+ LBUG();
+ }
+
+ return 0;
+}
+
+int
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+ srpc_debug_reqst_t *drq;
+ int rc;
+
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+ drq->dbg_sid = console_session.ses_id;
+ drq->dbg_flags = 0;
+
+ return rc;
+}
+
+int
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+ lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+{
+ lstcon_batch_t *batch;
+ srpc_batch_reqst_t *brq;
+ int rc;
+
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+ brq->bar_sid = console_session.ses_id;
+ brq->bar_bid = tsb->tsb_id;
+ brq->bar_testidx = tsb->tsb_index;
+ brq->bar_opc = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+ (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP :
+ SRPC_BATCH_OPC_QUERY);
+
+ if (transop != LST_TRANS_TSBRUN &&
+ transop != LST_TRANS_TSBSTOP)
+ return 0;
+
+ LASSERT(tsb->tsb_index == 0);
+
+ batch = (lstcon_batch_t *)tsb;
+ brq->bar_arg = batch->bat_arg;
+
+ return 0;
+}
+
+int
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+ srpc_stat_reqst_t *srq;
+ int rc;
+
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+ srq->str_sid = console_session.ses_id;
+ srq->str_type = 0; /* XXX remove it */
+
+ return 0;
+}
+
+static lnet_process_id_packed_t *
+lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
+{
+ lnet_process_id_packed_t *pid;
+ int i;
+
+ i = idx / SFW_ID_PER_PAGE;
+
+ LASSERT(i < nkiov);
+
+ pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page);
+
+ return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+static int
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+ int dist, int span, int nkiov, lnet_kiov_t *kiov)
+{
+ lnet_process_id_packed_t *pid;
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ int start;
+ int end;
+ int i = 0;
+
+ LASSERT(dist >= 1);
+ LASSERT(span >= 1);
+ LASSERT(grp->grp_nnode >= 1);
+
+ if (span > grp->grp_nnode)
+ return -EINVAL;
+
+ start = ((idx / dist) * span) % grp->grp_nnode;
+ end = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+ list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+ nd = ndl->ndl_node;
+ if (i < start) {
+ i++;
+ continue;
+ }
+
+ if (i > (end >= start ? end : grp->grp_nnode))
+ break;
+
+ pid = lstcon_next_id((i - start), nkiov, kiov);
+ pid->nid = nd->nd_id.nid;
+ pid->pid = nd->nd_id.pid;
+ i++;
+ }
+
+ if (start <= end) /* done */
+ return 0;
+
+ list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+ if (i > grp->grp_nnode + end)
+ break;
+
+ nd = ndl->ndl_node;
+ pid = lstcon_next_id((i - start), nkiov, kiov);
+ pid->nid = nd->nd_id.nid;
+ pid->pid = nd->nd_id.pid;
+ i++;
+ }
+
+ return 0;
+}
+
+static int
+lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req)
+{
+ test_ping_req_t *prq = &req->tsr_u.ping;
+
+ prq->png_size = param->png_size;
+ prq->png_flags = param->png_flags;
+ /* TODO dest */
+ return 0;
+}
+
+static int
+lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+ test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+
+ brq->blk_opc = param->blk_opc;
+ brq->blk_npg = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE;
+ brq->blk_flags = param->blk_flags;
+
+ return 0;
+}
+
+static int
+lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+ test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+
+ brq->blk_opc = param->blk_opc;
+ brq->blk_flags = param->blk_flags;
+ brq->blk_len = param->blk_size;
+ brq->blk_offset = 0; /* reserved */
+
+ return 0;
+}
+
+int
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+ lstcon_test_t *test, lstcon_rpc_t **crpc)
+{
+ lstcon_group_t *sgrp = test->tes_src_grp;
+ lstcon_group_t *dgrp = test->tes_dst_grp;
+ srpc_test_reqst_t *trq;
+ srpc_bulk_t *bulk;
+ int i;
+ int npg = 0;
+ int nob = 0;
+ int rc = 0;
+
+ if (transop == LST_TRANS_TSBCLIADD) {
+ npg = sfw_id_pages(test->tes_span);
+ nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+ npg * PAGE_CACHE_SIZE :
+ sizeof(lnet_process_id_packed_t) * test->tes_span;
+ }
+
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+ if (rc != 0)
+ return rc;
+
+ trq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+ if (transop == LST_TRANS_TSBSRVADD) {
+ int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+ int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+ int nmax = (ndist + nspan - 1) / nspan;
+
+ trq->tsr_ndest = 0;
+ trq->tsr_loop = nmax * test->tes_dist * test->tes_concur;
+
+ } else {
+ bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+ for (i = 0; i < npg; i++) {
+ int len;
+
+ LASSERT(nob > 0);
+
+ len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+ PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE);
+ nob -= len;
+
+ bulk->bk_iovs[i].kiov_offset = 0;
+ bulk->bk_iovs[i].kiov_len = len;
+ bulk->bk_iovs[i].kiov_page =
+ alloc_page(GFP_IOFS);
+
+ if (bulk->bk_iovs[i].kiov_page == NULL) {
+ lstcon_rpc_put(*crpc);
+ return -ENOMEM;
+ }
+ }
+
+ bulk->bk_sink = 0;
+
+ LASSERT(transop == LST_TRANS_TSBCLIADD);
+
+ rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+ test->tes_cliidx++,
+ test->tes_dist,
+ test->tes_span,
+ npg, &bulk->bk_iovs[0]);
+ if (rc != 0) {
+ lstcon_rpc_put(*crpc);
+ return rc;
+ }
+
+ trq->tsr_ndest = test->tes_span;
+ trq->tsr_loop = test->tes_loop;
+ }
+
+ trq->tsr_sid = console_session.ses_id;
+ trq->tsr_bid = test->tes_hdr.tsb_id;
+ trq->tsr_concur = test->tes_concur;
+ trq->tsr_is_client = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+ trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+ switch (test->tes_type) {
+ case LST_TEST_PING:
+ trq->tsr_service = SRPC_SERVICE_PING;
+ rc = lstcon_pingrpc_prep((lst_test_ping_param_t *)
+ &test->tes_param[0], trq);
+ break;
+
+ case LST_TEST_BULK:
+ trq->tsr_service = SRPC_SERVICE_BRW;
+ if ((feats & LST_FEAT_BULK_LEN) == 0) {
+ rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *)
+ &test->tes_param[0], trq);
+ } else {
+ rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *)
+ &test->tes_param[0], trq);
+ }
+
+ break;
+ default:
+ LBUG();
+ break;
+ }
+
+ return rc;
+}
+
+static int
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+ lstcon_node_t *nd, srpc_msg_t *reply)
+{
+ srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+ int status = mksn_rep->mksn_status;
+
+ if (status == 0 &&
+ (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ mksn_rep->mksn_status = EPROTO;
+ status = EPROTO;
+ }
+
+ if (status == EPROTO) {
+ CNETERR("session protocol error from %s: %u\n",
+ libcfs_nid2str(nd->nd_id.nid),
+ reply->msg_ses_feats);
+ }
+
+ if (status != 0)
+ return status;
+
+ if (!trans->tas_feats_updated) {
+ trans->tas_feats_updated = 1;
+ trans->tas_features = reply->msg_ses_feats;
+ }
+
+ if (reply->msg_ses_feats != trans->tas_features) {
+ CNETERR("Framework features %x from %s is different with features on this transaction: %x\n",
+ reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+ trans->tas_features);
+ status = mksn_rep->mksn_status = EPROTO;
+ }
+
+ if (status == 0) {
+ /* session timeout on remote node */
+ nd->nd_timeout = mksn_rep->mksn_timeout;
+ }
+
+ return status;
+}
+
+void
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+ lstcon_node_t *nd, lstcon_trans_stat_t *stat)
+{
+ srpc_rmsn_reply_t *rmsn_rep;
+ srpc_debug_reply_t *dbg_rep;
+ srpc_batch_reply_t *bat_rep;
+ srpc_test_reply_t *test_rep;
+ srpc_stat_reply_t *stat_rep;
+ int rc = 0;
+
+ switch (trans->tas_opc) {
+ case LST_TRANS_SESNEW:
+ rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+ if (rc == 0) {
+ lstcon_sesop_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_sesop_stat_failure(stat, 1);
+ break;
+
+ case LST_TRANS_SESEND:
+ rmsn_rep = &msg->msg_body.rmsn_reply;
+ /* ESRCH is not an error for end session */
+ if (rmsn_rep->rmsn_status == 0 ||
+ rmsn_rep->rmsn_status == ESRCH) {
+ lstcon_sesop_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_sesop_stat_failure(stat, 1);
+ rc = rmsn_rep->rmsn_status;
+ break;
+
+ case LST_TRANS_SESQRY:
+ case LST_TRANS_SESPING:
+ dbg_rep = &msg->msg_body.dbg_reply;
+
+ if (dbg_rep->dbg_status == ESRCH) {
+ lstcon_sesqry_stat_unknown(stat, 1);
+ return;
+ }
+
+ if (lstcon_session_match(dbg_rep->dbg_sid))
+ lstcon_sesqry_stat_active(stat, 1);
+ else
+ lstcon_sesqry_stat_busy(stat, 1);
+ return;
+
+ case LST_TRANS_TSBRUN:
+ case LST_TRANS_TSBSTOP:
+ bat_rep = &msg->msg_body.bat_reply;
+
+ if (bat_rep->bar_status == 0) {
+ lstcon_tsbop_stat_success(stat, 1);
+ return;
+ }
+
+ if (bat_rep->bar_status == EPERM &&
+ trans->tas_opc == LST_TRANS_TSBSTOP) {
+ lstcon_tsbop_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_tsbop_stat_failure(stat, 1);
+ rc = bat_rep->bar_status;
+ break;
+
+ case LST_TRANS_TSBCLIQRY:
+ case LST_TRANS_TSBSRVQRY:
+ bat_rep = &msg->msg_body.bat_reply;
+
+ if (bat_rep->bar_active != 0)
+ lstcon_tsbqry_stat_run(stat, 1);
+ else
+ lstcon_tsbqry_stat_idle(stat, 1);
+
+ if (bat_rep->bar_status == 0)
+ return;
+
+ lstcon_tsbqry_stat_failure(stat, 1);
+ rc = bat_rep->bar_status;
+ break;
+
+ case LST_TRANS_TSBCLIADD:
+ case LST_TRANS_TSBSRVADD:
+ test_rep = &msg->msg_body.tes_reply;
+
+ if (test_rep->tsr_status == 0) {
+ lstcon_tsbop_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_tsbop_stat_failure(stat, 1);
+ rc = test_rep->tsr_status;
+ break;
+
+ case LST_TRANS_STATQRY:
+ stat_rep = &msg->msg_body.stat_reply;
+
+ if (stat_rep->str_status == 0) {
+ lstcon_statqry_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_statqry_stat_failure(stat, 1);
+ rc = stat_rep->str_status;
+ break;
+
+ default:
+ LBUG();
+ }
+
+ if (stat->trs_fwk_errno == 0)
+ stat->trs_fwk_errno = rc;
+
+ return;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+ struct list_head *translist, int transop,
+ void *arg, lstcon_rpc_cond_func_t condition,
+ lstcon_rpc_trans_t **transpp)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ lstcon_rpc_t *rpc;
+ unsigned feats;
+ int rc;
+
+ /* Creating session RPG for list of nodes */
+
+ rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction %d: %d\n", transop, rc);
+ return rc;
+ }
+
+ feats = trans->tas_features;
+ list_for_each_entry(ndl, ndlist, ndl_link) {
+ rc = condition == NULL ? 1 :
+ condition(transop, ndl->ndl_node, arg);
+
+ if (rc == 0)
+ continue;
+
+ if (rc < 0) {
+ CDEBUG(D_NET, "Condition error while creating RPC for transaction %d: %d\n",
+ transop, rc);
+ break;
+ }
+
+ nd = ndl->ndl_node;
+
+ switch (transop) {
+ case LST_TRANS_SESNEW:
+ case LST_TRANS_SESEND:
+ rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+ break;
+ case LST_TRANS_SESQRY:
+ case LST_TRANS_SESPING:
+ rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+ break;
+ case LST_TRANS_TSBCLIADD:
+ case LST_TRANS_TSBSRVADD:
+ rc = lstcon_testrpc_prep(nd, transop, feats,
+ (lstcon_test_t *)arg, &rpc);
+ break;
+ case LST_TRANS_TSBRUN:
+ case LST_TRANS_TSBSTOP:
+ case LST_TRANS_TSBCLIQRY:
+ case LST_TRANS_TSBSRVQRY:
+ rc = lstcon_batrpc_prep(nd, transop, feats,
+ (lstcon_tsb_hdr_t *)arg, &rpc);
+ break;
+ case LST_TRANS_STATQRY:
+ rc = lstcon_statrpc_prep(nd, feats, &rpc);
+ break;
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ if (rc != 0) {
+ CERROR("Failed to create RPC for transaction %s: %d\n",
+ lstcon_rpc_trans_name(transop), rc);
+ break;
+ }
+
+ lstcon_rpc_trans_addreq(trans, rpc);
+ }
+
+ if (rc == 0) {
+ *transpp = trans;
+ return 0;
+ }
+
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+static void
+lstcon_rpc_pinger(void *arg)
+{
+ stt_timer_t *ptimer = (stt_timer_t *)arg;
+ lstcon_rpc_trans_t *trans;
+ lstcon_rpc_t *crpc;
+ srpc_msg_t *rep;
+ srpc_debug_reqst_t *drq;
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ time_t intv;
+ int count = 0;
+ int rc;
+
+ /* RPC pinger is a special case of transaction,
+ * it's called by timer at 8 seconds interval.
+ */
+ mutex_lock(&console_session.ses_mutex);
+
+ if (console_session.ses_shutdown || console_session.ses_expired) {
+ mutex_unlock(&console_session.ses_mutex);
+ return;
+ }
+
+ if (!console_session.ses_expired &&
+ get_seconds() - console_session.ses_laststamp >
+ (time_t)console_session.ses_timeout)
+ console_session.ses_expired = 1;
+
+ trans = console_session.ses_ping;
+
+ LASSERT(trans != NULL);
+
+ list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+ nd = ndl->ndl_node;
+
+ if (console_session.ses_expired) {
+ /* idle console, end session on all nodes */
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ continue;
+
+ rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+ trans->tas_features, &crpc);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ break;
+ }
+
+ lstcon_rpc_trans_addreq(trans, crpc);
+ lstcon_rpc_post(crpc);
+
+ continue;
+ }
+
+ crpc = &nd->nd_ping;
+
+ if (crpc->crp_rpc != NULL) {
+ LASSERT(crpc->crp_trans == trans);
+ LASSERT(!list_empty(&crpc->crp_link));
+
+ spin_lock(&crpc->crp_rpc->crpc_lock);
+
+ LASSERT(crpc->crp_posted);
+
+ if (!crpc->crp_finished) {
+ /* in flight */
+ spin_unlock(&crpc->crp_rpc->crpc_lock);
+ continue;
+ }
+
+ spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+ lstcon_rpc_get_reply(crpc, &rep);
+
+ list_del_init(&crpc->crp_link);
+
+ lstcon_rpc_put(crpc);
+ }
+
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ continue;
+
+ intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+ nd->nd_stamp));
+ if (intv < (time_t)nd->nd_timeout / 2)
+ continue;
+
+ rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+ trans->tas_features, 0, 0, 1, crpc);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ break;
+ }
+
+ drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+ drq->dbg_sid = console_session.ses_id;
+ drq->dbg_flags = 0;
+
+ lstcon_rpc_trans_addreq(trans, crpc);
+ lstcon_rpc_post(crpc);
+
+ count++;
+ }
+
+ if (console_session.ses_expired) {
+ mutex_unlock(&console_session.ses_mutex);
+ return;
+ }
+
+ CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+ ptimer->stt_expires = (unsigned long)(get_seconds() + LST_PING_INTERVAL);
+ stt_add_timer(ptimer);
+
+ mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+ stt_timer_t *ptimer;
+ int rc;
+
+ LASSERT(list_empty(&console_session.ses_rpc_freelist));
+ LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
+
+ rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+ &console_session.ses_ping);
+ if (rc != 0) {
+ CERROR("Failed to create console pinger\n");
+ return rc;
+ }
+
+ ptimer = &console_session.ses_ping_timer;
+ ptimer->stt_expires = (unsigned long)(get_seconds() + LST_PING_INTERVAL);
+
+ stt_add_timer(ptimer);
+
+ return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+ LASSERT(console_session.ses_shutdown);
+
+ stt_del_timer(&console_session.ses_ping_timer);
+
+ lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+ lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+ lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+ memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t));
+
+ console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_rpc_t *crpc;
+ struct list_head *pacer;
+ struct list_head zlist;
+
+ /* Called with hold of global mutex */
+
+ LASSERT(console_session.ses_shutdown);
+
+ while (!list_empty(&console_session.ses_trans_list)) {
+ list_for_each(pacer, &console_session.ses_trans_list) {
+ trans = list_entry(pacer, lstcon_rpc_trans_t,
+ tas_link);
+
+ CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+ lstcon_rpc_trans_name(trans->tas_opc));
+
+ wake_up(&trans->tas_waitq);
+ }
+
+ mutex_unlock(&console_session.ses_mutex);
+
+ CWARN("Session is shutting down, waiting for termination of transactions\n");
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+
+ mutex_lock(&console_session.ses_mutex);
+ }
+
+ spin_lock(&console_session.ses_rpc_lock);
+
+ lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+ console_session.ses_rpc_lock,
+ "Network is not accessible or target is down, waiting for %d console RPCs to being recycled\n",
+ atomic_read(&console_session.ses_rpc_counter));
+
+ list_add(&zlist, &console_session.ses_rpc_freelist);
+ list_del_init(&console_session.ses_rpc_freelist);
+
+ spin_unlock(&console_session.ses_rpc_lock);
+
+ while (!list_empty(&zlist)) {
+ crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+
+ list_del(&crpc->crp_link);
+ LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+ }
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+ INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+ console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+ console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+ console_session.ses_ping = NULL;
+
+ spin_lock_init(&console_session.ses_rpc_lock);
+ atomic_set(&console_session.ses_rpc_counter, 0);
+ INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+ return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+ LASSERT(list_empty(&console_session.ses_rpc_freelist));
+ LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/conrpc.h b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.h
new file mode 100644
index 000000000..2353889c6
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/conrpc.h
@@ -0,0 +1,146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lnet.h"
+#include "../../include/linux/lnet/lib-types.h"
+#include "../../include/linux/lnet/lnetst.h"
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT 30
+#define LST_TRANS_MIN_TIMEOUT 3
+
+#define LST_VALIDATE_TIMEOUT(t) min(max(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL 8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+typedef struct lstcon_rpc {
+ struct list_head crp_link; /* chain on rpc transaction */
+ srpc_client_rpc_t *crp_rpc; /* client rpc */
+ struct lstcon_node *crp_node; /* destination node */
+ struct lstcon_rpc_trans *crp_trans; /* conrpc transaction */
+
+ unsigned int crp_posted:1; /* rpc is posted */
+ unsigned int crp_finished:1; /* rpc is finished */
+ unsigned int crp_unpacked:1; /* reply is unpacked */
+ /** RPC is embedded in other structure and can't free it */
+ unsigned int crp_embedded:1;
+ int crp_status; /* console rpc errors */
+ unsigned long crp_stamp; /* replied time stamp */
+} lstcon_rpc_t;
+
+typedef struct lstcon_rpc_trans {
+ struct list_head tas_olink; /* link chain on owner list */
+ struct list_head tas_link; /* link chain on global list */
+ int tas_opc; /* operation code of transaction */
+ /* features mask is uptodate */
+ unsigned tas_feats_updated;
+ /* test features mask */
+ unsigned tas_features;
+ wait_queue_head_t tas_waitq; /* wait queue head */
+ atomic_t tas_remaining; /* # of un-scheduled rpcs */
+ struct list_head tas_rpcs_list; /* queued requests */
+} lstcon_rpc_trans_t;
+
+#define LST_TRANS_PRIVATE 0x1000
+
+#define LST_TRANS_SESNEW (LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND (LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY 0x03
+#define LST_TRANS_SESPING 0x04
+
+#define LST_TRANS_TSBCLIADD (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN (LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY 0x15
+#define LST_TRANS_TSBSRVQRY 0x16
+
+#define LST_TRANS_STATQRY 0x21
+
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *);
+
+int lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+ unsigned version, lstcon_rpc_t **crpc);
+int lstcon_dbgrpc_prep(struct lstcon_node *nd,
+ unsigned version, lstcon_rpc_t **crpc);
+int lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+ struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+int lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+ struct lstcon_test *test, lstcon_rpc_t **crpc);
+int lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+ lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
+int lstcon_rpc_trans_prep(struct list_head *translist,
+ int transop, lstcon_rpc_trans_t **transpp);
+int lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+ struct list_head *translist, int transop,
+ void *arg, lstcon_rpc_cond_func_t condition,
+ lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+ lstcon_trans_stat_t *stat);
+int lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+ struct list_head *head_up,
+ lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+int lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/console.c b/kernel/drivers/staging/lustre/lnet/selftest/console.c
new file mode 100644
index 000000000..2b5f53c7a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/console.c
@@ -0,0 +1,2096 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p) \
+do { \
+ if ((nd)->nd_state == LST_NODE_ACTIVE) \
+ (p)->nle_nactive++; \
+ else if ((nd)->nd_state == LST_NODE_BUSY) \
+ (p)->nle_nbusy++; \
+ else if ((nd)->nd_state == LST_NODE_DOWN) \
+ (p)->nle_ndown++; \
+ else \
+ (p)->nle_nunknown++; \
+ (p)->nle_nnode++; \
+} while (0)
+
+lstcon_session_t console_session;
+
+static void
+lstcon_node_get(lstcon_node_t *nd)
+{
+ LASSERT(nd->nd_ref >= 1);
+
+ nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create)
+{
+ lstcon_ndlink_t *ndl;
+ unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+ LASSERT(id.nid != LNET_NID_ANY);
+
+ list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) {
+ if (ndl->ndl_node->nd_id.nid != id.nid ||
+ ndl->ndl_node->nd_id.pid != id.pid)
+ continue;
+
+ lstcon_node_get(ndl->ndl_node);
+ *ndpp = ndl->ndl_node;
+ return 0;
+ }
+
+ if (!create)
+ return -ENOENT;
+
+ LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+ if (*ndpp == NULL)
+ return -ENOMEM;
+
+ ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+
+ ndl->ndl_node = *ndpp;
+
+ ndl->ndl_node->nd_ref = 1;
+ ndl->ndl_node->nd_id = id;
+ ndl->ndl_node->nd_stamp = cfs_time_current();
+ ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+ ndl->ndl_node->nd_timeout = 0;
+ memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+
+ /* queued in global hash & list, no refcount is taken by
+ * global hash & list, if caller release his refcount,
+ * node will be released */
+ list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+ list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+ return 0;
+}
+
+static void
+lstcon_node_put(lstcon_node_t *nd)
+{
+ lstcon_ndlink_t *ndl;
+
+ LASSERT(nd->nd_ref > 0);
+
+ if (--nd->nd_ref > 0)
+ return;
+
+ ndl = (lstcon_ndlink_t *)(nd + 1);
+
+ LASSERT(!list_empty(&ndl->ndl_link));
+ LASSERT(!list_empty(&ndl->ndl_hlink));
+
+ /* remove from session */
+ list_del(&ndl->ndl_link);
+ list_del(&ndl->ndl_hlink);
+
+ LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash,
+ lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create)
+{
+ unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ int rc;
+
+ if (id.nid == LNET_NID_ANY)
+ return -EINVAL;
+
+ /* search in hash */
+ list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+ if (ndl->ndl_node->nd_id.nid != id.nid ||
+ ndl->ndl_node->nd_id.pid != id.pid)
+ continue;
+
+ *ndlpp = ndl;
+ return 0;
+ }
+
+ if (create == 0)
+ return -ENOENT;
+
+ /* find or create in session hash */
+ rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+ if (rc != 0)
+ return rc;
+
+ LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+ if (ndl == NULL) {
+ lstcon_node_put(nd);
+ return -ENOMEM;
+ }
+
+ *ndlpp = ndl;
+
+ ndl->ndl_node = nd;
+ INIT_LIST_HEAD(&ndl->ndl_link);
+ list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+ return 0;
+}
+
+static void
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+{
+ LASSERT(list_empty(&ndl->ndl_link));
+ LASSERT(!list_empty(&ndl->ndl_hlink));
+
+ list_del(&ndl->ndl_hlink); /* delete from hash */
+ lstcon_node_put(ndl->ndl_node);
+
+ LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+{
+ lstcon_group_t *grp;
+ int i;
+
+ LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+ grp_ndl_hash[LST_NODE_HASHSIZE]));
+ if (grp == NULL)
+ return -ENOMEM;
+
+ grp->grp_ref = 1;
+ if (name != NULL)
+ strcpy(grp->grp_name, name);
+
+ INIT_LIST_HEAD(&grp->grp_link);
+ INIT_LIST_HEAD(&grp->grp_ndl_list);
+ INIT_LIST_HEAD(&grp->grp_trans_list);
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++)
+ INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+ *grpp = grp;
+
+ return 0;
+}
+
+static void
+lstcon_group_addref(lstcon_group_t *grp)
+{
+ grp->grp_ref++;
+}
+
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+
+static void
+lstcon_group_drain(lstcon_group_t *grp, int keep)
+{
+ lstcon_ndlink_t *ndl;
+ lstcon_ndlink_t *tmp;
+
+ list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+ if ((ndl->ndl_node->nd_state & keep) == 0)
+ lstcon_group_ndlink_release(grp, ndl);
+ }
+}
+
+static void
+lstcon_group_decref(lstcon_group_t *grp)
+{
+ int i;
+
+ if (--grp->grp_ref > 0)
+ return;
+
+ if (!list_empty(&grp->grp_link))
+ list_del(&grp->grp_link);
+
+ lstcon_group_drain(grp, 0);
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+ LASSERT(list_empty(&grp->grp_ndl_hash[i]));
+ }
+
+ LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+ grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(const char *name, lstcon_group_t **grpp)
+{
+ lstcon_group_t *grp;
+
+ list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+ if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+ continue;
+
+ lstcon_group_addref(grp); /* +1 ref for caller */
+ *grpp = grp;
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static void
+lstcon_group_put(lstcon_group_t *grp)
+{
+ lstcon_group_decref(grp);
+}
+
+static int
+lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id,
+ lstcon_ndlink_t **ndlpp, int create)
+{
+ int rc;
+
+ rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+ if (rc != 0)
+ return rc;
+
+ if (!list_empty(&(*ndlpp)->ndl_link))
+ return 0;
+
+ list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+ grp->grp_nnode++;
+
+ return 0;
+}
+
+static void
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+{
+ list_del_init(&ndl->ndl_link);
+ lstcon_ndlink_release(ndl);
+ grp->grp_nnode --;
+}
+
+static void
+lstcon_group_ndlink_move(lstcon_group_t *old,
+ lstcon_group_t *new, lstcon_ndlink_t *ndl)
+{
+ unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+ LST_NODE_HASHSIZE;
+
+ list_del(&ndl->ndl_hlink);
+ list_del(&ndl->ndl_link);
+ old->grp_nnode --;
+
+ list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+ list_add_tail(&ndl->ndl_link, &new->grp_ndl_list);
+ new->grp_nnode++;
+
+ return;
+}
+
+static void
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+{
+ lstcon_ndlink_t *ndl;
+
+ while (!list_empty(&old->grp_ndl_list)) {
+ ndl = list_entry(old->grp_ndl_list.next,
+ lstcon_ndlink_t, ndl_link);
+ lstcon_group_ndlink_move(old, new, ndl);
+ }
+}
+
+static int
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+ lstcon_group_t *grp = (lstcon_group_t *)arg;
+
+ switch (transop) {
+ case LST_TRANS_SESNEW:
+ if (nd->nd_state == LST_NODE_ACTIVE)
+ return 0;
+ break;
+
+ case LST_TRANS_SESEND:
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ return 0;
+
+ if (grp != NULL && nd->nd_ref > 1)
+ return 0;
+ break;
+
+ case LST_TRANS_SESQRY:
+ break;
+
+ default:
+ LBUG();
+ }
+
+ return 1;
+}
+
+static int
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+ lstcon_rpc_ent_t *ent_up)
+{
+ srpc_debug_reply_t *rep;
+
+ switch (transop) {
+ case LST_TRANS_SESNEW:
+ case LST_TRANS_SESEND:
+ return 0;
+
+ case LST_TRANS_SESQRY:
+ rep = &msg->msg_body.dbg_reply;
+
+ if (copy_to_user(&ent_up->rpe_priv[0],
+ &rep->dbg_timeout, sizeof(int)) ||
+ copy_to_user(&ent_up->rpe_payload[0],
+ &rep->dbg_name, LST_NAME_SIZE))
+ return -EFAULT;
+
+ return 0;
+
+ default:
+ LBUG();
+ }
+
+ return 0;
+}
+
+static int
+lstcon_group_nodes_add(lstcon_group_t *grp,
+ int count, lnet_process_id_t *ids_up,
+ unsigned *featp, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_ndlink_t *ndl;
+ lstcon_group_t *tmp;
+ lnet_process_id_t id;
+ int i;
+ int rc;
+
+ rc = lstcon_group_alloc(NULL, &tmp);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0 ; i < count; i++) {
+ if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ /* skip if it's in this group already */
+ rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+ if (rc == 0)
+ continue;
+
+ /* add to tmp group */
+ rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+ if (rc != 0) {
+ CERROR("Can't create ndlink, out of memory\n");
+ break;
+ }
+ }
+
+ if (rc != 0) {
+ lstcon_group_put(tmp);
+ return rc;
+ }
+
+ rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+ &tmp->grp_trans_list, LST_TRANS_SESNEW,
+ tmp, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ lstcon_group_put(tmp);
+ return rc;
+ }
+
+ /* post all RPCs */
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up,
+ lstcon_sesrpc_readent);
+ *featp = trans->tas_features;
+
+ /* destroy all RPGs */
+ lstcon_rpc_trans_destroy(trans);
+
+ lstcon_group_move(tmp, grp);
+ lstcon_group_put(tmp);
+
+ return rc;
+}
+
+static int
+lstcon_group_nodes_remove(lstcon_group_t *grp,
+ int count, lnet_process_id_t *ids_up,
+ struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_ndlink_t *ndl;
+ lstcon_group_t *tmp;
+ lnet_process_id_t id;
+ int rc;
+ int i;
+
+ /* End session and remove node from the group */
+
+ rc = lstcon_group_alloc(NULL, &tmp);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < count; i++) {
+ if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+ rc = -EFAULT;
+ goto error;
+ }
+
+ /* move node to tmp group */
+ if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+ lstcon_group_ndlink_move(grp, tmp, ndl);
+ }
+
+ rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+ &tmp->grp_trans_list, LST_TRANS_SESEND,
+ tmp, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ goto error;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+ lstcon_rpc_trans_destroy(trans);
+ /* release nodes anyway, because we can't rollback status */
+ lstcon_group_put(tmp);
+
+ return rc;
+error:
+ lstcon_group_move(tmp, grp);
+ lstcon_group_put(tmp);
+
+ return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+ if (rc != 0) {
+ /* find a group with same name */
+ lstcon_group_put(grp);
+ return rc;
+ }
+
+ rc = lstcon_group_alloc(name, &grp);
+ if (rc != 0) {
+ CERROR("Can't allocate descriptor for group %s\n", name);
+ return -ENOMEM;
+ }
+
+ list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+ return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up,
+ unsigned *featp, struct list_head *result_up)
+{
+ lstcon_group_t *grp;
+ int rc;
+
+ LASSERT(count > 0);
+ LASSERT(ids_up != NULL);
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by other threads or test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+
+ return -EBUSY;
+ }
+
+ rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group: %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by others threads or test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+ return -EBUSY;
+ }
+
+ rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+ &grp->grp_trans_list, LST_TRANS_SESEND,
+ grp, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ lstcon_group_put(grp);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ lstcon_rpc_trans_destroy(trans);
+
+ lstcon_group_put(grp);
+ /* -ref for session, it's destroyed,
+ * status can't be rolled back, destroy group anyway */
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+ lstcon_group_t *grp = NULL;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+ return -EBUSY;
+ }
+
+ args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+ LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+ lstcon_group_drain(grp, args);
+
+ lstcon_group_put(grp);
+ /* release empty group */
+ if (list_empty(&grp->grp_ndl_list))
+ lstcon_group_put(grp);
+
+ return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+ lnet_process_id_t *ids_up, struct list_head *result_up)
+{
+ lstcon_group_t *grp = NULL;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group: %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+ return -EBUSY;
+ }
+
+ rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+ lstcon_group_put(grp);
+ /* release empty group */
+ if (list_empty(&grp->grp_ndl_list))
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group: %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+ return -EBUSY;
+ }
+
+ /* re-invite all inactive nodes int the group */
+ rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+ &grp->grp_trans_list, LST_TRANS_SESNEW,
+ grp, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ /* local error, return */
+ CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+ lstcon_group_put(grp);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+ lstcon_rpc_trans_destroy(trans);
+ /* -ref for me */
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char *name_up)
+{
+ lstcon_group_t *grp;
+
+ LASSERT(index >= 0);
+ LASSERT(name_up != NULL);
+
+ list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+ if (index-- == 0) {
+ return copy_to_user(name_up, grp->grp_name, len) ?
+ -EFAULT : 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+ int *count_p, lstcon_node_ent_t *dents_up)
+{
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ int count = 0;
+ int index = 0;
+
+ LASSERT(index_p != NULL && count_p != NULL);
+ LASSERT(dents_up != NULL);
+ LASSERT(*index_p >= 0);
+ LASSERT(*count_p > 0);
+
+ list_for_each_entry(ndl, head, ndl_link) {
+ if (index++ < *index_p)
+ continue;
+
+ if (count >= *count_p)
+ break;
+
+ nd = ndl->ndl_node;
+ if (copy_to_user(&dents_up[count].nde_id,
+ &nd->nd_id, sizeof(nd->nd_id)) ||
+ copy_to_user(&dents_up[count].nde_state,
+ &nd->nd_state, sizeof(nd->nd_state)))
+ return -EFAULT;
+
+ count++;
+ }
+
+ if (index <= *index_p)
+ return -ENOENT;
+
+ *count_p = count;
+ *index_p = index;
+
+ return 0;
+}
+
+int
+lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p,
+ int *index_p, int *count_p, lstcon_node_ent_t *dents_up)
+{
+ lstcon_ndlist_ent_t *gentp;
+ lstcon_group_t *grp;
+ lstcon_ndlink_t *ndl;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", name);
+ return rc;
+ }
+
+ if (dents_up) {
+ /* verbose query */
+ rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+ index_p, count_p, dents_up);
+ lstcon_group_put(grp);
+
+ return rc;
+ }
+
+ /* non-verbose query */
+ LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t));
+ if (gentp == NULL) {
+ CERROR("Can't allocate ndlist_ent\n");
+ lstcon_group_put(grp);
+
+ return -ENOMEM;
+ }
+
+ list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+ LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+ rc = copy_to_user(gents_p, gentp,
+ sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0;
+
+ LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t));
+
+ lstcon_group_put(grp);
+
+ return 0;
+}
+
+static int
+lstcon_batch_find(const char *name, lstcon_batch_t **batpp)
+{
+ lstcon_batch_t *bat;
+
+ list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+ if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+ *batpp = bat;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+ lstcon_batch_t *bat;
+ int i;
+ int rc;
+
+ rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+ if (rc != 0) {
+ CDEBUG(D_NET, "Batch %s already exists\n", name);
+ return rc;
+ }
+
+ LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+ if (bat == NULL) {
+ CERROR("Can't allocate descriptor for batch %s\n", name);
+ return -ENOMEM;
+ }
+
+ LIBCFS_ALLOC(bat->bat_cli_hash,
+ sizeof(struct list_head) * LST_NODE_HASHSIZE);
+ if (bat->bat_cli_hash == NULL) {
+ CERROR("Can't allocate hash for batch %s\n", name);
+ LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+ return -ENOMEM;
+ }
+
+ LIBCFS_ALLOC(bat->bat_srv_hash,
+ sizeof(struct list_head) * LST_NODE_HASHSIZE);
+ if (bat->bat_srv_hash == NULL) {
+ CERROR("Can't allocate hash for batch %s\n", name);
+ LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+ LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+ return -ENOMEM;
+ }
+
+ strcpy(bat->bat_name, name);
+ bat->bat_hdr.tsb_index = 0;
+ bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+ bat->bat_ntest = 0;
+ bat->bat_state = LST_BATCH_IDLE;
+
+ INIT_LIST_HEAD(&bat->bat_cli_list);
+ INIT_LIST_HEAD(&bat->bat_srv_list);
+ INIT_LIST_HEAD(&bat->bat_test_list);
+ INIT_LIST_HEAD(&bat->bat_trans_list);
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+ INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+ INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+ }
+
+ list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+ return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char *name_up)
+{
+ lstcon_batch_t *bat;
+
+ LASSERT(name_up != NULL);
+ LASSERT(index >= 0);
+
+ list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+ if (index-- == 0) {
+ return copy_to_user(name_up, bat->bat_name, len) ?
+ -EFAULT: 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server,
+ int testidx, int *index_p, int *ndent_p,
+ lstcon_node_ent_t *dents_up)
+{
+ lstcon_test_batch_ent_t *entp;
+ struct list_head *clilst;
+ struct list_head *srvlst;
+ lstcon_test_t *test = NULL;
+ lstcon_batch_t *bat;
+ lstcon_ndlink_t *ndl;
+ int rc;
+
+ rc = lstcon_batch_find(name, &bat);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find batch %s\n", name);
+ return -ENOENT;
+ }
+
+ if (testidx > 0) {
+ /* query test, test index start from 1 */
+ list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+ if (testidx-- == 1)
+ break;
+ }
+
+ if (testidx > 0) {
+ CDEBUG(D_NET, "Can't find specified test in batch\n");
+ return -ENOENT;
+ }
+ }
+
+ clilst = (test == NULL) ? &bat->bat_cli_list :
+ &test->tes_src_grp->grp_ndl_list;
+ srvlst = (test == NULL) ? &bat->bat_srv_list :
+ &test->tes_dst_grp->grp_ndl_list;
+
+ if (dents_up != NULL) {
+ rc = lstcon_nodes_getent((server ? srvlst: clilst),
+ index_p, ndent_p, dents_up);
+ return rc;
+ }
+
+ /* non-verbose query */
+ LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t));
+ if (entp == NULL)
+ return -ENOMEM;
+
+ if (test == NULL) {
+ entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+ entp->u.tbe_batch.bae_state = bat->bat_state;
+
+ } else {
+
+ entp->u.tbe_test.tse_type = test->tes_type;
+ entp->u.tbe_test.tse_loop = test->tes_loop;
+ entp->u.tbe_test.tse_concur = test->tes_concur;
+ }
+
+ list_for_each_entry(ndl, clilst, ndl_link)
+ LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+ list_for_each_entry(ndl, srvlst, ndl_link)
+ LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+ rc = copy_to_user(ent_up, entp,
+ sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0;
+
+ LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t));
+
+ return rc;
+}
+
+static int
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+ switch (transop) {
+ case LST_TRANS_TSBRUN:
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ return -ENETDOWN;
+ break;
+
+ case LST_TRANS_TSBSTOP:
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ return 0;
+ break;
+
+ case LST_TRANS_TSBCLIQRY:
+ case LST_TRANS_TSBSRVQRY:
+ break;
+ }
+
+ return 1;
+}
+
+static int
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
+ struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ int rc;
+
+ rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+ &bat->bat_trans_list, transop,
+ bat, lstcon_batrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head *result_up)
+{
+ lstcon_batch_t *bat;
+ int rc;
+
+ if (lstcon_batch_find(name, &bat) != 0) {
+ CDEBUG(D_NET, "Can't find batch %s\n", name);
+ return -ENOENT;
+ }
+
+ bat->bat_arg = timeout;
+
+ rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+ /* mark batch as running if it's started in any node */
+ if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+ bat->bat_state = LST_BATCH_RUNNING;
+
+ return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head *result_up)
+{
+ lstcon_batch_t *bat;
+ int rc;
+
+ if (lstcon_batch_find(name, &bat) != 0) {
+ CDEBUG(D_NET, "Can't find batch %s\n", name);
+ return -ENOENT;
+ }
+
+ bat->bat_arg = force;
+
+ rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+ /* mark batch as stopped if all RPCs finished */
+ if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+ bat->bat_state = LST_BATCH_IDLE;
+
+ return rc;
+}
+
+static void
+lstcon_batch_destroy(lstcon_batch_t *bat)
+{
+ lstcon_ndlink_t *ndl;
+ lstcon_test_t *test;
+ int i;
+
+ list_del(&bat->bat_link);
+
+ while (!list_empty(&bat->bat_test_list)) {
+ test = list_entry(bat->bat_test_list.next,
+ lstcon_test_t, tes_link);
+ LASSERT(list_empty(&test->tes_trans_list));
+
+ list_del(&test->tes_link);
+
+ lstcon_group_put(test->tes_src_grp);
+ lstcon_group_put(test->tes_dst_grp);
+
+ LIBCFS_FREE(test, offsetof(lstcon_test_t,
+ tes_param[test->tes_paramlen]));
+ }
+
+ LASSERT(list_empty(&bat->bat_trans_list));
+
+ while (!list_empty(&bat->bat_cli_list)) {
+ ndl = list_entry(bat->bat_cli_list.next,
+ lstcon_ndlink_t, ndl_link);
+ list_del_init(&ndl->ndl_link);
+
+ lstcon_ndlink_release(ndl);
+ }
+
+ while (!list_empty(&bat->bat_srv_list)) {
+ ndl = list_entry(bat->bat_srv_list.next,
+ lstcon_ndlink_t, ndl_link);
+ list_del_init(&ndl->ndl_link);
+
+ lstcon_ndlink_release(ndl);
+ }
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+ LASSERT(list_empty(&bat->bat_cli_hash[i]));
+ LASSERT(list_empty(&bat->bat_srv_hash[i]));
+ }
+
+ LIBCFS_FREE(bat->bat_cli_hash,
+ sizeof(struct list_head) * LST_NODE_HASHSIZE);
+ LIBCFS_FREE(bat->bat_srv_hash,
+ sizeof(struct list_head) * LST_NODE_HASHSIZE);
+ LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+}
+
+static int
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+ lstcon_test_t *test;
+ lstcon_batch_t *batch;
+ lstcon_ndlink_t *ndl;
+ struct list_head *hash;
+ struct list_head *head;
+
+ test = (lstcon_test_t *)arg;
+ LASSERT(test != NULL);
+
+ batch = test->tes_batch;
+ LASSERT(batch != NULL);
+
+ if (test->tes_oneside &&
+ transop == LST_TRANS_TSBSRVADD)
+ return 0;
+
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ return -ENETDOWN;
+
+ if (transop == LST_TRANS_TSBCLIADD) {
+ hash = batch->bat_cli_hash;
+ head = &batch->bat_cli_list;
+
+ } else {
+ LASSERT(transop == LST_TRANS_TSBSRVADD);
+
+ hash = batch->bat_srv_hash;
+ head = &batch->bat_srv_list;
+ }
+
+ LASSERT(nd->nd_id.nid != LNET_NID_ANY);
+
+ if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+ return -ENOMEM;
+
+ if (list_empty(&ndl->ndl_link))
+ list_add_tail(&ndl->ndl_link, head);
+
+ return 1;
+}
+
+static int
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_group_t *grp;
+ int transop;
+ int rc;
+
+ LASSERT(test->tes_src_grp != NULL);
+ LASSERT(test->tes_dst_grp != NULL);
+
+ transop = LST_TRANS_TSBSRVADD;
+ grp = test->tes_dst_grp;
+again:
+ rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+ &test->tes_trans_list, transop,
+ test, lstcon_testrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+ lstcon_trans_stat()->trs_fwk_errno != 0) {
+ lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+ lstcon_rpc_trans_destroy(trans);
+ /* return if any error */
+ CDEBUG(D_NET, "Failed to add test %s, RPC error %d, framework error %d\n",
+ transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+ lstcon_trans_stat()->trs_rpc_errno,
+ lstcon_trans_stat()->trs_fwk_errno);
+
+ return rc;
+ }
+
+ lstcon_rpc_trans_destroy(trans);
+
+ if (transop == LST_TRANS_TSBCLIADD)
+ return rc;
+
+ transop = LST_TRANS_TSBCLIADD;
+ grp = test->tes_src_grp;
+ test->tes_cliidx = 0;
+
+ /* requests to test clients */
+ goto again;
+}
+
+static int
+lstcon_verify_batch(const char *name, lstcon_batch_t **batch)
+{
+ int rc;
+
+ rc = lstcon_batch_find(name, batch);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find batch %s\n", name);
+ return rc;
+ }
+
+ if ((*batch)->bat_state != LST_BATCH_IDLE) {
+ CDEBUG(D_NET, "Can't change running batch %s\n", name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+lstcon_verify_group(const char *name, lstcon_group_t **grp)
+{
+ int rc;
+ lstcon_ndlink_t *ndl;
+
+ rc = lstcon_group_find(name, grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "can't find group %s\n", name);
+ return rc;
+ }
+
+ list_for_each_entry(ndl, &(*grp)->grp_ndl_list, ndl_link) {
+ if (ndl->ndl_node->nd_state == LST_NODE_ACTIVE)
+ return 0;
+ }
+
+ CDEBUG(D_NET, "Group %s has no ACTIVE nodes\n", name);
+
+ return -EINVAL;
+}
+
+int
+lstcon_test_add(char *batch_name, int type, int loop,
+ int concur, int dist, int span,
+ char *src_name, char *dst_name,
+ void *param, int paramlen, int *retp,
+ struct list_head *result_up)
+{
+ lstcon_test_t *test = NULL;
+ int rc;
+ lstcon_group_t *src_grp = NULL;
+ lstcon_group_t *dst_grp = NULL;
+ lstcon_batch_t *batch = NULL;
+
+ /*
+ * verify that a batch of the given name exists, and the groups
+ * that will be part of the batch exist and have at least one
+ * active node
+ */
+ rc = lstcon_verify_batch(batch_name, &batch);
+ if (rc != 0)
+ goto out;
+
+ rc = lstcon_verify_group(src_name, &src_grp);
+ if (rc != 0)
+ goto out;
+
+ rc = lstcon_verify_group(dst_name, &dst_grp);
+ if (rc != 0)
+ goto out;
+
+ if (dst_grp->grp_userland)
+ *retp = 1;
+
+ LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+ if (!test) {
+ CERROR("Can't allocate test descriptor\n");
+ rc = -ENOMEM;
+
+ goto out;
+ }
+
+ test->tes_hdr.tsb_id = batch->bat_hdr.tsb_id;
+ test->tes_batch = batch;
+ test->tes_type = type;
+ test->tes_oneside = 0; /* TODO */
+ test->tes_loop = loop;
+ test->tes_concur = concur;
+ test->tes_stop_onerr = 1; /* TODO */
+ test->tes_span = span;
+ test->tes_dist = dist;
+ test->tes_cliidx = 0; /* just used for creating RPC */
+ test->tes_src_grp = src_grp;
+ test->tes_dst_grp = dst_grp;
+ INIT_LIST_HEAD(&test->tes_trans_list);
+
+ if (param != NULL) {
+ test->tes_paramlen = paramlen;
+ memcpy(&test->tes_param[0], param, paramlen);
+ }
+
+ rc = lstcon_test_nodes_add(test, result_up);
+
+ if (rc != 0)
+ goto out;
+
+ if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+ lstcon_trans_stat()->trs_fwk_errno != 0)
+ CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type,
+ batch_name);
+
+ /* add to test list anyway, so user can check what's going on */
+ list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+ batch->bat_ntest++;
+ test->tes_hdr.tsb_index = batch->bat_ntest;
+
+ /* hold groups so nobody can change them */
+ return rc;
+out:
+ if (test != NULL)
+ LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+
+ if (dst_grp != NULL)
+ lstcon_group_put(dst_grp);
+
+ if (src_grp != NULL)
+ lstcon_group_put(src_grp);
+
+ return rc;
+}
+
+static int
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+{
+ lstcon_test_t *test;
+
+ list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+ if (idx == test->tes_hdr.tsb_index) {
+ *testpp = test;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static int
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+ lstcon_rpc_ent_t *ent_up)
+{
+ srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+ LASSERT(transop == LST_TRANS_TSBCLIQRY ||
+ transop == LST_TRANS_TSBSRVQRY);
+
+ /* positive errno, framework error code */
+ if (copy_to_user(&ent_up->rpe_priv[0],
+ &rep->bar_active, sizeof(rep->bar_active)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+ int timeout, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ struct list_head *translist;
+ struct list_head *ndlist;
+ lstcon_tsb_hdr_t *hdr;
+ lstcon_batch_t *batch;
+ lstcon_test_t *test = NULL;
+ int transop;
+ int rc;
+
+ rc = lstcon_batch_find(name, &batch);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find batch: %s\n", name);
+ return rc;
+ }
+
+ if (testidx == 0) {
+ translist = &batch->bat_trans_list;
+ ndlist = &batch->bat_cli_list;
+ hdr = &batch->bat_hdr;
+
+ } else {
+ /* query specified test only */
+ rc = lstcon_test_find(batch, testidx, &test);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+ return rc;
+ }
+
+ translist = &test->tes_trans_list;
+ ndlist = &test->tes_src_grp->grp_ndl_list;
+ hdr = &test->tes_hdr;
+ }
+
+ transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+ rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+ lstcon_batrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, timeout);
+
+ if (testidx == 0 && /* query a batch, not a test */
+ lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+ lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+ /* all RPCs finished, and no active test */
+ batch->bat_state = LST_BATCH_IDLE;
+ }
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up,
+ lstcon_tsbrpc_readent);
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+static int
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+ lstcon_rpc_ent_t *ent_up)
+{
+ srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+ sfw_counters_t *sfwk_stat;
+ srpc_counters_t *srpc_stat;
+ lnet_counters_t *lnet_stat;
+
+ if (rep->str_status != 0)
+ return 0;
+
+ sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0];
+ srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat));
+ lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat));
+
+ if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+ copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+ copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int
+lstcon_ndlist_stat(struct list_head *ndlist,
+ int timeout, struct list_head *result_up)
+{
+ struct list_head head;
+ lstcon_rpc_trans_t *trans;
+ int rc;
+
+ INIT_LIST_HEAD(&head);
+
+ rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+ LST_TRANS_STATQRY, NULL, NULL, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up,
+ lstcon_statrpc_readent);
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up)
+{
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = lstcon_group_find(grp_name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+ return rc;
+ }
+
+ rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+ int timeout, struct list_head *result_up)
+{
+ lstcon_ndlink_t *ndl;
+ lstcon_group_t *tmp;
+ lnet_process_id_t id;
+ int i;
+ int rc;
+
+ rc = lstcon_group_alloc(NULL, &tmp);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0 ; i < count; i++) {
+ if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ /* add to tmp group */
+ rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+ if (rc != 0) {
+ CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+ "Failed to find or create %s: %d\n",
+ libcfs_id2str(id), rc);
+ break;
+ }
+ }
+
+ if (rc != 0) {
+ lstcon_group_put(tmp);
+ return rc;
+ }
+
+ rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+ lstcon_group_put(tmp);
+
+ return rc;
+}
+
+static int
+lstcon_debug_ndlist(struct list_head *ndlist,
+ struct list_head *translist,
+ int timeout, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ int rc;
+
+ rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+ NULL, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up,
+ lstcon_sesrpc_readent);
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head *result_up)
+{
+ return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+ NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+ int client, struct list_head *result_up)
+{
+ lstcon_batch_t *bat;
+ int rc;
+
+ rc = lstcon_batch_find(name, &bat);
+ if (rc != 0)
+ return -ENOENT;
+
+ rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+ &bat->bat_srv_list,
+ NULL, timeout, result_up);
+
+ return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+ struct list_head *result_up)
+{
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0)
+ return -ENOENT;
+
+ rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+ timeout, result_up);
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout,
+ int count, lnet_process_id_t *ids_up,
+ struct list_head *result_up)
+{
+ lnet_process_id_t id;
+ lstcon_ndlink_t *ndl;
+ lstcon_group_t *grp;
+ int i;
+ int rc;
+
+ rc = lstcon_group_alloc(NULL, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Out of memory\n");
+ return rc;
+ }
+
+ for (i = 0; i < count; i++) {
+ if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ /* node is added to tmp group */
+ rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+ if (rc != 0) {
+ CERROR("Can't create node link\n");
+ break;
+ }
+ }
+
+ if (rc != 0) {
+ lstcon_group_put(grp);
+ return rc;
+ }
+
+ rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+ timeout, result_up);
+
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_session_match(lst_sid_t sid)
+{
+ return (console_session.ses_id.ses_nid == sid.ses_nid &&
+ console_session.ses_id.ses_stamp == sid.ses_stamp) ? 1: 0;
+}
+
+static void
+lstcon_new_session_id(lst_sid_t *sid)
+{
+ lnet_process_id_t id;
+
+ LASSERT(console_session.ses_state == LST_SESSION_NONE);
+
+ LNetGetId(1, &id);
+ sid->ses_nid = id.nid;
+ sid->ses_stamp = cfs_time_current();
+}
+
+extern srpc_service_t lstcon_acceptor_service;
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+ int timeout, int force, lst_sid_t *sid_up)
+{
+ int rc = 0;
+ int i;
+
+ if (console_session.ses_state != LST_SESSION_NONE) {
+ /* session exists */
+ if (!force) {
+ CNETERR("Session %s already exists\n",
+ console_session.ses_name);
+ return -EEXIST;
+ }
+
+ rc = lstcon_session_end();
+
+ /* lstcon_session_end() only return local error */
+ if (rc != 0)
+ return rc;
+ }
+
+ if ((feats & ~LST_FEATS_MASK) != 0) {
+ CNETERR("Unknown session features %x\n",
+ (feats & ~LST_FEATS_MASK));
+ return -EINVAL;
+ }
+
+ for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+ LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+ lstcon_new_session_id(&console_session.ses_id);
+
+ console_session.ses_key = key;
+ console_session.ses_state = LST_SESSION_ACTIVE;
+ console_session.ses_force = !!force;
+ console_session.ses_features = feats;
+ console_session.ses_feats_updated = 0;
+ console_session.ses_timeout = (timeout <= 0) ?
+ LST_CONSOLE_TIMEOUT : timeout;
+ strcpy(console_session.ses_name, name);
+
+ rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+ if (rc != 0)
+ return rc;
+
+ rc = lstcon_rpc_pinger_start();
+ if (rc != 0) {
+ lstcon_batch_t *bat = NULL;
+
+ lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+ lstcon_batch_destroy(bat);
+
+ return rc;
+ }
+
+ if (copy_to_user(sid_up, &console_session.ses_id,
+ sizeof(lst_sid_t)) == 0)
+ return rc;
+
+ lstcon_session_end();
+
+ return -EFAULT;
+}
+
+int
+lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp,
+ lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len)
+{
+ lstcon_ndlist_ent_t *entp;
+ lstcon_ndlink_t *ndl;
+ int rc = 0;
+
+ if (console_session.ses_state != LST_SESSION_ACTIVE)
+ return -ESRCH;
+
+ LIBCFS_ALLOC(entp, sizeof(*entp));
+ if (entp == NULL)
+ return -ENOMEM;
+
+ list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+ LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+ if (copy_to_user(sid_up, &console_session.ses_id,
+ sizeof(lst_sid_t)) ||
+ copy_to_user(key_up, &console_session.ses_key,
+ sizeof(*key_up)) ||
+ copy_to_user(featp, &console_session.ses_features,
+ sizeof(*featp)) ||
+ copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+ copy_to_user(name_up, console_session.ses_name, len))
+ rc = -EFAULT;
+
+ LIBCFS_FREE(entp, sizeof(*entp));
+
+ return rc;
+}
+
+int
+lstcon_session_end(void)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_group_t *grp;
+ lstcon_batch_t *bat;
+ int rc = 0;
+
+ LASSERT(console_session.ses_state == LST_SESSION_ACTIVE);
+
+ rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+ NULL, LST_TRANS_SESEND, NULL,
+ lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ console_session.ses_shutdown = 1;
+
+ lstcon_rpc_pinger_stop();
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ lstcon_rpc_trans_destroy(trans);
+ /* User can do nothing even rpc failed, so go on */
+
+ /* waiting for orphan rpcs to die */
+ lstcon_rpc_cleanup_wait();
+
+ console_session.ses_id = LST_INVALID_SID;
+ console_session.ses_state = LST_SESSION_NONE;
+ console_session.ses_key = 0;
+ console_session.ses_force = 0;
+ console_session.ses_feats_updated = 0;
+
+ /* destroy all batches */
+ while (!list_empty(&console_session.ses_bat_list)) {
+ bat = list_entry(console_session.ses_bat_list.next,
+ lstcon_batch_t, bat_link);
+
+ lstcon_batch_destroy(bat);
+ }
+
+ /* destroy all groups */
+ while (!list_empty(&console_session.ses_grp_list)) {
+ grp = list_entry(console_session.ses_grp_list.next,
+ lstcon_group_t, grp_link);
+ LASSERT(grp->grp_ref == 1);
+
+ lstcon_group_put(grp);
+ }
+
+ /* all nodes should be released */
+ LASSERT(list_empty(&console_session.ses_ndl_list));
+
+ console_session.ses_shutdown = 0;
+ console_session.ses_expired = 0;
+
+ return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+ int rc = 0;
+
+ if ((feats & ~LST_FEATS_MASK) != 0) {
+ CERROR("Can't support these features: %x\n",
+ (feats & ~LST_FEATS_MASK));
+ return -EPROTO;
+ }
+
+ spin_lock(&console_session.ses_rpc_lock);
+
+ if (!console_session.ses_feats_updated) {
+ console_session.ses_feats_updated = 1;
+ console_session.ses_features = feats;
+ }
+
+ if (console_session.ses_features != feats)
+ rc = -EPROTO;
+
+ spin_unlock(&console_session.ses_rpc_lock);
+
+ if (rc != 0) {
+ CERROR("remote features %x do not match with session features %x of console\n",
+ feats, console_session.ses_features);
+ }
+
+ return rc;
+}
+
+static int
+lstcon_acceptor_handle(srpc_server_rpc_t *rpc)
+{
+ srpc_msg_t *rep = &rpc->srpc_replymsg;
+ srpc_msg_t *req = &rpc->srpc_reqstbuf->buf_msg;
+ srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+ srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+ lstcon_group_t *grp = NULL;
+ lstcon_ndlink_t *ndl;
+ int rc = 0;
+
+ sfw_unpack_message(req);
+
+ mutex_lock(&console_session.ses_mutex);
+
+ jrep->join_sid = console_session.ses_id;
+
+ if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+ jrep->join_status = ESRCH;
+ goto out;
+ }
+
+ if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+ jrep->join_status = EPROTO;
+ goto out;
+ }
+
+ if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+ !lstcon_session_match(jreq->join_sid)) {
+ jrep->join_status = EBUSY;
+ goto out;
+ }
+
+ if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+ rc = lstcon_group_alloc(jreq->join_group, &grp);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ goto out;
+ }
+
+ list_add_tail(&grp->grp_link,
+ &console_session.ses_grp_list);
+ lstcon_group_addref(grp);
+ }
+
+ if (grp->grp_ref > 2) {
+ /* Group in using */
+ jrep->join_status = EBUSY;
+ goto out;
+ }
+
+ rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+ if (rc == 0) {
+ jrep->join_status = EEXIST;
+ goto out;
+ }
+
+ rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ goto out;
+ }
+
+ ndl->ndl_node->nd_state = LST_NODE_ACTIVE;
+ ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+ if (grp->grp_userland == 0)
+ grp->grp_userland = 1;
+
+ strcpy(jrep->join_session, console_session.ses_name);
+ jrep->join_timeout = console_session.ses_timeout;
+ jrep->join_status = 0;
+
+out:
+ rep->msg_ses_feats = console_session.ses_features;
+ if (grp != NULL)
+ lstcon_group_put(grp);
+
+ mutex_unlock(&console_session.ses_mutex);
+
+ return rc;
+}
+
+srpc_service_t lstcon_acceptor_service;
+static void lstcon_init_acceptor_service(void)
+{
+ /* initialize selftest console acceptor service table */
+ lstcon_acceptor_service.sv_name = "join session";
+ lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+ lstcon_acceptor_service.sv_id = SRPC_SERVICE_JOIN;
+ lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+
+static DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+ int i;
+ int rc;
+
+ memset(&console_session, 0, sizeof(lstcon_session_t));
+
+ console_session.ses_id = LST_INVALID_SID;
+ console_session.ses_state = LST_SESSION_NONE;
+ console_session.ses_timeout = 0;
+ console_session.ses_force = 0;
+ console_session.ses_expired = 0;
+ console_session.ses_feats_updated = 0;
+ console_session.ses_features = LST_FEATS_MASK;
+ console_session.ses_laststamp = get_seconds();
+
+ mutex_init(&console_session.ses_mutex);
+
+ INIT_LIST_HEAD(&console_session.ses_ndl_list);
+ INIT_LIST_HEAD(&console_session.ses_grp_list);
+ INIT_LIST_HEAD(&console_session.ses_bat_list);
+ INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+ LIBCFS_ALLOC(console_session.ses_ndl_hash,
+ sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+ if (console_session.ses_ndl_hash == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+ INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+ /* initialize acceptor service table */
+ lstcon_init_acceptor_service();
+
+ rc = srpc_add_service(&lstcon_acceptor_service);
+ LASSERT(rc != -EBUSY);
+ if (rc != 0) {
+ LIBCFS_FREE(console_session.ses_ndl_hash,
+ sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+ return rc;
+ }
+
+ rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+ lstcon_acceptor_service.sv_wi_total);
+ if (rc != 0) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+ if (rc == 0) {
+ lstcon_rpc_module_init();
+ return 0;
+ }
+
+out:
+ srpc_shutdown_service(&lstcon_acceptor_service);
+ srpc_remove_service(&lstcon_acceptor_service);
+
+ LIBCFS_FREE(console_session.ses_ndl_hash,
+ sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+ srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+ return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+ int i;
+
+ libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+
+ mutex_lock(&console_session.ses_mutex);
+
+ srpc_shutdown_service(&lstcon_acceptor_service);
+ srpc_remove_service(&lstcon_acceptor_service);
+
+ if (console_session.ses_state != LST_SESSION_NONE)
+ lstcon_session_end();
+
+ lstcon_rpc_module_fini();
+
+ mutex_unlock(&console_session.ses_mutex);
+
+ LASSERT(list_empty(&console_session.ses_ndl_list));
+ LASSERT(list_empty(&console_session.ses_grp_list));
+ LASSERT(list_empty(&console_session.ses_bat_list));
+ LASSERT(list_empty(&console_session.ses_trans_list));
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+ LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+ }
+
+ LIBCFS_FREE(console_session.ses_ndl_hash,
+ sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+ srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+ return 0;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/console.h b/kernel/drivers/staging/lustre/lnet/selftest/console.h
new file mode 100644
index 000000000..e41ca89f1
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/console.h
@@ -0,0 +1,235 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lnet.h"
+#include "../../include/linux/lnet/lib-types.h"
+#include "../../include/linux/lnet/lnetst.h"
+#include "selftest.h"
+#include "conrpc.h"
+
+typedef struct lstcon_node {
+ lnet_process_id_t nd_id; /* id of the node */
+ int nd_ref; /* reference count */
+ int nd_state; /* state of the node */
+ int nd_timeout; /* session timeout */
+ unsigned long nd_stamp; /* timestamp of last replied RPC */
+ struct lstcon_rpc nd_ping; /* ping rpc */
+} lstcon_node_t; /*** node descriptor */
+
+typedef struct {
+ struct list_head ndl_link; /* chain on list */
+ struct list_head ndl_hlink; /* chain on hash */
+ lstcon_node_t *ndl_node; /* pointer to node */
+} lstcon_ndlink_t; /*** node link descriptor */
+
+typedef struct {
+ struct list_head grp_link; /* chain on global group list */
+ int grp_ref; /* reference count */
+ int grp_userland; /* has userland nodes */
+ int grp_nnode; /* # of nodes */
+ char grp_name[LST_NAME_SIZE]; /* group name */
+
+ struct list_head grp_trans_list; /* transaction list */
+ struct list_head grp_ndl_list; /* nodes list */
+ struct list_head grp_ndl_hash[0];/* hash table for nodes */
+} lstcon_group_t; /*** (alias of nodes) group descriptor */
+
+#define LST_BATCH_IDLE 0xB0 /* idle batch */
+#define LST_BATCH_RUNNING 0xB1 /* running batch */
+
+typedef struct lstcon_tsb_hdr {
+ lst_bid_t tsb_id; /* batch ID */
+ int tsb_index; /* test index */
+} lstcon_tsb_hdr_t;
+
+typedef struct {
+ lstcon_tsb_hdr_t bat_hdr; /* test_batch header */
+ struct list_head bat_link; /* chain on session's batches list */
+ int bat_ntest; /* # of test */
+ int bat_state; /* state of the batch */
+ int bat_arg; /* parameter for run|stop, timeout for run, force for stop */
+ char bat_name[LST_NAME_SIZE]; /* name of batch */
+
+ struct list_head bat_test_list; /* list head of tests (lstcon_test_t) */
+ struct list_head bat_trans_list; /* list head of transaction */
+ struct list_head bat_cli_list; /* list head of client nodes (lstcon_node_t) */
+ struct list_head *bat_cli_hash; /* hash table of client nodes */
+ struct list_head bat_srv_list; /* list head of server nodes */
+ struct list_head *bat_srv_hash; /* hash table of server nodes */
+} lstcon_batch_t; /*** (tests ) batch descriptor */
+
+typedef struct lstcon_test {
+ lstcon_tsb_hdr_t tes_hdr; /* test batch header */
+ struct list_head tes_link; /* chain on batch's tests list */
+ lstcon_batch_t *tes_batch; /* pointer to batch */
+
+ int tes_type; /* type of the test, i.e: bulk, ping */
+ int tes_stop_onerr; /* stop on error */
+ int tes_oneside; /* one-sided test */
+ int tes_concur; /* concurrency */
+ int tes_loop; /* loop count */
+ int tes_dist; /* nodes distribution of target group */
+ int tes_span; /* nodes span of target group */
+ int tes_cliidx; /* client index, used for RPC creating */
+
+ struct list_head tes_trans_list; /* transaction list */
+ lstcon_group_t *tes_src_grp; /* group run the test */
+ lstcon_group_t *tes_dst_grp; /* target group */
+
+ int tes_paramlen; /* test parameter length */
+ char tes_param[0]; /* test parameter */
+} lstcon_test_t; /*** a single test descriptor */
+
+#define LST_GLOBAL_HASHSIZE 503 /* global nodes hash table size */
+#define LST_NODE_HASHSIZE 239 /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE 0x0 /* no session */
+#define LST_SESSION_ACTIVE 0x1 /* working session */
+
+#define LST_CONSOLE_TIMEOUT 300 /* default console timeout */
+
+typedef struct {
+ struct mutex ses_mutex; /* only 1 thread in session */
+ lst_sid_t ses_id; /* global session id */
+ int ses_key; /* local session key */
+ int ses_state; /* state of session */
+ int ses_timeout; /* timeout in seconds */
+ time_t ses_laststamp; /* last operation stamp (seconds) */
+ /** tests features of the session */
+ unsigned ses_features;
+ /** features are synced with remote test nodes */
+ unsigned ses_feats_updated:1;
+ /** force creating */
+ unsigned ses_force:1;
+ /** session is shutting down */
+ unsigned ses_shutdown:1;
+ /** console is timedout */
+ unsigned ses_expired:1;
+ __u64 ses_id_cookie; /* batch id cookie */
+ char ses_name[LST_NAME_SIZE]; /* session name */
+ lstcon_rpc_trans_t *ses_ping; /* session pinger */
+ stt_timer_t ses_ping_timer; /* timer for pinger */
+ lstcon_trans_stat_t ses_trans_stat; /* transaction stats */
+
+ struct list_head ses_trans_list; /* global list of transaction */
+ struct list_head ses_grp_list; /* global list of groups */
+ struct list_head ses_bat_list; /* global list of batches */
+ struct list_head ses_ndl_list; /* global list of nodes */
+ struct list_head *ses_ndl_hash; /* hash table of nodes */
+
+ spinlock_t ses_rpc_lock; /* serialize */
+ atomic_t ses_rpc_counter;/* # of initialized RPCs */
+ struct list_head ses_rpc_freelist; /* idle console rpc */
+} lstcon_session_t; /*** session descriptor */
+
+extern lstcon_session_t console_session;
+
+static inline lstcon_trans_stat_t *
+lstcon_trans_stat(void)
+{
+ return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash (lnet_process_id_t id, struct list_head *hash)
+{
+ unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+ return &hash[idx];
+}
+
+int lstcon_console_init(void);
+int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+int lstcon_console_fini(void);
+extern int lstcon_session_match(lst_sid_t sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+ int timeout, int flags, lst_sid_t *sid_up);
+extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp,
+ lstcon_ndlist_ent_t *entp, char *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout, struct list_head *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+ int client, struct list_head *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+ struct list_head *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up,
+ struct list_head *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head *result_up);
+extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up,
+ unsigned *featp, struct list_head *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up,
+ struct list_head *result_up);
+extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up,
+ int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up);
+extern int lstcon_group_list(int idx, int len, char *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+ struct list_head *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+ struct list_head *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+ int client, int timeout,
+ struct list_head *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char *name_up);
+extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up,
+ int server, int testidx, int *index_p,
+ int *ndent_p, lstcon_node_ent_t *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+ struct list_head *result_up);
+extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+ int timeout, struct list_head *result_up);
+extern int lstcon_test_add(char *batch_name, int type, int loop,
+ int concur, int dist, int span,
+ char *src_name, char *dst_name,
+ void *param, int paramlen, int *retp,
+ struct list_head *result_up);
+#endif
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/framework.c b/kernel/drivers/staging/lustre/lnet/selftest/framework.c
new file mode 100644
index 000000000..a93a90de0
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/framework.c
@@ -0,0 +1,1804 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1};
+
+static int session_timeout = 100;
+module_param(session_timeout, int, 0444);
+MODULE_PARM_DESC(session_timeout, "test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+module_param(rpc_timeout, int, 0644);
+MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id) \
+do { \
+ __swab64s(&(id).nid); \
+ __swab32s(&(id).pid); \
+} while (0)
+
+#define sfw_unpack_sid(sid) \
+do { \
+ __swab64s(&(sid).ses_nid); \
+ __swab64s(&(sid).ses_stamp); \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc) \
+do { \
+ __swab32s(&(fc).running_ms); \
+ __swab32s(&(fc).active_batches); \
+ __swab32s(&(fc).zombie_sessions); \
+ __swab32s(&(fc).brw_errors); \
+ __swab32s(&(fc).ping_errors); \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc) \
+do { \
+ __swab32s(&(rc).errors); \
+ __swab32s(&(rc).rpcs_sent); \
+ __swab32s(&(rc).rpcs_rcvd); \
+ __swab32s(&(rc).rpcs_dropped); \
+ __swab32s(&(rc).rpcs_expired); \
+ __swab64s(&(rc).bulk_get); \
+ __swab64s(&(rc).bulk_put); \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc) \
+do { \
+ __swab32s(&(lc).errors); \
+ __swab32s(&(lc).msgs_max); \
+ __swab32s(&(lc).msgs_alloc); \
+ __swab32s(&(lc).send_count); \
+ __swab32s(&(lc).recv_count); \
+ __swab32s(&(lc).drop_count); \
+ __swab32s(&(lc).route_count); \
+ __swab64s(&(lc).send_length); \
+ __swab64s(&(lc).recv_length); \
+ __swab64s(&(lc).drop_length); \
+ __swab64s(&(lc).route_length); \
+} while (0)
+
+#define sfw_test_active(t) (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b) (atomic_read(&(b)->bat_nactive) != 0)
+
+static struct smoketest_framework {
+ struct list_head fw_zombie_rpcs; /* RPCs to be recycled */
+ struct list_head fw_zombie_sessions; /* stopping sessions */
+ struct list_head fw_tests; /* registered test cases */
+ atomic_t fw_nzombies; /* # zombie sessions */
+ spinlock_t fw_lock; /* serialise */
+ sfw_session_t *fw_session; /* _the_ session */
+ int fw_shuttingdown; /* shutdown in progress */
+ srpc_server_rpc_t *fw_active_srpc; /* running RPC */
+} sfw_data;
+
+/* forward ref's */
+int sfw_stop_batch(sfw_batch_t *tsb, int force);
+void sfw_destroy_session(sfw_session_t *sn);
+
+static inline sfw_test_case_t *
+sfw_find_test_case(int id)
+{
+ sfw_test_case_t *tsc;
+
+ LASSERT(id <= SRPC_SERVICE_MAX_ID);
+ LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+ list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+ if (tsc->tsc_srv_service->sv_id == id)
+ return tsc;
+ }
+
+ return NULL;
+}
+
+static int
+sfw_register_test(srpc_service_t *service, sfw_test_client_ops_t *cliops)
+{
+ sfw_test_case_t *tsc;
+
+ if (sfw_find_test_case(service->sv_id) != NULL) {
+ CERROR("Failed to register test %s (%d)\n",
+ service->sv_name, service->sv_id);
+ return -EEXIST;
+ }
+
+ LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+ if (tsc == NULL)
+ return -ENOMEM;
+
+ tsc->tsc_cli_ops = cliops;
+ tsc->tsc_srv_service = service;
+
+ list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+ return 0;
+}
+
+static void
+sfw_add_session_timer(void)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ stt_timer_t *timer = &sn->sn_timer;
+
+ LASSERT(!sfw_data.fw_shuttingdown);
+
+ if (sn == NULL || sn->sn_timeout == 0)
+ return;
+
+ LASSERT(!sn->sn_timer_active);
+
+ sn->sn_timer_active = 1;
+ timer->stt_expires = cfs_time_add(sn->sn_timeout,
+ get_seconds());
+ stt_add_timer(timer);
+ return;
+}
+
+static int
+sfw_del_session_timer(void)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+
+ if (sn == NULL || !sn->sn_timer_active)
+ return 0;
+
+ LASSERT(sn->sn_timeout != 0);
+
+ if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+ sn->sn_timer_active = 0;
+ return 0;
+ }
+
+ return EBUSY; /* racing with sfw_session_expired() */
+}
+
+static void
+sfw_deactivate_session(void)
+ __must_hold(&sfw_data.fw_lock)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ int nactive = 0;
+ sfw_batch_t *tsb;
+ sfw_test_case_t *tsc;
+
+ if (sn == NULL) return;
+
+ LASSERT(!sn->sn_timer_active);
+
+ sfw_data.fw_session = NULL;
+ atomic_inc(&sfw_data.fw_nzombies);
+ list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+ spin_unlock(&sfw_data.fw_lock);
+
+ list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+ srpc_abort_service(tsc->tsc_srv_service);
+ }
+
+ spin_lock(&sfw_data.fw_lock);
+
+ list_for_each_entry(tsb, &sn->sn_batches, bat_list) {
+ if (sfw_batch_active(tsb)) {
+ nactive++;
+ sfw_stop_batch(tsb, 1);
+ }
+ }
+
+ if (nactive != 0)
+ return; /* wait for active batches to stop */
+
+ list_del_init(&sn->sn_list);
+ spin_unlock(&sfw_data.fw_lock);
+
+ sfw_destroy_session(sn);
+
+ spin_lock(&sfw_data.fw_lock);
+}
+
+
+static void
+sfw_session_expired(void *data)
+{
+ sfw_session_t *sn = data;
+
+ spin_lock(&sfw_data.fw_lock);
+
+ LASSERT(sn->sn_timer_active);
+ LASSERT(sn == sfw_data.fw_session);
+
+ CWARN("Session expired! sid: %s-%llu, name: %s\n",
+ libcfs_nid2str(sn->sn_id.ses_nid),
+ sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+ sn->sn_timer_active = 0;
+ sfw_deactivate_session();
+
+ spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(sfw_session_t *sn, lst_sid_t sid,
+ unsigned features, const char *name)
+{
+ stt_timer_t *timer = &sn->sn_timer;
+
+ memset(sn, 0, sizeof(sfw_session_t));
+ INIT_LIST_HEAD(&sn->sn_list);
+ INIT_LIST_HEAD(&sn->sn_batches);
+ atomic_set(&sn->sn_refcount, 1); /* +1 for caller */
+ atomic_set(&sn->sn_brw_errors, 0);
+ atomic_set(&sn->sn_ping_errors, 0);
+ strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+ sn->sn_timer_active = 0;
+ sn->sn_id = sid;
+ sn->sn_features = features;
+ sn->sn_timeout = session_timeout;
+ sn->sn_started = cfs_time_current();
+
+ timer->stt_data = sn;
+ timer->stt_func = sfw_session_expired;
+ INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+static void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ int status = rpc->srpc_status;
+
+ CDEBUG(D_NET,
+ "Incoming framework RPC done: service %s, peer %s, status %s:%d\n",
+ sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+ swi_state2str(rpc->srpc_wi.swi_state),
+ status);
+
+ if (rpc->srpc_bulk != NULL)
+ sfw_free_pages(rpc);
+ return;
+}
+
+static void
+sfw_client_rpc_fini(srpc_client_rpc_t *rpc)
+{
+ LASSERT(rpc->crpc_bulk.bk_niov == 0);
+ LASSERT(list_empty(&rpc->crpc_list));
+ LASSERT(atomic_read(&rpc->crpc_refcount) == 0);
+
+ CDEBUG(D_NET,
+ "Outgoing framework RPC done: service %d, peer %s, status %s:%d:%d\n",
+ rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+ swi_state2str(rpc->crpc_wi.swi_state),
+ rpc->crpc_aborted, rpc->crpc_status);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ /* my callers must finish all RPCs before shutting me down */
+ LASSERT(!sfw_data.fw_shuttingdown);
+ list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+ spin_unlock(&sfw_data.fw_lock);
+}
+
+static sfw_batch_t *
+sfw_find_batch(lst_bid_t bid)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ sfw_batch_t *bat;
+
+ LASSERT(sn != NULL);
+
+ list_for_each_entry(bat, &sn->sn_batches, bat_list) {
+ if (bat->bat_id.bat_id == bid.bat_id)
+ return bat;
+ }
+
+ return NULL;
+}
+
+static sfw_batch_t *
+sfw_bid2batch(lst_bid_t bid)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ sfw_batch_t *bat;
+
+ LASSERT(sn != NULL);
+
+ bat = sfw_find_batch(bid);
+ if (bat != NULL)
+ return bat;
+
+ LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+ if (bat == NULL)
+ return NULL;
+
+ bat->bat_error = 0;
+ bat->bat_session = sn;
+ bat->bat_id = bid;
+ atomic_set(&bat->bat_nactive, 0);
+ INIT_LIST_HEAD(&bat->bat_tests);
+
+ list_add_tail(&bat->bat_list, &sn->sn_batches);
+ return bat;
+}
+
+static int
+sfw_get_stats(srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ sfw_counters_t *cnt = &reply->str_fw;
+ sfw_batch_t *bat;
+ struct timeval tv;
+
+ reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+ if (request->str_sid.ses_nid == LNET_NID_ANY) {
+ reply->str_status = EINVAL;
+ return 0;
+ }
+
+ if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+ reply->str_status = ESRCH;
+ return 0;
+ }
+
+ lnet_counters_get(&reply->str_lnet);
+ srpc_get_counters(&reply->str_rpc);
+
+ /* send over the msecs since the session was started
+ - with 32 bits to send, this is ~49 days */
+ cfs_duration_usec(cfs_time_sub(cfs_time_current(),
+ sn->sn_started), &tv);
+
+ cnt->running_ms = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+ cnt->brw_errors = atomic_read(&sn->sn_brw_errors);
+ cnt->ping_errors = atomic_read(&sn->sn_ping_errors);
+ cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+ cnt->active_batches = 0;
+ list_for_each_entry(bat, &sn->sn_batches, bat_list) {
+ if (atomic_read(&bat->bat_nactive) > 0)
+ cnt->active_batches++;
+ }
+
+ reply->str_status = 0;
+ return 0;
+}
+
+int
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ srpc_msg_t *msg = container_of(request, srpc_msg_t,
+ msg_body.mksn_reqst);
+ int cplen = 0;
+
+ if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+ reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+ reply->mksn_status = EINVAL;
+ return 0;
+ }
+
+ if (sn != NULL) {
+ reply->mksn_status = 0;
+ reply->mksn_sid = sn->sn_id;
+ reply->mksn_timeout = sn->sn_timeout;
+
+ if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+ atomic_inc(&sn->sn_refcount);
+ return 0;
+ }
+
+ if (!request->mksn_force) {
+ reply->mksn_status = EBUSY;
+ cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+ sizeof(reply->mksn_name));
+ if (cplen >= sizeof(reply->mksn_name))
+ return -E2BIG;
+ return 0;
+ }
+ }
+
+ /* reject the request if it requires unknown features
+ * NB: old version will always accept all features because it's not
+ * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+ * harmless because it will return zero feature to console, and it's
+ * console's responsibility to make sure all nodes in a session have
+ * same feature mask. */
+ if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ reply->mksn_status = EPROTO;
+ return 0;
+ }
+
+ /* brand new or create by force */
+ LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+ if (sn == NULL) {
+ CERROR("Dropping RPC (mksn) under memory pressure.\n");
+ return -ENOMEM;
+ }
+
+ sfw_init_session(sn, request->mksn_sid,
+ msg->msg_ses_feats, &request->mksn_name[0]);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ sfw_deactivate_session();
+ LASSERT(sfw_data.fw_session == NULL);
+ sfw_data.fw_session = sn;
+
+ spin_unlock(&sfw_data.fw_lock);
+
+ reply->mksn_status = 0;
+ reply->mksn_sid = sn->sn_id;
+ reply->mksn_timeout = sn->sn_timeout;
+ return 0;
+}
+
+static int
+sfw_remove_session(srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+
+ reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+ if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+ reply->rmsn_status = EINVAL;
+ return 0;
+ }
+
+ if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+ reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+ return 0;
+ }
+
+ if (!atomic_dec_and_test(&sn->sn_refcount)) {
+ reply->rmsn_status = 0;
+ return 0;
+ }
+
+ spin_lock(&sfw_data.fw_lock);
+ sfw_deactivate_session();
+ spin_unlock(&sfw_data.fw_lock);
+
+ reply->rmsn_status = 0;
+ reply->rmsn_sid = LST_INVALID_SID;
+ LASSERT(sfw_data.fw_session == NULL);
+ return 0;
+}
+
+static int
+sfw_debug_session(srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+
+ if (sn == NULL) {
+ reply->dbg_status = ESRCH;
+ reply->dbg_sid = LST_INVALID_SID;
+ return 0;
+ }
+
+ reply->dbg_status = 0;
+ reply->dbg_sid = sn->sn_id;
+ reply->dbg_timeout = sn->sn_timeout;
+ if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+ >= sizeof(reply->dbg_name))
+ return -E2BIG;
+
+ return 0;
+}
+
+static void
+sfw_test_rpc_fini(srpc_client_rpc_t *rpc)
+{
+ sfw_test_unit_t *tsu = rpc->crpc_priv;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+ /* Called with hold of tsi->tsi_lock */
+ LASSERT(list_empty(&rpc->crpc_list));
+ list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(sfw_test_instance_t *tsi)
+{
+ struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+ struct srpc_service *svc = tsc->tsc_srv_service;
+ int nbuf;
+
+ nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+ return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+static int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+ struct sfw_test_case *tsc;
+ struct srpc_service *svc;
+ int nbuf;
+ int rc;
+
+ LASSERT(tsi != NULL);
+ tsc = sfw_find_test_case(tsi->tsi_service);
+ nbuf = sfw_test_buffers(tsi);
+ LASSERT(tsc != NULL);
+ svc = tsc->tsc_srv_service;
+
+ if (tsi->tsi_is_client) {
+ tsi->tsi_ops = tsc->tsc_cli_ops;
+ return 0;
+ }
+
+ rc = srpc_service_add_buffers(svc, nbuf);
+ if (rc != 0) {
+ CWARN("Failed to reserve enough buffers: service %s, %d needed: %d\n",
+ svc->sv_name, nbuf, rc);
+ /* NB: this error handler is not strictly correct, because
+ * it may release more buffers than already allocated,
+ * but it doesn't matter because request portal should
+ * be lazy portal and will grow buffers if necessary. */
+ srpc_service_remove_buffers(svc, nbuf);
+ return -ENOMEM;
+ }
+
+ CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+ nbuf * (srpc_serv_is_framework(svc) ?
+ 1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name);
+ return 0;
+}
+
+static void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+ struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+
+ LASSERT(tsc != NULL);
+
+ if (tsi->tsi_is_client)
+ return;
+
+ /* shrink buffers, because request portal is lazy portal
+ * which can grow buffers at runtime so we may leave
+ * some buffers behind, but never mind... */
+ srpc_service_remove_buffers(tsc->tsc_srv_service,
+ sfw_test_buffers(tsi));
+ return;
+}
+
+static void
+sfw_destroy_test_instance(sfw_test_instance_t *tsi)
+{
+ srpc_client_rpc_t *rpc;
+ sfw_test_unit_t *tsu;
+
+ if (!tsi->tsi_is_client) goto clean;
+
+ tsi->tsi_ops->tso_fini(tsi);
+
+ LASSERT(!tsi->tsi_stopping);
+ LASSERT(list_empty(&tsi->tsi_active_rpcs));
+ LASSERT(!sfw_test_active(tsi));
+
+ while (!list_empty(&tsi->tsi_units)) {
+ tsu = list_entry(tsi->tsi_units.next,
+ sfw_test_unit_t, tsu_list);
+ list_del(&tsu->tsu_list);
+ LIBCFS_FREE(tsu, sizeof(*tsu));
+ }
+
+ while (!list_empty(&tsi->tsi_free_rpcs)) {
+ rpc = list_entry(tsi->tsi_free_rpcs.next,
+ srpc_client_rpc_t, crpc_list);
+ list_del(&rpc->crpc_list);
+ LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+ }
+
+clean:
+ sfw_unload_test(tsi);
+ LIBCFS_FREE(tsi, sizeof(*tsi));
+ return;
+}
+
+static void
+sfw_destroy_batch(sfw_batch_t *tsb)
+{
+ sfw_test_instance_t *tsi;
+
+ LASSERT(!sfw_batch_active(tsb));
+ LASSERT(list_empty(&tsb->bat_list));
+
+ while (!list_empty(&tsb->bat_tests)) {
+ tsi = list_entry(tsb->bat_tests.next,
+ sfw_test_instance_t, tsi_list);
+ list_del_init(&tsi->tsi_list);
+ sfw_destroy_test_instance(tsi);
+ }
+
+ LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+ return;
+}
+
+void
+sfw_destroy_session(sfw_session_t *sn)
+{
+ sfw_batch_t *batch;
+
+ LASSERT(list_empty(&sn->sn_list));
+ LASSERT(sn != sfw_data.fw_session);
+
+ while (!list_empty(&sn->sn_batches)) {
+ batch = list_entry(sn->sn_batches.next,
+ sfw_batch_t, bat_list);
+ list_del_init(&batch->bat_list);
+ sfw_destroy_batch(batch);
+ }
+
+ LIBCFS_FREE(sn, sizeof(*sn));
+ atomic_dec(&sfw_data.fw_nzombies);
+ return;
+}
+
+static void
+sfw_unpack_addtest_req(srpc_msg_t *msg)
+{
+ srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+ LASSERT(msg->msg_type == SRPC_MSG_TEST_REQST);
+ LASSERT(req->tsr_is_client);
+
+ if (msg->msg_magic == SRPC_MSG_MAGIC)
+ return; /* no flipping needed */
+
+ LASSERT(msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+ if (req->tsr_service == SRPC_SERVICE_BRW) {
+ if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+ test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+
+ __swab32s(&bulk->blk_opc);
+ __swab32s(&bulk->blk_npg);
+ __swab32s(&bulk->blk_flags);
+
+ } else {
+ test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+
+ __swab16s(&bulk->blk_opc);
+ __swab16s(&bulk->blk_flags);
+ __swab32s(&bulk->blk_offset);
+ __swab32s(&bulk->blk_len);
+ }
+
+ return;
+ }
+
+ if (req->tsr_service == SRPC_SERVICE_PING) {
+ test_ping_req_t *ping = &req->tsr_u.ping;
+
+ __swab32s(&ping->png_size);
+ __swab32s(&ping->png_flags);
+ return;
+ }
+
+ LBUG();
+ return;
+}
+
+static int
+sfw_add_test_instance(sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+{
+ srpc_msg_t *msg = &rpc->srpc_reqstbuf->buf_msg;
+ srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+ srpc_bulk_t *bk = rpc->srpc_bulk;
+ int ndest = req->tsr_ndest;
+ sfw_test_unit_t *tsu;
+ sfw_test_instance_t *tsi;
+ int i;
+ int rc;
+
+ LIBCFS_ALLOC(tsi, sizeof(*tsi));
+ if (tsi == NULL) {
+ CERROR("Can't allocate test instance for batch: %llu\n",
+ tsb->bat_id.bat_id);
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&tsi->tsi_lock);
+ atomic_set(&tsi->tsi_nactive, 0);
+ INIT_LIST_HEAD(&tsi->tsi_units);
+ INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+ INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+ tsi->tsi_stopping = 0;
+ tsi->tsi_batch = tsb;
+ tsi->tsi_loop = req->tsr_loop;
+ tsi->tsi_concur = req->tsr_concur;
+ tsi->tsi_service = req->tsr_service;
+ tsi->tsi_is_client = !!(req->tsr_is_client);
+ tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+ rc = sfw_load_test(tsi);
+ if (rc != 0) {
+ LIBCFS_FREE(tsi, sizeof(*tsi));
+ return rc;
+ }
+
+ LASSERT(!sfw_batch_active(tsb));
+
+ if (!tsi->tsi_is_client) {
+ /* it's test server, just add it to tsb */
+ list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+ return 0;
+ }
+
+ LASSERT(bk != NULL);
+ LASSERT(bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+ LASSERT((unsigned int)bk->bk_len >=
+ sizeof(lnet_process_id_packed_t) * ndest);
+
+ sfw_unpack_addtest_req(msg);
+ memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+ for (i = 0; i < ndest; i++) {
+ lnet_process_id_packed_t *dests;
+ lnet_process_id_packed_t id;
+ int j;
+
+ dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page);
+ LASSERT(dests != NULL); /* my pages are within KVM always */
+ id = dests[i % SFW_ID_PER_PAGE];
+ if (msg->msg_magic != SRPC_MSG_MAGIC)
+ sfw_unpack_id(id);
+
+ for (j = 0; j < tsi->tsi_concur; j++) {
+ LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+ if (tsu == NULL) {
+ rc = -ENOMEM;
+ CERROR("Can't allocate tsu for %d\n",
+ tsi->tsi_service);
+ goto error;
+ }
+
+ tsu->tsu_dest.nid = id.nid;
+ tsu->tsu_dest.pid = id.pid;
+ tsu->tsu_instance = tsi;
+ tsu->tsu_private = NULL;
+ list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+ }
+ }
+
+ rc = tsi->tsi_ops->tso_init(tsi);
+ if (rc == 0) {
+ list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+ return 0;
+ }
+
+error:
+ LASSERT(rc != 0);
+ sfw_destroy_test_instance(tsi);
+ return rc;
+}
+
+static void
+sfw_test_unit_done(sfw_test_unit_t *tsu)
+{
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_batch_t *tsb = tsi->tsi_batch;
+ sfw_session_t *sn = tsb->bat_session;
+
+ LASSERT(sfw_test_active(tsi));
+
+ if (!atomic_dec_and_test(&tsi->tsi_nactive))
+ return;
+
+ /* the test instance is done */
+ spin_lock(&tsi->tsi_lock);
+
+ tsi->tsi_stopping = 0;
+
+ spin_unlock(&tsi->tsi_lock);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+ sn == sfw_data.fw_session) { /* sn also active */
+ spin_unlock(&sfw_data.fw_lock);
+ return;
+ }
+
+ LASSERT(!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+ list_for_each_entry(tsb, &sn->sn_batches, bat_list) {
+ if (sfw_batch_active(tsb)) {
+ spin_unlock(&sfw_data.fw_lock);
+ return;
+ }
+ }
+
+ list_del_init(&sn->sn_list);
+ spin_unlock(&sfw_data.fw_lock);
+
+ sfw_destroy_session(sn);
+ return;
+}
+
+static void
+sfw_test_rpc_done(srpc_client_rpc_t *rpc)
+{
+ sfw_test_unit_t *tsu = rpc->crpc_priv;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ int done = 0;
+
+ tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+ spin_lock(&tsi->tsi_lock);
+
+ LASSERT(sfw_test_active(tsi));
+ LASSERT(!list_empty(&rpc->crpc_list));
+
+ list_del_init(&rpc->crpc_list);
+
+ /* batch is stopping or loop is done or get error */
+ if (tsi->tsi_stopping ||
+ tsu->tsu_loop == 0 ||
+ (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+ done = 1;
+
+ /* dec ref for poster */
+ srpc_client_rpc_decref(rpc);
+
+ spin_unlock(&tsi->tsi_lock);
+
+ if (!done) {
+ swi_schedule_workitem(&tsu->tsu_worker);
+ return;
+ }
+
+ sfw_test_unit_done(tsu);
+ return;
+}
+
+int
+sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer,
+ unsigned features, int nblk, int blklen,
+ srpc_client_rpc_t **rpcpp)
+{
+ srpc_client_rpc_t *rpc = NULL;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+ spin_lock(&tsi->tsi_lock);
+
+ LASSERT(sfw_test_active(tsi));
+
+ if (!list_empty(&tsi->tsi_free_rpcs)) {
+ /* pick request from buffer */
+ rpc = list_entry(tsi->tsi_free_rpcs.next,
+ srpc_client_rpc_t, crpc_list);
+ LASSERT(nblk == rpc->crpc_bulk.bk_niov);
+ list_del_init(&rpc->crpc_list);
+ }
+
+ spin_unlock(&tsi->tsi_lock);
+
+ if (rpc == NULL) {
+ rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+ blklen, sfw_test_rpc_done,
+ sfw_test_rpc_fini, tsu);
+ } else {
+ srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+ blklen, sfw_test_rpc_done,
+ sfw_test_rpc_fini, tsu);
+ }
+
+ if (rpc == NULL) {
+ CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+ return -ENOMEM;
+ }
+
+ rpc->crpc_reqstmsg.msg_ses_feats = features;
+ *rpcpp = rpc;
+
+ return 0;
+}
+
+static int
+sfw_run_test(swi_workitem_t *wi)
+{
+ sfw_test_unit_t *tsu = wi->swi_workitem.wi_data;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ srpc_client_rpc_t *rpc = NULL;
+
+ LASSERT(wi == &tsu->tsu_worker);
+
+ if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+ LASSERT(rpc == NULL);
+ goto test_done;
+ }
+
+ LASSERT(rpc != NULL);
+
+ spin_lock(&tsi->tsi_lock);
+
+ if (tsi->tsi_stopping) {
+ list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+ spin_unlock(&tsi->tsi_lock);
+ goto test_done;
+ }
+
+ if (tsu->tsu_loop > 0)
+ tsu->tsu_loop--;
+
+ list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+ spin_unlock(&tsi->tsi_lock);
+
+ rpc->crpc_timeout = rpc_timeout;
+
+ spin_lock(&rpc->crpc_lock);
+ srpc_post_rpc(rpc);
+ spin_unlock(&rpc->crpc_lock);
+ return 0;
+
+test_done:
+ /*
+ * No one can schedule me now since:
+ * - previous RPC, if any, has done and
+ * - no new RPC is initiated.
+ * - my batch is still active; no one can run it again now.
+ * Cancel pending schedules and prevent future schedule attempts:
+ */
+ swi_exit_workitem(wi);
+ sfw_test_unit_done(tsu);
+ return 1;
+}
+
+static int
+sfw_run_batch(sfw_batch_t *tsb)
+{
+ swi_workitem_t *wi;
+ sfw_test_unit_t *tsu;
+ sfw_test_instance_t *tsi;
+
+ if (sfw_batch_active(tsb)) {
+ CDEBUG(D_NET, "Batch already active: %llu (%d)\n",
+ tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+ return 0;
+ }
+
+ list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+ if (!tsi->tsi_is_client) /* skip server instances */
+ continue;
+
+ LASSERT(!tsi->tsi_stopping);
+ LASSERT(!sfw_test_active(tsi));
+
+ atomic_inc(&tsb->bat_nactive);
+
+ list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+ atomic_inc(&tsi->tsi_nactive);
+ tsu->tsu_loop = tsi->tsi_loop;
+ wi = &tsu->tsu_worker;
+ swi_init_workitem(wi, tsu, sfw_run_test,
+ lst_sched_test[\
+ lnet_cpt_of_nid(tsu->tsu_dest.nid)]);
+ swi_schedule_workitem(wi);
+ }
+ }
+
+ return 0;
+}
+
+int
+sfw_stop_batch(sfw_batch_t *tsb, int force)
+{
+ sfw_test_instance_t *tsi;
+ srpc_client_rpc_t *rpc;
+
+ if (!sfw_batch_active(tsb)) {
+ CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id);
+ return 0;
+ }
+
+ list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+ spin_lock(&tsi->tsi_lock);
+
+ if (!tsi->tsi_is_client ||
+ !sfw_test_active(tsi) || tsi->tsi_stopping) {
+ spin_unlock(&tsi->tsi_lock);
+ continue;
+ }
+
+ tsi->tsi_stopping = 1;
+
+ if (!force) {
+ spin_unlock(&tsi->tsi_lock);
+ continue;
+ }
+
+ /* abort launched rpcs in the test */
+ list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+ spin_lock(&rpc->crpc_lock);
+
+ srpc_abort_rpc(rpc, -EINTR);
+
+ spin_unlock(&rpc->crpc_lock);
+ }
+
+ spin_unlock(&tsi->tsi_lock);
+ }
+
+ return 0;
+}
+
+static int
+sfw_query_batch(sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+{
+ sfw_test_instance_t *tsi;
+
+ if (testidx < 0)
+ return -EINVAL;
+
+ if (testidx == 0) {
+ reply->bar_active = atomic_read(&tsb->bat_nactive);
+ return 0;
+ }
+
+ list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+ if (testidx-- > 1)
+ continue;
+
+ reply->bar_active = atomic_read(&tsi->tsi_nactive);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+void
+sfw_free_pages(srpc_server_rpc_t *rpc)
+{
+ srpc_free_bulk(rpc->srpc_bulk);
+ rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+ int sink)
+{
+ LASSERT(rpc->srpc_bulk == NULL);
+ LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+ rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink);
+ if (rpc->srpc_bulk == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int
+sfw_add_test(srpc_server_rpc_t *rpc)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+ srpc_test_reqst_t *request;
+ int rc;
+ sfw_batch_t *bat;
+
+ request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+ reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+ if (request->tsr_loop == 0 ||
+ request->tsr_concur == 0 ||
+ request->tsr_sid.ses_nid == LNET_NID_ANY ||
+ request->tsr_ndest > SFW_MAX_NDESTS ||
+ (request->tsr_is_client && request->tsr_ndest == 0) ||
+ request->tsr_concur > SFW_MAX_CONCUR ||
+ request->tsr_service > SRPC_SERVICE_MAX_ID ||
+ request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+ reply->tsr_status = EINVAL;
+ return 0;
+ }
+
+ if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+ sfw_find_test_case(request->tsr_service) == NULL) {
+ reply->tsr_status = ENOENT;
+ return 0;
+ }
+
+ bat = sfw_bid2batch(request->tsr_bid);
+ if (bat == NULL) {
+ CERROR("Dropping RPC (%s) from %s under memory pressure.\n",
+ rpc->srpc_scd->scd_svc->sv_name,
+ libcfs_id2str(rpc->srpc_peer));
+ return -ENOMEM;
+ }
+
+ if (sfw_batch_active(bat)) {
+ reply->tsr_status = EBUSY;
+ return 0;
+ }
+
+ if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+ /* rpc will be resumed later in sfw_bulk_ready */
+ int npg = sfw_id_pages(request->tsr_ndest);
+ int len;
+
+ if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+ len = npg * PAGE_CACHE_SIZE;
+
+ } else {
+ len = sizeof(lnet_process_id_packed_t) *
+ request->tsr_ndest;
+ }
+
+ return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+ }
+
+ rc = sfw_add_test_instance(bat, rpc);
+ CDEBUG(rc == 0 ? D_NET : D_WARNING,
+ "%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+ rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+ request->tsr_is_client ? "client" : "server",
+ request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+ reply->tsr_status = (rc < 0) ? -rc : rc;
+ return 0;
+}
+
+static int
+sfw_control_batch(srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ int rc = 0;
+ sfw_batch_t *bat;
+
+ reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+ if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+ reply->bar_status = ESRCH;
+ return 0;
+ }
+
+ bat = sfw_find_batch(request->bar_bid);
+ if (bat == NULL) {
+ reply->bar_status = ENOENT;
+ return 0;
+ }
+
+ switch (request->bar_opc) {
+ case SRPC_BATCH_OPC_RUN:
+ rc = sfw_run_batch(bat);
+ break;
+
+ case SRPC_BATCH_OPC_STOP:
+ rc = sfw_stop_batch(bat, request->bar_arg);
+ break;
+
+ case SRPC_BATCH_OPC_QUERY:
+ rc = sfw_query_batch(bat, request->bar_testidx, reply);
+ break;
+
+ default:
+ return -EINVAL; /* drop it */
+ }
+
+ reply->bar_status = (rc < 0) ? -rc : rc;
+ return 0;
+}
+
+static int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ srpc_msg_t *reply = &rpc->srpc_replymsg;
+ srpc_msg_t *request = &rpc->srpc_reqstbuf->buf_msg;
+ unsigned features = LST_FEATS_MASK;
+ int rc = 0;
+
+ LASSERT(sfw_data.fw_active_srpc == NULL);
+ LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ if (sfw_data.fw_shuttingdown) {
+ spin_unlock(&sfw_data.fw_lock);
+ return -ESHUTDOWN;
+ }
+
+ /* Remove timer to avoid racing with it or expiring active session */
+ if (sfw_del_session_timer() != 0) {
+ CERROR("Dropping RPC (%s) from %s: racing with expiry timer.",
+ sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+ spin_unlock(&sfw_data.fw_lock);
+ return -EAGAIN;
+ }
+
+ sfw_data.fw_active_srpc = rpc;
+ spin_unlock(&sfw_data.fw_lock);
+
+ sfw_unpack_message(request);
+ LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+ /* rpc module should have checked this */
+ LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+ if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+ sv->sv_id != SRPC_SERVICE_DEBUG) {
+ sfw_session_t *sn = sfw_data.fw_session;
+
+ if (sn != NULL &&
+ sn->sn_features != request->msg_ses_feats) {
+ CNETERR("Features of framework RPC don't match features of current session: %x/%x\n",
+ request->msg_ses_feats, sn->sn_features);
+ reply->msg_body.reply.status = EPROTO;
+ reply->msg_body.reply.sid = sn->sn_id;
+ goto out;
+ }
+
+ } else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ /* NB: at this point, old version will ignore features and
+ * create new session anyway, so console should be able
+ * to handle this */
+ reply->msg_body.reply.status = EPROTO;
+ goto out;
+ }
+
+ switch (sv->sv_id) {
+ default:
+ LBUG();
+ case SRPC_SERVICE_TEST:
+ rc = sfw_add_test(rpc);
+ break;
+
+ case SRPC_SERVICE_BATCH:
+ rc = sfw_control_batch(&request->msg_body.bat_reqst,
+ &reply->msg_body.bat_reply);
+ break;
+
+ case SRPC_SERVICE_QUERY_STAT:
+ rc = sfw_get_stats(&request->msg_body.stat_reqst,
+ &reply->msg_body.stat_reply);
+ break;
+
+ case SRPC_SERVICE_DEBUG:
+ rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+ &reply->msg_body.dbg_reply);
+ break;
+
+ case SRPC_SERVICE_MAKE_SESSION:
+ rc = sfw_make_session(&request->msg_body.mksn_reqst,
+ &reply->msg_body.mksn_reply);
+ break;
+
+ case SRPC_SERVICE_REMOVE_SESSION:
+ rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+ &reply->msg_body.rmsn_reply);
+ break;
+ }
+
+ if (sfw_data.fw_session != NULL)
+ features = sfw_data.fw_session->sn_features;
+ out:
+ reply->msg_ses_feats = features;
+ rpc->srpc_done = sfw_server_rpc_done;
+ spin_lock(&sfw_data.fw_lock);
+
+ if (!sfw_data.fw_shuttingdown)
+ sfw_add_session_timer();
+
+ sfw_data.fw_active_srpc = NULL;
+ spin_unlock(&sfw_data.fw_lock);
+ return rc;
+}
+
+static int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ int rc;
+
+ LASSERT(rpc->srpc_bulk != NULL);
+ LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+ LASSERT(sfw_data.fw_active_srpc == NULL);
+ LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ if (status != 0) {
+ CERROR("Bulk transfer failed for RPC: service %s, peer %s, status %d\n",
+ sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+ spin_unlock(&sfw_data.fw_lock);
+ return -EIO;
+ }
+
+ if (sfw_data.fw_shuttingdown) {
+ spin_unlock(&sfw_data.fw_lock);
+ return -ESHUTDOWN;
+ }
+
+ if (sfw_del_session_timer() != 0) {
+ CERROR("Dropping RPC (%s) from %s: racing with expiry timer",
+ sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+ spin_unlock(&sfw_data.fw_lock);
+ return -EAGAIN;
+ }
+
+ sfw_data.fw_active_srpc = rpc;
+ spin_unlock(&sfw_data.fw_lock);
+
+ rc = sfw_add_test(rpc);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ if (!sfw_data.fw_shuttingdown)
+ sfw_add_session_timer();
+
+ sfw_data.fw_active_srpc = NULL;
+ spin_unlock(&sfw_data.fw_lock);
+ return rc;
+}
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+ unsigned features, int nbulkiov, int bulklen,
+ void (*done)(srpc_client_rpc_t *), void *priv)
+{
+ srpc_client_rpc_t *rpc = NULL;
+
+ spin_lock(&sfw_data.fw_lock);
+
+ LASSERT(!sfw_data.fw_shuttingdown);
+ LASSERT(service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+ if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+ rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+ srpc_client_rpc_t, crpc_list);
+ list_del(&rpc->crpc_list);
+
+ srpc_init_client_rpc(rpc, peer, service, 0, 0,
+ done, sfw_client_rpc_fini, priv);
+ }
+
+ spin_unlock(&sfw_data.fw_lock);
+
+ if (rpc == NULL) {
+ rpc = srpc_create_client_rpc(peer, service,
+ nbulkiov, bulklen, done,
+ nbulkiov != 0 ? NULL :
+ sfw_client_rpc_fini,
+ priv);
+ }
+
+ if (rpc != NULL) /* "session" is concept in framework */
+ rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+ return rpc;
+}
+
+void
+sfw_unpack_message(srpc_msg_t *msg)
+{
+ if (msg->msg_magic == SRPC_MSG_MAGIC)
+ return; /* no flipping needed */
+
+ /* srpc module should guarantee I wouldn't get crap */
+ LASSERT(msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+ if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+ srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+
+ __swab32s(&req->str_type);
+ __swab64s(&req->str_rpyid);
+ sfw_unpack_sid(req->str_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+ srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+
+ __swab32s(&rep->str_status);
+ sfw_unpack_sid(rep->str_sid);
+ sfw_unpack_fw_counters(rep->str_fw);
+ sfw_unpack_rpc_counters(rep->str_rpc);
+ sfw_unpack_lnet_counters(rep->str_lnet);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+ srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+
+ __swab64s(&req->mksn_rpyid);
+ __swab32s(&req->mksn_force);
+ sfw_unpack_sid(req->mksn_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+ srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+
+ __swab32s(&rep->mksn_status);
+ __swab32s(&rep->mksn_timeout);
+ sfw_unpack_sid(rep->mksn_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+ srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+
+ __swab64s(&req->rmsn_rpyid);
+ sfw_unpack_sid(req->rmsn_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+ srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+
+ __swab32s(&rep->rmsn_status);
+ sfw_unpack_sid(rep->rmsn_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+ srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+
+ __swab64s(&req->dbg_rpyid);
+ __swab32s(&req->dbg_flags);
+ sfw_unpack_sid(req->dbg_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+ srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+
+ __swab32s(&rep->dbg_nbatch);
+ __swab32s(&rep->dbg_timeout);
+ sfw_unpack_sid(rep->dbg_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+ srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+
+ __swab32s(&req->bar_opc);
+ __swab64s(&req->bar_rpyid);
+ __swab32s(&req->bar_testidx);
+ __swab32s(&req->bar_arg);
+ sfw_unpack_sid(req->bar_sid);
+ __swab64s(&req->bar_bid.bat_id);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+ srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+ __swab32s(&rep->bar_status);
+ sfw_unpack_sid(rep->bar_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+ srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+ __swab64s(&req->tsr_rpyid);
+ __swab64s(&req->tsr_bulkid);
+ __swab32s(&req->tsr_loop);
+ __swab32s(&req->tsr_ndest);
+ __swab32s(&req->tsr_concur);
+ __swab32s(&req->tsr_service);
+ sfw_unpack_sid(req->tsr_sid);
+ __swab64s(&req->tsr_bid.bat_id);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+ srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+
+ __swab32s(&rep->tsr_status);
+ sfw_unpack_sid(rep->tsr_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+ srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+
+ __swab64s(&req->join_rpyid);
+ sfw_unpack_sid(req->join_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+ srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+
+ __swab32s(&rep->join_status);
+ __swab32s(&rep->join_timeout);
+ sfw_unpack_sid(rep->join_sid);
+ return;
+ }
+
+ LBUG();
+ return;
+}
+
+void
+sfw_abort_rpc(srpc_client_rpc_t *rpc)
+{
+ LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+ LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+ spin_lock(&rpc->crpc_lock);
+ srpc_abort_rpc(rpc, -EINTR);
+ spin_unlock(&rpc->crpc_lock);
+ return;
+}
+
+void
+sfw_post_rpc(srpc_client_rpc_t *rpc)
+{
+ spin_lock(&rpc->crpc_lock);
+
+ LASSERT(!rpc->crpc_closed);
+ LASSERT(!rpc->crpc_aborted);
+ LASSERT(list_empty(&rpc->crpc_list));
+ LASSERT(!sfw_data.fw_shuttingdown);
+
+ rpc->crpc_timeout = rpc_timeout;
+ srpc_post_rpc(rpc);
+
+ spin_unlock(&rpc->crpc_lock);
+ return;
+}
+
+static srpc_service_t sfw_services[] = {
+ {
+ /* sv_id */ SRPC_SERVICE_DEBUG,
+ /* sv_name */ "debug",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_QUERY_STAT,
+ /* sv_name */ "query stats",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_MAKE_SESSION,
+ /* sv_name */ "make session",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_REMOVE_SESSION,
+ /* sv_name */ "remove session",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_BATCH,
+ /* sv_name */ "batch service",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_TEST,
+ /* sv_name */ "test service",
+ 0
+ },
+ {
+ /* sv_id */ 0,
+ /* sv_name */ NULL,
+ 0
+ }
+};
+
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t ping_test_service;
+extern void ping_init_test_client(void);
+extern void ping_init_test_service(void);
+
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t brw_test_service;
+extern void brw_init_test_client(void);
+extern void brw_init_test_service(void);
+
+
+int
+sfw_startup(void)
+{
+ int i;
+ int rc;
+ int error;
+ srpc_service_t *sv;
+ sfw_test_case_t *tsc;
+
+
+ if (session_timeout < 0) {
+ CERROR("Session timeout must be non-negative: %d\n",
+ session_timeout);
+ return -EINVAL;
+ }
+
+ if (rpc_timeout < 0) {
+ CERROR("RPC timeout must be non-negative: %d\n",
+ rpc_timeout);
+ return -EINVAL;
+ }
+
+ if (session_timeout == 0)
+ CWARN("Zero session_timeout specified - test sessions never expire.\n");
+
+ if (rpc_timeout == 0)
+ CWARN("Zero rpc_timeout specified - test RPC never expire.\n");
+
+ memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+ sfw_data.fw_session = NULL;
+ sfw_data.fw_active_srpc = NULL;
+ spin_lock_init(&sfw_data.fw_lock);
+ atomic_set(&sfw_data.fw_nzombies, 0);
+ INIT_LIST_HEAD(&sfw_data.fw_tests);
+ INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+ INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+ brw_init_test_client();
+ brw_init_test_service();
+ rc = sfw_register_test(&brw_test_service, &brw_test_client);
+ LASSERT(rc == 0);
+
+ ping_init_test_client();
+ ping_init_test_service();
+ rc = sfw_register_test(&ping_test_service, &ping_test_client);
+ LASSERT(rc == 0);
+
+ error = 0;
+ list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+ sv = tsc->tsc_srv_service;
+
+ rc = srpc_add_service(sv);
+ LASSERT(rc != -EBUSY);
+ if (rc != 0) {
+ CWARN("Failed to add %s service: %d\n",
+ sv->sv_name, rc);
+ error = rc;
+ }
+ }
+
+ for (i = 0; ; i++) {
+ sv = &sfw_services[i];
+ if (sv->sv_name == NULL) break;
+
+ sv->sv_bulk_ready = NULL;
+ sv->sv_handler = sfw_handle_server_rpc;
+ sv->sv_wi_total = SFW_FRWK_WI_MAX;
+ if (sv->sv_id == SRPC_SERVICE_TEST)
+ sv->sv_bulk_ready = sfw_bulk_ready;
+
+ rc = srpc_add_service(sv);
+ LASSERT(rc != -EBUSY);
+ if (rc != 0) {
+ CWARN("Failed to add %s service: %d\n",
+ sv->sv_name, rc);
+ error = rc;
+ }
+
+ /* about to sfw_shutdown, no need to add buffer */
+ if (error) continue;
+
+ rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+ if (rc != 0) {
+ CWARN("Failed to reserve enough buffers: service %s, %d needed: %d\n",
+ sv->sv_name, sv->sv_wi_total, rc);
+ error = -ENOMEM;
+ }
+ }
+
+ if (error != 0)
+ sfw_shutdown();
+ return error;
+}
+
+void
+sfw_shutdown(void)
+{
+ srpc_service_t *sv;
+ sfw_test_case_t *tsc;
+ int i;
+
+ spin_lock(&sfw_data.fw_lock);
+
+ sfw_data.fw_shuttingdown = 1;
+ lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+ "waiting for active RPC to finish.\n");
+
+ if (sfw_del_session_timer() != 0)
+ lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+ "waiting for session timer to explode.\n");
+
+ sfw_deactivate_session();
+ lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+ sfw_data.fw_lock,
+ "waiting for %d zombie sessions to die.\n",
+ atomic_read(&sfw_data.fw_nzombies));
+
+ spin_unlock(&sfw_data.fw_lock);
+
+ for (i = 0; ; i++) {
+ sv = &sfw_services[i];
+ if (sv->sv_name == NULL)
+ break;
+
+ srpc_shutdown_service(sv);
+ srpc_remove_service(sv);
+ }
+
+ list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+ sv = tsc->tsc_srv_service;
+ srpc_shutdown_service(sv);
+ srpc_remove_service(sv);
+ }
+
+ while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+ srpc_client_rpc_t *rpc;
+
+ rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+ srpc_client_rpc_t, crpc_list);
+ list_del(&rpc->crpc_list);
+
+ LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+ }
+
+ for (i = 0; ; i++) {
+ sv = &sfw_services[i];
+ if (sv->sv_name == NULL)
+ break;
+
+ srpc_wait_service_shutdown(sv);
+ }
+
+ while (!list_empty(&sfw_data.fw_tests)) {
+ tsc = list_entry(sfw_data.fw_tests.next,
+ sfw_test_case_t, tsc_list);
+
+ srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+ list_del(&tsc->tsc_list);
+ LIBCFS_FREE(tsc, sizeof(*tsc));
+ }
+
+ return;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/module.c b/kernel/drivers/staging/lustre/lnet/selftest/module.c
new file mode 100644
index 000000000..7ad62f167
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/module.c
@@ -0,0 +1,159 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+enum {
+ LST_INIT_NONE = 0,
+ LST_INIT_WI_SERIAL,
+ LST_INIT_WI_TEST,
+ LST_INIT_RPC,
+ LST_INIT_FW,
+ LST_INIT_CONSOLE
+};
+
+extern int lstcon_console_init(void);
+extern int lstcon_console_fini(void);
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+static void
+lnet_selftest_fini(void)
+{
+ int i;
+
+ switch (lst_init_step) {
+ case LST_INIT_CONSOLE:
+ lstcon_console_fini();
+ case LST_INIT_FW:
+ sfw_shutdown();
+ case LST_INIT_RPC:
+ srpc_shutdown();
+ case LST_INIT_WI_TEST:
+ for (i = 0;
+ i < cfs_cpt_number(lnet_cpt_table()); i++) {
+ if (lst_sched_test[i] == NULL)
+ continue;
+ cfs_wi_sched_destroy(lst_sched_test[i]);
+ }
+ LIBCFS_FREE(lst_sched_test,
+ sizeof(lst_sched_test[0]) *
+ cfs_cpt_number(lnet_cpt_table()));
+ lst_sched_test = NULL;
+
+ case LST_INIT_WI_SERIAL:
+ cfs_wi_sched_destroy(lst_sched_serial);
+ lst_sched_serial = NULL;
+ case LST_INIT_NONE:
+ break;
+ default:
+ LBUG();
+ }
+}
+
+static int
+lnet_selftest_init(void)
+{
+ int nscheds;
+ int rc;
+ int i;
+
+ rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+ 1, &lst_sched_serial);
+ if (rc != 0) {
+ CERROR("Failed to create serial WI scheduler for LST\n");
+ return rc;
+ }
+ lst_init_step = LST_INIT_WI_SERIAL;
+
+ nscheds = cfs_cpt_number(lnet_cpt_table());
+ LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds);
+ if (lst_sched_test == NULL)
+ goto error;
+
+ lst_init_step = LST_INIT_WI_TEST;
+ for (i = 0; i < nscheds; i++) {
+ int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+ /* reserve at least one CPU for LND */
+ nthrs = max(nthrs - 1, 1);
+ rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+ nthrs, &lst_sched_test[i]);
+ if (rc != 0) {
+ CERROR("Failed to create CPT affinity WI scheduler %d for LST\n",
+ i);
+ goto error;
+ }
+ }
+
+ rc = srpc_startup();
+ if (rc != 0) {
+ CERROR("LST can't startup rpc\n");
+ goto error;
+ }
+ lst_init_step = LST_INIT_RPC;
+
+ rc = sfw_startup();
+ if (rc != 0) {
+ CERROR("LST can't startup framework\n");
+ goto error;
+ }
+ lst_init_step = LST_INIT_FW;
+
+ rc = lstcon_console_init();
+ if (rc != 0) {
+ CERROR("LST can't startup console\n");
+ goto error;
+ }
+ lst_init_step = LST_INIT_CONSOLE;
+ return 0;
+error:
+ lnet_selftest_fini();
+ return rc;
+}
+
+
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.9.0");
+
+module_init(lnet_selftest_init);
+module_exit(lnet_selftest_fini);
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/ping_test.c b/kernel/drivers/staging/lustre/lnet/selftest/ping_test.c
new file mode 100644
index 000000000..644069a9f
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/ping_test.c
@@ -0,0 +1,230 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC 0xbabeface
+
+static int ping_srv_workitems = SFW_TEST_WI_MAX;
+module_param(ping_srv_workitems, int, 0644);
+MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems");
+
+typedef struct {
+ spinlock_t pnd_lock; /* serialize */
+ int pnd_counter; /* sequence counter */
+} lst_ping_data_t;
+
+static lst_ping_data_t lst_ping_data;
+
+static int
+ping_client_init(sfw_test_instance_t *tsi)
+{
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+
+ LASSERT(tsi->tsi_is_client);
+ LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+ spin_lock_init(&lst_ping_data.pnd_lock);
+ lst_ping_data.pnd_counter = 0;
+
+ return 0;
+}
+
+static void
+ping_client_fini(sfw_test_instance_t *tsi)
+{
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ int errors;
+
+ LASSERT(sn != NULL);
+ LASSERT(tsi->tsi_is_client);
+
+ errors = atomic_read(&sn->sn_ping_errors);
+ if (errors)
+ CWARN("%d pings have failed.\n", errors);
+ else
+ CDEBUG(D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+ lnet_process_id_t dest, srpc_client_rpc_t **rpc)
+{
+ srpc_ping_reqst_t *req;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ struct timeval tv;
+ int rc;
+
+ LASSERT(sn != NULL);
+ LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+ rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+ if (rc != 0)
+ return rc;
+
+ req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+ req->pnr_magic = LST_PING_TEST_MAGIC;
+
+ spin_lock(&lst_ping_data.pnd_lock);
+ req->pnr_seq = lst_ping_data.pnd_counter++;
+ spin_unlock(&lst_ping_data.pnd_lock);
+
+ cfs_fs_timeval(&tv);
+ req->pnr_time_sec = tv.tv_sec;
+ req->pnr_time_usec = tv.tv_usec;
+
+ return rc;
+}
+
+static void
+ping_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ srpc_ping_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+ srpc_ping_reply_t *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+ struct timeval tv;
+
+ LASSERT(sn != NULL);
+
+ if (rpc->crpc_status != 0) {
+ if (!tsi->tsi_stopping) /* rpc could have been aborted */
+ atomic_inc(&sn->sn_ping_errors);
+ CERROR("Unable to ping %s (%d): %d\n",
+ libcfs_id2str(rpc->crpc_dest),
+ reqst->pnr_seq, rpc->crpc_status);
+ return;
+ }
+
+ if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+ __swab32s(&reply->pnr_seq);
+ __swab32s(&reply->pnr_magic);
+ __swab32s(&reply->pnr_status);
+ }
+
+ if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+ rpc->crpc_status = -EBADMSG;
+ atomic_inc(&sn->sn_ping_errors);
+ CERROR("Bad magic %u from %s, %u expected.\n",
+ reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+ LST_PING_TEST_MAGIC);
+ return;
+ }
+
+ if (reply->pnr_seq != reqst->pnr_seq) {
+ rpc->crpc_status = -EBADMSG;
+ atomic_inc(&sn->sn_ping_errors);
+ CERROR("Bad seq %u from %s, %u expected.\n",
+ reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+ reqst->pnr_seq);
+ return;
+ }
+
+ cfs_fs_timeval(&tv);
+ CDEBUG(D_NET, "%d reply in %u usec\n", reply->pnr_seq,
+ (unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000
+ + (tv.tv_usec - reqst->pnr_time_usec)));
+ return;
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ srpc_msg_t *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+ srpc_msg_t *replymsg = &rpc->srpc_replymsg;
+ srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+ srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+ LASSERT(sv->sv_id == SRPC_SERVICE_PING);
+
+ if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+ LASSERT(reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+ __swab32s(&req->pnr_seq);
+ __swab32s(&req->pnr_magic);
+ __swab64s(&req->pnr_time_sec);
+ __swab64s(&req->pnr_time_usec);
+ }
+ LASSERT(reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+ if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+ CERROR("Unexpected magic %08x from %s\n",
+ req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+ return -EINVAL;
+ }
+
+ rep->pnr_seq = req->pnr_seq;
+ rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+ if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ replymsg->msg_ses_feats = LST_FEATS_MASK;
+ rep->pnr_status = EPROTO;
+ return 0;
+ }
+
+ replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+ CDEBUG(D_NET, "Get ping %d from %s\n",
+ req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+ return 0;
+}
+
+sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void)
+{
+ ping_test_client.tso_init = ping_client_init;
+ ping_test_client.tso_fini = ping_client_fini;
+ ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+ ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+srpc_service_t ping_test_service;
+void ping_init_test_service(void)
+{
+ ping_test_service.sv_id = SRPC_SERVICE_PING;
+ ping_test_service.sv_name = "ping_test";
+ ping_test_service.sv_handler = ping_server_handle;
+ ping_test_service.sv_wi_total = ping_srv_workitems;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/rpc.c b/kernel/drivers/staging/lustre/lnet/selftest/rpc.c
new file mode 100644
index 000000000..080788ab7
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/rpc.c
@@ -0,0 +1,1673 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+typedef enum {
+ SRPC_STATE_NONE,
+ SRPC_STATE_NI_INIT,
+ SRPC_STATE_EQ_INIT,
+ SRPC_STATE_RUNNING,
+ SRPC_STATE_STOPPING,
+} srpc_state_t;
+
+static struct smoketest_rpc {
+ spinlock_t rpc_glock; /* global lock */
+ srpc_service_t *rpc_services[SRPC_SERVICE_MAX_ID + 1];
+ lnet_handle_eq_t rpc_lnet_eq; /* _the_ LNet event queue */
+ srpc_state_t rpc_state;
+ srpc_counters_t rpc_counters;
+ __u64 rpc_matchbits; /* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+ return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+ SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+int srpc_handle_rpc(swi_workitem_t *wi);
+
+void srpc_get_counters(srpc_counters_t *cnt)
+{
+ spin_lock(&srpc_data.rpc_glock);
+ *cnt = srpc_data.rpc_counters;
+ spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters(const srpc_counters_t *cnt)
+{
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters = *cnt;
+ spin_unlock(&srpc_data.rpc_glock);
+}
+
+static int
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob)
+{
+ nob = min(nob, (int)PAGE_CACHE_SIZE);
+
+ LASSERT(nob > 0);
+ LASSERT(i >= 0 && i < bk->bk_niov);
+
+ bk->bk_iovs[i].kiov_offset = 0;
+ bk->bk_iovs[i].kiov_page = pg;
+ bk->bk_iovs[i].kiov_len = nob;
+ return nob;
+}
+
+void
+srpc_free_bulk(srpc_bulk_t *bk)
+{
+ int i;
+ struct page *pg;
+
+ LASSERT(bk != NULL);
+
+ for (i = 0; i < bk->bk_niov; i++) {
+ pg = bk->bk_iovs[i].kiov_page;
+ if (pg == NULL)
+ break;
+
+ __free_page(pg);
+ }
+
+ LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+ return;
+}
+
+srpc_bulk_t *
+srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink)
+{
+ srpc_bulk_t *bk;
+ int i;
+
+ LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+ LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+ offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+ if (bk == NULL) {
+ CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+ return NULL;
+ }
+
+ memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+ bk->bk_sink = sink;
+ bk->bk_len = bulk_len;
+ bk->bk_niov = bulk_npg;
+
+ for (i = 0; i < bulk_npg; i++) {
+ struct page *pg;
+ int nob;
+
+ pg = alloc_pages_node(cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+ GFP_IOFS, 0);
+ if (pg == NULL) {
+ CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+ srpc_free_bulk(bk);
+ return NULL;
+ }
+
+ nob = srpc_add_bulk_page(bk, pg, i, bulk_len);
+ bulk_len -= nob;
+ }
+
+ return bk;
+}
+
+static inline __u64
+srpc_next_id(void)
+{
+ __u64 id;
+
+ spin_lock(&srpc_data.rpc_glock);
+ id = srpc_data.rpc_matchbits++;
+ spin_unlock(&srpc_data.rpc_glock);
+ return id;
+}
+
+static void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+ struct srpc_service_cd *scd,
+ struct srpc_buffer *buffer)
+{
+ memset(rpc, 0, sizeof(*rpc));
+ swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc,
+ srpc_serv_is_framework(scd->scd_svc) ?
+ lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+ rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+ rpc->srpc_scd = scd;
+ rpc->srpc_reqstbuf = buffer;
+ rpc->srpc_peer = buffer->buf_peer;
+ rpc->srpc_self = buffer->buf_self;
+ LNetInvalidateHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ struct srpc_buffer *buf;
+ struct list_head *q;
+ int i;
+
+ if (svc->sv_cpt_data == NULL)
+ return;
+
+ cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+ while (1) {
+ if (!list_empty(&scd->scd_buf_posted))
+ q = &scd->scd_buf_posted;
+ else if (!list_empty(&scd->scd_buf_blocked))
+ q = &scd->scd_buf_blocked;
+ else
+ break;
+
+ while (!list_empty(q)) {
+ buf = list_entry(q->next,
+ struct srpc_buffer,
+ buf_list);
+ list_del(&buf->buf_list);
+ LIBCFS_FREE(buf, sizeof(*buf));
+ }
+ }
+
+ LASSERT(list_empty(&scd->scd_rpc_active));
+
+ while (!list_empty(&scd->scd_rpc_free)) {
+ rpc = list_entry(scd->scd_rpc_free.next,
+ struct srpc_server_rpc,
+ srpc_list);
+ list_del(&rpc->srpc_list);
+ LIBCFS_FREE(rpc, sizeof(*rpc));
+ }
+ }
+
+ cfs_percpt_free(svc->sv_cpt_data);
+ svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+ int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+ return srpc_serv_is_framework(svc) ?
+ max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ int nrpcs;
+ int i;
+ int j;
+
+ svc->sv_shuttingdown = 0;
+
+ svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(struct srpc_service_cd));
+ if (svc->sv_cpt_data == NULL)
+ return -ENOMEM;
+
+ svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+ 1 : cfs_cpt_number(lnet_cpt_table());
+ nrpcs = srpc_service_nrpcs(svc);
+
+ cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+ scd->scd_cpt = i;
+ scd->scd_svc = svc;
+ spin_lock_init(&scd->scd_lock);
+ INIT_LIST_HEAD(&scd->scd_rpc_free);
+ INIT_LIST_HEAD(&scd->scd_rpc_active);
+ INIT_LIST_HEAD(&scd->scd_buf_posted);
+ INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+ scd->scd_ev.ev_data = scd;
+ scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+ /* NB: don't use lst_sched_serial for adding buffer,
+ * see details in srpc_service_add_buffers() */
+ swi_init_workitem(&scd->scd_buf_wi, scd,
+ srpc_add_buffer, lst_sched_test[i]);
+
+ if (i != 0 && srpc_serv_is_framework(svc)) {
+ /* NB: framework service only needs srpc_service_cd for
+ * one partition, but we allocate for all to make
+ * it easier to implement, it will waste a little
+ * memory but nobody should care about this */
+ continue;
+ }
+
+ for (j = 0; j < nrpcs; j++) {
+ LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+ i, sizeof(*rpc));
+ if (rpc == NULL) {
+ srpc_service_fini(svc);
+ return -ENOMEM;
+ }
+ list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+ }
+ }
+
+ return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+ int id = sv->sv_id;
+
+ LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+ if (srpc_service_init(sv) != 0)
+ return -ENOMEM;
+
+ spin_lock(&srpc_data.rpc_glock);
+
+ LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+ if (srpc_data.rpc_services[id] != NULL) {
+ spin_unlock(&srpc_data.rpc_glock);
+ goto failed;
+ }
+
+ srpc_data.rpc_services[id] = sv;
+ spin_unlock(&srpc_data.rpc_glock);
+
+ CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+ return 0;
+
+ failed:
+ srpc_service_fini(sv);
+ return -EBUSY;
+}
+
+int
+srpc_remove_service(srpc_service_t *sv)
+{
+ int id = sv->sv_id;
+
+ spin_lock(&srpc_data.rpc_glock);
+
+ if (srpc_data.rpc_services[id] != sv) {
+ spin_unlock(&srpc_data.rpc_glock);
+ return -ENOENT;
+ }
+
+ srpc_data.rpc_services[id] = NULL;
+ spin_unlock(&srpc_data.rpc_glock);
+ return 0;
+}
+
+static int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+ int len, int options, lnet_process_id_t peer,
+ lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+ int rc;
+ lnet_md_t md;
+ lnet_handle_me_t meh;
+
+ rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
+ local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
+ if (rc != 0) {
+ CERROR("LNetMEAttach failed: %d\n", rc);
+ LASSERT(rc == -ENOMEM);
+ return -ENOMEM;
+ }
+
+ md.threshold = 1;
+ md.user_ptr = ev;
+ md.start = buf;
+ md.length = len;
+ md.options = options;
+ md.eq_handle = srpc_data.rpc_lnet_eq;
+
+ rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+ if (rc != 0) {
+ CERROR("LNetMDAttach failed: %d\n", rc);
+ LASSERT(rc == -ENOMEM);
+
+ rc = LNetMEUnlink(meh);
+ LASSERT(rc == 0);
+ return -ENOMEM;
+ }
+
+ CDEBUG(D_NET,
+ "Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
+ libcfs_id2str(peer), portal, matchbits);
+ return 0;
+}
+
+static int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+ int options, lnet_process_id_t peer, lnet_nid_t self,
+ lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+ int rc;
+ lnet_md_t md;
+
+ md.user_ptr = ev;
+ md.start = buf;
+ md.length = len;
+ md.eq_handle = srpc_data.rpc_lnet_eq;
+ md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+ md.options = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+ rc = LNetMDBind(md, LNET_UNLINK, mdh);
+ if (rc != 0) {
+ CERROR("LNetMDBind failed: %d\n", rc);
+ LASSERT(rc == -ENOMEM);
+ return -ENOMEM;
+ }
+
+ /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+ * they're only meaningful for MDs attached to an ME (i.e. passive
+ * buffers... */
+ if ((options & LNET_MD_OP_PUT) != 0) {
+ rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+ portal, matchbits, 0, 0);
+ } else {
+ LASSERT((options & LNET_MD_OP_GET) != 0);
+
+ rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+ }
+
+ if (rc != 0) {
+ CERROR("LNet%s(%s, %d, %lld) failed: %d\n",
+ ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+ libcfs_id2str(peer), portal, matchbits, rc);
+
+ /* The forthcoming unlink event will complete this operation
+ * with failure, so fall through and return success here.
+ */
+ rc = LNetMDUnlink(*mdh);
+ LASSERT(rc == 0);
+ } else {
+ CDEBUG(D_NET,
+ "Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
+ libcfs_id2str(peer), portal, matchbits);
+ }
+ return 0;
+}
+
+static int
+srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf,
+ int len, lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+ return srpc_post_active_rdma(srpc_serv_portal(service), service,
+ buf, len, LNET_MD_OP_PUT, peer,
+ LNET_NID_ANY, mdh, ev);
+}
+
+static int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+ lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+ lnet_process_id_t any = {0};
+
+ any.nid = LNET_NID_ANY;
+ any.pid = LNET_PID_ANY;
+
+ return srpc_post_passive_rdma(srpc_serv_portal(service),
+ local, service, buf, len,
+ LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+static int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+ __must_hold(&scd->scd_lock)
+{
+ struct srpc_service *sv = scd->scd_svc;
+ struct srpc_msg *msg = &buf->buf_msg;
+ int rc;
+
+ LNetInvalidateHandle(&buf->buf_mdh);
+ list_add(&buf->buf_list, &scd->scd_buf_posted);
+ scd->scd_buf_nposted++;
+ spin_unlock(&scd->scd_lock);
+
+ rc = srpc_post_passive_rqtbuf(sv->sv_id,
+ !srpc_serv_is_framework(sv),
+ msg, sizeof(*msg), &buf->buf_mdh,
+ &scd->scd_ev);
+
+ /* At this point, a RPC (new or delayed) may have arrived in
+ * msg and its event handler has been called. So we must add
+ * buf to scd_buf_posted _before_ dropping scd_lock */
+
+ spin_lock(&scd->scd_lock);
+
+ if (rc == 0) {
+ if (!sv->sv_shuttingdown)
+ return 0;
+
+ spin_unlock(&scd->scd_lock);
+ /* srpc_shutdown_service might have tried to unlink me
+ * when my buf_mdh was still invalid */
+ LNetMDUnlink(buf->buf_mdh);
+ spin_lock(&scd->scd_lock);
+ return 0;
+ }
+
+ scd->scd_buf_nposted--;
+ if (sv->sv_shuttingdown)
+ return rc; /* don't allow to change scd_buf_posted */
+
+ list_del(&buf->buf_list);
+ spin_unlock(&scd->scd_lock);
+
+ LIBCFS_FREE(buf, sizeof(*buf));
+
+ spin_lock(&scd->scd_lock);
+ return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+ struct srpc_service_cd *scd = wi->swi_workitem.wi_data;
+ struct srpc_buffer *buf;
+ int rc = 0;
+
+ /* it's called by workitem scheduler threads, these threads
+ * should have been set CPT affinity, so buffers will be posted
+ * on CPT local list of Portal */
+ spin_lock(&scd->scd_lock);
+
+ while (scd->scd_buf_adjust > 0 &&
+ !scd->scd_svc->sv_shuttingdown) {
+ scd->scd_buf_adjust--; /* consume it */
+ scd->scd_buf_posting++;
+
+ spin_unlock(&scd->scd_lock);
+
+ LIBCFS_ALLOC(buf, sizeof(*buf));
+ if (buf == NULL) {
+ CERROR("Failed to add new buf to service: %s\n",
+ scd->scd_svc->sv_name);
+ spin_lock(&scd->scd_lock);
+ rc = -ENOMEM;
+ break;
+ }
+
+ spin_lock(&scd->scd_lock);
+ if (scd->scd_svc->sv_shuttingdown) {
+ spin_unlock(&scd->scd_lock);
+ LIBCFS_FREE(buf, sizeof(*buf));
+
+ spin_lock(&scd->scd_lock);
+ rc = -ESHUTDOWN;
+ break;
+ }
+
+ rc = srpc_service_post_buffer(scd, buf);
+ if (rc != 0)
+ break; /* buf has been freed inside */
+
+ LASSERT(scd->scd_buf_posting > 0);
+ scd->scd_buf_posting--;
+ scd->scd_buf_total++;
+ scd->scd_buf_low = max(2, scd->scd_buf_total / 4);
+ }
+
+ if (rc != 0) {
+ scd->scd_buf_err_stamp = get_seconds();
+ scd->scd_buf_err = rc;
+
+ LASSERT(scd->scd_buf_posting > 0);
+ scd->scd_buf_posting--;
+ }
+
+ spin_unlock(&scd->scd_lock);
+ return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+ struct srpc_service_cd *scd;
+ int rc = 0;
+ int i;
+
+ LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+
+ scd->scd_buf_err = 0;
+ scd->scd_buf_err_stamp = 0;
+ scd->scd_buf_posting = 0;
+ scd->scd_buf_adjust = nbuffer;
+ /* start to post buffers */
+ swi_schedule_workitem(&scd->scd_buf_wi);
+ spin_unlock(&scd->scd_lock);
+
+ /* framework service only post buffer for one partition */
+ if (srpc_serv_is_framework(sv))
+ break;
+ }
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+ /*
+ * NB: srpc_service_add_buffers() can be called inside
+ * thread context of lst_sched_serial, and we don't normally
+ * allow to sleep inside thread context of WI scheduler
+ * because it will block current scheduler thread from doing
+ * anything else, even worse, it could deadlock if it's
+ * waiting on result from another WI of the same scheduler.
+ * However, it's safe at here because scd_buf_wi is scheduled
+ * by thread in a different WI scheduler (lst_sched_test),
+ * so we don't have any risk of deadlock, though this could
+ * block all WIs pending on lst_sched_serial for a moment
+ * which is not good but not fatal.
+ */
+ lst_wait_until(scd->scd_buf_err != 0 ||
+ (scd->scd_buf_adjust == 0 &&
+ scd->scd_buf_posting == 0),
+ scd->scd_lock, "waiting for adding buffer\n");
+
+ if (scd->scd_buf_err != 0 && rc == 0)
+ rc = scd->scd_buf_err;
+
+ spin_unlock(&scd->scd_lock);
+ }
+
+ return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+ struct srpc_service_cd *scd;
+ int num;
+ int i;
+
+ LASSERT(!sv->sv_shuttingdown);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+
+ num = scd->scd_buf_total + scd->scd_buf_posting;
+ scd->scd_buf_adjust -= min(nbuffer, num);
+
+ spin_unlock(&scd->scd_lock);
+ }
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ int i;
+
+ LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+ if (!swi_deschedule_workitem(&scd->scd_buf_wi)) {
+ spin_unlock(&scd->scd_lock);
+ return 0;
+ }
+
+ if (scd->scd_buf_nposted > 0) {
+ CDEBUG(D_NET, "waiting for %d posted buffers to unlink",
+ scd->scd_buf_nposted);
+ spin_unlock(&scd->scd_lock);
+ return 0;
+ }
+
+ if (list_empty(&scd->scd_rpc_active)) {
+ spin_unlock(&scd->scd_lock);
+ continue;
+ }
+
+ rpc = list_entry(scd->scd_rpc_active.next,
+ struct srpc_server_rpc, srpc_list);
+ CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n",
+ rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+ swi_state2str(rpc->srpc_wi.swi_state),
+ rpc->srpc_wi.swi_workitem.wi_scheduled,
+ rpc->srpc_wi.swi_workitem.wi_running,
+ rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+ rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+ spin_unlock(&scd->scd_lock);
+ return 0;
+ }
+
+ /* no lock needed from now on */
+ srpc_service_fini(sv);
+ return 1;
+}
+
+/* called with sv->sv_lock held */
+static void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+ __must_hold(&scd->scd_lock)
+{
+ if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+ if (srpc_service_post_buffer(scd, buf) != 0) {
+ CWARN("Failed to post %s buffer\n",
+ scd->scd_svc->sv_name);
+ }
+ return;
+ }
+
+ /* service is shutting down, or we want to recycle some buffers */
+ scd->scd_buf_total--;
+
+ if (scd->scd_buf_adjust < 0) {
+ scd->scd_buf_adjust++;
+ if (scd->scd_buf_adjust < 0 &&
+ scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+ CDEBUG(D_INFO,
+ "Try to recycle %d buffers but nothing left\n",
+ scd->scd_buf_adjust);
+ scd->scd_buf_adjust = 0;
+ }
+ }
+
+ spin_unlock(&scd->scd_lock);
+ LIBCFS_FREE(buf, sizeof(*buf));
+ spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ int i;
+
+ CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+ sv->sv_id, sv->sv_name);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+
+ /* schedule in-flight RPCs to notice the abort, NB:
+ * racing with incoming RPCs; complete fix should make test
+ * RPCs carry session ID in its headers */
+ list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+ rpc->srpc_aborted = 1;
+ swi_schedule_workitem(&rpc->srpc_wi);
+ }
+
+ spin_unlock(&scd->scd_lock);
+ }
+}
+
+void
+srpc_shutdown_service(srpc_service_t *sv)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ srpc_buffer_t *buf;
+ int i;
+
+ CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+ sv->sv_id, sv->sv_name);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+ spin_lock(&scd->scd_lock);
+
+ sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+ spin_unlock(&scd->scd_lock);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+
+ /* schedule in-flight RPCs to notice the shutdown */
+ list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+ swi_schedule_workitem(&rpc->srpc_wi);
+
+ spin_unlock(&scd->scd_lock);
+
+ /* OK to traverse scd_buf_posted without lock, since no one
+ * touches scd_buf_posted now */
+ list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+ LNetMDUnlink(buf->buf_mdh);
+ }
+}
+
+static int
+srpc_send_request(srpc_client_rpc_t *rpc)
+{
+ srpc_event_t *ev = &rpc->crpc_reqstev;
+ int rc;
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = SRPC_REQUEST_SENT;
+
+ rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service,
+ &rpc->crpc_reqstmsg, sizeof(srpc_msg_t),
+ &rpc->crpc_reqstmdh, ev);
+ if (rc != 0) {
+ LASSERT(rc == -ENOMEM);
+ ev->ev_fired = 1; /* no more event expected */
+ }
+ return rc;
+}
+
+static int
+srpc_prepare_reply(srpc_client_rpc_t *rpc)
+{
+ srpc_event_t *ev = &rpc->crpc_replyev;
+ __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+ int rc;
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = SRPC_REPLY_RCVD;
+
+ *id = srpc_next_id();
+
+ rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+ &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+ LNET_MD_OP_PUT, rpc->crpc_dest,
+ &rpc->crpc_replymdh, ev);
+ if (rc != 0) {
+ LASSERT(rc == -ENOMEM);
+ ev->ev_fired = 1; /* no more event expected */
+ }
+ return rc;
+}
+
+static int
+srpc_prepare_bulk(srpc_client_rpc_t *rpc)
+{
+ srpc_bulk_t *bk = &rpc->crpc_bulk;
+ srpc_event_t *ev = &rpc->crpc_bulkev;
+ __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+ int rc;
+ int opt;
+
+ LASSERT(bk->bk_niov <= LNET_MAX_IOV);
+
+ if (bk->bk_niov == 0)
+ return 0; /* nothing to do */
+
+ opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+ opt |= LNET_MD_KIOV;
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = SRPC_BULK_REQ_RCVD;
+
+ *id = srpc_next_id();
+
+ rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+ &bk->bk_iovs[0], bk->bk_niov, opt,
+ rpc->crpc_dest, &bk->bk_mdh, ev);
+ if (rc != 0) {
+ LASSERT(rc == -ENOMEM);
+ ev->ev_fired = 1; /* no more event expected */
+ }
+ return rc;
+}
+
+static int
+srpc_do_bulk(srpc_server_rpc_t *rpc)
+{
+ srpc_event_t *ev = &rpc->srpc_ev;
+ srpc_bulk_t *bk = rpc->srpc_bulk;
+ __u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+ int rc;
+ int opt;
+
+ LASSERT(bk != NULL);
+
+ opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+ opt |= LNET_MD_KIOV;
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+ rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+ &bk->bk_iovs[0], bk->bk_niov, opt,
+ rpc->srpc_peer, rpc->srpc_self,
+ &bk->bk_mdh, ev);
+ if (rc != 0)
+ ev->ev_fired = 1; /* no more event expected */
+ return rc;
+}
+
+/* only called from srpc_handle_rpc */
+static void
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+{
+ struct srpc_service_cd *scd = rpc->srpc_scd;
+ struct srpc_service *sv = scd->scd_svc;
+ srpc_buffer_t *buffer;
+
+ LASSERT(status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+ rpc->srpc_status = status;
+
+ CDEBUG_LIMIT(status == 0 ? D_NET : D_NETERROR,
+ "Server RPC %p done: service %s, peer %s, status %s:%d\n",
+ rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+ swi_state2str(rpc->srpc_wi.swi_state), status);
+
+ if (status != 0) {
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.rpcs_dropped++;
+ spin_unlock(&srpc_data.rpc_glock);
+ }
+
+ if (rpc->srpc_done != NULL)
+ (*rpc->srpc_done) (rpc);
+ LASSERT(rpc->srpc_bulk == NULL);
+
+ spin_lock(&scd->scd_lock);
+
+ if (rpc->srpc_reqstbuf != NULL) {
+ /* NB might drop sv_lock in srpc_service_recycle_buffer, but
+ * sv won't go away for scd_rpc_active must not be empty */
+ srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+ rpc->srpc_reqstbuf = NULL;
+ }
+
+ list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+ /*
+ * No one can schedule me now since:
+ * - I'm not on scd_rpc_active.
+ * - all LNet events have been fired.
+ * Cancel pending schedules and prevent future schedule attempts:
+ */
+ LASSERT(rpc->srpc_ev.ev_fired);
+ swi_exit_workitem(&rpc->srpc_wi);
+
+ if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+ buffer = list_entry(scd->scd_buf_blocked.next,
+ srpc_buffer_t, buf_list);
+ list_del(&buffer->buf_list);
+
+ srpc_init_server_rpc(rpc, scd, buffer);
+ list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+ swi_schedule_workitem(&rpc->srpc_wi);
+ } else {
+ list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+ }
+
+ spin_unlock(&scd->scd_lock);
+ return;
+}
+
+/* handles an incoming RPC */
+int
+srpc_handle_rpc(swi_workitem_t *wi)
+{
+ struct srpc_server_rpc *rpc = wi->swi_workitem.wi_data;
+ struct srpc_service_cd *scd = rpc->srpc_scd;
+ struct srpc_service *sv = scd->scd_svc;
+ srpc_event_t *ev = &rpc->srpc_ev;
+ int rc = 0;
+
+ LASSERT(wi == &rpc->srpc_wi);
+
+ spin_lock(&scd->scd_lock);
+
+ if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+ spin_unlock(&scd->scd_lock);
+
+ if (rpc->srpc_bulk != NULL)
+ LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+ LNetMDUnlink(rpc->srpc_replymdh);
+
+ if (ev->ev_fired) { /* no more event, OK to finish */
+ srpc_server_rpc_done(rpc, -ESHUTDOWN);
+ return 1;
+ }
+ return 0;
+ }
+
+ spin_unlock(&scd->scd_lock);
+
+ switch (wi->swi_state) {
+ default:
+ LBUG();
+ case SWI_STATE_NEWBORN: {
+ srpc_msg_t *msg;
+ srpc_generic_reply_t *reply;
+
+ msg = &rpc->srpc_reqstbuf->buf_msg;
+ reply = &rpc->srpc_replymsg.msg_body.reply;
+
+ if (msg->msg_magic == 0) {
+ /* moaned already in srpc_lnet_ev_handler */
+ srpc_server_rpc_done(rpc, EBADMSG);
+ return 1;
+ }
+
+ srpc_unpack_msg_hdr(msg);
+ if (msg->msg_version != SRPC_MSG_VERSION) {
+ CWARN("Version mismatch: %u, %u expected, from %s\n",
+ msg->msg_version, SRPC_MSG_VERSION,
+ libcfs_id2str(rpc->srpc_peer));
+ reply->status = EPROTO;
+ /* drop through and send reply */
+ } else {
+ reply->status = 0;
+ rc = (*sv->sv_handler)(rpc);
+ LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+ if (rc != 0) {
+ srpc_server_rpc_done(rpc, rc);
+ return 1;
+ }
+ }
+
+ wi->swi_state = SWI_STATE_BULK_STARTED;
+
+ if (rpc->srpc_bulk != NULL) {
+ rc = srpc_do_bulk(rpc);
+ if (rc == 0)
+ return 0; /* wait for bulk */
+
+ LASSERT(ev->ev_fired);
+ ev->ev_status = rc;
+ }
+ }
+ case SWI_STATE_BULK_STARTED:
+ LASSERT(rpc->srpc_bulk == NULL || ev->ev_fired);
+
+ if (rpc->srpc_bulk != NULL) {
+ rc = ev->ev_status;
+
+ if (sv->sv_bulk_ready != NULL)
+ rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+ if (rc != 0) {
+ srpc_server_rpc_done(rpc, rc);
+ return 1;
+ }
+ }
+
+ wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+ rc = srpc_send_reply(rpc);
+ if (rc == 0)
+ return 0; /* wait for reply */
+ srpc_server_rpc_done(rpc, rc);
+ return 1;
+
+ case SWI_STATE_REPLY_SUBMITTED:
+ if (!ev->ev_fired) {
+ CERROR("RPC %p: bulk %p, service %d\n",
+ rpc, rpc->srpc_bulk, sv->sv_id);
+ CERROR("Event: status %d, type %d, lnet %d\n",
+ ev->ev_status, ev->ev_type, ev->ev_lnet);
+ LASSERT(ev->ev_fired);
+ }
+
+ wi->swi_state = SWI_STATE_DONE;
+ srpc_server_rpc_done(rpc, ev->ev_status);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+srpc_client_rpc_expired(void *data)
+{
+ srpc_client_rpc_t *rpc = data;
+
+ CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n",
+ rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+ rpc->crpc_timeout);
+
+ spin_lock(&rpc->crpc_lock);
+
+ rpc->crpc_timeout = 0;
+ srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+ spin_unlock(&rpc->crpc_lock);
+
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.rpcs_expired++;
+ spin_unlock(&srpc_data.rpc_glock);
+}
+
+inline void
+srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
+{
+ stt_timer_t *timer = &rpc->crpc_timer;
+
+ if (rpc->crpc_timeout == 0)
+ return;
+
+ INIT_LIST_HEAD(&timer->stt_list);
+ timer->stt_data = rpc;
+ timer->stt_func = srpc_client_rpc_expired;
+ timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+ get_seconds());
+ stt_add_timer(timer);
+ return;
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU. */
+static void
+srpc_del_client_rpc_timer(srpc_client_rpc_t *rpc)
+{
+ /* timer not planted or already exploded */
+ if (rpc->crpc_timeout == 0)
+ return;
+
+ /* timer successfully defused */
+ if (stt_del_timer(&rpc->crpc_timer))
+ return;
+
+ /* timer detonated, wait for it to explode */
+ while (rpc->crpc_timeout != 0) {
+ spin_unlock(&rpc->crpc_lock);
+
+ schedule();
+
+ spin_lock(&rpc->crpc_lock);
+ }
+}
+
+static void
+srpc_client_rpc_done(srpc_client_rpc_t *rpc, int status)
+{
+ swi_workitem_t *wi = &rpc->crpc_wi;
+
+ LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+ spin_lock(&rpc->crpc_lock);
+
+ rpc->crpc_closed = 1;
+ if (rpc->crpc_status == 0)
+ rpc->crpc_status = status;
+
+ srpc_del_client_rpc_timer(rpc);
+
+ CDEBUG_LIMIT((status == 0) ? D_NET : D_NETERROR,
+ "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+ rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+ swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+ /*
+ * No one can schedule me now since:
+ * - RPC timer has been defused.
+ * - all LNet events have been fired.
+ * - crpc_closed has been set, preventing srpc_abort_rpc from
+ * scheduling me.
+ * Cancel pending schedules and prevent future schedule attempts:
+ */
+ LASSERT(!srpc_event_pending(rpc));
+ swi_exit_workitem(wi);
+
+ spin_unlock(&rpc->crpc_lock);
+
+ (*rpc->crpc_done)(rpc);
+ return;
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc(swi_workitem_t *wi)
+{
+ int rc = 0;
+ srpc_client_rpc_t *rpc;
+ srpc_msg_t *reply;
+ int do_bulk;
+
+ LASSERT(wi != NULL);
+
+ rpc = wi->swi_workitem.wi_data;
+
+ LASSERT(rpc != NULL);
+ LASSERT(wi == &rpc->crpc_wi);
+
+ reply = &rpc->crpc_replymsg;
+ do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+ spin_lock(&rpc->crpc_lock);
+
+ if (rpc->crpc_aborted) {
+ spin_unlock(&rpc->crpc_lock);
+ goto abort;
+ }
+
+ spin_unlock(&rpc->crpc_lock);
+
+ switch (wi->swi_state) {
+ default:
+ LBUG();
+ case SWI_STATE_NEWBORN:
+ LASSERT(!srpc_event_pending(rpc));
+
+ rc = srpc_prepare_reply(rpc);
+ if (rc != 0) {
+ srpc_client_rpc_done(rpc, rc);
+ return 1;
+ }
+
+ rc = srpc_prepare_bulk(rpc);
+ if (rc != 0)
+ break;
+
+ wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+ rc = srpc_send_request(rpc);
+ break;
+
+ case SWI_STATE_REQUEST_SUBMITTED:
+ /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+ * order; however, they're processed in a strict order:
+ * rqt, rpy, and bulk. */
+ if (!rpc->crpc_reqstev.ev_fired)
+ break;
+
+ rc = rpc->crpc_reqstev.ev_status;
+ if (rc != 0)
+ break;
+
+ wi->swi_state = SWI_STATE_REQUEST_SENT;
+ /* perhaps more events, fall thru */
+ case SWI_STATE_REQUEST_SENT: {
+ srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+
+ if (!rpc->crpc_replyev.ev_fired)
+ break;
+
+ rc = rpc->crpc_replyev.ev_status;
+ if (rc != 0)
+ break;
+
+ srpc_unpack_msg_hdr(reply);
+ if (reply->msg_type != type ||
+ (reply->msg_magic != SRPC_MSG_MAGIC &&
+ reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+ CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n",
+ libcfs_id2str(rpc->crpc_dest),
+ reply->msg_type, type,
+ reply->msg_magic, SRPC_MSG_MAGIC);
+ rc = -EBADMSG;
+ break;
+ }
+
+ if (do_bulk && reply->msg_body.reply.status != 0) {
+ CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n",
+ reply->msg_body.reply.status,
+ libcfs_id2str(rpc->crpc_dest));
+ LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+ }
+
+ wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+ }
+ case SWI_STATE_REPLY_RECEIVED:
+ if (do_bulk && !rpc->crpc_bulkev.ev_fired)
+ break;
+
+ rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+ /* Bulk buffer was unlinked due to remote error. Clear error
+ * since reply buffer still contains valid data.
+ * NB rpc->crpc_done shouldn't look into bulk data in case of
+ * remote error. */
+ if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+ rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+ rc = 0;
+
+ wi->swi_state = SWI_STATE_DONE;
+ srpc_client_rpc_done(rpc, rc);
+ return 1;
+ }
+
+ if (rc != 0) {
+ spin_lock(&rpc->crpc_lock);
+ srpc_abort_rpc(rpc, rc);
+ spin_unlock(&rpc->crpc_lock);
+ }
+
+abort:
+ if (rpc->crpc_aborted) {
+ LNetMDUnlink(rpc->crpc_reqstmdh);
+ LNetMDUnlink(rpc->crpc_replymdh);
+ LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+ if (!srpc_event_pending(rpc)) {
+ srpc_client_rpc_done(rpc, -EINTR);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+ int nbulkiov, int bulklen,
+ void (*rpc_done)(srpc_client_rpc_t *),
+ void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+ srpc_client_rpc_t *rpc;
+
+ LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+ crpc_bulk.bk_iovs[nbulkiov]));
+ if (rpc == NULL)
+ return NULL;
+
+ srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+ bulklen, rpc_done, rpc_fini, priv);
+ return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc(srpc_client_rpc_t *rpc, int why)
+{
+ LASSERT(why != 0);
+
+ if (rpc->crpc_aborted || /* already aborted */
+ rpc->crpc_closed) /* callback imminent */
+ return;
+
+ CDEBUG(D_NET,
+ "Aborting RPC: service %d, peer %s, state %s, why %d\n",
+ rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+ swi_state2str(rpc->crpc_wi.swi_state), why);
+
+ rpc->crpc_aborted = 1;
+ rpc->crpc_status = why;
+ swi_schedule_workitem(&rpc->crpc_wi);
+ return;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc(srpc_client_rpc_t *rpc)
+{
+ LASSERT(!rpc->crpc_aborted);
+ LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+ CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+ libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+ rpc->crpc_timeout);
+
+ srpc_add_client_rpc_timer(rpc);
+ swi_schedule_workitem(&rpc->crpc_wi);
+ return;
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+ srpc_event_t *ev = &rpc->srpc_ev;
+ struct srpc_msg *msg = &rpc->srpc_replymsg;
+ struct srpc_buffer *buffer = rpc->srpc_reqstbuf;
+ struct srpc_service_cd *scd = rpc->srpc_scd;
+ struct srpc_service *sv = scd->scd_svc;
+ __u64 rpyid;
+ int rc;
+
+ LASSERT(buffer != NULL);
+ rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+ spin_lock(&scd->scd_lock);
+
+ if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+ /* Repost buffer before replying since test client
+ * might send me another RPC once it gets the reply */
+ if (srpc_service_post_buffer(scd, buffer) != 0)
+ CWARN("Failed to repost %s buffer\n", sv->sv_name);
+ rpc->srpc_reqstbuf = NULL;
+ }
+
+ spin_unlock(&scd->scd_lock);
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = SRPC_REPLY_SENT;
+
+ msg->msg_magic = SRPC_MSG_MAGIC;
+ msg->msg_version = SRPC_MSG_VERSION;
+ msg->msg_type = srpc_service2reply(sv->sv_id);
+
+ rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+ sizeof(*msg), LNET_MD_OP_PUT,
+ rpc->srpc_peer, rpc->srpc_self,
+ &rpc->srpc_replymdh, ev);
+ if (rc != 0)
+ ev->ev_fired = 1; /* no more event expected */
+ return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+static void
+srpc_lnet_ev_handler(lnet_event_t *ev)
+{
+ struct srpc_service_cd *scd;
+ srpc_event_t *rpcev = ev->md.user_ptr;
+ srpc_client_rpc_t *crpc;
+ srpc_server_rpc_t *srpc;
+ srpc_buffer_t *buffer;
+ srpc_service_t *sv;
+ srpc_msg_t *msg;
+ srpc_msg_type_t type;
+
+ LASSERT(!in_interrupt());
+
+ if (ev->status != 0) {
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.errors++;
+ spin_unlock(&srpc_data.rpc_glock);
+ }
+
+ rpcev->ev_lnet = ev->type;
+
+ switch (rpcev->ev_type) {
+ default:
+ CERROR("Unknown event: status %d, type %d, lnet %d\n",
+ rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+ LBUG();
+ case SRPC_REQUEST_SENT:
+ if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.rpcs_sent++;
+ spin_unlock(&srpc_data.rpc_glock);
+ }
+ case SRPC_REPLY_RCVD:
+ case SRPC_BULK_REQ_RCVD:
+ crpc = rpcev->ev_data;
+
+ if (rpcev != &crpc->crpc_reqstev &&
+ rpcev != &crpc->crpc_replyev &&
+ rpcev != &crpc->crpc_bulkev) {
+ CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+ rpcev, crpc, &crpc->crpc_reqstev,
+ &crpc->crpc_replyev, &crpc->crpc_bulkev);
+ CERROR("Bad event: status %d, type %d, lnet %d\n",
+ rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+ LBUG();
+ }
+
+ spin_lock(&crpc->crpc_lock);
+
+ LASSERT(rpcev->ev_fired == 0);
+ rpcev->ev_fired = 1;
+ rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+ -EINTR : ev->status;
+ swi_schedule_workitem(&crpc->crpc_wi);
+
+ spin_unlock(&crpc->crpc_lock);
+ break;
+
+ case SRPC_REQUEST_RCVD:
+ scd = rpcev->ev_data;
+ sv = scd->scd_svc;
+
+ LASSERT(rpcev == &scd->scd_ev);
+
+ spin_lock(&scd->scd_lock);
+
+ LASSERT(ev->unlinked);
+ LASSERT(ev->type == LNET_EVENT_PUT ||
+ ev->type == LNET_EVENT_UNLINK);
+ LASSERT(ev->type != LNET_EVENT_UNLINK ||
+ sv->sv_shuttingdown);
+
+ buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+ buffer->buf_peer = ev->initiator;
+ buffer->buf_self = ev->target.nid;
+
+ LASSERT(scd->scd_buf_nposted > 0);
+ scd->scd_buf_nposted--;
+
+ if (sv->sv_shuttingdown) {
+ /* Leave buffer on scd->scd_buf_nposted since
+ * srpc_finish_service needs to traverse it. */
+ spin_unlock(&scd->scd_lock);
+ break;
+ }
+
+ if (scd->scd_buf_err_stamp != 0 &&
+ scd->scd_buf_err_stamp < get_seconds()) {
+ /* re-enable adding buffer */
+ scd->scd_buf_err_stamp = 0;
+ scd->scd_buf_err = 0;
+ }
+
+ if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+ scd->scd_buf_adjust == 0 &&
+ scd->scd_buf_nposted < scd->scd_buf_low) {
+ scd->scd_buf_adjust = max(scd->scd_buf_total / 2,
+ SFW_TEST_WI_MIN);
+ swi_schedule_workitem(&scd->scd_buf_wi);
+ }
+
+ list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+ msg = &buffer->buf_msg;
+ type = srpc_service2request(sv->sv_id);
+
+ if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+ (msg->msg_type != type &&
+ msg->msg_type != __swab32(type)) ||
+ (msg->msg_magic != SRPC_MSG_MAGIC &&
+ msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+ CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n",
+ sv->sv_name, libcfs_id2str(ev->initiator),
+ ev->status, ev->mlength,
+ msg->msg_type, msg->msg_magic);
+
+ /* NB can't call srpc_service_recycle_buffer here since
+ * it may call LNetM[DE]Attach. The invalid magic tells
+ * srpc_handle_rpc to drop this RPC */
+ msg->msg_magic = 0;
+ }
+
+ if (!list_empty(&scd->scd_rpc_free)) {
+ srpc = list_entry(scd->scd_rpc_free.next,
+ struct srpc_server_rpc,
+ srpc_list);
+ list_del(&srpc->srpc_list);
+
+ srpc_init_server_rpc(srpc, scd, buffer);
+ list_add_tail(&srpc->srpc_list,
+ &scd->scd_rpc_active);
+ swi_schedule_workitem(&srpc->srpc_wi);
+ } else {
+ list_add_tail(&buffer->buf_list,
+ &scd->scd_buf_blocked);
+ }
+
+ spin_unlock(&scd->scd_lock);
+
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.rpcs_rcvd++;
+ spin_unlock(&srpc_data.rpc_glock);
+ break;
+
+ case SRPC_BULK_GET_RPLD:
+ LASSERT(ev->type == LNET_EVENT_SEND ||
+ ev->type == LNET_EVENT_REPLY ||
+ ev->type == LNET_EVENT_UNLINK);
+
+ if (!ev->unlinked)
+ break; /* wait for final event */
+
+ case SRPC_BULK_PUT_SENT:
+ if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+ spin_lock(&srpc_data.rpc_glock);
+
+ if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+ srpc_data.rpc_counters.bulk_get += ev->mlength;
+ else
+ srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+ spin_unlock(&srpc_data.rpc_glock);
+ }
+ case SRPC_REPLY_SENT:
+ srpc = rpcev->ev_data;
+ scd = srpc->srpc_scd;
+
+ LASSERT(rpcev == &srpc->srpc_ev);
+
+ spin_lock(&scd->scd_lock);
+
+ rpcev->ev_fired = 1;
+ rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+ -EINTR : ev->status;
+ swi_schedule_workitem(&srpc->srpc_wi);
+
+ spin_unlock(&scd->scd_lock);
+ break;
+ }
+}
+
+
+int
+srpc_startup(void)
+{
+ int rc;
+
+ memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+ spin_lock_init(&srpc_data.rpc_glock);
+
+ /* 1 second pause to avoid timestamp reuse */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ srpc_data.rpc_matchbits = ((__u64) get_seconds()) << 48;
+
+ srpc_data.rpc_state = SRPC_STATE_NONE;
+
+ rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+ if (rc < 0) {
+ CERROR("LNetNIInit() has failed: %d\n", rc);
+ return rc;
+ }
+
+ srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+ LNetInvalidateHandle(&srpc_data.rpc_lnet_eq);
+ rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
+ if (rc != 0) {
+ CERROR("LNetEQAlloc() has failed: %d\n", rc);
+ goto bail;
+ }
+
+ rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+ LASSERT(rc == 0);
+ rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+ LASSERT(rc == 0);
+
+ srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+ rc = stt_startup();
+
+bail:
+ if (rc != 0)
+ srpc_shutdown();
+ else
+ srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+ return rc;
+}
+
+void
+srpc_shutdown(void)
+{
+ int i;
+ int rc;
+ int state;
+
+ state = srpc_data.rpc_state;
+ srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+ switch (state) {
+ default:
+ LBUG();
+ case SRPC_STATE_RUNNING:
+ spin_lock(&srpc_data.rpc_glock);
+
+ for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+ srpc_service_t *sv = srpc_data.rpc_services[i];
+
+ LASSERTF(sv == NULL,
+ "service not empty: id %d, name %s\n",
+ i, sv->sv_name);
+ }
+
+ spin_unlock(&srpc_data.rpc_glock);
+
+ stt_shutdown();
+
+ case SRPC_STATE_EQ_INIT:
+ rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+ rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+ LASSERT(rc == 0);
+ rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+ LASSERT(rc == 0); /* the EQ should have no user by now */
+
+ case SRPC_STATE_NI_INIT:
+ LNetNIFini();
+ }
+
+ return;
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/rpc.h b/kernel/drivers/staging/lustre/lnet/selftest/rpc.h
new file mode 100644
index 000000000..fbeb75fe5
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/rpc.h
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include "../../include/linux/lnet/lnetst.h"
+
+/*
+ * LST wired structures
+ *
+ * XXX: *REPLY == *REQST + 1
+ */
+typedef enum {
+ SRPC_MSG_MKSN_REQST = 0,
+ SRPC_MSG_MKSN_REPLY = 1,
+ SRPC_MSG_RMSN_REQST = 2,
+ SRPC_MSG_RMSN_REPLY = 3,
+ SRPC_MSG_BATCH_REQST = 4,
+ SRPC_MSG_BATCH_REPLY = 5,
+ SRPC_MSG_STAT_REQST = 6,
+ SRPC_MSG_STAT_REPLY = 7,
+ SRPC_MSG_TEST_REQST = 8,
+ SRPC_MSG_TEST_REPLY = 9,
+ SRPC_MSG_DEBUG_REQST = 10,
+ SRPC_MSG_DEBUG_REPLY = 11,
+ SRPC_MSG_BRW_REQST = 12,
+ SRPC_MSG_BRW_REPLY = 13,
+ SRPC_MSG_PING_REQST = 14,
+ SRPC_MSG_PING_REPLY = 15,
+ SRPC_MSG_JOIN_REQST = 16,
+ SRPC_MSG_JOIN_REPLY = 17,
+} srpc_msg_type_t;
+
+
+/* CAVEAT EMPTOR:
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+typedef struct {
+ __u64 rpyid; /* reply buffer matchbits */
+ __u64 bulkid; /* bulk buffer matchbits */
+} WIRE_ATTR srpc_generic_reqst_t;
+
+typedef struct {
+ __u32 status;
+ lst_sid_t sid;
+} WIRE_ATTR srpc_generic_reply_t;
+
+/* FRAMEWORK RPCs */
+typedef struct {
+ __u64 mksn_rpyid; /* reply buffer matchbits */
+ lst_sid_t mksn_sid; /* session id */
+ __u32 mksn_force; /* use brute force */
+ char mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reqst_t; /* make session request */
+
+typedef struct {
+ __u32 mksn_status; /* session status */
+ lst_sid_t mksn_sid; /* session id */
+ __u32 mksn_timeout; /* session timeout */
+ char mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+
+typedef struct {
+ __u64 rmsn_rpyid; /* reply buffer matchbits */
+ lst_sid_t rmsn_sid; /* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+
+typedef struct {
+ __u32 rmsn_status;
+ lst_sid_t rmsn_sid; /* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+
+typedef struct {
+ __u64 join_rpyid; /* reply buffer matchbits */
+ lst_sid_t join_sid; /* session id to join */
+ char join_group[LST_NAME_SIZE]; /* group name */
+} WIRE_ATTR srpc_join_reqst_t;
+
+typedef struct {
+ __u32 join_status; /* returned status */
+ lst_sid_t join_sid; /* session id */
+ __u32 join_timeout; /* # seconds' inactivity to expire */
+ char join_session[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_join_reply_t;
+
+typedef struct {
+ __u64 dbg_rpyid; /* reply buffer matchbits */
+ lst_sid_t dbg_sid; /* session id */
+ __u32 dbg_flags; /* bitmap of debug */
+} WIRE_ATTR srpc_debug_reqst_t;
+
+typedef struct {
+ __u32 dbg_status; /* returned code */
+ lst_sid_t dbg_sid; /* session id */
+ __u32 dbg_timeout; /* session timeout */
+ __u32 dbg_nbatch; /* # of batches in the node */
+ char dbg_name[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_debug_reply_t;
+
+#define SRPC_BATCH_OPC_RUN 1
+#define SRPC_BATCH_OPC_STOP 2
+#define SRPC_BATCH_OPC_QUERY 3
+
+typedef struct {
+ __u64 bar_rpyid; /* reply buffer matchbits */
+ lst_sid_t bar_sid; /* session id */
+ lst_bid_t bar_bid; /* batch id */
+ __u32 bar_opc; /* create/start/stop batch */
+ __u32 bar_testidx; /* index of test */
+ __u32 bar_arg; /* parameters */
+} WIRE_ATTR srpc_batch_reqst_t;
+
+typedef struct {
+ __u32 bar_status; /* status of request */
+ lst_sid_t bar_sid; /* session id */
+ __u32 bar_active; /* # of active tests in batch/test */
+ __u32 bar_time; /* remained time */
+} WIRE_ATTR srpc_batch_reply_t;
+
+typedef struct {
+ __u64 str_rpyid; /* reply buffer matchbits */
+ lst_sid_t str_sid; /* session id */
+ __u32 str_type; /* type of stat */
+} WIRE_ATTR srpc_stat_reqst_t;
+
+typedef struct {
+ __u32 str_status;
+ lst_sid_t str_sid;
+ sfw_counters_t str_fw;
+ srpc_counters_t str_rpc;
+ lnet_counters_t str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
+
+typedef struct {
+ __u32 blk_opc; /* bulk operation code */
+ __u32 blk_npg; /* # of pages */
+ __u32 blk_flags; /* reserved flags */
+} WIRE_ATTR test_bulk_req_t;
+
+typedef struct {
+ /** bulk operation code */
+ __u16 blk_opc;
+ /** data check flags */
+ __u16 blk_flags;
+ /** data length */
+ __u32 blk_len;
+ /** reserved: offset */
+ __u32 blk_offset;
+} WIRE_ATTR test_bulk_req_v1_t;
+
+typedef struct {
+ __u32 png_size; /* size of ping message */
+ __u32 png_flags; /* reserved flags */
+} WIRE_ATTR test_ping_req_t;
+
+typedef struct {
+ __u64 tsr_rpyid; /* reply buffer matchbits */
+ __u64 tsr_bulkid; /* bulk buffer matchbits */
+ lst_sid_t tsr_sid; /* session id */
+ lst_bid_t tsr_bid; /* batch id */
+ __u32 tsr_service; /* test type: bulk|ping|... */
+ /* test client loop count or # server buffers needed */
+ __u32 tsr_loop;
+ __u32 tsr_concur; /* concurrency of test */
+ __u8 tsr_is_client; /* is test client or not */
+ __u8 tsr_stop_onerr; /* stop on error */
+ __u32 tsr_ndest; /* # of dest nodes */
+
+ union {
+ test_ping_req_t ping;
+ test_bulk_req_t bulk_v0;
+ test_bulk_req_v1_t bulk_v1;
+ } tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
+
+typedef struct {
+ __u32 tsr_status; /* returned code */
+ lst_sid_t tsr_sid;
+} WIRE_ATTR srpc_test_reply_t;
+
+/* TEST RPCs */
+typedef struct {
+ __u64 pnr_rpyid;
+ __u32 pnr_magic;
+ __u32 pnr_seq;
+ __u64 pnr_time_sec;
+ __u64 pnr_time_usec;
+} WIRE_ATTR srpc_ping_reqst_t;
+
+typedef struct {
+ __u32 pnr_status;
+ __u32 pnr_magic;
+ __u32 pnr_seq;
+} WIRE_ATTR srpc_ping_reply_t;
+
+typedef struct {
+ __u64 brw_rpyid; /* reply buffer matchbits */
+ __u64 brw_bulkid; /* bulk buffer matchbits */
+ __u32 brw_rw; /* read or write */
+ __u32 brw_len; /* bulk data len */
+ __u32 brw_flags; /* bulk data patterns */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+
+typedef struct {
+ __u32 brw_status;
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC 0xeeb0f00d
+#define SRPC_MSG_VERSION 1
+
+typedef struct srpc_msg {
+ /** magic number */
+ __u32 msg_magic;
+ /** message version number */
+ __u32 msg_version;
+ /** type of message body: srpc_msg_type_t */
+ __u32 msg_type;
+ __u32 msg_reserved0;
+ __u32 msg_reserved1;
+ /** test session features */
+ __u32 msg_ses_feats;
+ union {
+ srpc_generic_reqst_t reqst;
+ srpc_generic_reply_t reply;
+
+ srpc_mksn_reqst_t mksn_reqst;
+ srpc_mksn_reply_t mksn_reply;
+ srpc_rmsn_reqst_t rmsn_reqst;
+ srpc_rmsn_reply_t rmsn_reply;
+ srpc_debug_reqst_t dbg_reqst;
+ srpc_debug_reply_t dbg_reply;
+ srpc_batch_reqst_t bat_reqst;
+ srpc_batch_reply_t bat_reply;
+ srpc_stat_reqst_t stat_reqst;
+ srpc_stat_reply_t stat_reply;
+ srpc_test_reqst_t tes_reqst;
+ srpc_test_reply_t tes_reply;
+ srpc_join_reqst_t join_reqst;
+ srpc_join_reply_t join_reply;
+
+ srpc_ping_reqst_t ping_reqst;
+ srpc_ping_reply_t ping_reply;
+ srpc_brw_reqst_t brw_reqst;
+ srpc_brw_reply_t brw_reply;
+ } msg_body;
+} WIRE_ATTR srpc_msg_t;
+
+static inline void
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
+{
+ if (msg->msg_magic == SRPC_MSG_MAGIC)
+ return; /* no flipping needed */
+
+ /* We do not swap the magic number here as it is needed to
+ determine whether the body needs to be swapped. */
+ /* __swab32s(&msg->msg_magic); */
+ __swab32s(&msg->msg_type);
+ __swab32s(&msg->msg_version);
+ __swab32s(&msg->msg_ses_feats);
+ __swab32s(&msg->msg_reserved0);
+ __swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/selftest.h b/kernel/drivers/staging/lustre/lnet/selftest/selftest.h
new file mode 100644
index 000000000..d48701834
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/selftest.h
@@ -0,0 +1,624 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lnet.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "../../include/linux/lnet/lib-types.h"
+#include "../../include/linux/lnet/lnetst.h"
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN 0
+#define SWI_STATE_REPLY_SUBMITTED 1
+#define SWI_STATE_REPLY_SENT 2
+#define SWI_STATE_REQUEST_SUBMITTED 3
+#define SWI_STATE_REQUEST_SENT 4
+#define SWI_STATE_REPLY_RECEIVED 5
+#define SWI_STATE_BULK_STARTED 6
+#define SWI_STATE_DONE 10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG 0
+#define SRPC_SERVICE_MAKE_SESSION 1
+#define SRPC_SERVICE_REMOVE_SESSION 2
+#define SRPC_SERVICE_BATCH 3
+#define SRPC_SERVICE_TEST 4
+#define SRPC_SERVICE_QUERY_STAT 5
+#define SRPC_SERVICE_JOIN 6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID 10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW 11
+#define SRPC_SERVICE_PING 12
+#define SRPC_SERVICE_MAX_ID 12
+
+#define SRPC_REQUEST_PORTAL 50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL 51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL 52
+
+static inline srpc_msg_type_t
+srpc_service2request (int service)
+{
+ switch (service) {
+ default:
+ LBUG ();
+ case SRPC_SERVICE_DEBUG:
+ return SRPC_MSG_DEBUG_REQST;
+
+ case SRPC_SERVICE_MAKE_SESSION:
+ return SRPC_MSG_MKSN_REQST;
+
+ case SRPC_SERVICE_REMOVE_SESSION:
+ return SRPC_MSG_RMSN_REQST;
+
+ case SRPC_SERVICE_BATCH:
+ return SRPC_MSG_BATCH_REQST;
+
+ case SRPC_SERVICE_TEST:
+ return SRPC_MSG_TEST_REQST;
+
+ case SRPC_SERVICE_QUERY_STAT:
+ return SRPC_MSG_STAT_REQST;
+
+ case SRPC_SERVICE_BRW:
+ return SRPC_MSG_BRW_REQST;
+
+ case SRPC_SERVICE_PING:
+ return SRPC_MSG_PING_REQST;
+
+ case SRPC_SERVICE_JOIN:
+ return SRPC_MSG_JOIN_REQST;
+ }
+}
+
+static inline srpc_msg_type_t
+srpc_service2reply (int service)
+{
+ return srpc_service2request(service) + 1;
+}
+
+typedef enum {
+ SRPC_BULK_REQ_RCVD = 1, /* passive bulk request(PUT sink/GET source) received */
+ SRPC_BULK_PUT_SENT = 2, /* active bulk PUT sent (source) */
+ SRPC_BULK_GET_RPLD = 3, /* active bulk GET replied (sink) */
+ SRPC_REPLY_RCVD = 4, /* incoming reply received */
+ SRPC_REPLY_SENT = 5, /* outgoing reply sent */
+ SRPC_REQUEST_RCVD = 6, /* incoming request received */
+ SRPC_REQUEST_SENT = 7, /* outgoing request sent */
+} srpc_event_type_t;
+
+/* RPC event */
+typedef struct {
+ srpc_event_type_t ev_type; /* what's up */
+ lnet_event_kind_t ev_lnet; /* LNet event type */
+ int ev_fired; /* LNet event fired? */
+ int ev_status; /* LNet event status */
+ void *ev_data; /* owning server/client RPC */
+} srpc_event_t;
+
+typedef struct {
+ int bk_len; /* len of bulk data */
+ lnet_handle_md_t bk_mdh;
+ int bk_sink; /* sink/source */
+ int bk_niov; /* # iov in bk_iovs */
+ lnet_kiov_t bk_iovs[0];
+} srpc_bulk_t; /* bulk descriptor */
+
+/* message buffer descriptor */
+typedef struct srpc_buffer {
+ struct list_head buf_list; /* chain on srpc_service::*_msgq */
+ srpc_msg_t buf_msg;
+ lnet_handle_md_t buf_mdh;
+ lnet_nid_t buf_self;
+ lnet_process_id_t buf_peer;
+} srpc_buffer_t;
+
+struct swi_workitem;
+typedef int (*swi_action_t) (struct swi_workitem *);
+
+typedef struct swi_workitem {
+ struct cfs_wi_sched *swi_sched;
+ cfs_workitem_t swi_workitem;
+ swi_action_t swi_action;
+ int swi_state;
+} swi_workitem_t;
+
+/* server-side state of a RPC */
+typedef struct srpc_server_rpc {
+ /* chain on srpc_service::*_rpcq */
+ struct list_head srpc_list;
+ struct srpc_service_cd *srpc_scd;
+ swi_workitem_t srpc_wi;
+ srpc_event_t srpc_ev; /* bulk/reply event */
+ lnet_nid_t srpc_self;
+ lnet_process_id_t srpc_peer;
+ srpc_msg_t srpc_replymsg;
+ lnet_handle_md_t srpc_replymdh;
+ srpc_buffer_t *srpc_reqstbuf;
+ srpc_bulk_t *srpc_bulk;
+
+ unsigned int srpc_aborted; /* being given up */
+ int srpc_status;
+ void (*srpc_done)(struct srpc_server_rpc *);
+} srpc_server_rpc_t;
+
+/* client-side state of a RPC */
+typedef struct srpc_client_rpc {
+ struct list_head crpc_list; /* chain on user's lists */
+ spinlock_t crpc_lock; /* serialize */
+ int crpc_service;
+ atomic_t crpc_refcount;
+ int crpc_timeout; /* # seconds to wait for reply */
+ stt_timer_t crpc_timer;
+ swi_workitem_t crpc_wi;
+ lnet_process_id_t crpc_dest;
+
+ void (*crpc_done)(struct srpc_client_rpc *);
+ void (*crpc_fini)(struct srpc_client_rpc *);
+ int crpc_status; /* completion status */
+ void *crpc_priv; /* caller data */
+
+ /* state flags */
+ unsigned int crpc_aborted:1; /* being given up */
+ unsigned int crpc_closed:1; /* completed */
+
+ /* RPC events */
+ srpc_event_t crpc_bulkev; /* bulk event */
+ srpc_event_t crpc_reqstev; /* request event */
+ srpc_event_t crpc_replyev; /* reply event */
+
+ /* bulk, request(reqst), and reply exchanged on wire */
+ srpc_msg_t crpc_reqstmsg;
+ srpc_msg_t crpc_replymsg;
+ lnet_handle_md_t crpc_reqstmdh;
+ lnet_handle_md_t crpc_replymdh;
+ srpc_bulk_t crpc_bulk;
+} srpc_client_rpc_t;
+
+#define srpc_client_rpc_size(rpc) \
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc) \
+do { \
+ CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n", \
+ (rpc), libcfs_id2str((rpc)->crpc_dest), \
+ atomic_read(&(rpc)->crpc_refcount)); \
+ LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \
+ atomic_inc(&(rpc)->crpc_refcount); \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc) \
+do { \
+ CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n", \
+ (rpc), libcfs_id2str((rpc)->crpc_dest), \
+ atomic_read(&(rpc)->crpc_refcount)); \
+ LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \
+ if (atomic_dec_and_test(&(rpc)->crpc_refcount)) \
+ srpc_destroy_client_rpc(rpc); \
+} while (0)
+
+#define srpc_event_pending(rpc) ((rpc)->crpc_bulkev.ev_fired == 0 || \
+ (rpc)->crpc_reqstev.ev_fired == 0 || \
+ (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+ /** serialize */
+ spinlock_t scd_lock;
+ /** backref to service */
+ struct srpc_service *scd_svc;
+ /** event buffer */
+ srpc_event_t scd_ev;
+ /** free RPC descriptors */
+ struct list_head scd_rpc_free;
+ /** in-flight RPCs */
+ struct list_head scd_rpc_active;
+ /** workitem for posting buffer */
+ swi_workitem_t scd_buf_wi;
+ /** CPT id */
+ int scd_cpt;
+ /** error code for scd_buf_wi */
+ int scd_buf_err;
+ /** timestamp for scd_buf_err */
+ unsigned long scd_buf_err_stamp;
+ /** total # request buffers */
+ int scd_buf_total;
+ /** # posted request buffers */
+ int scd_buf_nposted;
+ /** in progress of buffer posting */
+ int scd_buf_posting;
+ /** allocate more buffers if scd_buf_nposted < scd_buf_low */
+ int scd_buf_low;
+ /** increase/decrease some buffers */
+ int scd_buf_adjust;
+ /** posted message buffers */
+ struct list_head scd_buf_posted;
+ /** blocked for RPC descriptor */
+ struct list_head scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN 256
+#define SFW_TEST_WI_MAX 2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions */
+#define SFW_TEST_WI_EXTRA 64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN 16
+#define SFW_FRWK_WI_MAX 256
+
+typedef struct srpc_service {
+ int sv_id; /* service id */
+ const char *sv_name; /* human readable name */
+ int sv_wi_total; /* total server workitems */
+ int sv_shuttingdown;
+ int sv_ncpts;
+ /* percpt data for srpc_service */
+ struct srpc_service_cd **sv_cpt_data;
+ /* Service callbacks:
+ * - sv_handler: process incoming RPC request
+ * - sv_bulk_ready: notify bulk data
+ */
+ int (*sv_handler) (srpc_server_rpc_t *);
+ int (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
+
+typedef struct {
+ struct list_head sn_list; /* chain on fw_zombie_sessions */
+ lst_sid_t sn_id; /* unique identifier */
+ unsigned int sn_timeout; /* # seconds' inactivity to expire */
+ int sn_timer_active;
+ unsigned int sn_features;
+ stt_timer_t sn_timer;
+ struct list_head sn_batches; /* list of batches */
+ char sn_name[LST_NAME_SIZE];
+ atomic_t sn_refcount;
+ atomic_t sn_brw_errors;
+ atomic_t sn_ping_errors;
+ unsigned long sn_started;
+} sfw_session_t;
+
+#define sfw_sid_equal(sid0, sid1) ((sid0).ses_nid == (sid1).ses_nid && \
+ (sid0).ses_stamp == (sid1).ses_stamp)
+
+typedef struct {
+ struct list_head bat_list; /* chain on sn_batches */
+ lst_bid_t bat_id; /* batch id */
+ int bat_error; /* error code of batch */
+ sfw_session_t *bat_session; /* batch's session */
+ atomic_t bat_nactive; /* # of active tests */
+ struct list_head bat_tests; /* test instances */
+} sfw_batch_t;
+
+typedef struct {
+ int (*tso_init)(struct sfw_test_instance *tsi); /* initialize test client */
+ void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+ int (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+ lnet_process_id_t dest,
+ srpc_client_rpc_t **rpc); /* prep a tests rpc */
+ void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+ srpc_client_rpc_t *rpc); /* done a test rpc */
+} sfw_test_client_ops_t;
+
+typedef struct sfw_test_instance {
+ struct list_head tsi_list; /* chain on batch */
+ int tsi_service; /* test type */
+ sfw_batch_t *tsi_batch; /* batch */
+ sfw_test_client_ops_t *tsi_ops; /* test client operations */
+
+ /* public parameter for all test units */
+ unsigned int tsi_is_client:1; /* is test client */
+ unsigned int tsi_stoptsu_onerr:1; /* stop tsu on error */
+ int tsi_concur; /* concurrency */
+ int tsi_loop; /* loop count */
+
+ /* status of test instance */
+ spinlock_t tsi_lock; /* serialize */
+ unsigned int tsi_stopping:1; /* test is stopping */
+ atomic_t tsi_nactive; /* # of active test unit */
+ struct list_head tsi_units; /* test units */
+ struct list_head tsi_free_rpcs; /* free rpcs */
+ struct list_head tsi_active_rpcs; /* active rpcs */
+
+ union {
+ test_ping_req_t ping; /* ping parameter */
+ test_bulk_req_t bulk_v0; /* bulk parameter */
+ test_bulk_req_v1_t bulk_v1; /* bulk v1 parameter */
+ } tsi_u;
+} sfw_test_instance_t;
+
+/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t))
+#define SFW_MAX_NDESTS (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n) (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+typedef struct sfw_test_unit {
+ struct list_head tsu_list; /* chain on lst_test_instance */
+ lnet_process_id_t tsu_dest; /* id of dest node */
+ int tsu_loop; /* loop count of the test */
+ sfw_test_instance_t *tsu_instance; /* pointer to test instance */
+ void *tsu_private; /* private data */
+ swi_workitem_t tsu_worker; /* workitem of the test unit */
+} sfw_test_unit_t;
+
+typedef struct sfw_test_case {
+ struct list_head tsc_list; /* chain on fw_tests */
+ srpc_service_t *tsc_srv_service; /* test service */
+ sfw_test_client_ops_t *tsc_cli_ops; /* ops of test client */
+} sfw_test_case_t;
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+ unsigned features, int nbulkiov, int bulklen,
+ void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+ lnet_process_id_t peer, unsigned features,
+ int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+ int sink);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+ int nbulkiov, int bulklen,
+ void (*rpc_done)(srpc_client_rpc_t *),
+ void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len,
+ int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_get_counters(srpc_counters_t *cnt);
+void srpc_set_counters(const srpc_counters_t *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+ return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(cfs_workitem_t *wi)
+{
+ swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+
+ return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(swi_workitem_t *swi, void *data,
+ swi_action_t action, struct cfs_wi_sched *sched)
+{
+ swi->swi_sched = sched;
+ swi->swi_action = action;
+ swi->swi_state = SWI_STATE_NEWBORN;
+ cfs_wi_init(&swi->swi_workitem, data, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(swi_workitem_t *wi)
+{
+ cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(swi_workitem_t *swi)
+{
+ cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(swi_workitem_t *swi)
+{
+ return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+{
+ LASSERT (rpc != NULL);
+ LASSERT (!srpc_event_pending(rpc));
+ LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+ if (rpc->crpc_fini == NULL) {
+ LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+ } else {
+ (*rpc->crpc_fini) (rpc);
+ }
+
+ return;
+}
+
+static inline void
+srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer,
+ int service, int nbulkiov, int bulklen,
+ void (*rpc_done)(srpc_client_rpc_t *),
+ void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+ LASSERT (nbulkiov <= LNET_MAX_IOV);
+
+ memset(rpc, 0, offsetof(srpc_client_rpc_t,
+ crpc_bulk.bk_iovs[nbulkiov]));
+
+ INIT_LIST_HEAD(&rpc->crpc_list);
+ swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
+ lst_sched_test[lnet_cpt_of_nid(peer.nid)]);
+ spin_lock_init(&rpc->crpc_lock);
+ atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+ rpc->crpc_dest = peer;
+ rpc->crpc_priv = priv;
+ rpc->crpc_service = service;
+ rpc->crpc_bulk.bk_len = bulklen;
+ rpc->crpc_bulk.bk_niov = nbulkiov;
+ rpc->crpc_done = rpc_done;
+ rpc->crpc_fini = rpc_fini;
+ LNetInvalidateHandle(&rpc->crpc_reqstmdh);
+ LNetInvalidateHandle(&rpc->crpc_replymdh);
+ LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh);
+
+ /* no event is expected at this point */
+ rpc->crpc_bulkev.ev_fired =
+ rpc->crpc_reqstev.ev_fired =
+ rpc->crpc_replyev.ev_fired = 1;
+
+ rpc->crpc_reqstmsg.msg_magic = SRPC_MSG_MAGIC;
+ rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+ rpc->crpc_reqstmsg.msg_type = srpc_service2request(service);
+ return;
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+ switch(state) {
+ default:
+ LBUG();
+ STATE2STR(SWI_STATE_NEWBORN);
+ STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+ STATE2STR(SWI_STATE_REPLY_SENT);
+ STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+ STATE2STR(SWI_STATE_REQUEST_SENT);
+ STATE2STR(SWI_STATE_REPLY_RECEIVED);
+ STATE2STR(SWI_STATE_BULK_STARTED);
+ STATE2STR(SWI_STATE_DONE);
+ }
+#undef STATE2STR
+}
+
+#define selftest_wait_events() \
+ do { \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ schedule_timeout(cfs_time_seconds(1) / 10); \
+ } while (0)
+
+
+#define lst_wait_until(cond, lock, fmt, ...) \
+do { \
+ int __I = 2; \
+ while (!(cond)) { \
+ CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET, \
+ fmt, ## __VA_ARGS__); \
+ spin_unlock(&(lock)); \
+ \
+ selftest_wait_events(); \
+ \
+ spin_lock(&(lock)); \
+ } \
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(srpc_service_t *sv)
+{
+ int i = 2;
+
+ LASSERT(sv->sv_shuttingdown);
+
+ while (srpc_finish_service(sv) == 0) {
+ i++;
+ CDEBUG (((i & -i) == i) ? D_WARNING : D_NET,
+ "Waiting for %s service to shutdown...\n",
+ sv->sv_name);
+ selftest_wait_events();
+ }
+}
+
+extern sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void);
+
+extern srpc_service_t brw_test_service;
+void brw_init_test_service(void);
+
+extern sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void);
+
+extern srpc_service_t ping_test_service;
+void ping_init_test_service(void);
+
+#endif /* __SELFTEST_SELFTEST_H__ */
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/timer.c b/kernel/drivers/staging/lustre/lnet/selftest/timer.c
new file mode 100644
index 000000000..441f9472a
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/timer.c
@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL 3 /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS (1 << 7)
+#define STTIMER_SLOT(t) (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+ (STTIMER_NSLOTS - 1))])
+
+static struct st_timer_data {
+ spinlock_t stt_lock;
+ /* start time of the slot processed previously */
+ unsigned long stt_prev_slot;
+ struct list_head stt_hash[STTIMER_NSLOTS];
+ int stt_shuttingdown;
+ wait_queue_head_t stt_waitq;
+ int stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(stt_timer_t *timer)
+{
+ struct list_head *pos;
+
+ spin_lock(&stt_data.stt_lock);
+
+ LASSERT(stt_data.stt_nthreads > 0);
+ LASSERT(!stt_data.stt_shuttingdown);
+ LASSERT(timer->stt_func != NULL);
+ LASSERT(list_empty(&timer->stt_list));
+ LASSERT(cfs_time_after(timer->stt_expires, get_seconds()));
+
+ /* a simple insertion sort */
+ list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) {
+ stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+
+ if (cfs_time_aftereq(timer->stt_expires, old->stt_expires))
+ break;
+ }
+ list_add(&timer->stt_list, pos);
+
+ spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer(stt_timer_t *timer)
+{
+ int ret = 0;
+
+ spin_lock(&stt_data.stt_lock);
+
+ LASSERT(stt_data.stt_nthreads > 0);
+ LASSERT(!stt_data.stt_shuttingdown);
+
+ if (!list_empty(&timer->stt_list)) {
+ ret = 1;
+ list_del_init(&timer->stt_list);
+ }
+
+ spin_unlock(&stt_data.stt_lock);
+ return ret;
+}
+
+/* called with stt_data.stt_lock held */
+static int
+stt_expire_list(struct list_head *slot, unsigned long now)
+{
+ int expired = 0;
+ stt_timer_t *timer;
+
+ while (!list_empty(slot)) {
+ timer = list_entry(slot->next, stt_timer_t, stt_list);
+
+ if (cfs_time_after(timer->stt_expires, now))
+ break;
+
+ list_del_init(&timer->stt_list);
+ spin_unlock(&stt_data.stt_lock);
+
+ expired++;
+ (*timer->stt_func) (timer->stt_data);
+
+ spin_lock(&stt_data.stt_lock);
+ }
+
+ return expired;
+}
+
+static int
+stt_check_timers(unsigned long *last)
+{
+ int expired = 0;
+ unsigned long now;
+ unsigned long this_slot;
+
+ now = get_seconds();
+ this_slot = now & STTIMER_SLOTTIMEMASK;
+
+ spin_lock(&stt_data.stt_lock);
+
+ while (cfs_time_aftereq(this_slot, *last)) {
+ expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+ this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+ }
+
+ *last = now & STTIMER_SLOTTIMEMASK;
+ spin_unlock(&stt_data.stt_lock);
+ return expired;
+}
+
+
+static int
+stt_timer_main(void *arg)
+{
+ cfs_block_allsigs();
+
+ while (!stt_data.stt_shuttingdown) {
+ stt_check_timers(&stt_data.stt_prev_slot);
+
+ wait_event_timeout(stt_data.stt_waitq,
+ stt_data.stt_shuttingdown,
+ cfs_time_seconds(STTIMER_SLOTTIME));
+ }
+
+ spin_lock(&stt_data.stt_lock);
+ stt_data.stt_nthreads--;
+ spin_unlock(&stt_data.stt_lock);
+ return 0;
+}
+
+static int
+stt_start_timer_thread(void)
+{
+ struct task_struct *task;
+
+ LASSERT(!stt_data.stt_shuttingdown);
+
+ task = kthread_run(stt_timer_main, NULL, "st_timer");
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ spin_lock(&stt_data.stt_lock);
+ stt_data.stt_nthreads++;
+ spin_unlock(&stt_data.stt_lock);
+ return 0;
+}
+
+
+int
+stt_startup(void)
+{
+ int rc = 0;
+ int i;
+
+ stt_data.stt_shuttingdown = 0;
+ stt_data.stt_prev_slot = get_seconds() & STTIMER_SLOTTIMEMASK;
+
+ spin_lock_init(&stt_data.stt_lock);
+ for (i = 0; i < STTIMER_NSLOTS; i++)
+ INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+ stt_data.stt_nthreads = 0;
+ init_waitqueue_head(&stt_data.stt_waitq);
+ rc = stt_start_timer_thread();
+ if (rc != 0)
+ CERROR("Can't spawn timer thread: %d\n", rc);
+
+ return rc;
+}
+
+void
+stt_shutdown(void)
+{
+ int i;
+
+ spin_lock(&stt_data.stt_lock);
+
+ for (i = 0; i < STTIMER_NSLOTS; i++)
+ LASSERT(list_empty(&stt_data.stt_hash[i]));
+
+ stt_data.stt_shuttingdown = 1;
+
+ wake_up(&stt_data.stt_waitq);
+ lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+ "waiting for %d threads to terminate\n",
+ stt_data.stt_nthreads);
+
+ spin_unlock(&stt_data.stt_lock);
+}
diff --git a/kernel/drivers/staging/lustre/lnet/selftest/timer.h b/kernel/drivers/staging/lustre/lnet/selftest/timer.h
new file mode 100644
index 000000000..d727c1e2b
--- /dev/null
+++ b/kernel/drivers/staging/lustre/lnet/selftest/timer.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+typedef struct {
+ struct list_head stt_list;
+ unsigned long stt_expires;
+ void (*stt_func) (void *);
+ void *stt_data;
+} stt_timer_t;
+
+void stt_add_timer (stt_timer_t *timer);
+int stt_del_timer (stt_timer_t *timer);
+int stt_startup (void);
+void stt_shutdown (void);
+
+#endif /* __SELFTEST_TIMER_H__ */