summaryrefslogtreecommitdiffstats
path: root/kernel/net/sunrpc/xprtrdma
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/net/sunrpc/xprtrdma')
-rw-r--r--kernel/net/sunrpc/xprtrdma/Makefile15
-rw-r--r--kernel/net/sunrpc/xprtrdma/backchannel.c394
-rw-r--r--kernel/net/sunrpc/xprtrdma/fmr_ops.c120
-rw-r--r--kernel/net/sunrpc/xprtrdma/frwr_ops.c343
-rw-r--r--kernel/net/sunrpc/xprtrdma/module.c46
-rw-r--r--kernel/net/sunrpc/xprtrdma/physical_ops.c31
-rw-r--r--kernel/net/sunrpc/xprtrdma/rpc_rdma.c353
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma.c14
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c140
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c141
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c117
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c256
-rw-r--r--kernel/net/sunrpc/xprtrdma/transport.c149
-rw-r--r--kernel/net/sunrpc/xprtrdma/verbs.c1029
-rw-r--r--kernel/net/sunrpc/xprtrdma/xprt_rdma.h129
15 files changed, 1869 insertions, 1408 deletions
diff --git a/kernel/net/sunrpc/xprtrdma/Makefile b/kernel/net/sunrpc/xprtrdma/Makefile
index 579f72bbc..33f99d300 100644
--- a/kernel/net/sunrpc/xprtrdma/Makefile
+++ b/kernel/net/sunrpc/xprtrdma/Makefile
@@ -1,9 +1,8 @@
-obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
-xprtrdma-y := transport.o rpc_rdma.o verbs.o \
- fmr_ops.o frwr_ops.o physical_ops.o
-
-obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
-
-svcrdma-y := svc_rdma.o svc_rdma_transport.o \
- svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
+rpcrdma-y := transport.o rpc_rdma.o verbs.o \
+ fmr_ops.o frwr_ops.o physical_ops.o \
+ svc_rdma.o svc_rdma_transport.o \
+ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
+ module.o
+rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/kernel/net/sunrpc/xprtrdma/backchannel.c b/kernel/net/sunrpc/xprtrdma/backchannel.c
new file mode 100644
index 000000000..2dcb44f69
--- /dev/null
+++ b/kernel/net/sunrpc/xprtrdma/backchannel.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA.
+ */
+
+#include <linux/module.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+#define RPCRDMA_BACKCHANNEL_DEBUG
+
+static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+
+ spin_lock(&buf->rb_reqslock);
+ list_del(&req->rl_all);
+ spin_unlock(&buf->rb_reqslock);
+
+ rpcrdma_destroy_req(&r_xprt->rx_ia, req);
+
+ kfree(rqst);
+}
+
+static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_regbuf *rb;
+ struct rpcrdma_req *req;
+ struct xdr_buf *buf;
+ size_t size;
+
+ req = rpcrdma_create_req(r_xprt);
+ if (!req)
+ return -ENOMEM;
+ req->rl_backchannel = true;
+
+ size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+ rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ if (IS_ERR(rb))
+ goto out_fail;
+ req->rl_rdmabuf = rb;
+
+ size += RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+ rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ if (IS_ERR(rb))
+ goto out_fail;
+ rb->rg_owner = req;
+ req->rl_sendbuf = rb;
+ /* so that rpcr_to_rdmar works when receiving a request */
+ rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base;
+
+ buf = &rqst->rq_snd_buf;
+ buf->head[0].iov_base = rqst->rq_buffer;
+ buf->head[0].iov_len = 0;
+ buf->tail[0].iov_base = NULL;
+ buf->tail[0].iov_len = 0;
+ buf->page_len = 0;
+ buf->len = 0;
+ buf->buflen = size;
+
+ return 0;
+
+out_fail:
+ rpcrdma_bc_free_rqst(r_xprt, rqst);
+ return -ENOMEM;
+}
+
+/* Allocate and add receive buffers to the rpcrdma_buffer's
+ * existing list of rep's. These are released when the
+ * transport is destroyed.
+ */
+static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
+ unsigned int count)
+{
+ struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
+ struct rpcrdma_rep *rep;
+ unsigned long flags;
+ int rc = 0;
+
+ while (count--) {
+ rep = rpcrdma_create_rep(r_xprt);
+ if (IS_ERR(rep)) {
+ pr_err("RPC: %s: reply buffer alloc failed\n",
+ __func__);
+ rc = PTR_ERR(rep);
+ break;
+ }
+
+ spin_lock_irqsave(&buffers->rb_lock, flags);
+ list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ }
+
+ return rc;
+}
+
+/**
+ * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of concurrent incoming requests to expect
+ *
+ * Returns 0 on success; otherwise a negative errno
+ */
+int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
+ struct rpc_rqst *rqst;
+ unsigned int i;
+ int rc;
+
+ /* The backchannel reply path returns each rpc_rqst to the
+ * bc_pa_list _after_ the reply is sent. If the server is
+ * faster than the client, it can send another backward
+ * direction request before the rpc_rqst is returned to the
+ * list. The client rejects the request in this case.
+ *
+ * Twice as many rpc_rqsts are prepared to ensure there is
+ * always an rpc_rqst available as soon as a reply is sent.
+ */
+ if (reqs > RPCRDMA_BACKWARD_WRS >> 1)
+ goto out_err;
+
+ for (i = 0; i < (reqs << 1); i++) {
+ rqst = kzalloc(sizeof(*rqst), GFP_KERNEL);
+ if (!rqst) {
+ pr_err("RPC: %s: Failed to create bc rpc_rqst\n",
+ __func__);
+ goto out_free;
+ }
+
+ rqst->rq_xprt = &r_xprt->rx_xprt;
+ INIT_LIST_HEAD(&rqst->rq_list);
+ INIT_LIST_HEAD(&rqst->rq_bc_list);
+
+ if (rpcrdma_bc_setup_rqst(r_xprt, rqst))
+ goto out_free;
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+ }
+
+ rc = rpcrdma_bc_setup_reps(r_xprt, reqs);
+ if (rc)
+ goto out_free;
+
+ rc = rpcrdma_ep_post_extra_recv(r_xprt, reqs);
+ if (rc)
+ goto out_free;
+
+ buffer->rb_bc_srv_max_requests = reqs;
+ request_module("svcrdma");
+
+ return 0;
+
+out_free:
+ xprt_rdma_bc_destroy(xprt, reqs);
+
+out_err:
+ pr_err("RPC: %s: setup backchannel transport failed\n", __func__);
+ return -ENOMEM;
+}
+
+/**
+ * xprt_rdma_bc_up - Create transport endpoint for backchannel service
+ * @serv: server endpoint
+ * @net: network namespace
+ *
+ * The "xprt" is an implied argument: it supplies the name of the
+ * backchannel transport class.
+ *
+ * Returns zero on success, negative errno on failure
+ */
+int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
+{
+ int ret;
+
+ ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * rpcrdma_bc_marshal_reply - Send backwards direction reply
+ * @rqst: buffer containing RPC reply data
+ *
+ * Returns zero on success.
+ */
+int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_msg *headerp;
+ size_t rpclen;
+
+ headerp = rdmab_to_msg(req->rl_rdmabuf);
+ headerp->rm_xid = rqst->rq_xid;
+ headerp->rm_vers = rpcrdma_version;
+ headerp->rm_credit =
+ cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests);
+ headerp->rm_type = rdma_msg;
+ headerp->rm_body.rm_chunks[0] = xdr_zero;
+ headerp->rm_body.rm_chunks[1] = xdr_zero;
+ headerp->rm_body.rm_chunks[2] = xdr_zero;
+
+ rpclen = rqst->rq_svec[0].iov_len;
+
+ pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
+ __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
+ pr_info("RPC: %s: RPC/RDMA: %*ph\n",
+ __func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
+ pr_info("RPC: %s: RPC: %*ph\n",
+ __func__, (int)rpclen, rqst->rq_svec[0].iov_base);
+
+ req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
+ req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
+ req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
+
+ req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
+ req->rl_send_iov[1].length = rpclen;
+ req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
+
+ req->rl_niovs = 2;
+ return 0;
+}
+
+/**
+ * xprt_rdma_bc_destroy - Release resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of incoming requests to destroy; ignored
+ */
+void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpc_rqst *rqst, *tmp;
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
+ list_del(&rqst->rq_bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+
+ rpcrdma_bc_free_rqst(r_xprt, rqst);
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ }
+ spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+/**
+ * xprt_rdma_bc_free_rqst - Release a backchannel rqst
+ * @rqst: request to release
+ */
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+
+ smp_mb__before_atomic();
+ WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state));
+ clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+ smp_mb__after_atomic();
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+/**
+ * rpcrdma_bc_receive_call - Handle a backward direction call
+ * @xprt: transport receiving the call
+ * @rep: receive buffer containing the call
+ *
+ * Called in the RPC reply handler, which runs in a tasklet.
+ * Be quick about it.
+ *
+ * Operational assumptions:
+ * o Backchannel credits are ignored, just as the NFS server
+ * forechannel currently does
+ * o The ULP manages a replay cache (eg, NFSv4.1 sessions).
+ * No replay detection is done at the transport level
+ */
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_rep *rep)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct rpcrdma_msg *headerp;
+ struct svc_serv *bc_serv;
+ struct rpcrdma_req *req;
+ struct rpc_rqst *rqst;
+ struct xdr_buf *buf;
+ size_t size;
+ __be32 *p;
+
+ headerp = rdmab_to_msg(rep->rr_rdmabuf);
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+ pr_info("RPC: %s: callback XID %08x, length=%u\n",
+ __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len);
+ pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp);
+#endif
+
+ /* Sanity check:
+ * Need at least enough bytes for RPC/RDMA header, as code
+ * here references the header fields by array offset. Also,
+ * backward calls are always inline, so ensure there
+ * are some bytes beyond the RPC/RDMA header.
+ */
+ if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24)
+ goto out_short;
+ p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN);
+ size = rep->rr_len - RPCRDMA_HDRLEN_MIN;
+
+ /* Grab a free bc rqst */
+ spin_lock(&xprt->bc_pa_lock);
+ if (list_empty(&xprt->bc_pa_list)) {
+ spin_unlock(&xprt->bc_pa_lock);
+ goto out_overflow;
+ }
+ rqst = list_first_entry(&xprt->bc_pa_list,
+ struct rpc_rqst, rq_bc_pa_list);
+ list_del(&rqst->rq_bc_pa_list);
+ spin_unlock(&xprt->bc_pa_lock);
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+ pr_info("RPC: %s: using rqst %p\n", __func__, rqst);
+#endif
+
+ /* Prepare rqst */
+ rqst->rq_reply_bytes_recvd = 0;
+ rqst->rq_bytes_sent = 0;
+ rqst->rq_xid = headerp->rm_xid;
+ set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+
+ buf = &rqst->rq_rcv_buf;
+ memset(buf, 0, sizeof(*buf));
+ buf->head[0].iov_base = p;
+ buf->head[0].iov_len = size;
+ buf->len = size;
+
+ /* The receive buffer has to be hooked to the rpcrdma_req
+ * so that it can be reposted after the server is done
+ * parsing it but just before sending the backward
+ * direction reply.
+ */
+ req = rpcr_to_rdmar(rqst);
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+ pr_info("RPC: %s: attaching rep %p to req %p\n",
+ __func__, rep, req);
+#endif
+ req->rl_reply = rep;
+
+ /* Defeat the retransmit detection logic in send_request */
+ req->rl_connect_cookie = 0;
+
+ /* Queue rqst for ULP's callback service */
+ bc_serv = xprt->bc_serv;
+ spin_lock(&bc_serv->sv_cb_lock);
+ list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
+ spin_unlock(&bc_serv->sv_cb_lock);
+
+ wake_up(&bc_serv->sv_cb_waitq);
+
+ r_xprt->rx_stats.bcall_count++;
+ return;
+
+out_overflow:
+ pr_warn("RPC/RDMA backchannel overflow\n");
+ xprt_disconnect_done(xprt);
+ /* This receive buffer gets reposted automatically
+ * when the connection is re-established.
+ */
+ return;
+
+out_short:
+ pr_warn("RPC/RDMA short backward direction call\n");
+
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ xprt_disconnect_done(xprt);
+ else
+ pr_warn("RPC: %s: reposting rep %p\n",
+ __func__, rep);
+}
diff --git a/kernel/net/sunrpc/xprtrdma/fmr_ops.c b/kernel/net/sunrpc/xprtrdma/fmr_ops.c
index 302d4ebf6..f1e8dafbd 100644
--- a/kernel/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/kernel/net/sunrpc/xprtrdma/fmr_ops.c
@@ -11,6 +11,21 @@
* can take tens of usecs to complete.
*/
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using the
+ * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
+ * finished, the Memory Region is unmapped using the ib_unmap_fmr
+ * verb (fmr_op_unmap).
+ */
+
+/* Transport recovery
+ *
+ * After a transport reconnect, fmr_op_map re-uses the MR already
+ * allocated for the RPC, but generates a fresh rkey then maps the
+ * MR again. This process is synchronous.
+ */
+
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -50,19 +65,28 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_mw *r;
int i, rc;
+ spin_lock_init(&buf->rb_mwlock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
- i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
- dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
+ i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
+ i += 2; /* head + tail */
+ i *= buf->rb_max_requests; /* one set for each RPC slot */
+ dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
+ rc = -ENOMEM;
while (i--) {
r = kzalloc(sizeof(*r), GFP_KERNEL);
if (!r)
- return -ENOMEM;
+ goto out;
- r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
- if (IS_ERR(r->r.fmr))
+ r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
+ sizeof(u64), GFP_KERNEL);
+ if (!r->r.fmr.physaddrs)
+ goto out_free;
+
+ r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+ if (IS_ERR(r->r.fmr.fmr))
goto out_fmr_err;
list_add(&r->mw_list, &buf->rb_mws);
@@ -71,12 +95,24 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
return 0;
out_fmr_err:
- rc = PTR_ERR(r->r.fmr);
+ rc = PTR_ERR(r->r.fmr.fmr);
dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
+ kfree(r->r.fmr.physaddrs);
+out_free:
kfree(r);
+out:
return rc;
}
+static int
+__fmr_unmap(struct rpcrdma_mw *r)
+{
+ LIST_HEAD(l);
+
+ list_add(&r->r.fmr.fmr->list, &l);
+ return ib_unmap_fmr(&l);
+}
+
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
@@ -85,12 +121,24 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_id->device;
+ struct ib_device *device = ia->ri_device;
enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
int len, pageoff, i, rc;
+ struct rpcrdma_mw *mw;
+
+ mw = seg1->rl_mw;
+ seg1->rl_mw = NULL;
+ if (!mw) {
+ mw = rpcrdma_get_mw(r_xprt);
+ if (!mw)
+ return -ENOMEM;
+ } else {
+ /* this is a retransmit; generate a fresh rkey */
+ rc = __fmr_unmap(mw);
+ if (rc)
+ return rc;
+ }
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -100,7 +148,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) {
rpcrdma_map_one(device, seg, direction);
- physaddrs[i] = seg->mr_dma;
+ mw->r.fmr.physaddrs[i] = seg->mr_dma;
len += seg->mr_len;
++seg;
++i;
@@ -110,11 +158,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
break;
}
- rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
+ rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs,
+ i, seg1->mr_dma);
if (rc)
goto out_maperr;
- seg1->mr_rkey = mw->r.fmr->rkey;
+ seg1->rl_mw = mw;
+ seg1->mr_rkey = mw->r.fmr.fmr->rkey;
seg1->mr_base = seg1->mr_dma + pageoff;
seg1->mr_nsegs = i;
seg1->mr_len = len;
@@ -137,48 +187,28 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_mr_seg *seg1 = seg;
- struct ib_device *device;
+ struct rpcrdma_mw *mw = seg1->rl_mw;
int rc, nsegs = seg->mr_nsegs;
- LIST_HEAD(l);
- list_add(&seg1->rl_mw->r.fmr->list, &l);
- rc = ib_unmap_fmr(&l);
- read_lock(&ia->ri_qplock);
- device = ia->ri_id->device;
+ dprintk("RPC: %s: FMR %p\n", __func__, mw);
+
+ seg1->rl_mw = NULL;
while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(device, seg++);
- read_unlock(&ia->ri_qplock);
+ rpcrdma_unmap_one(ia->ri_device, seg++);
+ rc = __fmr_unmap(mw);
if (rc)
goto out_err;
+ rpcrdma_put_mw(r_xprt, mw);
return nsegs;
out_err:
+ /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
+ * will attempt to release it when the transport is destroyed.
+ */
dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
return nsegs;
}
-/* After a disconnect, unmap all FMRs.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_fmr_external().
- */
-static void
-fmr_op_reset(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_mw *r;
- LIST_HEAD(list);
- int rc;
-
- list_for_each_entry(r, &buf->rb_all, mw_all)
- list_add(&r->r.fmr->list, &list);
-
- rc = ib_unmap_fmr(&list);
- if (rc)
- dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
- __func__, rc);
-}
-
static void
fmr_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -188,10 +218,13 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
while (!list_empty(&buf->rb_all)) {
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&r->mw_all);
- rc = ib_dealloc_fmr(r->r.fmr);
+ kfree(r->r.fmr.physaddrs);
+
+ rc = ib_dealloc_fmr(r->r.fmr.fmr);
if (rc)
dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
__func__, rc);
+
kfree(r);
}
}
@@ -202,7 +235,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
.ro_init = fmr_op_init,
- .ro_reset = fmr_op_reset,
.ro_destroy = fmr_op_destroy,
.ro_displayname = "fmr",
};
diff --git a/kernel/net/sunrpc/xprtrdma/frwr_ops.c b/kernel/net/sunrpc/xprtrdma/frwr_ops.c
index dff0481db..88cf9e726 100644
--- a/kernel/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/kernel/net/sunrpc/xprtrdma/frwr_ops.c
@@ -11,12 +11,136 @@
* but most complex memory registration mode.
*/
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
+ * Work Request (frmr_op_map). When the RDMA operation is finished, this
+ * Memory Region is invalidated using a LOCAL_INV Work Request
+ * (frmr_op_unmap).
+ *
+ * Typically these Work Requests are not signaled, and neither are RDMA
+ * SEND Work Requests (with the exception of signaling occasionally to
+ * prevent provider work queue overflows). This greatly reduces HCA
+ * interrupt workload.
+ *
+ * As an optimization, frwr_op_unmap marks MRs INVALID before the
+ * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
+ * rb_mws immediately so that no work (like managing a linked list
+ * under a spinlock) is needed in the completion upcall.
+ *
+ * But this means that frwr_op_map() can occasionally encounter an MR
+ * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
+ * ordering prevents a subsequent FAST_REG WR from executing against
+ * that MR while it is still being invalidated.
+ */
+
+/* Transport recovery
+ *
+ * ->op_map and the transport connect worker cannot run at the same
+ * time, but ->op_unmap can fire while the transport connect worker
+ * is running. Thus MR recovery is handled in ->op_map, to guarantee
+ * that recovered MRs are owned by a sending RPC, and not one where
+ * ->op_unmap could fire at the same time transport reconnect is
+ * being done.
+ *
+ * When the underlying transport disconnects, MRs are left in one of
+ * three states:
+ *
+ * INVALID: The MR was not in use before the QP entered ERROR state.
+ * (Or, the LOCAL_INV WR has not completed or flushed yet).
+ *
+ * STALE: The MR was being registered or unregistered when the QP
+ * entered ERROR state, and the pending WR was flushed.
+ *
+ * VALID: The MR was registered before the QP entered ERROR state.
+ *
+ * When frwr_op_map encounters STALE and VALID MRs, they are recovered
+ * with ib_dereg_mr and then are re-initialized. Beause MR recovery
+ * allocates fresh resources, it is deferred to a workqueue, and the
+ * recovered MRs are placed back on the rb_mws list when recovery is
+ * complete. frwr_op_map allocates another MR for the current RPC while
+ * the broken MR is reset.
+ *
+ * To ensure that frwr_op_map doesn't encounter an MR that is marked
+ * INVALID but that is about to be flushed due to a previous transport
+ * disconnect, the transport connect worker attempts to drain all
+ * pending send queue WRs before the transport is reconnected.
+ */
+
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
+static struct workqueue_struct *frwr_recovery_wq;
+
+#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM)
+
+int
+frwr_alloc_recovery_wq(void)
+{
+ frwr_recovery_wq = alloc_workqueue("frwr_recovery",
+ FRWR_RECOVERY_WQ_FLAGS, 0);
+ return !frwr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+frwr_destroy_recovery_wq(void)
+{
+ struct workqueue_struct *wq;
+
+ if (!frwr_recovery_wq)
+ return;
+
+ wq = frwr_recovery_wq;
+ frwr_recovery_wq = NULL;
+ destroy_workqueue(wq);
+}
+
+/* Deferred reset of a single FRMR. Generate a fresh rkey by
+ * replacing the MR.
+ *
+ * There's no recovery if this fails. The FRMR is abandoned, but
+ * remains in rb_all. It will be cleaned up when the transport is
+ * destroyed.
+ */
+static void
+__frwr_recovery_worker(struct work_struct *work)
+{
+ struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
+ r.frmr.fr_work);
+ struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt;
+ unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+ struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+
+ if (ib_dereg_mr(r->r.frmr.fr_mr))
+ goto out_fail;
+
+ r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
+ if (IS_ERR(r->r.frmr.fr_mr))
+ goto out_fail;
+
+ dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
+ r->r.frmr.fr_state = FRMR_IS_INVALID;
+ rpcrdma_put_mw(r_xprt, r);
+ return;
+
+out_fail:
+ pr_warn("RPC: %s: FRMR %p unrecovered\n",
+ __func__, r);
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__frwr_queue_recovery(struct rpcrdma_mw *r)
+{
+ INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker);
+ queue_work(frwr_recovery_wq, &r->r.frmr.fr_work);
+}
+
static int
__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
unsigned int depth)
@@ -24,24 +148,28 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
struct rpcrdma_frmr *f = &r->r.frmr;
int rc;
- f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+ f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
if (IS_ERR(f->fr_mr))
goto out_mr_err;
- f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
- if (IS_ERR(f->fr_pgl))
+
+ f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
+ if (!f->sg)
goto out_list_err;
+
+ sg_init_table(f->sg, depth);
+
return 0;
out_mr_err:
rc = PTR_ERR(f->fr_mr);
- dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n",
+ dprintk("RPC: %s: ib_alloc_mr status %i\n",
__func__, rc);
return rc;
out_list_err:
- rc = PTR_ERR(f->fr_pgl);
- dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n",
- __func__, rc);
+ rc = -ENOMEM;
+ dprintk("RPC: %s: sg allocation failure\n",
+ __func__);
ib_dereg_mr(f->fr_mr);
return rc;
}
@@ -55,7 +183,7 @@ __frwr_release(struct rpcrdma_mw *r)
if (rc)
dprintk("RPC: %s: ib_dereg_mr status %i\n",
__func__, rc);
- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+ kfree(r->r.frmr.sg);
}
static int
@@ -128,8 +256,11 @@ frwr_sendcompletion(struct ib_wc *wc)
/* WARNING: Only wr_id and status are reliable at this point */
r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
- dprintk("RPC: %s: frmr %p (stale), status %d\n",
- __func__, r, wc->status);
+ if (wc->status == IB_WC_WR_FLUSH_ERR)
+ dprintk("RPC: %s: frmr %p flushed\n", __func__, r);
+ else
+ pr_warn("RPC: %s: frmr %p error, status %s (%d)\n",
+ __func__, r, ib_wc_status_msg(wc->status), wc->status);
r->r.frmr.fr_state = FRMR_IS_STALE;
}
@@ -137,16 +268,19 @@ static int
frwr_op_init(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ struct ib_device *device = r_xprt->rx_ia.ri_device;
unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
int i;
+ spin_lock_init(&buf->rb_mwlock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
- i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
- dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
+ i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
+ i += 2; /* head + tail */
+ i *= buf->rb_max_requests; /* one set for each RPC slot */
+ dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
while (i--) {
struct rpcrdma_mw *r;
@@ -165,6 +299,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
r->mw_sendcompletion = frwr_sendcompletion;
+ r->r.frmr.fr_xprt = r_xprt;
}
return 0;
@@ -178,78 +313,103 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_id->device;
+ struct ib_device *device = ia->ri_device;
enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- struct rpcrdma_frmr *frmr = &mw->r.frmr;
- struct ib_mr *mr = frmr->fr_mr;
- struct ib_send_wr fastreg_wr, *bad_wr;
+ struct rpcrdma_mw *mw;
+ struct rpcrdma_frmr *frmr;
+ struct ib_mr *mr;
+ struct ib_reg_wr reg_wr;
+ struct ib_send_wr *bad_wr;
+ int rc, i, n, dma_nents;
u8 key;
- int len, pageoff;
- int i, rc;
- int seg_len;
- u64 pa;
- int page_no;
-
- pageoff = offset_in_page(seg1->mr_offset);
- seg1->mr_offset -= pageoff; /* start of page */
- seg1->mr_len += pageoff;
- len = -pageoff;
+
+ mw = seg1->rl_mw;
+ seg1->rl_mw = NULL;
+ do {
+ if (mw)
+ __frwr_queue_recovery(mw);
+ mw = rpcrdma_get_mw(r_xprt);
+ if (!mw)
+ return -ENOMEM;
+ } while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
+ frmr = &mw->r.frmr;
+ frmr->fr_state = FRMR_IS_VALID;
+ mr = frmr->fr_mr;
+
if (nsegs > ia->ri_max_frmr_depth)
nsegs = ia->ri_max_frmr_depth;
- for (page_no = i = 0; i < nsegs;) {
- rpcrdma_map_one(device, seg, direction);
- pa = seg->mr_dma;
- for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
- frmr->fr_pgl->page_list[page_no++] = pa;
- pa += PAGE_SIZE;
- }
- len += seg->mr_len;
+
+ for (i = 0; i < nsegs;) {
+ if (seg->mr_page)
+ sg_set_page(&frmr->sg[i],
+ seg->mr_page,
+ seg->mr_len,
+ offset_in_page(seg->mr_offset));
+ else
+ sg_set_buf(&frmr->sg[i], seg->mr_offset,
+ seg->mr_len);
+
++seg;
++i;
+
/* Check for holes */
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
- __func__, mw, i, len);
+ frmr->sg_nents = i;
- frmr->fr_state = FRMR_IS_VALID;
+ dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+ if (!dma_nents) {
+ pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
+ __func__, frmr->sg, frmr->sg_nents);
+ return -ENOMEM;
+ }
+
+ n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+ if (unlikely(n != frmr->sg_nents)) {
+ pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
+ __func__, frmr->fr_mr, n, frmr->sg_nents);
+ rc = n < 0 ? n : -EINVAL;
+ goto out_senderr;
+ }
+
+ dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
+ __func__, mw, frmr->sg_nents, mr->length);
- memset(&fastreg_wr, 0, sizeof(fastreg_wr));
- fastreg_wr.wr_id = (unsigned long)(void *)mw;
- fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
- fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
- fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fastreg_wr.wr.fast_reg.page_list_len = page_no;
- fastreg_wr.wr.fast_reg.length = len;
- fastreg_wr.wr.fast_reg.access_flags = writing ?
- IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
- IB_ACCESS_REMOTE_READ;
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
- fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+
+ reg_wr.wr.next = NULL;
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.wr_id = (uintptr_t)mw;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.wr.send_flags = 0;
+ reg_wr.mr = mr;
+ reg_wr.key = mr->rkey;
+ reg_wr.access = writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ;
DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
+ rc = ib_post_send(ia->ri_id->qp, &reg_wr.wr, &bad_wr);
if (rc)
goto out_senderr;
+ seg1->mr_dir = direction;
+ seg1->rl_mw = mw;
seg1->mr_rkey = mr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- return i;
+ seg1->mr_base = mr->iova;
+ seg1->mr_nsegs = frmr->sg_nents;
+ seg1->mr_len = mr->length;
+
+ return frmr->sg_nents;
out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- ib_update_fast_reg_key(mr, --key);
- frmr->fr_state = FRMR_IS_INVALID;
- while (i--)
- rpcrdma_unmap_one(device, --seg);
+ ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
+ __frwr_queue_recovery(mw);
return rc;
}
@@ -261,78 +421,46 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_mw *mw = seg1->rl_mw;
+ struct rpcrdma_frmr *frmr = &mw->r.frmr;
struct ib_send_wr invalidate_wr, *bad_wr;
int rc, nsegs = seg->mr_nsegs;
- struct ib_device *device;
- seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+ dprintk("RPC: %s: FRMR %p\n", __func__, mw);
+
+ seg1->rl_mw = NULL;
+ frmr->fr_state = FRMR_IS_INVALID;
memset(&invalidate_wr, 0, sizeof(invalidate_wr));
- invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+ invalidate_wr.wr_id = (unsigned long)(void *)mw;
invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+ invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
+ ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
read_lock(&ia->ri_qplock);
- device = ia->ri_id->device;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(device, seg++);
rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
read_unlock(&ia->ri_qplock);
if (rc)
goto out_err;
+
+ rpcrdma_put_mw(r_xprt, mw);
return nsegs;
out_err:
- /* Force rpcrdma_buffer_get() to retry */
- seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
+ __frwr_queue_recovery(mw);
return nsegs;
}
-/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
- * an unusable state. Find FRMRs in this state and dereg / reg
- * each. FRMRs that are VALID and attached to an rpcrdma_req are
- * also torn down.
- *
- * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_frmr_external().
- */
-static void
-frwr_op_reset(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct ib_device *device = r_xprt->rx_ia.ri_id->device;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- struct rpcrdma_mw *r;
- int rc;
-
- list_for_each_entry(r, &buf->rb_all, mw_all) {
- if (r->r.frmr.fr_state == FRMR_IS_INVALID)
- continue;
-
- __frwr_release(r);
- rc = __frwr_init(r, pd, device, depth);
- if (rc) {
- dprintk("RPC: %s: mw %p left %s\n",
- __func__, r,
- (r->r.frmr.fr_state == FRMR_IS_STALE ?
- "stale" : "valid"));
- continue;
- }
-
- r->r.frmr.fr_state = FRMR_IS_INVALID;
- }
-}
-
static void
frwr_op_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_mw *r;
+ /* Ensure stale MWs for "buf" are no longer in flight */
+ flush_workqueue(frwr_recovery_wq);
+
while (!list_empty(&buf->rb_all)) {
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&r->mw_all);
@@ -347,7 +475,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
.ro_init = frwr_op_init,
- .ro_reset = frwr_op_reset,
.ro_destroy = frwr_op_destroy,
.ro_displayname = "frwr",
};
diff --git a/kernel/net/sunrpc/xprtrdma/module.c b/kernel/net/sunrpc/xprtrdma/module.c
new file mode 100644
index 000000000..560712bd9
--- /dev/null
+++ b/kernel/net/sunrpc/xprtrdma/module.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ */
+
+/* rpcrdma.ko module initialization
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc.");
+MODULE_DESCRIPTION("RPC/RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("svcrdma");
+MODULE_ALIAS("xprtrdma");
+
+static void __exit rpc_rdma_cleanup(void)
+{
+ xprt_rdma_cleanup();
+ svc_rdma_cleanup();
+}
+
+static int __init rpc_rdma_init(void)
+{
+ int rc;
+
+ rc = svc_rdma_init();
+ if (rc)
+ goto out;
+
+ rc = xprt_rdma_init();
+ if (rc)
+ svc_rdma_cleanup();
+
+out:
+ return rc;
+}
+
+module_init(rpc_rdma_init);
+module_exit(rpc_rdma_cleanup);
diff --git a/kernel/net/sunrpc/xprtrdma/physical_ops.c b/kernel/net/sunrpc/xprtrdma/physical_ops.c
index ba518af16..617b76f22 100644
--- a/kernel/net/sunrpc/xprtrdma/physical_ops.c
+++ b/kernel/net/sunrpc/xprtrdma/physical_ops.c
@@ -23,6 +23,21 @@ static int
physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ struct ib_mr *mr;
+
+ /* Obtain an rkey to use for RPC data payloads.
+ */
+ mr = ib_get_dma_mr(ia->ri_pd,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ);
+ if (IS_ERR(mr)) {
+ pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+ __func__, PTR_ERR(mr));
+ return -ENOMEM;
+ }
+
+ ia->ri_dma_mr = mr;
return 0;
}
@@ -50,9 +65,8 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- rpcrdma_map_one(ia->ri_id->device, seg,
- rpcrdma_data_dir(writing));
- seg->mr_rkey = ia->ri_bind_mem->rkey;
+ rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
+ seg->mr_rkey = ia->ri_dma_mr->rkey;
seg->mr_base = seg->mr_dma;
seg->mr_nsegs = 1;
return 1;
@@ -65,19 +79,11 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- read_lock(&ia->ri_qplock);
- rpcrdma_unmap_one(ia->ri_id->device, seg);
- read_unlock(&ia->ri_qplock);
-
+ rpcrdma_unmap_one(ia->ri_device, seg);
return 1;
}
static void
-physical_op_reset(struct rpcrdma_xprt *r_xprt)
-{
-}
-
-static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
}
@@ -88,7 +94,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
.ro_init = physical_op_init,
- .ro_reset = physical_op_reset,
.ro_destroy = physical_op_destroy,
.ro_displayname = "physical",
};
diff --git a/kernel/net/sunrpc/xprtrdma/rpc_rdma.c b/kernel/net/sunrpc/xprtrdma/rpc_rdma.c
index 2c53ea9e1..c10d96994 100644
--- a/kernel/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/kernel/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -71,6 +71,67 @@ static const char transfertypes[][12] = {
};
#endif
+/* The client can send a request inline as long as the RPCRDMA header
+ * plus the RPC call fit under the transport's inline limit. If the
+ * combined call message size exceeds that limit, the client must use
+ * the read chunk list for this operation.
+ */
+static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+{
+ unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+
+ return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+}
+
+/* The client can't know how large the actual reply will be. Thus it
+ * plans for the largest possible reply for that particular ULP
+ * operation. If the maximum combined reply message size exceeds that
+ * limit, the client must provide a write list or a reply chunk for
+ * this request.
+ */
+static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+{
+ unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+
+ return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+}
+
+static int
+rpcrdma_tail_pullup(struct xdr_buf *buf)
+{
+ size_t tlen = buf->tail[0].iov_len;
+ size_t skip = tlen & 3;
+
+ /* Do not include the tail if it is only an XDR pad */
+ if (tlen < 4)
+ return 0;
+
+ /* xdr_write_pages() adds a pad at the beginning of the tail
+ * if the content in "buf->pages" is unaligned. Force the
+ * tail's actual content to land at the next XDR position
+ * after the head instead.
+ */
+ if (skip) {
+ unsigned char *src, *dst;
+ unsigned int count;
+
+ src = buf->tail[0].iov_base;
+ dst = buf->head[0].iov_base;
+ dst += buf->head[0].iov_len;
+
+ src += skip;
+ tlen -= skip;
+
+ dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n",
+ __func__, skip, dst, src, tlen);
+
+ for (count = tlen; count; count--)
+ *dst++ = *src++;
+ }
+
+ return tlen;
+}
+
/*
* Chunk assembly from upper layer xdr_buf.
*
@@ -122,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
if (len && n == nsegs)
return -EIO;
+ /* When encoding the read list, the tail is always sent inline */
+ if (type == rpcrdma_readch)
+ return n;
+
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
* xdr pad bytes, saving the server an RDMA operation. */
@@ -284,9 +349,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
return (unsigned char *)iptr - (unsigned char *)headerp;
out:
- if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
- return n;
-
for (pos = 0; nchunks--;)
pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
&req->rl_segments[pos]);
@@ -300,8 +362,7 @@ out:
* pre-registered memory buffer for this request. For small amounts
* of data, this is efficient. The cutoff value is tunable.
*/
-static int
-rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
{
int i, npages, curlen;
int copy_len;
@@ -313,16 +374,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
destp = rqst->rq_svec[0].iov_base;
curlen = rqst->rq_svec[0].iov_len;
destp += curlen;
- /*
- * Do optional padding where it makes sense. Alignment of write
- * payload can help the server, if our setting is accurate.
- */
- pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
- if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
- pad = 0; /* don't pad this request */
- dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n",
- __func__, pad, destp, rqst->rq_slen, curlen);
+ dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n",
+ __func__, destp, rqst->rq_slen, curlen);
copy_len = rqst->rq_snd_buf.page_len;
@@ -358,7 +412,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
page_base = 0;
}
/* header now contains entire send message */
- return pad;
}
/*
@@ -383,11 +436,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
char *base;
- size_t rpclen, padlen;
+ size_t rpclen;
ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
+ return rpcrdma_bc_marshal_reply(rqst);
+#endif
+
/*
* rpclen gets amount of data in first buffer, which is the
* pre-registered buffer.
@@ -405,28 +463,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/*
* Chunks needed for results?
*
+ * o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
- * return as inline (but see later).
+ * return as inline.
* o Large non-read ops return as a single reply chunk.
- * o Large read ops return data as write chunk(s), header as inline.
- *
- * Note: the NFS code sending down multiple result segments implies
- * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
- */
-
- /*
- * This code can handle read chunks, write chunks OR reply
- * chunks -- only one type. If the request is too big to fit
- * inline, then we will choose read chunks. If the request is
- * a READ, then use write chunks to separate the file data
- * into pages; otherwise use reply chunks.
*/
- if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
- wtype = rpcrdma_noch;
- else if (rqst->rq_rcv_buf.page_len == 0)
- wtype = rpcrdma_replych;
- else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech;
+ else if (rpcrdma_results_inline(rqst))
+ wtype = rpcrdma_noch;
else
wtype = rpcrdma_replych;
@@ -435,21 +480,25 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*
* o If the total request is under the inline threshold, all ops
* are sent as inline.
- * o Large non-write ops are sent with the entire message as a
- * single read chunk (protocol 0-position special case).
* o Large write ops transmit data as read chunk(s), header as
* inline.
+ * o Large non-write ops are sent with the entire message as a
+ * single read chunk (protocol 0-position special case).
*
- * Note: the NFS code sending down multiple argument segments
- * implies the op is a write.
- * TBD check NFSv4 setacl
+ * This assumes that the upper layer does not present a request
+ * that both has a data payload, and whose non-data arguments
+ * by themselves are larger than the inline threshold.
*/
- if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ if (rpcrdma_args_inline(rqst)) {
rtype = rpcrdma_noch;
- else if (rqst->rq_snd_buf.page_len == 0)
- rtype = rpcrdma_areadch;
- else
+ } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
+ } else {
+ r_xprt->rx_stats.nomsg_call_count++;
+ headerp->rm_type = htonl(RDMA_NOMSG);
+ rtype = rpcrdma_areadch;
+ rpclen = 0;
+ }
/* The following simplification is not true forever */
if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
@@ -461,7 +510,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
}
hdrlen = RPCRDMA_HDRLEN_MIN;
- padlen = 0;
/*
* Pull up any extra send data into the preregistered buffer.
@@ -470,45 +518,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rtype == rpcrdma_noch) {
- padlen = rpcrdma_inline_pullup(rqst,
- RPCRDMA_INLINE_PAD_VALUE(rqst));
-
- if (padlen) {
- headerp->rm_type = rdma_msgp;
- headerp->rm_body.rm_padded.rm_align =
- cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
- headerp->rm_body.rm_padded.rm_thresh =
- cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
- headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
- headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
- headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
- hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
- if (wtype != rpcrdma_noch) {
- dprintk("RPC: %s: invalid chunk list\n",
- __func__);
- return -EIO;
- }
- } else {
- headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
- /* new length after pullup */
- rpclen = rqst->rq_svec[0].iov_len;
- /*
- * Currently we try to not actually use read inline.
- * Reply chunks have the desirable property that
- * they land, packed, directly in the target buffers
- * without headers, so they require no fixup. The
- * additional RDMA Write op sends the same amount
- * of data, streams on-the-wire and adds no overhead
- * on receive. Therefore, we request a reply chunk
- * for non-writes wherever feasible and efficient.
- */
- if (wtype == rpcrdma_noch)
- wtype = rpcrdma_replych;
- }
- }
+ rpcrdma_inline_pullup(rqst);
+ headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+ /* new length after pullup */
+ rpclen = rqst->rq_svec[0].iov_len;
+ } else if (rtype == rpcrdma_readch)
+ rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
if (rtype != rpcrdma_noch) {
hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
headerp, rtype);
@@ -521,9 +539,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
if (hdrlen < 0)
return hdrlen;
- dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+ dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+ __func__, transfertypes[wtype], hdrlen, rpclen,
headerp, base, rdmab_lkey(req->rl_rdmabuf));
/*
@@ -537,26 +555,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
req->rl_send_iov[0].length = hdrlen;
req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
+ req->rl_niovs = 1;
+ if (rtype == rpcrdma_areadch)
+ return 0;
+
req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
req->rl_send_iov[1].length = rpclen;
req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
req->rl_niovs = 2;
-
- if (padlen) {
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
- req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
- req->rl_send_iov[2].length = padlen;
- req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
-
- req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
- req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
- req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
-
- req->rl_niovs = 4;
- }
-
return 0;
}
@@ -709,6 +716,37 @@ rpcrdma_connect_worker(struct work_struct *work)
spin_unlock_bh(&xprt->transport_lock);
}
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
+{
+ __be32 *p = (__be32 *)headerp;
+
+ if (headerp->rm_type != rdma_msg)
+ return false;
+ if (headerp->rm_body.rm_chunks[0] != xdr_zero)
+ return false;
+ if (headerp->rm_body.rm_chunks[1] != xdr_zero)
+ return false;
+ if (headerp->rm_body.rm_chunks[2] != xdr_zero)
+ return false;
+
+ /* sanity */
+ if (p[7] != headerp->rm_xid)
+ return false;
+ /* call direction */
+ if (p[8] != cpu_to_be32(RPC_CALL))
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
/*
* This function is called when an async event is posted to
* the connection which changes the connection state. All it
@@ -721,8 +759,8 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
schedule_delayed_work(&ep->rep_connect_worker, 0);
}
-/*
- * Called as a tasklet to do req/reply match and complete a request
+/* Process received RPC/RDMA messages.
+ *
* Errors must result in the RPC task either being awakened, or
* allowed to timeout, to discover the errors at that time.
*/
@@ -732,60 +770,39 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
struct rpcrdma_msg *headerp;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
- struct rpc_xprt *xprt = rep->rr_xprt;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
__be32 *iptr;
int rdmalen, status;
unsigned long cwnd;
u32 credits;
- /* Check status. If bad, signal disconnect and return rep to pool */
- if (rep->rr_len == ~0U) {
- rpcrdma_recv_buffer_put(rep);
- if (r_xprt->rx_ep.rep_connected == 1) {
- r_xprt->rx_ep.rep_connected = -EIO;
- rpcrdma_conn_func(&r_xprt->rx_ep);
- }
- return;
- }
- if (rep->rr_len < RPCRDMA_HDRLEN_MIN) {
- dprintk("RPC: %s: short/invalid reply\n", __func__);
- goto repost;
- }
+ dprintk("RPC: %s: incoming rep %p\n", __func__, rep);
+
+ if (rep->rr_len == RPCRDMA_BAD_LEN)
+ goto out_badstatus;
+ if (rep->rr_len < RPCRDMA_HDRLEN_MIN)
+ goto out_shortreply;
+
headerp = rdmab_to_msg(rep->rr_rdmabuf);
- if (headerp->rm_vers != rpcrdma_version) {
- dprintk("RPC: %s: invalid version %d\n",
- __func__, be32_to_cpu(headerp->rm_vers));
- goto repost;
- }
+ if (headerp->rm_vers != rpcrdma_version)
+ goto out_badversion;
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ if (rpcrdma_is_bcall(headerp))
+ goto out_bcall;
+#endif
- /* Get XID and try for a match. */
- spin_lock(&xprt->transport_lock);
+ /* Match incoming rpcrdma_rep to an rpcrdma_req to
+ * get context for handling any incoming chunks.
+ */
+ spin_lock_bh(&xprt->transport_lock);
rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
- if (rqst == NULL) {
- spin_unlock(&xprt->transport_lock);
- dprintk("RPC: %s: reply 0x%p failed "
- "to match any request xid 0x%08x len %d\n",
- __func__, rep, be32_to_cpu(headerp->rm_xid),
- rep->rr_len);
-repost:
- r_xprt->rx_stats.bad_reply_count++;
- rep->rr_func = rpcrdma_reply_handler;
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
- rpcrdma_recv_buffer_put(rep);
-
- return;
- }
+ if (!rqst)
+ goto out_nomatch;
- /* get request object */
req = rpcr_to_rdmar(rqst);
- if (req->rl_reply) {
- spin_unlock(&xprt->transport_lock);
- dprintk("RPC: %s: duplicate reply 0x%p to RPC "
- "request 0x%p: xid 0x%08x\n", __func__, rep, req,
- be32_to_cpu(headerp->rm_xid));
- goto repost;
- }
+ if (req->rl_reply)
+ goto out_duplicate;
dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"
" RPC request 0x%p xid 0x%08x\n",
@@ -882,8 +899,50 @@ badheader:
if (xprt->cwnd > cwnd)
xprt_release_rqst_cong(rqst->rq_task);
+ xprt_complete_rqst(rqst->rq_task, status);
+ spin_unlock_bh(&xprt->transport_lock);
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
__func__, xprt, rqst, status);
- xprt_complete_rqst(rqst->rq_task, status);
- spin_unlock(&xprt->transport_lock);
+ return;
+
+out_badstatus:
+ rpcrdma_recv_buffer_put(rep);
+ if (r_xprt->rx_ep.rep_connected == 1) {
+ r_xprt->rx_ep.rep_connected = -EIO;
+ rpcrdma_conn_func(&r_xprt->rx_ep);
+ }
+ return;
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+out_bcall:
+ rpcrdma_bc_receive_call(r_xprt, rep);
+ return;
+#endif
+
+out_shortreply:
+ dprintk("RPC: %s: short/invalid reply\n", __func__);
+ goto repost;
+
+out_badversion:
+ dprintk("RPC: %s: invalid version %d\n",
+ __func__, be32_to_cpu(headerp->rm_vers));
+ goto repost;
+
+out_nomatch:
+ spin_unlock_bh(&xprt->transport_lock);
+ dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n",
+ __func__, be32_to_cpu(headerp->rm_xid),
+ rep->rr_len);
+ goto repost;
+
+out_duplicate:
+ spin_unlock_bh(&xprt->transport_lock);
+ dprintk("RPC: %s: "
+ "duplicate reply %p to RPC request %p: xid 0x%08x\n",
+ __func__, rep, req, be32_to_cpu(headerp->rm_xid));
+
+repost:
+ r_xprt->rx_stats.bad_reply_count++;
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ rpcrdma_recv_buffer_put(rep);
}
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma.c b/kernel/net/sunrpc/xprtrdma/svc_rdma.c
index c1b627026..1b7051bdb 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma.c
@@ -38,8 +38,7 @@
*
* Author: Tom Tucker <tom@opengridcomputing.com>
*/
-#include <linux/module.h>
-#include <linux/init.h>
+
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/sysctl.h>
@@ -240,6 +239,9 @@ void svc_rdma_cleanup(void)
unregister_sysctl_table(svcrdma_table_header);
svcrdma_table_header = NULL;
}
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ svc_unreg_xprt_class(&svc_rdma_bc_class);
+#endif
svc_unreg_xprt_class(&svc_rdma_class);
kmem_cache_destroy(svc_rdma_map_cachep);
kmem_cache_destroy(svc_rdma_ctxt_cachep);
@@ -287,6 +289,9 @@ int svc_rdma_init(void)
/* Register RDMA with the SVC transport switch */
svc_reg_xprt_class(&svc_rdma_class);
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ svc_reg_xprt_class(&svc_rdma_bc_class);
+#endif
return 0;
err1:
kmem_cache_destroy(svc_rdma_map_cachep);
@@ -295,8 +300,3 @@ int svc_rdma_init(void)
destroy_workqueue(svc_rdma_wq);
return -ENOMEM;
}
-MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
-MODULE_DESCRIPTION("SVC RDMA Transport");
-MODULE_LICENSE("Dual BSD/GPL");
-module_init(svc_rdma_init);
-module_exit(svc_rdma_cleanup);
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index b681855cf..e2fca7617 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -50,12 +50,12 @@
/*
* Decodes a read chunk list. The expected format is as follows:
* descrim : xdr_one
- * position : u32 offset into XDR stream
- * handle : u32 RKEY
+ * position : __be32 offset into XDR stream
+ * handle : __be32 RKEY
* . . .
* end-of-list: xdr_zero
*/
-static u32 *decode_read_list(u32 *va, u32 *vaend)
+static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
{
struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
@@ -67,20 +67,20 @@ static u32 *decode_read_list(u32 *va, u32 *vaend)
}
ch++;
}
- return (u32 *)&ch->rc_position;
+ return &ch->rc_position;
}
/*
* Decodes a write chunk list. The expected format is as follows:
* descrim : xdr_one
* nchunks : <count>
- * handle : u32 RKEY ---+
- * length : u32 <len of segment> |
+ * handle : __be32 RKEY ---+
+ * length : __be32 <len of segment> |
* offset : remove va + <count>
* . . . |
* ---+
*/
-static u32 *decode_write_list(u32 *va, u32 *vaend)
+static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
{
unsigned long start, end;
int nchunks;
@@ -90,14 +90,14 @@ static u32 *decode_write_list(u32 *va, u32 *vaend)
/* Check for not write-array */
if (ary->wc_discrim == xdr_zero)
- return (u32 *)&ary->wc_nchunks;
+ return &ary->wc_nchunks;
if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
(unsigned long)vaend) {
dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
return NULL;
}
- nchunks = ntohl(ary->wc_nchunks);
+ nchunks = be32_to_cpu(ary->wc_nchunks);
start = (unsigned long)&ary->wc_array[0];
end = (unsigned long)vaend;
@@ -112,10 +112,10 @@ static u32 *decode_write_list(u32 *va, u32 *vaend)
* rs_length is the 2nd 4B field in wc_target and taking its
* address skips the list terminator
*/
- return (u32 *)&ary->wc_array[nchunks].wc_target.rs_length;
+ return &ary->wc_array[nchunks].wc_target.rs_length;
}
-static u32 *decode_reply_array(u32 *va, u32 *vaend)
+static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
{
unsigned long start, end;
int nchunks;
@@ -124,14 +124,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend)
/* Check for no reply-array */
if (ary->wc_discrim == xdr_zero)
- return (u32 *)&ary->wc_nchunks;
+ return &ary->wc_nchunks;
if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
(unsigned long)vaend) {
dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
return NULL;
}
- nchunks = ntohl(ary->wc_nchunks);
+ nchunks = be32_to_cpu(ary->wc_nchunks);
start = (unsigned long)&ary->wc_array[0];
end = (unsigned long)vaend;
@@ -142,15 +142,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend)
ary, nchunks, vaend);
return NULL;
}
- return (u32 *)&ary->wc_array[nchunks];
+ return (__be32 *)&ary->wc_array[nchunks];
}
int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
struct svc_rqst *rqstp)
{
struct rpcrdma_msg *rmsgp = NULL;
- u32 *va;
- u32 *vaend;
+ __be32 *va, *vaend;
u32 hdr_len;
rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
@@ -162,22 +161,17 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
return -EINVAL;
}
- /* Decode the header */
- rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
- rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
- rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
- rmsgp->rm_type = ntohl(rmsgp->rm_type);
-
- if (rmsgp->rm_vers != RPCRDMA_VERSION)
+ if (rmsgp->rm_vers != rpcrdma_version)
return -ENOSYS;
/* Pull in the extra for the padded case and bump our pointer */
- if (rmsgp->rm_type == RDMA_MSGP) {
+ if (rmsgp->rm_type == rdma_msgp) {
int hdrlen;
+
rmsgp->rm_body.rm_padded.rm_align =
- ntohl(rmsgp->rm_body.rm_padded.rm_align);
+ be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
rmsgp->rm_body.rm_padded.rm_thresh =
- ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
+ be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
rqstp->rq_arg.head[0].iov_base = va;
@@ -192,7 +186,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
* chunk list and a reply chunk list.
*/
va = &rmsgp->rm_body.rm_chunks[0];
- vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+ vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
va = decode_read_list(va, vaend);
if (!va)
return -EINVAL;
@@ -211,76 +205,20 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
return hdr_len;
}
-int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
-{
- struct rpcrdma_msg *rmsgp = NULL;
- struct rpcrdma_read_chunk *ch;
- struct rpcrdma_write_array *ary;
- u32 *va;
- u32 hdrlen;
-
- dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
- rqstp);
- rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-
- /* Pull in the extra for the padded case and bump our pointer */
- if (rmsgp->rm_type == RDMA_MSGP) {
- va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
- rqstp->rq_arg.head[0].iov_base = va;
- hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
- rqstp->rq_arg.head[0].iov_len -= hdrlen;
- return hdrlen;
- }
-
- /*
- * Skip all chunks to find RPC msg. These were previously processed
- */
- va = &rmsgp->rm_body.rm_chunks[0];
-
- /* Skip read-list */
- for (ch = (struct rpcrdma_read_chunk *)va;
- ch->rc_discrim != xdr_zero; ch++);
- va = (u32 *)&ch->rc_position;
-
- /* Skip write-list */
- ary = (struct rpcrdma_write_array *)va;
- if (ary->wc_discrim == xdr_zero)
- va = (u32 *)&ary->wc_nchunks;
- else
- /*
- * rs_length is the 2nd 4B field in wc_target and taking its
- * address skips the list terminator
- */
- va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
-
- /* Skip reply-array */
- ary = (struct rpcrdma_write_array *)va;
- if (ary->wc_discrim == xdr_zero)
- va = (u32 *)&ary->wc_nchunks;
- else
- va = (u32 *)&ary->wc_array[ary->wc_nchunks];
-
- rqstp->rq_arg.head[0].iov_base = va;
- hdrlen = (unsigned long)va - (unsigned long)rmsgp;
- rqstp->rq_arg.head[0].iov_len -= hdrlen;
-
- return hdrlen;
-}
-
int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rmsgp,
- enum rpcrdma_errcode err, u32 *va)
+ enum rpcrdma_errcode err, __be32 *va)
{
- u32 *startp = va;
+ __be32 *startp = va;
- *va++ = htonl(rmsgp->rm_xid);
- *va++ = htonl(rmsgp->rm_vers);
- *va++ = htonl(xprt->sc_max_requests);
- *va++ = htonl(RDMA_ERROR);
- *va++ = htonl(err);
+ *va++ = rmsgp->rm_xid;
+ *va++ = rmsgp->rm_vers;
+ *va++ = cpu_to_be32(xprt->sc_max_requests);
+ *va++ = rdma_error;
+ *va++ = cpu_to_be32(err);
if (err == ERR_VERS) {
- *va++ = htonl(RPCRDMA_VERSION);
- *va++ = htonl(RPCRDMA_VERSION);
+ *va++ = rpcrdma_version;
+ *va++ = rpcrdma_version;
}
return (int)((unsigned long)va - (unsigned long)startp);
@@ -297,7 +235,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
&rmsgp->rm_body.rm_chunks[1];
if (wr_ary->wc_discrim)
wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
+ &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
wc_target.rs_length;
else
wr_ary = (struct rpcrdma_write_array *)
@@ -306,7 +244,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
/* skip reply array */
if (wr_ary->wc_discrim)
wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
+ &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
else
wr_ary = (struct rpcrdma_write_array *)
&wr_ary->wc_nchunks;
@@ -325,7 +263,7 @@ void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
ary = (struct rpcrdma_write_array *)
&rmsgp->rm_body.rm_chunks[1];
ary->wc_discrim = xdr_one;
- ary->wc_nchunks = htonl(chunks);
+ ary->wc_nchunks = cpu_to_be32(chunks);
/* write-list terminator */
ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
@@ -338,7 +276,7 @@ void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
int chunks)
{
ary->wc_discrim = xdr_one;
- ary->wc_nchunks = htonl(chunks);
+ ary->wc_nchunks = cpu_to_be32(chunks);
}
void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
@@ -350,7 +288,7 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
seg->rs_handle = rs_handle;
seg->rs_offset = rs_offset;
- seg->rs_length = htonl(write_len);
+ seg->rs_length = cpu_to_be32(write_len);
}
void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
@@ -358,10 +296,10 @@ void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rdma_resp,
enum rpcrdma_proc rdma_type)
{
- rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
- rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
- rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
- rdma_resp->rm_type = htonl(rdma_type);
+ rdma_resp->rm_xid = rdma_argp->rm_xid;
+ rdma_resp->rm_vers = rdma_argp->rm_vers;
+ rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
+ rdma_resp->rm_type = cpu_to_be32(rdma_type);
/* Encode <nul> chunks lists */
rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f9f13a32d..ff4f01e52 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -85,7 +85,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
- if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
+ if (rmsgp->rm_type == rdma_nomsg)
rqstp->rq_arg.pages = &rqstp->rq_pages[0];
else
rqstp->rq_arg.pages = &rqstp->rq_pages[1];
@@ -115,15 +115,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
rqstp->rq_arg.tail[0].iov_len = 0;
}
-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
-{
- if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
- RDMA_TRANSPORT_IWARP)
- return 1;
- else
- return min_t(int, sge_count, xprt->sc_max_sge);
-}
-
/* Issue an RDMA_READ using the local lkey to map the data sink */
int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
struct svc_rqst *rqstp,
@@ -135,7 +126,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
u64 rs_offset,
bool last)
{
- struct ib_send_wr read_wr;
+ struct ib_rdma_wr read_wr;
int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
int ret, read, pno;
@@ -144,9 +135,9 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
ctxt->direction = DMA_FROM_DEVICE;
ctxt->read_hdr = head;
- pages_needed =
- min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
- read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+ pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd);
+ read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset,
+ rs_length);
for (pno = 0; pno < pages_needed; pno++) {
int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
@@ -189,16 +180,16 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
memset(&read_wr, 0, sizeof(read_wr));
- read_wr.wr_id = (unsigned long)ctxt;
- read_wr.opcode = IB_WR_RDMA_READ;
- ctxt->wr_op = read_wr.opcode;
- read_wr.send_flags = IB_SEND_SIGNALED;
- read_wr.wr.rdma.rkey = rs_handle;
- read_wr.wr.rdma.remote_addr = rs_offset;
- read_wr.sg_list = ctxt->sge;
- read_wr.num_sge = pages_needed;
-
- ret = svc_rdma_send(xprt, &read_wr);
+ read_wr.wr.wr_id = (unsigned long)ctxt;
+ read_wr.wr.opcode = IB_WR_RDMA_READ;
+ ctxt->wr_op = read_wr.wr.opcode;
+ read_wr.wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.rkey = rs_handle;
+ read_wr.remote_addr = rs_offset;
+ read_wr.wr.sg_list = ctxt->sge;
+ read_wr.wr.num_sge = pages_needed;
+
+ ret = svc_rdma_send(xprt, &read_wr.wr);
if (ret) {
pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -228,14 +219,14 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
u64 rs_offset,
bool last)
{
- struct ib_send_wr read_wr;
+ struct ib_rdma_wr read_wr;
struct ib_send_wr inv_wr;
- struct ib_send_wr fastreg_wr;
+ struct ib_reg_wr reg_wr;
u8 key;
- int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+ int nents = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
- int ret, read, pno;
+ int ret, read, pno, dma_nents, n;
u32 pg_off = *page_offset;
u32 pg_no = *page_no;
@@ -244,16 +235,14 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
ctxt->direction = DMA_FROM_DEVICE;
ctxt->frmr = frmr;
- pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
- read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+ nents = min_t(unsigned int, nents, xprt->sc_frmr_pg_list_len);
+ read = min_t(int, (nents << PAGE_SHIFT) - *page_offset, rs_length);
- frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
frmr->direction = DMA_FROM_DEVICE;
frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
- frmr->map_len = pages_needed << PAGE_SHIFT;
- frmr->page_list_len = pages_needed;
+ frmr->sg_nents = nents;
- for (pno = 0; pno < pages_needed; pno++) {
+ for (pno = 0; pno < nents; pno++) {
int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
@@ -261,17 +250,12 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
head->arg.len += len;
if (!pg_off)
head->count++;
+
+ sg_set_page(&frmr->sg[pno], rqstp->rq_arg.pages[pg_no],
+ len, pg_off);
+
rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
rqstp->rq_next_page = rqstp->rq_respages + 1;
- frmr->page_list->page_list[pno] =
- ib_dma_map_page(xprt->sc_cm_id->device,
- head->arg.pages[pg_no], 0,
- PAGE_SIZE, DMA_FROM_DEVICE);
- ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
- frmr->page_list->page_list[pno]);
- if (ret)
- goto err;
- atomic_inc(&xprt->sc_dma_used);
/* adjust offset and wrap to next page if needed */
pg_off += len;
@@ -287,43 +271,57 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
else
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ dma_nents = ib_dma_map_sg(xprt->sc_cm_id->device,
+ frmr->sg, frmr->sg_nents,
+ frmr->direction);
+ if (!dma_nents) {
+ pr_err("svcrdma: failed to dma map sg %p\n",
+ frmr->sg);
+ return -ENOMEM;
+ }
+ atomic_inc(&xprt->sc_dma_used);
+
+ n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+ if (unlikely(n != frmr->sg_nents)) {
+ pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n",
+ frmr->mr, n, frmr->sg_nents);
+ return n < 0 ? n : -EINVAL;
+ }
+
/* Bump the key */
key = (u8)(frmr->mr->lkey & 0x000000FF);
ib_update_fast_reg_key(frmr->mr, ++key);
- ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
+ ctxt->sge[0].addr = frmr->mr->iova;
ctxt->sge[0].lkey = frmr->mr->lkey;
- ctxt->sge[0].length = read;
+ ctxt->sge[0].length = frmr->mr->length;
ctxt->count = 1;
ctxt->read_hdr = head;
- /* Prepare FASTREG WR */
- memset(&fastreg_wr, 0, sizeof(fastreg_wr));
- fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- fastreg_wr.send_flags = IB_SEND_SIGNALED;
- fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
- fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
- fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
- fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fastreg_wr.wr.fast_reg.length = frmr->map_len;
- fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
- fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
- fastreg_wr.next = &read_wr;
+ /* Prepare REG WR */
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.wr_id = 0;
+ reg_wr.wr.send_flags = IB_SEND_SIGNALED;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.mr = frmr->mr;
+ reg_wr.key = frmr->mr->lkey;
+ reg_wr.access = frmr->access_flags;
+ reg_wr.wr.next = &read_wr.wr;
/* Prepare RDMA_READ */
memset(&read_wr, 0, sizeof(read_wr));
- read_wr.send_flags = IB_SEND_SIGNALED;
- read_wr.wr.rdma.rkey = rs_handle;
- read_wr.wr.rdma.remote_addr = rs_offset;
- read_wr.sg_list = ctxt->sge;
- read_wr.num_sge = 1;
+ read_wr.wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.rkey = rs_handle;
+ read_wr.remote_addr = rs_offset;
+ read_wr.wr.sg_list = ctxt->sge;
+ read_wr.wr.num_sge = 1;
if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
- read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
- read_wr.wr_id = (unsigned long)ctxt;
- read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
+ read_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ read_wr.wr.wr_id = (unsigned long)ctxt;
+ read_wr.wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
} else {
- read_wr.opcode = IB_WR_RDMA_READ;
- read_wr.next = &inv_wr;
+ read_wr.wr.opcode = IB_WR_RDMA_READ;
+ read_wr.wr.next = &inv_wr;
/* Prepare invalidate */
memset(&inv_wr, 0, sizeof(inv_wr));
inv_wr.wr_id = (unsigned long)ctxt;
@@ -331,10 +329,10 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE;
inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
}
- ctxt->wr_op = read_wr.opcode;
+ ctxt->wr_op = read_wr.wr.opcode;
/* Post the chain */
- ret = svc_rdma_send(xprt, &fastreg_wr);
+ ret = svc_rdma_send(xprt, &reg_wr.wr);
if (ret) {
pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -348,7 +346,8 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
atomic_inc(&rdma_stat_read);
return ret;
err:
- svc_rdma_unmap_dma(ctxt);
+ ib_dma_unmap_sg(xprt->sc_cm_id->device,
+ frmr->sg, frmr->sg_nents, frmr->direction);
svc_rdma_put_context(ctxt, 0);
svc_rdma_put_frmr(xprt, frmr);
return ret;
@@ -541,7 +540,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.page_base = head->arg.page_base;
/* rq_respages starts after the last arg page */
- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+ rqstp->rq_respages = &rqstp->rq_pages[page_no];
rqstp->rq_next_page = rqstp->rq_respages + 1;
/* Rebuild rq_arg head and tail. */
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 7de33d1af..969a1ab75 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -136,6 +136,79 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
return dma_addr;
}
+/* Returns the address of the first read chunk or <nul> if no read chunk
+ * is present
+ */
+struct rpcrdma_read_chunk *
+svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *ch =
+ (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+
+ if (ch->rc_discrim == xdr_zero)
+ return NULL;
+ return ch;
+}
+
+/* Returns the address of the first read write array element or <nul>
+ * if no write array list is present
+ */
+static struct rpcrdma_write_array *
+svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
+{
+ if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
+ rmsgp->rm_body.rm_chunks[1] == xdr_zero)
+ return NULL;
+ return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
+}
+
+/* Returns the address of the first reply array element or <nul> if no
+ * reply array is present
+ */
+static struct rpcrdma_write_array *
+svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *rch;
+ struct rpcrdma_write_array *wr_ary;
+ struct rpcrdma_write_array *rp_ary;
+
+ /* XXX: Need to fix when reply chunk may occur with read list
+ * and/or write list.
+ */
+ if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
+ rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+ return NULL;
+
+ rch = svc_rdma_get_read_chunk(rmsgp);
+ if (rch) {
+ while (rch->rc_discrim != xdr_zero)
+ rch++;
+
+ /* The reply chunk follows an empty write array located
+ * at 'rc_position' here. The reply array is at rc_target.
+ */
+ rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
+ goto found_it;
+ }
+
+ wr_ary = svc_rdma_get_write_array(rmsgp);
+ if (wr_ary) {
+ int chunk = be32_to_cpu(wr_ary->wc_nchunks);
+
+ rp_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_array[chunk].wc_target.rs_length;
+ goto found_it;
+ }
+
+ /* No read list, no write list */
+ rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2];
+
+ found_it:
+ if (rp_ary->wc_discrim == xdr_zero)
+ return NULL;
+ return rp_ary;
+}
+
/* Assumptions:
* - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
*/
@@ -144,7 +217,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
u32 xdr_off, int write_len,
struct svc_rdma_req_map *vec)
{
- struct ib_send_wr write_wr;
+ struct ib_rdma_wr write_wr;
struct ib_sge *sge;
int xdr_sge_no;
int sge_no;
@@ -209,17 +282,17 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
/* Prepare WRITE WR */
memset(&write_wr, 0, sizeof write_wr);
ctxt->wr_op = IB_WR_RDMA_WRITE;
- write_wr.wr_id = (unsigned long)ctxt;
- write_wr.sg_list = &sge[0];
- write_wr.num_sge = sge_no;
- write_wr.opcode = IB_WR_RDMA_WRITE;
- write_wr.send_flags = IB_SEND_SIGNALED;
- write_wr.wr.rdma.rkey = rmr;
- write_wr.wr.rdma.remote_addr = to;
+ write_wr.wr.wr_id = (unsigned long)ctxt;
+ write_wr.wr.sg_list = &sge[0];
+ write_wr.wr.num_sge = sge_no;
+ write_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ write_wr.wr.send_flags = IB_SEND_SIGNALED;
+ write_wr.rkey = rmr;
+ write_wr.remote_addr = to;
/* Post It */
atomic_inc(&rdma_stat_write);
- if (svc_rdma_send(xprt, &write_wr))
+ if (svc_rdma_send(xprt, &write_wr.wr))
goto err;
return write_len - bc;
err:
@@ -240,6 +313,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
u32 xdr_off;
int chunk_off;
int chunk_no;
+ int nchunks;
struct rpcrdma_write_array *arg_ary;
struct rpcrdma_write_array *res_ary;
int ret;
@@ -251,14 +325,15 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
&rdma_resp->rm_body.rm_chunks[1];
/* Write chunks start at the pagelist */
+ nchunks = be32_to_cpu(arg_ary->wc_nchunks);
for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
- xfer_len && chunk_no < arg_ary->wc_nchunks;
+ xfer_len && chunk_no < nchunks;
chunk_no++) {
struct rpcrdma_segment *arg_ch;
u64 rs_offset;
arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, ntohl(arg_ch->rs_length));
+ write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
/* Prepare the response chunk given the length actually
* written */
@@ -270,7 +345,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
chunk_off = 0;
while (write_len) {
ret = send_write(xprt, rqstp,
- ntohl(arg_ch->rs_handle),
+ be32_to_cpu(arg_ch->rs_handle),
rs_offset + chunk_off,
xdr_off,
write_len,
@@ -318,13 +393,13 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
&rdma_resp->rm_body.rm_chunks[2];
/* xdr offset starts at RPC message */
- nchunks = ntohl(arg_ary->wc_nchunks);
+ nchunks = be32_to_cpu(arg_ary->wc_nchunks);
for (xdr_off = 0, chunk_no = 0;
xfer_len && chunk_no < nchunks;
chunk_no++) {
u64 rs_offset;
ch = &arg_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, htonl(ch->rs_length));
+ write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
/* Prepare the reply chunk given the length actually
* written */
@@ -335,7 +410,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
chunk_off = 0;
while (write_len) {
ret = send_write(xprt, rqstp,
- ntohl(ch->rs_handle),
+ be32_to_cpu(ch->rs_handle),
rs_offset + chunk_off,
xdr_off,
write_len,
@@ -382,6 +457,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
int byte_count)
{
struct ib_send_wr send_wr;
+ u32 xdr_off;
int sge_no;
int sge_bytes;
int page_no;
@@ -416,8 +492,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->direction = DMA_TO_DEVICE;
/* Map the payload indicated by 'byte_count' */
+ xdr_off = 0;
for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
- int xdr_off = 0;
sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
byte_count -= sge_bytes;
ctxt->sge[sge_no].addr =
@@ -455,6 +531,13 @@ static int send_reply(struct svcxprt_rdma *rdma,
}
rqstp->rq_next_page = rqstp->rq_respages + 1;
+ /* The loop above bumps sc_dma_used for each sge. The
+ * xdr_buf.tail gets a separate sge, but resides in the
+ * same page as xdr_buf.head. Don't count it twice.
+ */
+ if (sge_no > ctxt->count)
+ atomic_dec(&rdma->sc_dma_used);
+
if (sge_no > rdma->sc_max_sge) {
pr_err("svcrdma: Too many sges (%d)\n", sge_no);
goto err;
@@ -515,7 +598,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
inline_bytes = rqstp->rq_res.len;
/* Create the RDMA response header */
- res_page = svc_rdma_get_page();
+ res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
rdma_resp = page_address(res_page);
reply_ary = svc_rdma_get_reply_array(rdma_argp);
if (reply_ary)
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f609c1c2d..b348b4ade 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -56,6 +56,7 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int);
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
@@ -95,16 +96,69 @@ struct svc_xprt_class svc_rdma_class = {
.xcl_ident = XPRT_TRANSPORT_RDMA,
};
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
+ struct sockaddr *, int, int);
+static void svc_rdma_bc_detach(struct svc_xprt *);
+static void svc_rdma_bc_free(struct svc_xprt *);
+
+static struct svc_xprt_ops svc_rdma_bc_ops = {
+ .xpo_create = svc_rdma_bc_create,
+ .xpo_detach = svc_rdma_bc_detach,
+ .xpo_free = svc_rdma_bc_free,
+ .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+ .xpo_secure_port = svc_rdma_secure_port,
+};
+
+struct svc_xprt_class svc_rdma_bc_class = {
+ .xcl_name = "rdma-bc",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_rdma_bc_ops,
+ .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN)
+};
+
+static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
+ struct net *net,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ struct svcxprt_rdma *cma_xprt;
+ struct svc_xprt *xprt;
+
+ cma_xprt = rdma_create_xprt(serv, 0);
+ if (!cma_xprt)
+ return ERR_PTR(-ENOMEM);
+ xprt = &cma_xprt->sc_xprt;
+
+ svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
+ serv->sv_bc_xprt = xprt;
+
+ dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+ return xprt;
+}
+
+static void svc_rdma_bc_detach(struct svc_xprt *xprt)
+{
+ dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+}
+
+static void svc_rdma_bc_free(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+ dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+ if (xprt)
+ kfree(rdma);
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
{
struct svc_rdma_op_ctxt *ctxt;
- while (1) {
- ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
- if (ctxt)
- break;
- schedule_timeout_uninterruptible(msecs_to_jiffies(500));
- }
+ ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep,
+ GFP_KERNEL | __GFP_NOFAIL);
ctxt->xprt = xprt;
INIT_LIST_HEAD(&ctxt->dto_q);
ctxt->count = 0;
@@ -156,12 +210,8 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
struct svc_rdma_req_map *svc_rdma_get_req_map(void)
{
struct svc_rdma_req_map *map;
- while (1) {
- map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
- if (map)
- break;
- schedule_timeout_uninterruptible(msecs_to_jiffies(500));
- }
+ map = kmem_cache_alloc(svc_rdma_map_cachep,
+ GFP_KERNEL | __GFP_NOFAIL);
map->count = 0;
return map;
}
@@ -175,8 +225,8 @@ void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
static void cq_event_handler(struct ib_event *event, void *context)
{
struct svc_xprt *xprt = context;
- dprintk("svcrdma: received CQ event id=%d, context=%p\n",
- event->event, context);
+ dprintk("svcrdma: received CQ event %s (%d), context=%p\n",
+ ib_event_msg(event->event), event->event, context);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
}
@@ -191,8 +241,9 @@ static void qp_event_handler(struct ib_event *event, void *context)
case IB_EVENT_COMM_EST:
case IB_EVENT_SQ_DRAINED:
case IB_EVENT_QP_LAST_WQE_REACHED:
- dprintk("svcrdma: QP event %d received for QP=%p\n",
- event->event, event->element.qp);
+ dprintk("svcrdma: QP event %s (%d) received for QP=%p\n",
+ ib_event_msg(event->event), event->event,
+ event->element.qp);
break;
/* These are considered fatal events */
case IB_EVENT_PATH_MIG_ERR:
@@ -201,9 +252,10 @@ static void qp_event_handler(struct ib_event *event, void *context)
case IB_EVENT_QP_ACCESS_ERR:
case IB_EVENT_DEVICE_FATAL:
default:
- dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
+ dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, "
"closing transport\n",
- event->event, event->element.qp);
+ ib_event_msg(event->event), event->event,
+ event->element.qp);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
break;
}
@@ -402,7 +454,8 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
for (i = 0; i < ret; i++) {
wc = &wc_a[i];
if (wc->status != IB_WC_SUCCESS) {
- dprintk("svcrdma: sq wc err status %d\n",
+ dprintk("svcrdma: sq wc err status %s (%d)\n",
+ ib_wc_status_msg(wc->status),
wc->status);
/* Close the transport */
@@ -490,18 +543,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
return cma_xprt;
}
-struct page *svc_rdma_get_page(void)
-{
- struct page *page;
-
- while ((page = alloc_page(GFP_KERNEL)) == NULL) {
- /* If we can't get memory, wait a bit and try again */
- printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
- schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
- }
- return page;
-}
-
int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
{
struct ib_recv_wr recv_wr, *bad_recv_wr;
@@ -520,7 +561,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
pr_err("svcrdma: Too many sges (%d)\n", sge_no);
goto err_put_ctxt;
}
- page = svc_rdma_get_page();
+ page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
ctxt->pages[sge_no] = page;
pa = ib_dma_map_page(xprt->sc_cm_id->device,
page, 0, PAGE_SIZE,
@@ -616,7 +657,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
- "event=%d\n", cma_id, cma_id->context, event->event);
+ "event = %s (%d)\n", cma_id, cma_id->context,
+ rdma_event_msg(event->event), event->event);
handle_connect_req(cma_id,
event->param.conn.initiator_depth);
break;
@@ -636,7 +678,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
default:
dprintk("svcrdma: Unexpected event on listening endpoint %p, "
- "event=%d\n", cma_id, event->event);
+ "event = %s (%d)\n", cma_id,
+ rdma_event_msg(event->event), event->event);
break;
}
@@ -669,15 +712,18 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id,
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
- "event=%d\n", cma_id, xprt, event->event);
+ "event = %s (%d)\n", cma_id, xprt,
+ rdma_event_msg(event->event), event->event);
if (xprt) {
set_bit(XPT_CLOSE, &xprt->xpt_flags);
svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
}
break;
default:
dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
- "event=%d\n", cma_id, event->event);
+ "event = %s (%d)\n", cma_id,
+ rdma_event_msg(event->event), event->event);
break;
}
return 0;
@@ -704,8 +750,8 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
if (!cma_xprt)
return ERR_PTR(-ENOMEM);
- listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
- IB_QPT_RC);
+ listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(listen_id)) {
ret = PTR_ERR(listen_id);
dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
@@ -744,24 +790,27 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
{
struct ib_mr *mr;
- struct ib_fast_reg_page_list *pl;
+ struct scatterlist *sg;
struct svc_rdma_fastreg_mr *frmr;
+ u32 num_sg;
frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
if (!frmr)
goto err;
- mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+ num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len);
+ mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg);
if (IS_ERR(mr))
goto err_free_frmr;
- pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
- RPCSVC_MAXPAGES);
- if (IS_ERR(pl))
+ sg = kcalloc(RPCSVC_MAXPAGES, sizeof(*sg), GFP_KERNEL);
+ if (!sg)
goto err_free_mr;
+ sg_init_table(sg, RPCSVC_MAXPAGES);
+
frmr->mr = mr;
- frmr->page_list = pl;
+ frmr->sg = sg;
INIT_LIST_HEAD(&frmr->frmr_list);
return frmr;
@@ -781,8 +830,8 @@ static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
frmr = list_entry(xprt->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
+ kfree(frmr->sg);
ib_dereg_mr(frmr->mr);
- ib_free_fast_reg_page_list(frmr->page_list);
kfree(frmr);
}
}
@@ -796,8 +845,7 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
frmr = list_entry(rdma->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
- frmr->map_len = 0;
- frmr->page_list_len = 0;
+ frmr->sg_nents = 0;
}
spin_unlock_bh(&rdma->sc_frmr_q_lock);
if (frmr)
@@ -806,25 +854,13 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
return rdma_alloc_frmr(rdma);
}
-static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
- struct svc_rdma_fastreg_mr *frmr)
-{
- int page_no;
- for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
- dma_addr_t addr = frmr->page_list->page_list[page_no];
- if (ib_dma_mapping_error(frmr->mr->device, addr))
- continue;
- atomic_dec(&xprt->sc_dma_used);
- ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE,
- frmr->direction);
- }
-}
-
void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
struct svc_rdma_fastreg_mr *frmr)
{
if (frmr) {
- frmr_unmap_dma(rdma, frmr);
+ ib_dma_unmap_sg(rdma->sc_cm_id->device,
+ frmr->sg, frmr->sg_nents, frmr->direction);
+ atomic_dec(&rdma->sc_dma_used);
spin_lock_bh(&rdma->sc_frmr_q_lock);
WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
@@ -848,10 +884,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct svcxprt_rdma *listen_rdma;
struct svcxprt_rdma *newxprt = NULL;
struct rdma_conn_param conn_param;
+ struct ib_cq_init_attr cq_attr = {};
struct ib_qp_init_attr qp_attr;
struct ib_device_attr devattr;
int uninitialized_var(dma_mr_acc);
- int need_dma_mr;
+ int need_dma_mr = 0;
int ret;
int i;
@@ -884,6 +921,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
* capabilities of this particular device */
newxprt->sc_max_sge = min((size_t)devattr.max_sge,
(size_t)RPCSVC_MAXPAGES);
+ newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+ RPCSVC_MAXPAGES);
newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
(size_t)svcrdma_max_requests);
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
@@ -900,22 +939,22 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk("svcrdma: error creating PD for connect request\n");
goto errout;
}
+ cq_attr.cqe = newxprt->sc_sq_depth;
newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
sq_comp_handler,
cq_event_handler,
newxprt,
- newxprt->sc_sq_depth,
- 0);
+ &cq_attr);
if (IS_ERR(newxprt->sc_sq_cq)) {
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
+ cq_attr.cqe = newxprt->sc_max_requests;
newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
rq_comp_handler,
cq_event_handler,
newxprt,
- newxprt->sc_max_requests,
- 0);
+ &cq_attr);
if (IS_ERR(newxprt->sc_rq_cq)) {
dprintk("svcrdma: error creating RQ CQ for connect request\n");
goto errout;
@@ -985,35 +1024,26 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/*
* Determine if a DMA MR is required and if so, what privs are required
*/
- switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
- case RDMA_TRANSPORT_IWARP:
- newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
- if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
- need_dma_mr = 1;
- dma_mr_acc =
- (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE);
- } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
- need_dma_mr = 1;
- dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
- } else
- need_dma_mr = 0;
- break;
- case RDMA_TRANSPORT_IB:
- if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
- need_dma_mr = 1;
- dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
- } else if (!(devattr.device_cap_flags &
- IB_DEVICE_LOCAL_DMA_LKEY)) {
- need_dma_mr = 1;
- dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
- } else
- need_dma_mr = 0;
- break;
- default:
+ if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device,
+ newxprt->sc_cm_id->port_num) &&
+ !rdma_ib_or_roce(newxprt->sc_cm_id->device,
+ newxprt->sc_cm_id->port_num))
goto errout;
+
+ if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) ||
+ !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
+ newxprt->sc_cm_id->port_num) &&
+ !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG))
+ dma_mr_acc |= IB_ACCESS_REMOTE_WRITE;
}
+ if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
+ newxprt->sc_cm_id->port_num))
+ newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
+
/* Create the DMA MR if needed, otherwise, use the DMA LKEY */
if (need_dma_mr) {
/* Register all of physical memory */
@@ -1067,6 +1097,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
" remote_ip : %pI4\n"
" remote_port : %d\n"
" max_sge : %d\n"
+ " max_sge_rd : %d\n"
" sq_depth : %d\n"
" max_requests : %d\n"
" ord : %d\n",
@@ -1080,6 +1111,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
route.addr.dst_addr)->sin_port),
newxprt->sc_max_sge,
+ newxprt->sc_max_sge_rd,
newxprt->sc_sq_depth,
newxprt->sc_max_requests,
newxprt->sc_ord);
@@ -1222,40 +1254,6 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp)
return 1;
}
-/*
- * Attempt to register the kvec representing the RPC memory with the
- * device.
- *
- * Returns:
- * NULL : The device does not support fastreg or there were no more
- * fastreg mr.
- * frmr : The kvec register request was successfully posted.
- * <0 : An error was encountered attempting to register the kvec.
- */
-int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
- struct svc_rdma_fastreg_mr *frmr)
-{
- struct ib_send_wr fastreg_wr;
- u8 key;
-
- /* Bump the key */
- key = (u8)(frmr->mr->lkey & 0x000000FF);
- ib_update_fast_reg_key(frmr->mr, ++key);
-
- /* Prepare FASTREG WR */
- memset(&fastreg_wr, 0, sizeof fastreg_wr);
- fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- fastreg_wr.send_flags = IB_SEND_SIGNALED;
- fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
- fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
- fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
- fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fastreg_wr.wr.fast_reg.length = frmr->map_len;
- fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
- fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
- return svc_rdma_send(xprt, &fastreg_wr);
-}
-
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
struct ib_send_wr *bad_wr, *n_wr;
@@ -1319,11 +1317,11 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
struct ib_send_wr err_wr;
struct page *p;
struct svc_rdma_op_ctxt *ctxt;
- u32 *va;
+ __be32 *va;
int length;
int ret;
- p = svc_rdma_get_page();
+ p = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
va = page_address(p);
/* XDR encode error */
diff --git a/kernel/net/sunrpc/xprtrdma/transport.c b/kernel/net/sunrpc/xprtrdma/transport.c
index 54f23b1be..8c545f7d7 100644
--- a/kernel/net/sunrpc/xprtrdma/transport.c
+++ b/kernel/net/sunrpc/xprtrdma/transport.c
@@ -48,7 +48,6 @@
*/
#include <linux/module.h>
-#include <linux/init.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/sunrpc/addr.h>
@@ -59,11 +58,6 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-MODULE_LICENSE("Dual BSD/GPL");
-
-MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
-MODULE_AUTHOR("Network Appliance, Inc.");
-
/*
* tunables
*/
@@ -181,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
}
static void
-xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
{
- struct sockaddr *sap = (struct sockaddr *)
- &rpcx_to_rdmad(xprt).addr;
char buf[128];
switch (sap->sa_family) {
@@ -246,6 +238,16 @@ xprt_rdma_connect_worker(struct work_struct *work)
xprt_clear_connecting(xprt);
}
+static void
+xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
+ rx_xprt);
+
+ pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt);
+ rdma_disconnect(r_xprt->rx_ia.ri_id);
+}
+
/*
* xprt_rdma_destroy
*
@@ -268,8 +270,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
xprt_clear_connected(xprt);
- rpcrdma_buffer_destroy(&r_xprt->rx_buf);
rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_buffer_destroy(&r_xprt->rx_buf);
rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
@@ -298,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args)
struct rpc_xprt *xprt;
struct rpcrdma_xprt *new_xprt;
struct rpcrdma_ep *new_ep;
- struct sockaddr_in *sin;
+ struct sockaddr *sap;
int rc;
if (args->addrlen > sizeof(xprt->addr)) {
@@ -329,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args)
* Set up RDMA-specific connect data.
*/
- /* Put server RDMA address in local cdata */
- memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+ sap = (struct sockaddr *)&cdata.addr;
+ memcpy(sap, args->dstaddr, args->addrlen);
/* Ensure xprt->addr holds valid server TCP (not RDMA)
* address, for any side protocols which peek at it */
xprt->prot = IPPROTO_TCP;
xprt->addrlen = args->addrlen;
- memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+ memcpy(&xprt->addr, sap, xprt->addrlen);
- sin = (struct sockaddr_in *)&cdata.addr;
- if (ntohs(sin->sin_port) != 0)
+ if (rpc_get_port(sap))
xprt_set_bound(xprt);
- dprintk("RPC: %s: %pI4:%u\n",
- __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
-
- /* Set max requests */
cdata.max_requests = xprt->max_reqs;
- /* Set some length limits */
cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
@@ -371,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args)
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
- xprt_rdma_memreg_strategy);
+ rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
if (rc)
goto out1;
@@ -405,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args)
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
- xprt_rdma_format_addresses(xprt);
+ xprt_rdma_format_addresses(xprt, sap);
xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
if (xprt->max_payload == 0)
goto out4;
@@ -416,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args)
if (!try_module_get(THIS_MODULE))
goto out4;
+ dprintk("RPC: %s: %s:%s\n", __func__,
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT]);
return xprt;
out4:
@@ -618,12 +616,6 @@ xprt_rdma_send_request(struct rpc_task *task)
if (req->rl_reply == NULL) /* e.g. reconnection */
rpcrdma_recv_buffer_get(req);
- if (req->rl_reply) {
- req->rl_reply->rr_func = rpcrdma_reply_handler;
- /* this need only be done once, but... */
- req->rl_reply->rr_xprt = xprt;
- }
-
/* Must suppress retransmit to maintain credits */
if (req->rl_connect_cookie == xprt->connect_cookie)
goto drop_connection;
@@ -655,31 +647,41 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
if (xprt_connected(xprt))
idle_time = (long)(jiffies - xprt->last_used) / HZ;
- seq_printf(seq,
- "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
- "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
-
- 0, /* need a local port? */
- xprt->stat.bind_count,
- xprt->stat.connect_count,
- xprt->stat.connect_time,
- idle_time,
- xprt->stat.sends,
- xprt->stat.recvs,
- xprt->stat.bad_xids,
- xprt->stat.req_u,
- xprt->stat.bklog_u,
-
- r_xprt->rx_stats.read_chunk_count,
- r_xprt->rx_stats.write_chunk_count,
- r_xprt->rx_stats.reply_chunk_count,
- r_xprt->rx_stats.total_rdma_request,
- r_xprt->rx_stats.total_rdma_reply,
- r_xprt->rx_stats.pullup_copy_count,
- r_xprt->rx_stats.fixup_copy_count,
- r_xprt->rx_stats.hardway_register_count,
- r_xprt->rx_stats.failed_marshal_count,
- r_xprt->rx_stats.bad_reply_count);
+ seq_puts(seq, "\txprt:\trdma ");
+ seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
+ 0, /* need a local port? */
+ xprt->stat.bind_count,
+ xprt->stat.connect_count,
+ xprt->stat.connect_time,
+ idle_time,
+ xprt->stat.sends,
+ xprt->stat.recvs,
+ xprt->stat.bad_xids,
+ xprt->stat.req_u,
+ xprt->stat.bklog_u);
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+ r_xprt->rx_stats.read_chunk_count,
+ r_xprt->rx_stats.write_chunk_count,
+ r_xprt->rx_stats.reply_chunk_count,
+ r_xprt->rx_stats.total_rdma_request,
+ r_xprt->rx_stats.total_rdma_reply,
+ r_xprt->rx_stats.pullup_copy_count,
+ r_xprt->rx_stats.fixup_copy_count,
+ r_xprt->rx_stats.hardway_register_count,
+ r_xprt->rx_stats.failed_marshal_count,
+ r_xprt->rx_stats.bad_reply_count,
+ r_xprt->rx_stats.nomsg_call_count);
+}
+
+static int
+xprt_rdma_enable_swap(struct rpc_xprt *xprt)
+{
+ return 0;
+}
+
+static void
+xprt_rdma_disable_swap(struct rpc_xprt *xprt)
+{
}
/*
@@ -700,7 +702,16 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
.send_request = xprt_rdma_send_request,
.close = xprt_rdma_close,
.destroy = xprt_rdma_destroy,
- .print_stats = xprt_rdma_print_stats
+ .print_stats = xprt_rdma_print_stats,
+ .enable_swap = xprt_rdma_enable_swap,
+ .disable_swap = xprt_rdma_disable_swap,
+ .inject_disconnect = xprt_rdma_inject_disconnect,
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ .bc_setup = xprt_rdma_bc_setup,
+ .bc_up = xprt_rdma_bc_up,
+ .bc_free_rqst = xprt_rdma_bc_free_rqst,
+ .bc_destroy = xprt_rdma_bc_destroy,
+#endif
};
static struct xprt_class xprt_rdma = {
@@ -711,7 +722,7 @@ static struct xprt_class xprt_rdma = {
.setup = xprt_setup_rdma,
};
-static void __exit xprt_rdma_cleanup(void)
+void xprt_rdma_cleanup(void)
{
int rc;
@@ -726,17 +737,32 @@ static void __exit xprt_rdma_cleanup(void)
if (rc)
dprintk("RPC: %s: xprt_unregister returned %i\n",
__func__, rc);
+
+ rpcrdma_destroy_wq();
+ frwr_destroy_recovery_wq();
}
-static int __init xprt_rdma_init(void)
+int xprt_rdma_init(void)
{
int rc;
- rc = xprt_register_transport(&xprt_rdma);
-
+ rc = frwr_alloc_recovery_wq();
if (rc)
return rc;
+ rc = rpcrdma_alloc_wq();
+ if (rc) {
+ frwr_destroy_recovery_wq();
+ return rc;
+ }
+
+ rc = xprt_register_transport(&xprt_rdma);
+ if (rc) {
+ rpcrdma_destroy_wq();
+ frwr_destroy_recovery_wq();
+ return rc;
+ }
+
dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
dprintk("Defaults:\n");
@@ -753,6 +779,3 @@ static int __init xprt_rdma_init(void)
#endif
return 0;
}
-
-module_init(xprt_rdma_init);
-module_exit(xprt_rdma_cleanup);
diff --git a/kernel/net/sunrpc/xprtrdma/verbs.c b/kernel/net/sunrpc/xprtrdma/verbs.c
index 4870d272e..eadd16551 100644
--- a/kernel/net/sunrpc/xprtrdma/verbs.c
+++ b/kernel/net/sunrpc/xprtrdma/verbs.c
@@ -52,6 +52,7 @@
#include <linux/prefetch.h>
#include <linux/sunrpc/addr.h>
#include <asm/bitops.h>
+#include <linux/module.h> /* try_module_get()/module_put() */
#include "xprt_rdma.h"
@@ -67,79 +68,33 @@
* internal functions
*/
-/*
- * handle replies in tasklet context, using a single, global list
- * rdma tasklet function -- just turn around and call the func
- * for all replies on the list
- */
-
-static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
-static LIST_HEAD(rpcrdma_tasklets_g);
+static struct workqueue_struct *rpcrdma_receive_wq;
-static void
-rpcrdma_run_tasklet(unsigned long data)
+int
+rpcrdma_alloc_wq(void)
{
- struct rpcrdma_rep *rep;
- void (*func)(struct rpcrdma_rep *);
- unsigned long flags;
+ struct workqueue_struct *recv_wq;
- data = data;
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- while (!list_empty(&rpcrdma_tasklets_g)) {
- rep = list_entry(rpcrdma_tasklets_g.next,
- struct rpcrdma_rep, rr_list);
- list_del(&rep->rr_list);
- func = rep->rr_func;
- rep->rr_func = NULL;
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
-
- if (func)
- func(rep);
- else
- rpcrdma_recv_buffer_put(rep);
-
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- }
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
-}
+ recv_wq = alloc_workqueue("xprtrdma_receive",
+ WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI,
+ 0);
+ if (!recv_wq)
+ return -ENOMEM;
-static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
-
-static const char * const async_event[] = {
- "CQ error",
- "QP fatal error",
- "QP request error",
- "QP access error",
- "communication established",
- "send queue drained",
- "path migration successful",
- "path mig error",
- "device fatal error",
- "port active",
- "port error",
- "LID change",
- "P_key change",
- "SM change",
- "SRQ error",
- "SRQ limit reached",
- "last WQE reached",
- "client reregister",
- "GID change",
-};
-
-#define ASYNC_MSG(status) \
- ((status) < ARRAY_SIZE(async_event) ? \
- async_event[(status)] : "unknown async error")
+ rpcrdma_receive_wq = recv_wq;
+ return 0;
+}
-static void
-rpcrdma_schedule_tasklet(struct list_head *sched_list)
+void
+rpcrdma_destroy_wq(void)
{
- unsigned long flags;
+ struct workqueue_struct *wq;
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- list_splice_tail(sched_list, &rpcrdma_tasklets_g);
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
- tasklet_schedule(&rpcrdma_tasklet_g);
+ if (rpcrdma_receive_wq) {
+ wq = rpcrdma_receive_wq;
+ rpcrdma_receive_wq = NULL;
+ destroy_workqueue(wq);
+ }
}
static void
@@ -148,7 +103,7 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
struct rpcrdma_ep *ep = context;
pr_err("RPC: %s: %s on device %s ep %p\n",
- __func__, ASYNC_MSG(event->event),
+ __func__, ib_event_msg(event->event),
event->device->name, context);
if (ep->rep_connected == 1) {
ep->rep_connected = -EIO;
@@ -163,7 +118,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
struct rpcrdma_ep *ep = context;
pr_err("RPC: %s: %s on device %s ep %p\n",
- __func__, ASYNC_MSG(event->event),
+ __func__, ib_event_msg(event->event),
event->device->name, context);
if (ep->rep_connected == 1) {
ep->rep_connected = -EIO;
@@ -172,35 +127,6 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
}
}
-static const char * const wc_status[] = {
- "success",
- "local length error",
- "local QP operation error",
- "local EE context operation error",
- "local protection error",
- "WR flushed",
- "memory management operation error",
- "bad response error",
- "local access error",
- "remote invalid request error",
- "remote access error",
- "remote operation error",
- "transport retry counter exceeded",
- "RNR retry counter exceeded",
- "local RDD violation error",
- "remove invalid RD request",
- "operation aborted",
- "invalid EE context number",
- "invalid EE context state",
- "fatal error",
- "response timeout error",
- "general error",
-};
-
-#define COMPLETION_MSG(status) \
- ((status) < ARRAY_SIZE(wc_status) ? \
- wc_status[(status)] : "unexpected completion error")
-
static void
rpcrdma_sendcq_process_wc(struct ib_wc *wc)
{
@@ -209,7 +135,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
if (wc->status != IB_WC_SUCCESS &&
wc->status != IB_WC_WR_FLUSH_ERR)
pr_err("RPC: %s: SEND: %s\n",
- __func__, COMPLETION_MSG(wc->status));
+ __func__, ib_wc_status_msg(wc->status));
} else {
struct rpcrdma_mw *r;
@@ -218,63 +144,54 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
}
}
-static int
-rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+/* The common case is a single send completion is waiting. By
+ * passing two WC entries to ib_poll_cq, a return code of 1
+ * means there is exactly one WC waiting and no more. We don't
+ * have to invoke ib_poll_cq again to know that the CQ has been
+ * properly drained.
+ */
+static void
+rpcrdma_sendcq_poll(struct ib_cq *cq)
{
- struct ib_wc *wcs;
- int budget, count, rc;
+ struct ib_wc *pos, wcs[2];
+ int count, rc;
- budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
do {
- wcs = ep->rep_send_wcs;
+ pos = wcs;
- rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
- if (rc <= 0)
- return rc;
+ rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos);
+ if (rc < 0)
+ break;
count = rc;
while (count-- > 0)
- rpcrdma_sendcq_process_wc(wcs++);
- } while (rc == RPCRDMA_POLLSIZE && --budget);
- return 0;
+ rpcrdma_sendcq_process_wc(pos++);
+ } while (rc == ARRAY_SIZE(wcs));
+ return;
}
-/*
- * Handle send, fast_reg_mr, and local_inv completions.
- *
- * Send events are typically suppressed and thus do not result
- * in an upcall. Occasionally one is signaled, however. This
- * prevents the provider's completion queue from wrapping and
- * losing a completion.
+/* Handle provider send completion upcalls.
*/
static void
rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
{
- struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
- int rc;
-
- rc = rpcrdma_sendcq_poll(cq, ep);
- if (rc) {
- dprintk("RPC: %s: ib_poll_cq failed: %i\n",
- __func__, rc);
- return;
- }
+ do {
+ rpcrdma_sendcq_poll(cq);
+ } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP |
+ IB_CQ_REPORT_MISSED_EVENTS) > 0);
+}
- rc = ib_req_notify_cq(cq,
- IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
- if (rc == 0)
- return;
- if (rc < 0) {
- dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
- __func__, rc);
- return;
- }
+static void
+rpcrdma_receive_worker(struct work_struct *work)
+{
+ struct rpcrdma_rep *rep =
+ container_of(work, struct rpcrdma_rep, rr_work);
- rpcrdma_sendcq_poll(cq, ep);
+ rpcrdma_reply_handler(rep);
}
static void
-rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
+rpcrdma_recvcq_process_wc(struct ib_wc *wc)
{
struct rpcrdma_rep *rep =
(struct rpcrdma_rep *)(unsigned long)wc->wr_id;
@@ -291,126 +208,70 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
__func__, rep, wc->byte_len);
rep->rr_len = wc->byte_len;
- ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+ ib_dma_sync_single_for_cpu(rep->rr_device,
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
prefetch(rdmab_to_msg(rep->rr_rdmabuf));
out_schedule:
- list_add_tail(&rep->rr_list, sched_list);
+ queue_work(rpcrdma_receive_wq, &rep->rr_work);
return;
+
out_fail:
if (wc->status != IB_WC_WR_FLUSH_ERR)
pr_err("RPC: %s: rep %p: %s\n",
- __func__, rep, COMPLETION_MSG(wc->status));
- rep->rr_len = ~0U;
+ __func__, rep, ib_wc_status_msg(wc->status));
+ rep->rr_len = RPCRDMA_BAD_LEN;
goto out_schedule;
}
-static int
-rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+/* The wc array is on stack: automatic memory is always CPU-local.
+ *
+ * struct ib_wc is 64 bytes, making the poll array potentially
+ * large. But this is at the bottom of the call chain. Further
+ * substantial work is done in another thread.
+ */
+static void
+rpcrdma_recvcq_poll(struct ib_cq *cq)
{
- struct list_head sched_list;
- struct ib_wc *wcs;
- int budget, count, rc;
+ struct ib_wc *pos, wcs[4];
+ int count, rc;
- INIT_LIST_HEAD(&sched_list);
- budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
do {
- wcs = ep->rep_recv_wcs;
+ pos = wcs;
- rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
- if (rc <= 0)
- goto out_schedule;
+ rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos);
+ if (rc < 0)
+ break;
count = rc;
while (count-- > 0)
- rpcrdma_recvcq_process_wc(wcs++, &sched_list);
- } while (rc == RPCRDMA_POLLSIZE && --budget);
- rc = 0;
-
-out_schedule:
- rpcrdma_schedule_tasklet(&sched_list);
- return rc;
+ rpcrdma_recvcq_process_wc(pos++);
+ } while (rc == ARRAY_SIZE(wcs));
}
-/*
- * Handle receive completions.
- *
- * It is reentrant but processes single events in order to maintain
- * ordering of receives to keep server credits.
- *
- * It is the responsibility of the scheduled tasklet to return
- * recv buffers to the pool. NOTE: this affects synchronization of
- * connection shutdown. That is, the structures required for
- * the completion of the reply handler must remain intact until
- * all memory has been reclaimed.
+/* Handle provider receive completion upcalls.
*/
static void
rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
{
- struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
- int rc;
-
- rc = rpcrdma_recvcq_poll(cq, ep);
- if (rc) {
- dprintk("RPC: %s: ib_poll_cq failed: %i\n",
- __func__, rc);
- return;
- }
-
- rc = ib_req_notify_cq(cq,
- IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
- if (rc == 0)
- return;
- if (rc < 0) {
- dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
- __func__, rc);
- return;
- }
-
- rpcrdma_recvcq_poll(cq, ep);
+ do {
+ rpcrdma_recvcq_poll(cq);
+ } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP |
+ IB_CQ_REPORT_MISSED_EVENTS) > 0);
}
static void
rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
{
struct ib_wc wc;
- LIST_HEAD(sched_list);
while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
- rpcrdma_recvcq_process_wc(&wc, &sched_list);
- if (!list_empty(&sched_list))
- rpcrdma_schedule_tasklet(&sched_list);
+ rpcrdma_recvcq_process_wc(&wc);
while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
rpcrdma_sendcq_process_wc(&wc);
}
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-static const char * const conn[] = {
- "address resolved",
- "address error",
- "route resolved",
- "route error",
- "connect request",
- "connect response",
- "connect error",
- "unreachable",
- "rejected",
- "established",
- "disconnected",
- "device removal",
- "multicast join",
- "multicast error",
- "address change",
- "timewait exit",
-};
-
-#define CONNECTION_MSG(status) \
- ((status) < ARRAY_SIZE(conn) ? \
- conn[(status)] : "unrecognized connection error")
-#endif
-
static int
rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
@@ -476,7 +337,7 @@ connected:
default:
dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
__func__, sap, rpc_get_port(sap), ep,
- CONNECTION_MSG(event->event));
+ rdma_event_msg(event->event));
break;
}
@@ -487,7 +348,7 @@ connected:
pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
sap, rpc_get_port(sap),
- ia->ri_id->device->name,
+ ia->ri_device->name,
ia->ri_ops->ro_displayname,
xprt->rx_buf.rb_max_requests,
ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
@@ -500,6 +361,14 @@ connected:
return 0;
}
+static void rpcrdma_destroy_id(struct rdma_cm_id *id)
+{
+ if (id) {
+ module_put(id->device->owner);
+ rdma_destroy_id(id);
+ }
+}
+
static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
@@ -509,7 +378,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
init_completion(&ia->ri_done);
- id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
+ id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
+ IB_QPT_RC);
if (IS_ERR(id)) {
rc = PTR_ERR(id);
dprintk("RPC: %s: rdma_create_id() failed %i\n",
@@ -526,6 +396,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
}
wait_for_completion_interruptible_timeout(&ia->ri_done,
msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+
+ /* FIXME:
+ * Until xprtrdma supports DEVICE_REMOVAL, the provider must
+ * be pinned while there are active NFS/RDMA mounts to prevent
+ * hangs and crashes at umount time.
+ */
+ if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
+ dprintk("RPC: %s: Failed to get device module\n",
+ __func__);
+ ia->ri_async_rc = -ENODEV;
+ }
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -535,16 +416,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
if (rc) {
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
__func__, rc);
- goto out;
+ goto put;
}
wait_for_completion_interruptible_timeout(&ia->ri_done,
msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
rc = ia->ri_async_rc;
if (rc)
- goto out;
+ goto put;
return id;
-
+put:
+ module_put(id->device->owner);
out:
rdma_destroy_id(id);
return ERR_PTR(rc);
@@ -579,17 +461,20 @@ rpcrdma_clean_cq(struct ib_cq *cq)
int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
- int rc, mem_priv;
struct rpcrdma_ia *ia = &xprt->rx_ia;
struct ib_device_attr *devattr = &ia->ri_devattr;
+ int rc;
+
+ ia->ri_dma_mr = NULL;
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
goto out1;
}
+ ia->ri_device = ia->ri_id->device;
- ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+ ia->ri_pd = ib_alloc_pd(ia->ri_device);
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
@@ -597,69 +482,39 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
goto out2;
}
- rc = ib_query_device(ia->ri_id->device, devattr);
+ rc = ib_query_device(ia->ri_device, devattr);
if (rc) {
dprintk("RPC: %s: ib_query_device failed %d\n",
__func__, rc);
goto out3;
}
- if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
- ia->ri_have_dma_lkey = 1;
- ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
- }
-
if (memreg == RPCRDMA_FRMR) {
- /* Requires both frmr reg and local dma lkey */
- if (((devattr->device_cap_flags &
- (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
- (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
- (devattr->max_fast_reg_page_list_len == 0)) {
+ if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+ (devattr->max_fast_reg_page_list_len == 0)) {
dprintk("RPC: %s: FRMR registration "
"not supported by HCA\n", __func__);
memreg = RPCRDMA_MTHCAFMR;
}
}
if (memreg == RPCRDMA_MTHCAFMR) {
- if (!ia->ri_id->device->alloc_fmr) {
+ if (!ia->ri_device->alloc_fmr) {
dprintk("RPC: %s: MTHCAFMR registration "
"not supported by HCA\n", __func__);
- memreg = RPCRDMA_ALLPHYSICAL;
+ rc = -EINVAL;
+ goto out3;
}
}
- /*
- * Optionally obtain an underlying physical identity mapping in
- * order to do a memory window-based bind. This base registration
- * is protected from remote access - that is enabled only by binding
- * for the specific bytes targeted during each RPC operation, and
- * revoked after the corresponding completion similar to a storage
- * adapter.
- */
switch (memreg) {
case RPCRDMA_FRMR:
ia->ri_ops = &rpcrdma_frwr_memreg_ops;
break;
case RPCRDMA_ALLPHYSICAL:
ia->ri_ops = &rpcrdma_physical_memreg_ops;
- mem_priv = IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ;
- goto register_setup;
+ break;
case RPCRDMA_MTHCAFMR:
ia->ri_ops = &rpcrdma_fmr_memreg_ops;
- if (ia->ri_have_dma_lkey)
- break;
- mem_priv = IB_ACCESS_LOCAL_WRITE;
- register_setup:
- ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
- if (IS_ERR(ia->ri_bind_mem)) {
- printk(KERN_ALERT "%s: ib_get_dma_mr for "
- "phys register failed with %lX\n",
- __func__, PTR_ERR(ia->ri_bind_mem));
- rc = -ENOMEM;
- goto out3;
- }
break;
default:
printk(KERN_ERR "RPC: Unsupported memory "
@@ -670,9 +525,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
dprintk("RPC: %s: memory registration strategy is '%s'\n",
__func__, ia->ri_ops->ro_displayname);
- /* Else will do memory reg/dereg for each chunk */
- ia->ri_memreg_strategy = memreg;
-
rwlock_init(&ia->ri_qplock);
return 0;
@@ -680,7 +532,7 @@ out3:
ib_dealloc_pd(ia->ri_pd);
ia->ri_pd = NULL;
out2:
- rdma_destroy_id(ia->ri_id);
+ rpcrdma_destroy_id(ia->ri_id);
ia->ri_id = NULL;
out1:
return rc;
@@ -694,25 +546,17 @@ out1:
void
rpcrdma_ia_close(struct rpcrdma_ia *ia)
{
- int rc;
-
dprintk("RPC: %s: entering\n", __func__);
- if (ia->ri_bind_mem != NULL) {
- rc = ib_dereg_mr(ia->ri_bind_mem);
- dprintk("RPC: %s: ib_dereg_mr returned %i\n",
- __func__, rc);
- }
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
if (ia->ri_id->qp)
rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
+ rpcrdma_destroy_id(ia->ri_id);
ia->ri_id = NULL;
}
- if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
- rc = ib_dealloc_pd(ia->ri_pd);
- dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
- __func__, rc);
- }
+
+ /* If the pd is still busy, xprtrdma missed freeing a resource */
+ if (ia->ri_pd && !IS_ERR(ia->ri_pd))
+ ib_dealloc_pd(ia->ri_pd);
}
/*
@@ -724,35 +568,44 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
{
struct ib_device_attr *devattr = &ia->ri_devattr;
struct ib_cq *sendcq, *recvcq;
+ struct ib_cq_init_attr cq_attr = {};
+ unsigned int max_qp_wr;
int rc, err;
+ if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+ dprintk("RPC: %s: insufficient sge's available\n",
+ __func__);
+ return -ENOMEM;
+ }
+
+ if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
+ dprintk("RPC: %s: insufficient wqe's available\n",
+ __func__);
+ return -ENOMEM;
+ }
+ max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS;
+
/* check provider's send/recv wr limits */
- if (cdata->max_requests > devattr->max_qp_wr)
- cdata->max_requests = devattr->max_qp_wr;
+ if (cdata->max_requests > max_qp_wr)
+ cdata->max_requests = max_qp_wr;
ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
ep->rep_attr.qp_context = ep;
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+ ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
rc = ia->ri_ops->ro_open(ia, ep, cdata);
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
- ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+ ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
ep->rep_attr.qp_type = IB_QPT_RC;
ep->rep_attr.port_num = ~0;
- if (cdata->padding) {
- ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
- GFP_KERNEL);
- if (IS_ERR(ep->rep_padbuf))
- return PTR_ERR(ep->rep_padbuf);
- } else
- ep->rep_padbuf = NULL;
-
dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
"iovs: send %d recv %d\n",
__func__,
@@ -771,9 +624,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
- sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
- rpcrdma_cq_async_error_upcall, ep,
- ep->rep_attr.cap.max_send_wr + 1, 0);
+ cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
+ sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
+ rpcrdma_cq_async_error_upcall, NULL, &cq_attr);
if (IS_ERR(sendcq)) {
rc = PTR_ERR(sendcq);
dprintk("RPC: %s: failed to create send CQ: %i\n",
@@ -788,9 +641,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
goto out2;
}
- recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
- rpcrdma_cq_async_error_upcall, ep,
- ep->rep_attr.cap.max_recv_wr + 1, 0);
+ cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
+ recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
+ rpcrdma_cq_async_error_upcall, NULL, &cq_attr);
if (IS_ERR(recvcq)) {
rc = PTR_ERR(recvcq);
dprintk("RPC: %s: failed to create recv CQ: %i\n",
@@ -835,7 +688,8 @@ out2:
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, err);
out1:
- rpcrdma_free_regbuf(ia, ep->rep_padbuf);
+ if (ia->ri_dma_mr)
+ ib_dereg_mr(ia->ri_dma_mr);
return rc;
}
@@ -856,25 +710,32 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
cancel_delayed_work_sync(&ep->rep_connect_worker);
- if (ia->ri_id->qp) {
+ if (ia->ri_id->qp)
rpcrdma_ep_disconnect(ep, ia);
+
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
+
+ if (ia->ri_id->qp) {
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
- rpcrdma_free_regbuf(ia, ep->rep_padbuf);
-
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
rc = ib_destroy_cq(ep->rep_attr.recv_cq);
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
rc = ib_destroy_cq(ep->rep_attr.send_cq);
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
+
+ if (ia->ri_dma_mr) {
+ rc = ib_dereg_mr(ia->ri_dma_mr);
+ dprintk("RPC: %s: ib_dereg_mr returned %i\n",
+ __func__, rc);
+ }
}
/*
@@ -896,8 +757,6 @@ retry:
rpcrdma_flush_cqs(ep);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
- ia->ri_ops->ro_reset(xprt);
-
id = rpcrdma_create_id(xprt, ia,
(struct sockaddr *)&xprt->rx_data.addr);
if (IS_ERR(id)) {
@@ -911,10 +770,10 @@ retry:
* More stuff I haven't thought of!
* Rrrgh!
*/
- if (ia->ri_id->device != id->device) {
+ if (ia->ri_device != id->device) {
printk("RPC: %s: can't reconnect on "
"different device!\n", __func__);
- rdma_destroy_id(id);
+ rpcrdma_destroy_id(id);
rc = -ENETUNREACH;
goto out;
}
@@ -923,7 +782,7 @@ retry:
if (rc) {
dprintk("RPC: %s: rdma_create_qp failed %i\n",
__func__, rc);
- rdma_destroy_id(id);
+ rpcrdma_destroy_id(id);
rc = -ENETUNREACH;
goto out;
}
@@ -934,7 +793,7 @@ retry:
write_unlock(&ia->ri_qplock);
rdma_destroy_qp(old);
- rdma_destroy_id(old);
+ rpcrdma_destroy_id(old);
} else {
dprintk("RPC: %s: connecting...\n", __func__);
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -983,7 +842,21 @@ retry:
}
rc = ep->rep_connected;
} else {
+ struct rpcrdma_xprt *r_xprt;
+ unsigned int extras;
+
dprintk("RPC: %s: connected\n", __func__);
+
+ r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+ extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
+
+ if (extras) {
+ rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
+ if (rc)
+ pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
+ __func__, rc);
+ rc = 0;
+ }
}
out:
@@ -1020,20 +893,25 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
}
}
-static struct rpcrdma_req *
+struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
+ struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
struct rpcrdma_req *req;
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (req == NULL)
return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&req->rl_free);
+ spin_lock(&buffer->rb_reqslock);
+ list_add(&req->rl_all, &buffer->rb_allreqs);
+ spin_unlock(&buffer->rb_reqslock);
req->rl_buffer = &r_xprt->rx_buf;
return req;
}
-static struct rpcrdma_rep *
+struct rpcrdma_rep *
rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
@@ -1053,7 +931,9 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
goto out_free;
}
- rep->rr_buffer = &r_xprt->rx_buf;
+ rep->rr_device = ia->ri_device;
+ rep->rr_rxprt = r_xprt;
+ INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
return rep;
out_free:
@@ -1067,44 +947,21 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- char *p;
- size_t len;
int i, rc;
- buf->rb_max_requests = cdata->max_requests;
+ buf->rb_max_requests = r_xprt->rx_data.max_requests;
+ buf->rb_bc_srv_max_requests = 0;
spin_lock_init(&buf->rb_lock);
- /* Need to allocate:
- * 1. arrays for send and recv pointers
- * 2. arrays of struct rpcrdma_req to fill in pointers
- * 3. array of struct rpcrdma_rep for replies
- * Send/recv buffers in req/rep need to be registered
- */
- len = buf->rb_max_requests *
- (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
-
- p = kzalloc(len, GFP_KERNEL);
- if (p == NULL) {
- dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
- __func__, len);
- rc = -ENOMEM;
- goto out;
- }
- buf->rb_pool = p; /* for freeing it later */
-
- buf->rb_send_bufs = (struct rpcrdma_req **) p;
- p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
- buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
- p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
-
rc = ia->ri_ops->ro_init(r_xprt);
if (rc)
goto out;
+ INIT_LIST_HEAD(&buf->rb_send_bufs);
+ INIT_LIST_HEAD(&buf->rb_allreqs);
+ spin_lock_init(&buf->rb_reqslock);
for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_req *req;
- struct rpcrdma_rep *rep;
req = rpcrdma_create_req(r_xprt);
if (IS_ERR(req)) {
@@ -1113,7 +970,13 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
rc = PTR_ERR(req);
goto out;
}
- buf->rb_send_bufs[i] = req;
+ req->rl_backchannel = false;
+ list_add(&req->rl_free, &buf->rb_send_bufs);
+ }
+
+ INIT_LIST_HEAD(&buf->rb_recv_bufs);
+ for (i = 0; i < buf->rb_max_requests + 2; i++) {
+ struct rpcrdma_rep *rep;
rep = rpcrdma_create_rep(r_xprt);
if (IS_ERR(rep)) {
@@ -1122,7 +985,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
rc = PTR_ERR(rep);
goto out;
}
- buf->rb_recv_bufs[i] = rep;
+ list_add(&rep->rr_list, &buf->rb_recv_bufs);
}
return 0;
@@ -1131,22 +994,38 @@ out:
return rc;
}
+static struct rpcrdma_req *
+rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_req *req;
+
+ req = list_first_entry(&buf->rb_send_bufs,
+ struct rpcrdma_req, rl_free);
+ list_del(&req->rl_free);
+ return req;
+}
+
+static struct rpcrdma_rep *
+rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_rep *rep;
+
+ rep = list_first_entry(&buf->rb_recv_bufs,
+ struct rpcrdma_rep, rr_list);
+ list_del(&rep->rr_list);
+ return rep;
+}
+
static void
rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
{
- if (!rep)
- return;
-
rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
kfree(rep);
}
-static void
+void
rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
{
- if (!req)
- return;
-
rpcrdma_free_regbuf(ia, req->rl_sendbuf);
rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
kfree(req);
@@ -1156,220 +1035,88 @@ void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
- int i;
-
- /* clean up in reverse order from create
- * 1. recv mr memory (mr free, then kfree)
- * 2. send mr memory (mr free, then kfree)
- * 3. MWs
- */
- dprintk("RPC: %s: entering\n", __func__);
-
- for (i = 0; i < buf->rb_max_requests; i++) {
- if (buf->rb_recv_bufs)
- rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
- if (buf->rb_send_bufs)
- rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
- }
-
- ia->ri_ops->ro_destroy(buf);
- kfree(buf->rb_pool);
-}
+ while (!list_empty(&buf->rb_recv_bufs)) {
+ struct rpcrdma_rep *rep;
-/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
- * some req segments uninitialized.
- */
-static void
-rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
-{
- if (*mw) {
- list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
- *mw = NULL;
+ rep = rpcrdma_buffer_get_rep_locked(buf);
+ rpcrdma_destroy_rep(ia, rep);
}
-}
-/* Cycle mw's back in reverse order, and "spin" them.
- * This delays and scrambles reuse as much as possible.
- */
-static void
-rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mr_seg *seg = req->rl_segments;
- struct rpcrdma_mr_seg *seg1 = seg;
- int i;
+ spin_lock(&buf->rb_reqslock);
+ while (!list_empty(&buf->rb_allreqs)) {
+ struct rpcrdma_req *req;
- for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
- rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
- rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
-}
+ req = list_first_entry(&buf->rb_allreqs,
+ struct rpcrdma_req, rl_all);
+ list_del(&req->rl_all);
-static void
-rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
-{
- buf->rb_send_bufs[--buf->rb_send_index] = req;
- req->rl_niovs = 0;
- if (req->rl_reply) {
- buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
- req->rl_reply->rr_func = NULL;
- req->rl_reply = NULL;
+ spin_unlock(&buf->rb_reqslock);
+ rpcrdma_destroy_req(ia, req);
+ spin_lock(&buf->rb_reqslock);
}
-}
-
-/* rpcrdma_unmap_one() was already done during deregistration.
- * Redo only the ib_post_send().
- */
-static void
-rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
-{
- struct rpcrdma_xprt *r_xprt =
- container_of(ia, struct rpcrdma_xprt, rx_ia);
- struct ib_send_wr invalidate_wr, *bad_wr;
- int rc;
-
- dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
-
- /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
- r->r.frmr.fr_state = FRMR_IS_INVALID;
-
- memset(&invalidate_wr, 0, sizeof(invalidate_wr));
- invalidate_wr.wr_id = (unsigned long)(void *)r;
- invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
-
- dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
- __func__, r, r->r.frmr.fr_mr->rkey);
+ spin_unlock(&buf->rb_reqslock);
- read_lock(&ia->ri_qplock);
- rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
- read_unlock(&ia->ri_qplock);
- if (rc) {
- /* Force rpcrdma_buffer_get() to retry */
- r->r.frmr.fr_state = FRMR_IS_STALE;
- dprintk("RPC: %s: ib_post_send failed, %i\n",
- __func__, rc);
- }
+ ia->ri_ops->ro_destroy(buf);
}
-static void
-rpcrdma_retry_flushed_linv(struct list_head *stale,
- struct rpcrdma_buffer *buf)
+struct rpcrdma_mw *
+rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ia *ia = rdmab_to_ia(buf);
- struct list_head *pos;
- struct rpcrdma_mw *r;
- unsigned long flags;
-
- list_for_each(pos, stale) {
- r = list_entry(pos, struct rpcrdma_mw, mw_list);
- rpcrdma_retry_local_inv(r, ia);
- }
-
- spin_lock_irqsave(&buf->rb_lock, flags);
- list_splice_tail(stale, &buf->rb_mws);
- spin_unlock_irqrestore(&buf->rb_lock, flags);
-}
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_mw *mw = NULL;
-static struct rpcrdma_req *
-rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
- struct list_head *stale)
-{
- struct rpcrdma_mw *r;
- int i;
-
- i = RPCRDMA_MAX_SEGS - 1;
- while (!list_empty(&buf->rb_mws)) {
- r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- if (r->r.frmr.fr_state == FRMR_IS_STALE) {
- list_add(&r->mw_list, stale);
- continue;
- }
- req->rl_segments[i].rl_mw = r;
- if (unlikely(i-- == 0))
- return req; /* Success */
+ spin_lock(&buf->rb_mwlock);
+ if (!list_empty(&buf->rb_mws)) {
+ mw = list_first_entry(&buf->rb_mws,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
}
+ spin_unlock(&buf->rb_mwlock);
- /* Not enough entries on rb_mws for this req */
- rpcrdma_buffer_put_sendbuf(req, buf);
- rpcrdma_buffer_put_mrs(req, buf);
- return NULL;
+ if (!mw)
+ pr_err("RPC: %s: no MWs available\n", __func__);
+ return mw;
}
-static struct rpcrdma_req *
-rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+void
+rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
{
- struct rpcrdma_mw *r;
- int i;
-
- i = RPCRDMA_MAX_SEGS - 1;
- while (!list_empty(&buf->rb_mws)) {
- r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- req->rl_segments[i].rl_mw = r;
- if (unlikely(i-- == 0))
- return req; /* Success */
- }
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- /* Not enough entries on rb_mws for this req */
- rpcrdma_buffer_put_sendbuf(req, buf);
- rpcrdma_buffer_put_mrs(req, buf);
- return NULL;
+ spin_lock(&buf->rb_mwlock);
+ list_add_tail(&mw->mw_list, &buf->rb_mws);
+ spin_unlock(&buf->rb_mwlock);
}
/*
* Get a set of request/reply buffers.
*
- * Reply buffer (if needed) is attached to send buffer upon return.
- * Rule:
- * rb_send_index and rb_recv_index MUST always be pointing to the
- * *next* available buffer (non-NULL). They are incremented after
- * removing buffers, and decremented *before* returning them.
+ * Reply buffer (if available) is attached to send buffer upon return.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
- struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
- struct list_head stale;
struct rpcrdma_req *req;
- unsigned long flags;
-
- spin_lock_irqsave(&buffers->rb_lock, flags);
- if (buffers->rb_send_index == buffers->rb_max_requests) {
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
- dprintk("RPC: %s: out of request buffers\n", __func__);
- return ((struct rpcrdma_req *)NULL);
- }
- req = buffers->rb_send_bufs[buffers->rb_send_index];
- if (buffers->rb_send_index < buffers->rb_recv_index) {
- dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
- __func__,
- buffers->rb_recv_index - buffers->rb_send_index);
- req->rl_reply = NULL;
- } else {
- req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
- buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
- }
- buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+ spin_lock(&buffers->rb_lock);
+ if (list_empty(&buffers->rb_send_bufs))
+ goto out_reqbuf;
+ req = rpcrdma_buffer_get_req_locked(buffers);
+ if (list_empty(&buffers->rb_recv_bufs))
+ goto out_repbuf;
+ req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ spin_unlock(&buffers->rb_lock);
+ return req;
- INIT_LIST_HEAD(&stale);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
- break;
- case RPCRDMA_MTHCAFMR:
- req = rpcrdma_buffer_get_fmrs(req, buffers);
- break;
- default:
- break;
- }
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
- if (!list_empty(&stale))
- rpcrdma_retry_flushed_linv(&stale, buffers);
+out_reqbuf:
+ spin_unlock(&buffers->rb_lock);
+ pr_warn("RPC: %s: out of request buffers\n", __func__);
+ return NULL;
+out_repbuf:
+ spin_unlock(&buffers->rb_lock);
+ pr_warn("RPC: %s: out of reply buffers\n", __func__);
+ req->rl_reply = NULL;
return req;
}
@@ -1381,39 +1128,31 @@ void
rpcrdma_buffer_put(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
- struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
- unsigned long flags;
+ struct rpcrdma_rep *rep = req->rl_reply;
- spin_lock_irqsave(&buffers->rb_lock, flags);
- rpcrdma_buffer_put_sendbuf(req, buffers);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- case RPCRDMA_MTHCAFMR:
- rpcrdma_buffer_put_mrs(req, buffers);
- break;
- default:
- break;
- }
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ req->rl_niovs = 0;
+ req->rl_reply = NULL;
+
+ spin_lock(&buffers->rb_lock);
+ list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
+ if (rep)
+ list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock(&buffers->rb_lock);
}
/*
* Recover reply buffers from pool.
- * This happens when recovering from error conditions.
- * Post-increment counter/array index.
+ * This happens when recovering from disconnect.
*/
void
rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
- unsigned long flags;
- spin_lock_irqsave(&buffers->rb_lock, flags);
- if (buffers->rb_recv_index < buffers->rb_max_requests) {
- req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
- buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
- }
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ spin_lock(&buffers->rb_lock);
+ if (!list_empty(&buffers->rb_recv_bufs))
+ req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ spin_unlock(&buffers->rb_lock);
}
/*
@@ -1423,13 +1162,11 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
void
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
{
- struct rpcrdma_buffer *buffers = rep->rr_buffer;
- unsigned long flags;
+ struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
- rep->rr_func = NULL;
- spin_lock_irqsave(&buffers->rb_lock, flags);
- buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ spin_lock(&buffers->rb_lock);
+ list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock(&buffers->rb_lock);
}
/*
@@ -1444,75 +1181,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
(unsigned long long)seg->mr_dma, seg->mr_dmalen);
}
-static int
-rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
- struct ib_mr **mrp, struct ib_sge *iov)
-{
- struct ib_phys_buf ipb;
- struct ib_mr *mr;
- int rc;
-
- /*
- * All memory passed here was kmalloc'ed, therefore phys-contiguous.
- */
- iov->addr = ib_dma_map_single(ia->ri_id->device,
- va, len, DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
- return -ENOMEM;
-
- iov->length = len;
-
- if (ia->ri_have_dma_lkey) {
- *mrp = NULL;
- iov->lkey = ia->ri_dma_lkey;
- return 0;
- } else if (ia->ri_bind_mem != NULL) {
- *mrp = NULL;
- iov->lkey = ia->ri_bind_mem->lkey;
- return 0;
- }
-
- ipb.addr = iov->addr;
- ipb.size = iov->length;
- mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
- IB_ACCESS_LOCAL_WRITE, &iov->addr);
-
- dprintk("RPC: %s: phys convert: 0x%llx "
- "registered 0x%llx length %d\n",
- __func__, (unsigned long long)ipb.addr,
- (unsigned long long)iov->addr, len);
-
- if (IS_ERR(mr)) {
- *mrp = NULL;
- rc = PTR_ERR(mr);
- dprintk("RPC: %s: failed with %i\n", __func__, rc);
- } else {
- *mrp = mr;
- iov->lkey = mr->lkey;
- rc = 0;
- }
-
- return rc;
-}
-
-static int
-rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
- struct ib_mr *mr, struct ib_sge *iov)
-{
- int rc;
-
- ib_dma_unmap_single(ia->ri_id->device,
- iov->addr, iov->length, DMA_BIDIRECTIONAL);
-
- if (NULL == mr)
- return 0;
-
- rc = ib_dereg_mr(mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
- return rc;
-}
-
/**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia
@@ -1532,26 +1200,29 @@ struct rpcrdma_regbuf *
rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
{
struct rpcrdma_regbuf *rb;
- int rc;
+ struct ib_sge *iov;
- rc = -ENOMEM;
rb = kmalloc(sizeof(*rb) + size, flags);
if (rb == NULL)
goto out;
- rb->rg_size = size;
- rb->rg_owner = NULL;
- rc = rpcrdma_register_internal(ia, rb->rg_base, size,
- &rb->rg_mr, &rb->rg_iov);
- if (rc)
+ iov = &rb->rg_iov;
+ iov->addr = ib_dma_map_single(ia->ri_device,
+ (void *)rb->rg_base, size,
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(ia->ri_device, iov->addr))
goto out_free;
+ iov->length = size;
+ iov->lkey = ia->ri_pd->local_dma_lkey;
+ rb->rg_size = size;
+ rb->rg_owner = NULL;
return rb;
out_free:
kfree(rb);
out:
- return ERR_PTR(rc);
+ return ERR_PTR(-ENOMEM);
}
/**
@@ -1562,10 +1233,15 @@ out:
void
rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
- if (rb) {
- rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
- kfree(rb);
- }
+ struct ib_sge *iov;
+
+ if (!rb)
+ return;
+
+ iov = &rb->rg_iov;
+ ib_dma_unmap_single(ia->ri_device,
+ iov->addr, iov->length, DMA_BIDIRECTIONAL);
+ kfree(rb);
}
/*
@@ -1578,9 +1254,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_ep *ep,
struct rpcrdma_req *req)
{
+ struct ib_device *device = ia->ri_device;
struct ib_send_wr send_wr, *send_wr_fail;
struct rpcrdma_rep *rep = req->rl_reply;
- int rc;
+ struct ib_sge *iov = req->rl_send_iov;
+ int i, rc;
if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1591,19 +1269,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
send_wr.next = NULL;
send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
- send_wr.sg_list = req->rl_send_iov;
+ send_wr.sg_list = iov;
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
- if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
- ib_dma_sync_single_for_device(ia->ri_id->device,
- req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_id->device,
- req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_id->device,
- req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
- DMA_TO_DEVICE);
+
+ for (i = 0; i < send_wr.num_sge; i++)
+ ib_dma_sync_single_for_device(device, iov[i].addr,
+ iov[i].length, DMA_TO_DEVICE);
+ dprintk("RPC: %s: posting %d s/g entries\n",
+ __func__, send_wr.num_sge);
if (DECR_CQCOUNT(ep) > 0)
send_wr.send_flags = 0;
@@ -1636,7 +1310,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
recv_wr.num_sge = 1;
- ib_dma_sync_single_for_cpu(ia->ri_id->device,
+ ib_dma_sync_single_for_cpu(ia->ri_device,
rdmab_addr(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf),
DMA_BIDIRECTIONAL);
@@ -1649,6 +1323,47 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
return rc;
}
+/**
+ * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
+ * @r_xprt: transport associated with these backchannel resources
+ * @min_reqs: minimum number of incoming requests expected
+ *
+ * Returns zero if all requested buffers were posted, or a negative errno.
+ */
+int
+rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
+{
+ struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_rep *rep;
+ unsigned long flags;
+ int rc;
+
+ while (count--) {
+ spin_lock_irqsave(&buffers->rb_lock, flags);
+ if (list_empty(&buffers->rb_recv_bufs))
+ goto out_reqbuf;
+ rep = rpcrdma_buffer_get_rep_locked(buffers);
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+
+ rc = rpcrdma_ep_post_recv(ia, ep, rep);
+ if (rc)
+ goto out_rc;
+ }
+
+ return 0;
+
+out_reqbuf:
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ pr_warn("%s: no extra receive buffers\n", __func__);
+ return -ENOMEM;
+
+out_rc:
+ rpcrdma_recv_buffer_put(rep);
+ return rc;
+}
+
/* How many chunk list items fit within our inline buffers?
*/
unsigned int
diff --git a/kernel/net/sunrpc/xprtrdma/xprt_rdma.h b/kernel/net/sunrpc/xprtrdma/xprt_rdma.h
index 78e0b8bea..ac7f8d4f6 100644
--- a/kernel/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/kernel/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -51,7 +51,6 @@
#include <linux/sunrpc/clnt.h> /* rpc_xprt */
#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
-#include <linux/sunrpc/svc.h> /* RPCSVC_MAXPAYLOAD */
#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
@@ -62,14 +61,12 @@
struct rpcrdma_ia {
const struct rpcrdma_memreg_ops *ri_ops;
rwlock_t ri_qplock;
+ struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
- struct ib_mr *ri_bind_mem;
- u32 ri_dma_lkey;
- int ri_have_dma_lkey;
+ struct ib_mr *ri_dma_mr;
struct completion ri_done;
int ri_async_rc;
- enum rpcrdma_memreg ri_memreg_strategy;
unsigned int ri_max_frmr_depth;
struct ib_device_attr ri_devattr;
struct ib_qp_attr ri_qp_attr;
@@ -80,21 +77,15 @@ struct rpcrdma_ia {
* RDMA Endpoint -- one per transport instance
*/
-#define RPCRDMA_WC_BUDGET (128)
-#define RPCRDMA_POLLSIZE (16)
-
struct rpcrdma_ep {
atomic_t rep_cqcount;
int rep_cqinit;
int rep_connected;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
- struct rpcrdma_regbuf *rep_padbuf;
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
struct delayed_work rep_connect_worker;
- struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
- struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
};
/*
@@ -110,6 +101,16 @@ struct rpcrdma_ep {
*/
#define RPCRDMA_IGNORE_COMPLETION (0ULL)
+/* Pre-allocate extra Work Requests for handling backward receives
+ * and sends. This is a fixed value because the Work Queues are
+ * allocated when the forward channel is set up.
+ */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+#define RPCRDMA_BACKWARD_WRS (8)
+#else
+#define RPCRDMA_BACKWARD_WRS (0)
+#endif
+
/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
*
* The below structure appears at the front of a large region of kmalloc'd
@@ -119,7 +120,6 @@ struct rpcrdma_ep {
struct rpcrdma_regbuf {
size_t rg_size;
struct rpcrdma_req *rg_owner;
- struct ib_mr *rg_mr;
struct ib_sge rg_iov;
__be32 rg_base[0] __attribute__ ((aligned(256)));
};
@@ -165,21 +165,22 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* struct rpcrdma_buffer. N is the max number of outstanding requests.
*/
-/* temporary static scatter/gather max */
-#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
+#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
struct rpcrdma_buffer;
struct rpcrdma_rep {
unsigned int rr_len;
- struct rpcrdma_buffer *rr_buffer;
- struct rpc_xprt *rr_xprt;
- void (*rr_func)(struct rpcrdma_rep *);
+ struct ib_device *rr_device;
+ struct rpcrdma_xprt *rr_rxprt;
+ struct work_struct rr_work;
struct list_head rr_list;
struct rpcrdma_regbuf *rr_rdmabuf;
};
+#define RPCRDMA_BAD_LEN (~0U)
+
/*
* struct rpcrdma_mw - external memory region metadata
*
@@ -200,14 +201,22 @@ enum rpcrdma_frmr_state {
};
struct rpcrdma_frmr {
- struct ib_fast_reg_page_list *fr_pgl;
+ struct scatterlist *sg;
+ int sg_nents;
struct ib_mr *fr_mr;
enum rpcrdma_frmr_state fr_state;
+ struct work_struct fr_work;
+ struct rpcrdma_xprt *fr_xprt;
+};
+
+struct rpcrdma_fmr {
+ struct ib_fmr *fmr;
+ u64 *physaddrs;
};
struct rpcrdma_mw {
union {
- struct ib_fmr *fmr;
+ struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
} r;
void (*mw_sendcompletion)(struct ib_wc *);
@@ -252,16 +261,22 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
char *mr_offset; /* kva if no page, else offset */
};
+#define RPCRDMA_MAX_IOVS (2)
+
struct rpcrdma_req {
- unsigned int rl_niovs; /* 0, 2 or 4 */
- unsigned int rl_nchunks; /* non-zero if chunks */
- unsigned int rl_connect_cookie; /* retry detection */
- struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+ struct list_head rl_free;
+ unsigned int rl_niovs;
+ unsigned int rl_nchunks;
+ unsigned int rl_connect_cookie;
+ struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
- struct ib_sge rl_send_iov[4]; /* for active requests */
- struct rpcrdma_regbuf *rl_rdmabuf;
- struct rpcrdma_regbuf *rl_sendbuf;
- struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
+ struct rpcrdma_regbuf *rl_rdmabuf;
+ struct rpcrdma_regbuf *rl_sendbuf;
+ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+
+ struct list_head rl_all;
+ bool rl_backchannel;
};
static inline struct rpcrdma_req *
@@ -281,15 +296,19 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
* One of these is associated with a transport instance
*/
struct rpcrdma_buffer {
- spinlock_t rb_lock; /* protects indexes */
- u32 rb_max_requests;/* client max requests */
- struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
- struct list_head rb_all;
- int rb_send_index;
- struct rpcrdma_req **rb_send_bufs;
- int rb_recv_index;
- struct rpcrdma_rep **rb_recv_bufs;
- char *rb_pool;
+ spinlock_t rb_mwlock; /* protect rb_mws list */
+ struct list_head rb_mws;
+ struct list_head rb_all;
+ char *rb_pool;
+
+ spinlock_t rb_lock; /* protect buf lists */
+ struct list_head rb_send_bufs;
+ struct list_head rb_recv_bufs;
+ u32 rb_max_requests;
+
+ u32 rb_bc_srv_max_requests;
+ spinlock_t rb_reqslock; /* protect rb_allreqs */
+ struct list_head rb_allreqs;
};
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -334,6 +353,8 @@ struct rpcrdma_stats {
unsigned long hardway_register_count;
unsigned long failed_marshal_count;
unsigned long bad_reply_count;
+ unsigned long nomsg_call_count;
+ unsigned long bcall_count;
};
/*
@@ -350,7 +371,6 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_create_data_internal *);
size_t (*ro_maxpages)(struct rpcrdma_xprt *);
int (*ro_init)(struct rpcrdma_xprt *);
- void (*ro_reset)(struct rpcrdma_xprt *);
void (*ro_destroy)(struct rpcrdma_buffer *);
const char *ro_displayname;
};
@@ -410,9 +430,14 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
/*
* Buffer calls - xprtrdma/verbs.c
*/
+struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
+struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
+void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
+struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
+void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
void rpcrdma_buffer_put(struct rpcrdma_req *);
void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
@@ -424,6 +449,13 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
struct rpcrdma_regbuf *);
unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
+int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
+
+int frwr_alloc_recovery_wq(void);
+void frwr_destroy_recovery_wq(void);
+
+int rpcrdma_alloc_wq(void);
+void rpcrdma_destroy_wq(void);
/*
* Wrappers for chunk registration, shared by read/write chunk code.
@@ -480,6 +512,23 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
*/
int rpcrdma_marshal_req(struct rpc_rqst *);
+/* RPC/RDMA module init - xprtrdma/transport.c
+ */
+int xprt_rdma_init(void);
+void xprt_rdma_cleanup(void);
+
+/* Backchannel calls - xprtrdma/backchannel.c
+ */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
+int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
+int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
+void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
/* Temporary NFS request map cache. Created in svc_rdma.c */
extern struct kmem_cache *svc_rdma_map_cachep;
/* WR context cache. Created in svc_rdma.c */
@@ -487,10 +536,4 @@ extern struct kmem_cache *svc_rdma_ctxt_cachep;
/* Workqueue created in svc_rdma.c */
extern struct workqueue_struct *svc_rdma_wq;
-#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
-#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD
-#else
-#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
-#endif
-
#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */